railseek6/test_enhanced_processor.py

"""
Test Enhanced Document Processor with Bee Classification
Uploads test.docx and verifies bee classification is searchable
"""

import requests
import time
import os

# Configuration
LIGHTRAG_URL = "http://localhost:3015"
API_KEY = "jleu1212"
HEADERS = {"X-API-Key": API_KEY}

def clear_existing_documents():
    """Clear existing documents to ensure fresh processing"""
    print("🗑️  CLEARING EXISTING DOCUMENTS...")

    try:
        # Get current documents
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            documents = response.json()
            if isinstance(documents, list):
                for doc in documents:
                    if 'id' in doc:
                        doc_id = doc['id']
                        delete_response = requests.delete(
                            f"{LIGHTRAG_URL}/documents/{doc_id}",
                            headers=HEADERS,
                            timeout=10
                        )
                        if delete_response.status_code == 200:
                            print(f"✅ Deleted document: {doc.get('filename', 'Unknown')}")
                        else:
                            print(f"❌ Failed to delete document: {delete_response.status_code}")
            print("✅ All documents cleared")
        else:
            print("❌ Could not get documents list")
    except Exception as e:
        print(f"❌ Error clearing documents: {e}")

def upload_test_document():
    """Upload test.docx for processing with enhanced processor"""
    print("📤 UPLOADING TEST DOCUMENT WITH ENHANCED PROCESSOR...")

    test_file = "test.docx"
    if not os.path.exists(test_file):
        print(f"❌ Test file {test_file} not found")
        return False

    try:
        with open(test_file, 'rb') as f:
            files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
            response = requests.post(
                f"{LIGHTRAG_URL}/documents/upload",
                files=files,
                headers=HEADERS,
                timeout=60  # Longer timeout for processing
            )

        if response.status_code == 200:
            print("✅ Document uploaded successfully")
            result = response.json()
            print(f"   Upload result: {result}")
            return True
        else:
            print(f"❌ Upload failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Upload error: {e}")
        return False

def wait_for_processing():
    """Wait for document processing to complete"""
    print("⏳ WAITING FOR DOCUMENT PROCESSING...")

    for attempt in range(20):  # Wait up to 2 minutes
        try:
            response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
            if response.status_code == 200:
                documents = response.json()
                if isinstance(documents, list):
                    for doc in documents:
                        if 'test.docx' in doc.get('filename', '').lower():
                            status = doc.get('status', 'unknown')
                            print(f"📄 Document status: {status}")
                            if status == 'processed':
                                print("✅ Document processing completed")
                                return True
            time.sleep(6)
        except Exception as e:
            print(f"⚠️  Status check error: {e}")
            time.sleep(6)

    print("❌ Timeout waiting for processing")
    return False

def test_bee_search():
    """Test if bee classification is now searchable"""
    print("🔍 TESTING BEE SEARCH...")

    search_queries = [
        "bee",
        "Bee",
        "classification",
        "photo of a bee",
        "Entity: Bee",
        "insect",
        "animal"
    ]

    bee_found = False

    for query in search_queries:
        try:
            search_payload = {
                "query": query,
                "top_k": 10,
                "mode": "hybrid"  # Use hybrid mode which worked in diagnostics
            }

            response = requests.post(
                f"{LIGHTRAG_URL}/search",
                json=search_payload,
                headers=HEADERS,
                timeout=30
            )

            if response.status_code == 200:
                results = response.json()
                if results.get('results'):
                    print(f"✅ '{query}': Found {len(results['results'])} results")

                    # Check if any result contains bee-related content
                    for result in results['results']:
                        content = result.get('content', '').lower()
                        score = result.get('score', 0)

                        if 'bee' in content or 'classification' in content:
                            print(f"🎯 BEE FOUND: Score {score:.4f}")
                            print(f"   Content: {content[:200]}...")
                            bee_found = True
                else:
                    print(f"❌ '{query}': No results")
            else:
                print(f"❌ '{query}' search failed: {response.status_code}")

        except Exception as e:
            print(f"❌ '{query}' search error: {e}")

    return bee_found

def main():
    """Main test function"""
    print("🧪 TESTING ENHANCED DOCUMENT PROCESSOR")
    print("=" * 60)

    # Step 1: Clear existing documents
    clear_existing_documents()

    # Step 2: Upload test document
    if not upload_test_document():
        print("❌ Document upload failed")
        return False

    # Step 3: Wait for processing
    if not wait_for_processing():
        print("⚠️  Processing timeout, but continuing with search...")

    # Step 4: Test bee search
    bee_found = test_bee_search()

    print("\n" + "=" * 60)
    print("📊 TEST RESULTS")
    print("=" * 60)

    if bee_found:
        print("🎉 SUCCESS: Bee classification is now searchable!")
        print("   The enhanced document processor is working correctly.")
        print("   The Web UI should now detect bee classification.")
    else:
        print("❌ ISSUE: Bee classification still not searchable")
        print("   There may be an issue with the enhanced processor")
        print("   or the image classification is not running.")

    print("\n💡 Next steps:")
    print("   1. Open the Web UI at http://localhost:3015/webui")
    print("   2. Search for 'bee' to verify classification appears")

    if bee_found:
        print("\n✅ TEST PASSED: Web UI should now detect bee classification")
        return True
    else:
        print("\n❌ TEST FAILED: Further investigation needed")
        return False

if __name__ == "__main__":
    success = main()
    if success:
        print("\n🎉 The enhanced document processor is working correctly!")
    else:
        print("\n⚠️  The enhanced document processor needs investigation.")