railseek6/test_final_workflow.py

"""
Final Test Workflow for Document Processing Pipeline
Tests the complete workflow with enhanced entity extraction for bee classification
"""

import os
import sys
import time
import requests
import json
from pathlib import Path

# Configuration
LIGHTRAG_URL = "http://localhost:3016"
API_KEY = "jleu1212"
TEST_FILE = "test.docx"
HEADERS = {"X-API-Key": API_KEY}

def check_server_status():
    """Check if LightRAG server is running"""
    try:
        response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
        return response.status_code == 200
    except Exception as e:
        print(f"❌ Server not reachable: {e}")
        return False

def upload_document():
    """Upload test document to LightRAG"""
    print(f"📤 Uploading {TEST_FILE} to LightRAG...")

    if not os.path.exists(TEST_FILE):
        print(f"❌ Test file {TEST_FILE} not found")
        return False

    try:
        with open(TEST_FILE, 'rb') as f:
            files = {'file': (TEST_FILE, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
            response = requests.post(
                f"{LIGHTRAG_URL}/documents/upload",
                files=files,
                headers=HEADERS,
                timeout=30
            )

        if response.status_code == 200:
            print("✅ Document uploaded successfully")
            result = response.json()
            print(f"   Upload result: {result}")
            return True
        else:
            print(f"❌ Upload failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Upload error: {e}")
        return False

def wait_for_processing(max_wait=60):
    """Wait for document processing to complete"""
    print("⏳ Waiting for document processing...")

    for attempt in range(max_wait // 5):
        try:
            # Check documents status
            response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
            if response.status_code == 200:
                documents = response.json()
                if documents:
                    print(f"📄 Found {len(documents)} documents in system")
                    # Check if our document is processed
                    for doc in documents:
                        print(f"   - {doc.get('filename', 'Unknown')}: {doc.get('status', 'Unknown')}")

                    # Assume processing is complete if we see documents
                    return True

            time.sleep(5)

        except Exception as e:
            print(f"⚠️  Status check error: {e}")
            time.sleep(5)

    print("❌ Timeout waiting for processing")
    return False

def search_for_bee():
    """Search for bee-related content"""
    print("🔍 Searching for 'bee' in documents...")

    try:
        search_payload = {
            "query": "bee",
            "top_k": 10,
            "mode": "local"
        }

        response = requests.post(
            f"{LIGHTRAG_URL}/search",
            json=search_payload,
            headers=HEADERS,
            timeout=30
        )

        if response.status_code == 200:
            results = response.json()
            print("✅ Search completed successfully")

            if results and "results" in results and results["results"]:
                print(f"📊 Found {len(results['results'])} results for 'bee':")
                for i, result in enumerate(results["results"]):
                    print(f"   {i+1}. Score: {result.get('score', 0):.4f}")
                    print(f"      Content: {result.get('content', '')[:200]}...")
                    if "metadata" in result:
                        print(f"      Metadata: {result.get('metadata', {})}")
                    print()

                # Check if we found bee classification
                bee_found = False
                for result in results["results"]:
                    content = result.get("content", "").lower()
                    if "bee" in content or "classification" in content:
                        bee_found = True
                        break

                if bee_found:
                    print("🎉 SUCCESS: Bee classification found in search results!")
                    return True
                else:
                    print("❌ Bee classification not found in search results")
                    return False
            else:
                print("❌ No search results found")
                return False
        else:
            print(f"❌ Search failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Search error: {e}")
        return False

def search_for_entities():
    """Search for specific entities related to bee classification"""
    print("🔍 Searching for entity-related terms...")

    search_terms = [
        "bee image classification",
        "insect",
        "animal",
        "photo of a bee",
        "Entity: Bee"
    ]

    all_results = []

    for term in search_terms:
        try:
            search_payload = {
                "query": term,
                "top_k": 5,
                "mode": "local"
            }

            response = requests.post(
                f"{LIGHTRAG_URL}/search",
                json=search_payload,
                headers=HEADERS,
                timeout=30
            )

            if response.status_code == 200:
                results = response.json()
                if results and "results" in results and results["results"]:
                    print(f"✅ Found {len(results['results'])} results for '{term}':")
                    for result in results["results"]:
                        print(f"   - Score: {result.get('score', 0):.4f}")
                        print(f"     Content: {result.get('content', '')[:150]}...")
                        all_results.append(result)
                else:
                    print(f"❌ No results for '{term}'")
            else:
                print(f"❌ Search for '{term}' failed: {response.status_code}")

        except Exception as e:
            print(f"❌ Search for '{term}' error: {e}")

    return len(all_results) > 0

def test_web_ui():
    """Test Web UI accessibility"""
    print("🌐 Testing Web UI access...")

    try:
        response = requests.get(f"{LIGHTRAG_URL}/webui", timeout=10)
        if response.status_code == 200:
            print("✅ Web UI is accessible")
            return True
        else:
            print(f"❌ Web UI not accessible: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Web UI test error: {e}")
        return False

def main():
    """Main test workflow"""
    print("=" * 60)
    print("🚀 FINAL DOCUMENT PROCESSING WORKFLOW TEST")
    print("=" * 60)
    print(f"📡 Server: {LIGHTRAG_URL}")
    print(f"🔑 API Key: {API_KEY}")
    print(f"📄 Test File: {TEST_FILE}")
    print()

    # Step 1: Check server status
    print("1. Checking server status...")
    if not check_server_status():
        print("❌ Cannot proceed - server not running")
        return False

    # Step 2: Upload document
    print("\n2. Uploading document...")
    if not upload_document():
        return False

    # Step 3: Wait for processing
    print("\n3. Waiting for document processing...")
    if not wait_for_processing():
        print("⚠️  Processing timeout, but continuing with search...")

    # Step 4: Search for bee
    print("\n4. Testing search functionality...")
    bee_found = search_for_bee()

    # Step 5: Search for entities
    print("\n5. Testing entity search...")
    entities_found = search_for_entities()

    # Step 6: Test Web UI
    print("\n6. Testing Web UI...")
    webui_accessible = test_web_ui()

    # Final results
    print("\n" + "=" * 60)
    print("📊 TEST RESULTS SUMMARY")
    print("=" * 60)
    print(f"✅ Server Status: {'OK' if check_server_status() else 'FAILED'}")
    print(f"✅ Document Upload: {'SUCCESS' if True else 'FAILED'}")
    print(f"✅ Bee Search: {'FOUND' if bee_found else 'NOT FOUND'}")
    print(f"✅ Entity Search: {'FOUND' if entities_found else 'NOT FOUND'}")
    print(f"✅ Web UI: {'ACCESSIBLE' if webui_accessible else 'INACCESSIBLE'}")

    if bee_found:
        print("\n🎉 SUCCESS: Enhanced document processing with entity extraction is working!")
        print("   Bee classification should now be searchable in the Web UI")
    else:
        print("\n❌ ISSUE: Bee classification not found in search results")
        print("   This may indicate that the enhanced entity extraction needs further tuning")

    print("\n💡 Next steps:")
    print("   - Open the Web UI at http://localhost:3016/webui")
    print("   - Search for 'bee' to verify classification appears")
    print("   - Check that the first image is recognized as a bee")

    return bee_found

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)