railseek6/test_webui_bee_search.py

#!/usr/bin/env python3
"""
Test script to verify Web UI search functionality for bee classification
This tests the complete pipeline including LightRAG's document processing
"""

import requests
import json
import time
import os
from pathlib import Path

# Configuration
LIGHTRAG_URL = "http://localhost:3015"
TEST_DOCX_PATH = "test.docx"
API_KEY = "jleu1212"  # Correct API key from zrun.bat

def test_webui_search():
    """Test complete Web UI search pipeline for bee classification"""
    print("🧪 Testing Web UI Bee Search Pipeline")
    print("=" * 50)

    # Check if LightRAG server is running
    try:
        response = requests.get(f"{LIGHTRAG_URL}/health", timeout=10)
        if response.status_code != 200:
            print("❌ LightRAG server is not responding")
            return False
        print("✅ LightRAG server is running")
    except Exception as e:
        print(f"❌ Cannot connect to LightRAG server: {e}")
        return False

    # Check if test document exists
    if not os.path.exists(TEST_DOCX_PATH):
        print(f"❌ Test document not found: {TEST_DOCX_PATH}")
        return False
    print(f"✅ Test document found: {TEST_DOCX_PATH}")

    # Upload test document
    print("\n📤 Uploading test document...")
    try:
        with open(TEST_DOCX_PATH, 'rb') as f:
            files = {'file': (TEST_DOCX_PATH, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
            headers = {'X-API-Key': API_KEY}
            response = requests.post(f"{LIGHTRAG_URL}/documents/upload", files=files, headers=headers)

        if response.status_code == 200:
            upload_result = response.json()
            track_id = upload_result.get('track_id')
            print(f"✅ Document uploaded successfully, track_id: {track_id}")
        else:
            print(f"❌ Upload failed: {response.status_code} - {response.text}")
            return False
    except Exception as e:
        print(f"❌ Upload error: {e}")
        return False

    # Wait for indexing to complete
    print("\n⏳ Waiting for indexing to complete...")
    max_wait_time = 120  # 2 minutes
    wait_interval = 5
    waited = 0

    while waited < max_wait_time:
        try:
            # Check pipeline status
            headers = {'X-API-Key': API_KEY}
            status_response = requests.get(f"{LIGHTRAG_URL}/documents/pipeline_status", headers=headers)

            if status_response.status_code == 200:
                status_data = status_response.json()
                busy = status_data.get('busy', False)

                if not busy:
                    print("✅ Indexing completed")
                    break
                else:
                    job_name = status_data.get('job_name', 'Unknown')
                    print(f"⏳ Still indexing... ({job_name})")
            else:
                print(f"⚠️  Could not get pipeline status: {status_response.status_code}")

        except Exception as e:
            print(f"⚠️  Error checking pipeline status: {e}")

        time.sleep(wait_interval)
        waited += wait_interval

    if waited >= max_wait_time:
        print("❌ Indexing timeout - proceeding with search anyway")

    # Test search for "bee"
    print("\n🔍 Testing search for 'bee'...")
    try:
        search_payload = {
            "query": "bee",
            "top_k": 10,
            "mode": "local"
        }
        headers = {
            'Content-Type': 'application/json',
            'X-API-Key': API_KEY
        }

        search_response = requests.post(
            f"{LIGHTRAG_URL}/search",
            json=search_payload,
            headers=headers,
            timeout=30
        )

        if search_response.status_code == 200:
            search_results = search_response.json()
            results = search_results.get('results', [])

            print(f"✅ Search completed, found {len(results)} results")

            # Analyze search results
            bee_found = False
            for i, result in enumerate(results):
                content = result.get('content', '')
                score = result.get('score', 0)
                source = result.get('source', 'Unknown')

                print(f"\nResult {i+1} (Score: {score:.4f}, Source: {source}):")
                print(f"Content preview: {content[:200]}...")

                # Check if bee classification is in the content
                if 'bee' in content.lower() or 'classification' in content.lower():
                    bee_found = True
                    print("🎯 BEE CLASSIFICATION DETECTED IN SEARCH RESULT!")

            if bee_found:
                print("\n✅ SUCCESS: Bee classification is searchable in Web UI!")
                return True
            else:
                print("\n❌ Bee classification not found in search results")
                print("This might indicate:")
                print("- Classification metadata not properly indexed")
                print("- Search query needs adjustment")
                print("- Indexing may not have completed")

                # Try alternative search queries
                print("\n🔍 Trying alternative search queries...")
                alternative_queries = ["classification", "image", "photo", "clipart"]

                for alt_query in alternative_queries:
                    alt_payload = {
                        "query": alt_query,
                        "top_k": 5,
                        "mode": "local"
                    }

                    alt_response = requests.post(
                        f"{LIGHTRAG_URL}/search",
                        json=alt_payload,
                        headers=headers,
                        timeout=10
                    )

                    if alt_response.status_code == 200:
                        alt_results = alt_response.json().get('results', [])
                        if alt_results:
                            print(f"Query '{alt_query}': Found {len(alt_results)} results")
                            for result in alt_results[:2]:  # Show first 2 results
                                content_preview = result.get('content', '')[:150]
                                print(f"  - {content_preview}...")
                        else:
                            print(f"Query '{alt_query}': No results")

                return False

        else:
            print(f"❌ Search failed: {search_response.status_code} - {search_response.text}")
            return False

    except Exception as e:
        print(f"❌ Search error: {e}")
        return False

def check_document_status():
    """Check the status of uploaded documents"""
    print("\n📊 Checking document status...")
    try:
        headers = {'X-API-Key': API_KEY}
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=headers)

        if response.status_code == 200:
            status_data = response.json()
            statuses = status_data.get('statuses', {})

            for status, docs in statuses.items():
                print(f"{status}: {len(docs)} documents")
                for doc in docs[:3]:  # Show first 3 documents of each status
                    print(f"  - {doc.get('file_path', 'Unknown')} (ID: {doc.get('id', 'Unknown')})")
        else:
            print(f"❌ Could not get document status: {response.status_code}")

    except Exception as e:
        print(f"❌ Error checking document status: {e}")

if __name__ == "__main__":
    print("Web UI Bee Search Test")
    print("This test verifies that bee classification is searchable through the Web UI")
    print("Make sure LightRAG server is running on port 3015")
    print()

    success = test_webui_search()
    check_document_status()

    if success:
        print("\n🎉 TEST PASSED: Bee classification is successfully searchable in Web UI!")
    else:
        print("\n💥 TEST FAILED: Bee classification is not searchable in Web UI")
        print("\nTroubleshooting steps:")
        print("1. Check that LightRAG server is running on port 3015")
        print("2. Verify the document processor is using our custom implementation")
        print("3. Check if the test.docx file contains the bee image")
        print("4. Verify that classification metadata is being added to the content")
        print("5. Check LightRAG logs for any processing errors")