railseek6/diagnose_webui_issue.py

"""
Diagnostic Script for Web UI Bee Classification Issue
Investigates why Web UI doesn't detect bee classification while test.py can
"""

import os
import sys
import requests
import json
from pathlib import Path

# Configuration
LIGHTRAG_URL = "http://localhost:3015"
API_KEY = "jleu1212"
HEADERS = {"X-API-Key": API_KEY}

def check_server_status():
    """Check if server is running and get basic info"""
    print("🔍 CHECKING SERVER STATUS")
    print("=" * 50)

    try:
        response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            print("✅ Server is running on port 3015")
            return True
        else:
            print(f"❌ Server status: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Server not reachable: {e}")
        return False

def list_documents():
    """List all documents in the system"""
    print("\n📄 LISTING DOCUMENTS IN SYSTEM")
    print("=" * 50)

    try:
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            documents = response.json()
            print(f"📊 Found {len(documents)} documents:")

            for doc in documents:
                print(f"   - {doc.get('filename', 'Unknown')}")
                print(f"     Status: {doc.get('status', 'Unknown')}")
                print(f"     ID: {doc.get('id', 'Unknown')}")

                # Check if it's test.docx
                if 'test.docx' in doc.get('filename', '').lower():
                    print("     🎯 THIS IS THE TEST DOCUMENT")

            return documents
        else:
            print(f"❌ Failed to get documents: {response.status_code}")
            return []
    except Exception as e:
        print(f"❌ Error listing documents: {e}")
        return []

def check_document_content(doc_id):
    """Check the actual content of a document"""
    print(f"\n🔍 CHECKING DOCUMENT CONTENT FOR ID: {doc_id}")
    print("=" * 50)

    try:
        # Try to get document details
        response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            doc_detail = response.json()
            print(f"📋 Document details:")
            print(f"   - Filename: {doc_detail.get('filename')}")
            print(f"   - Status: {doc_detail.get('status')}")
            print(f"   - Metadata: {doc_detail.get('metadata', {})}")

            # Try to get chunks to see actual content
            chunks_response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}/chunks", headers=HEADERS, timeout=10)
            if chunks_response.status_code == 200:
                chunks = chunks_response.json()
                print(f"📝 Found {len(chunks)} chunks:")

                bee_found = False
                for i, chunk in enumerate(chunks[:5]):  # Check first 5 chunks
                    content = chunk.get('content', '')
                    print(f"   Chunk {i+1}: {content[:200]}...")

                    # Check for bee keywords
                    if 'bee' in content.lower() or 'classification' in content.lower():
                        bee_found = True
                        print(f"   🎯 BEE CLASSIFICATION FOUND IN CHUNK {i+1}")

                if not bee_found:
                    print("   ❌ No bee classification found in chunks")

                return bee_found
            else:
                print(f"❌ Could not get chunks: {chunks_response.status_code}")
        else:
            print(f"❌ Could not get document details: {response.status_code}")

    except Exception as e:
        print(f"❌ Error checking document content: {e}")

    return False

def test_search_methods():
    """Test different search methods to see which one works"""
    print("\n🔍 TESTING DIFFERENT SEARCH METHODS")
    print("=" * 50)

    search_queries = ["bee", "Bee", "classification", "image", "photo of a bee", "Entity: Bee"]

    for query in search_queries:
        print(f"\n🔍 Searching for: '{query}'")

        # Method 1: Standard search
        try:
            search_payload = {
                "query": query,
                "top_k": 10,
                "mode": "local"
            }

            response = requests.post(
                f"{LIGHTRAG_URL}/search",
                json=search_payload,
                headers=HEADERS,
                timeout=30
            )

            if response.status_code == 200:
                results = response.json()
                if results.get('results'):
                    print(f"✅ STANDARD SEARCH: Found {len(results['results'])} results")
                    for result in results['results']:
                        content = result.get('content', '')
                        score = result.get('score', 0)
                        print(f"   - Score: {score:.4f}, Content: {content[:100]}...")
                else:
                    print(f"❌ STANDARD SEARCH: No results")
            else:
                print(f"❌ STANDARD SEARCH failed: {response.status_code}")

        except Exception as e:
            print(f"❌ STANDARD SEARCH error: {e}")

        # Method 2: Try with different modes
        for mode in ["local", "hybrid", "semantic"]:
            try:
                search_payload = {
                    "query": query,
                    "top_k": 5,
                    "mode": mode
                }

                response = requests.post(
                    f"{LIGHTRAG_URL}/search",
                    json=search_payload,
                    headers=HEADERS,
                    timeout=30
                )

                if response.status_code == 200:
                    results = response.json()
                    if results.get('results'):
                        print(f"✅ {mode.upper()} MODE: Found {len(results['results'])} results")
                    else:
                        print(f"❌ {mode.upper()} MODE: No results")
                else:
                    print(f"❌ {mode.upper()} MODE failed: {response.status_code}")

            except Exception as e:
                print(f"❌ {mode.upper()} MODE error: {e}")

def check_processing_logs():
    """Check if there are any processing logs or errors"""
    print("\n📋 CHECKING PROCESSING STATUS")
    print("=" * 50)

    try:
        # Try to get processing status
        status_response = requests.get(f"{LIGHTRAG_URL}/status", headers=HEADERS, timeout=10)
        if status_response.status_code == 200:
            status = status_response.json()
            print(f"📊 Server status: {status}")
        else:
            print(f"❌ Could not get status: {status_response.status_code}")

    except Exception as e:
        print(f"❌ Error checking status: {e}")

def reupload_test_document():
    """Re-upload test.docx to ensure it's processed with the latest processor"""
    print("\n🔄 RE-UPLOADING TEST DOCUMENT")
    print("=" * 50)

    test_file = "test.docx"
    if not os.path.exists(test_file):
        print(f"❌ Test file {test_file} not found")
        return False

    try:
        with open(test_file, 'rb') as f:
            files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
            response = requests.post(
                f"{LIGHTRAG_URL}/documents/upload",
                files=files,
                headers=HEADERS,
                timeout=30
            )

        if response.status_code == 200:
            print("✅ Document re-uploaded successfully")
            result = response.json()
            print(f"   Upload result: {result}")
            return True
        else:
            print(f"❌ Upload failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Upload error: {e}")
        return False

def main():
    """Main diagnostic function"""
    print("🔧 WEB UI BEE CLASSIFICATION DIAGNOSTIC")
    print("=" * 60)
    print("This script investigates why the Web UI doesn't detect")
    print("bee classification while test.py can.")
    print()

    # Step 1: Check server status
    if not check_server_status():
        print("❌ Cannot proceed - server not available")
        return

    # Step 2: List documents
    documents = list_documents()

    # Step 3: Check if test.docx exists and its content
    test_doc_id = None
    for doc in documents:
        if 'test.docx' in doc.get('filename', '').lower():
            test_doc_id = doc.get('id')
            break

    if test_doc_id:
        bee_in_content = check_document_content(test_doc_id)
    else:
        print("❌ test.docx not found in documents")
        bee_in_content = False

    # Step 4: Test different search methods
    test_search_methods()

    # Step 5: Check processing logs
    check_processing_logs()

    # Step 6: If bee not found, re-upload the document
    if not bee_in_content:
        print("\n🔄 Bee classification not found in current document")
        print("   Attempting to re-upload with enhanced processor...")
        reupload_test_document()

    print("\n" + "=" * 60)
    print("📊 DIAGNOSTIC SUMMARY")
    print("=" * 60)

    if bee_in_content:
        print("✅ Bee classification is present in document content")
        print("❌ But Web UI search is not finding it")
        print("\n💡 Possible issues:")
        print("   - Web UI might be using different search parameters")
        print("   - Entity extraction might be filtering out classification text")
        print("   - Search indexing might need to be refreshed")
    else:
        print("❌ Bee classification is NOT present in document content")
        print("\n💡 Possible issues:")
        print("   - Document was processed before enhanced processor was active")
        print("   - Image classification is not running properly")
        print("   - Enhanced entity extraction is not working")

    print("\n🔧 Recommended actions:")
    print("   1. Check server logs for document processing details")
    print("   2. Verify the enhanced document processor is being used")
    print("   3. Try re-uploading test.docx")
    print("   4. Check if OpenCLIP classifier is available and working")

if __name__ == "__main__":
    main()