Files
railseek6/diagnose_webui_issue.py

292 lines
10 KiB
Python

"""
Diagnostic Script for Web UI Bee Classification Issue
Investigates why Web UI doesn't detect bee classification while test.py can
"""
import os
import sys
import requests
import json
from pathlib import Path
# Configuration
LIGHTRAG_URL = "http://localhost:3015"
API_KEY = "jleu1212"
HEADERS = {"X-API-Key": API_KEY}
def check_server_status():
"""Check if server is running and get basic info"""
print("🔍 CHECKING SERVER STATUS")
print("=" * 50)
try:
response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
if response.status_code == 200:
print("✅ Server is running on port 3015")
return True
else:
print(f"❌ Server status: {response.status_code}")
return False
except Exception as e:
print(f"❌ Server not reachable: {e}")
return False
def list_documents():
"""List all documents in the system"""
print("\n📄 LISTING DOCUMENTS IN SYSTEM")
print("=" * 50)
try:
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
if response.status_code == 200:
documents = response.json()
print(f"📊 Found {len(documents)} documents:")
for doc in documents:
print(f" - {doc.get('filename', 'Unknown')}")
print(f" Status: {doc.get('status', 'Unknown')}")
print(f" ID: {doc.get('id', 'Unknown')}")
# Check if it's test.docx
if 'test.docx' in doc.get('filename', '').lower():
print(" 🎯 THIS IS THE TEST DOCUMENT")
return documents
else:
print(f"❌ Failed to get documents: {response.status_code}")
return []
except Exception as e:
print(f"❌ Error listing documents: {e}")
return []
def check_document_content(doc_id):
"""Check the actual content of a document"""
print(f"\n🔍 CHECKING DOCUMENT CONTENT FOR ID: {doc_id}")
print("=" * 50)
try:
# Try to get document details
response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}", headers=HEADERS, timeout=10)
if response.status_code == 200:
doc_detail = response.json()
print(f"📋 Document details:")
print(f" - Filename: {doc_detail.get('filename')}")
print(f" - Status: {doc_detail.get('status')}")
print(f" - Metadata: {doc_detail.get('metadata', {})}")
# Try to get chunks to see actual content
chunks_response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}/chunks", headers=HEADERS, timeout=10)
if chunks_response.status_code == 200:
chunks = chunks_response.json()
print(f"📝 Found {len(chunks)} chunks:")
bee_found = False
for i, chunk in enumerate(chunks[:5]): # Check first 5 chunks
content = chunk.get('content', '')
print(f" Chunk {i+1}: {content[:200]}...")
# Check for bee keywords
if 'bee' in content.lower() or 'classification' in content.lower():
bee_found = True
print(f" 🎯 BEE CLASSIFICATION FOUND IN CHUNK {i+1}")
if not bee_found:
print(" ❌ No bee classification found in chunks")
return bee_found
else:
print(f"❌ Could not get chunks: {chunks_response.status_code}")
else:
print(f"❌ Could not get document details: {response.status_code}")
except Exception as e:
print(f"❌ Error checking document content: {e}")
return False
def test_search_methods():
"""Test different search methods to see which one works"""
print("\n🔍 TESTING DIFFERENT SEARCH METHODS")
print("=" * 50)
search_queries = ["bee", "Bee", "classification", "image", "photo of a bee", "Entity: Bee"]
for query in search_queries:
print(f"\n🔍 Searching for: '{query}'")
# Method 1: Standard search
try:
search_payload = {
"query": query,
"top_k": 10,
"mode": "local"
}
response = requests.post(
f"{LIGHTRAG_URL}/search",
json=search_payload,
headers=HEADERS,
timeout=30
)
if response.status_code == 200:
results = response.json()
if results.get('results'):
print(f"✅ STANDARD SEARCH: Found {len(results['results'])} results")
for result in results['results']:
content = result.get('content', '')
score = result.get('score', 0)
print(f" - Score: {score:.4f}, Content: {content[:100]}...")
else:
print(f"❌ STANDARD SEARCH: No results")
else:
print(f"❌ STANDARD SEARCH failed: {response.status_code}")
except Exception as e:
print(f"❌ STANDARD SEARCH error: {e}")
# Method 2: Try with different modes
for mode in ["local", "hybrid", "semantic"]:
try:
search_payload = {
"query": query,
"top_k": 5,
"mode": mode
}
response = requests.post(
f"{LIGHTRAG_URL}/search",
json=search_payload,
headers=HEADERS,
timeout=30
)
if response.status_code == 200:
results = response.json()
if results.get('results'):
print(f"{mode.upper()} MODE: Found {len(results['results'])} results")
else:
print(f"{mode.upper()} MODE: No results")
else:
print(f"{mode.upper()} MODE failed: {response.status_code}")
except Exception as e:
print(f"{mode.upper()} MODE error: {e}")
def check_processing_logs():
"""Check if there are any processing logs or errors"""
print("\n📋 CHECKING PROCESSING STATUS")
print("=" * 50)
try:
# Try to get processing status
status_response = requests.get(f"{LIGHTRAG_URL}/status", headers=HEADERS, timeout=10)
if status_response.status_code == 200:
status = status_response.json()
print(f"📊 Server status: {status}")
else:
print(f"❌ Could not get status: {status_response.status_code}")
except Exception as e:
print(f"❌ Error checking status: {e}")
def reupload_test_document():
"""Re-upload test.docx to ensure it's processed with the latest processor"""
print("\n🔄 RE-UPLOADING TEST DOCUMENT")
print("=" * 50)
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file {test_file} not found")
return False
try:
with open(test_file, 'rb') as f:
files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
response = requests.post(
f"{LIGHTRAG_URL}/documents/upload",
files=files,
headers=HEADERS,
timeout=30
)
if response.status_code == 200:
print("✅ Document re-uploaded successfully")
result = response.json()
print(f" Upload result: {result}")
return True
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Upload error: {e}")
return False
def main():
"""Main diagnostic function"""
print("🔧 WEB UI BEE CLASSIFICATION DIAGNOSTIC")
print("=" * 60)
print("This script investigates why the Web UI doesn't detect")
print("bee classification while test.py can.")
print()
# Step 1: Check server status
if not check_server_status():
print("❌ Cannot proceed - server not available")
return
# Step 2: List documents
documents = list_documents()
# Step 3: Check if test.docx exists and its content
test_doc_id = None
for doc in documents:
if 'test.docx' in doc.get('filename', '').lower():
test_doc_id = doc.get('id')
break
if test_doc_id:
bee_in_content = check_document_content(test_doc_id)
else:
print("❌ test.docx not found in documents")
bee_in_content = False
# Step 4: Test different search methods
test_search_methods()
# Step 5: Check processing logs
check_processing_logs()
# Step 6: If bee not found, re-upload the document
if not bee_in_content:
print("\n🔄 Bee classification not found in current document")
print(" Attempting to re-upload with enhanced processor...")
reupload_test_document()
print("\n" + "=" * 60)
print("📊 DIAGNOSTIC SUMMARY")
print("=" * 60)
if bee_in_content:
print("✅ Bee classification is present in document content")
print("❌ But Web UI search is not finding it")
print("\n💡 Possible issues:")
print(" - Web UI might be using different search parameters")
print(" - Entity extraction might be filtering out classification text")
print(" - Search indexing might need to be refreshed")
else:
print("❌ Bee classification is NOT present in document content")
print("\n💡 Possible issues:")
print(" - Document was processed before enhanced processor was active")
print(" - Image classification is not running properly")
print(" - Enhanced entity extraction is not working")
print("\n🔧 Recommended actions:")
print(" 1. Check server logs for document processing details")
print(" 2. Verify the enhanced document processor is being used")
print(" 3. Try re-uploading test.docx")
print(" 4. Check if OpenCLIP classifier is available and working")
if __name__ == "__main__":
main()