292 lines
10 KiB
Python
292 lines
10 KiB
Python
"""
|
|
Diagnostic Script for Web UI Bee Classification Issue
|
|
Investigates why Web UI doesn't detect bee classification while test.py can
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import requests
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
LIGHTRAG_URL = "http://localhost:3015"
|
|
API_KEY = "jleu1212"
|
|
HEADERS = {"X-API-Key": API_KEY}
|
|
|
|
def check_server_status():
|
|
"""Check if server is running and get basic info"""
|
|
print("🔍 CHECKING SERVER STATUS")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
print("✅ Server is running on port 3015")
|
|
return True
|
|
else:
|
|
print(f"❌ Server status: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Server not reachable: {e}")
|
|
return False
|
|
|
|
def list_documents():
|
|
"""List all documents in the system"""
|
|
print("\n📄 LISTING DOCUMENTS IN SYSTEM")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
print(f"📊 Found {len(documents)} documents:")
|
|
|
|
for doc in documents:
|
|
print(f" - {doc.get('filename', 'Unknown')}")
|
|
print(f" Status: {doc.get('status', 'Unknown')}")
|
|
print(f" ID: {doc.get('id', 'Unknown')}")
|
|
|
|
# Check if it's test.docx
|
|
if 'test.docx' in doc.get('filename', '').lower():
|
|
print(" 🎯 THIS IS THE TEST DOCUMENT")
|
|
|
|
return documents
|
|
else:
|
|
print(f"❌ Failed to get documents: {response.status_code}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error listing documents: {e}")
|
|
return []
|
|
|
|
def check_document_content(doc_id):
|
|
"""Check the actual content of a document"""
|
|
print(f"\n🔍 CHECKING DOCUMENT CONTENT FOR ID: {doc_id}")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Try to get document details
|
|
response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
doc_detail = response.json()
|
|
print(f"📋 Document details:")
|
|
print(f" - Filename: {doc_detail.get('filename')}")
|
|
print(f" - Status: {doc_detail.get('status')}")
|
|
print(f" - Metadata: {doc_detail.get('metadata', {})}")
|
|
|
|
# Try to get chunks to see actual content
|
|
chunks_response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}/chunks", headers=HEADERS, timeout=10)
|
|
if chunks_response.status_code == 200:
|
|
chunks = chunks_response.json()
|
|
print(f"📝 Found {len(chunks)} chunks:")
|
|
|
|
bee_found = False
|
|
for i, chunk in enumerate(chunks[:5]): # Check first 5 chunks
|
|
content = chunk.get('content', '')
|
|
print(f" Chunk {i+1}: {content[:200]}...")
|
|
|
|
# Check for bee keywords
|
|
if 'bee' in content.lower() or 'classification' in content.lower():
|
|
bee_found = True
|
|
print(f" 🎯 BEE CLASSIFICATION FOUND IN CHUNK {i+1}")
|
|
|
|
if not bee_found:
|
|
print(" ❌ No bee classification found in chunks")
|
|
|
|
return bee_found
|
|
else:
|
|
print(f"❌ Could not get chunks: {chunks_response.status_code}")
|
|
else:
|
|
print(f"❌ Could not get document details: {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error checking document content: {e}")
|
|
|
|
return False
|
|
|
|
def test_search_methods():
|
|
"""Test different search methods to see which one works"""
|
|
print("\n🔍 TESTING DIFFERENT SEARCH METHODS")
|
|
print("=" * 50)
|
|
|
|
search_queries = ["bee", "Bee", "classification", "image", "photo of a bee", "Entity: Bee"]
|
|
|
|
for query in search_queries:
|
|
print(f"\n🔍 Searching for: '{query}'")
|
|
|
|
# Method 1: Standard search
|
|
try:
|
|
search_payload = {
|
|
"query": query,
|
|
"top_k": 10,
|
|
"mode": "local"
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/search",
|
|
json=search_payload,
|
|
headers=HEADERS,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
if results.get('results'):
|
|
print(f"✅ STANDARD SEARCH: Found {len(results['results'])} results")
|
|
for result in results['results']:
|
|
content = result.get('content', '')
|
|
score = result.get('score', 0)
|
|
print(f" - Score: {score:.4f}, Content: {content[:100]}...")
|
|
else:
|
|
print(f"❌ STANDARD SEARCH: No results")
|
|
else:
|
|
print(f"❌ STANDARD SEARCH failed: {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ STANDARD SEARCH error: {e}")
|
|
|
|
# Method 2: Try with different modes
|
|
for mode in ["local", "hybrid", "semantic"]:
|
|
try:
|
|
search_payload = {
|
|
"query": query,
|
|
"top_k": 5,
|
|
"mode": mode
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/search",
|
|
json=search_payload,
|
|
headers=HEADERS,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
if results.get('results'):
|
|
print(f"✅ {mode.upper()} MODE: Found {len(results['results'])} results")
|
|
else:
|
|
print(f"❌ {mode.upper()} MODE: No results")
|
|
else:
|
|
print(f"❌ {mode.upper()} MODE failed: {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ {mode.upper()} MODE error: {e}")
|
|
|
|
def check_processing_logs():
|
|
"""Check if there are any processing logs or errors"""
|
|
print("\n📋 CHECKING PROCESSING STATUS")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Try to get processing status
|
|
status_response = requests.get(f"{LIGHTRAG_URL}/status", headers=HEADERS, timeout=10)
|
|
if status_response.status_code == 200:
|
|
status = status_response.json()
|
|
print(f"📊 Server status: {status}")
|
|
else:
|
|
print(f"❌ Could not get status: {status_response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error checking status: {e}")
|
|
|
|
def reupload_test_document():
|
|
"""Re-upload test.docx to ensure it's processed with the latest processor"""
|
|
print("\n🔄 RE-UPLOADING TEST DOCUMENT")
|
|
print("=" * 50)
|
|
|
|
test_file = "test.docx"
|
|
if not os.path.exists(test_file):
|
|
print(f"❌ Test file {test_file} not found")
|
|
return False
|
|
|
|
try:
|
|
with open(test_file, 'rb') as f:
|
|
files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/documents/upload",
|
|
files=files,
|
|
headers=HEADERS,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
print("✅ Document re-uploaded successfully")
|
|
result = response.json()
|
|
print(f" Upload result: {result}")
|
|
return True
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main diagnostic function"""
|
|
print("🔧 WEB UI BEE CLASSIFICATION DIAGNOSTIC")
|
|
print("=" * 60)
|
|
print("This script investigates why the Web UI doesn't detect")
|
|
print("bee classification while test.py can.")
|
|
print()
|
|
|
|
# Step 1: Check server status
|
|
if not check_server_status():
|
|
print("❌ Cannot proceed - server not available")
|
|
return
|
|
|
|
# Step 2: List documents
|
|
documents = list_documents()
|
|
|
|
# Step 3: Check if test.docx exists and its content
|
|
test_doc_id = None
|
|
for doc in documents:
|
|
if 'test.docx' in doc.get('filename', '').lower():
|
|
test_doc_id = doc.get('id')
|
|
break
|
|
|
|
if test_doc_id:
|
|
bee_in_content = check_document_content(test_doc_id)
|
|
else:
|
|
print("❌ test.docx not found in documents")
|
|
bee_in_content = False
|
|
|
|
# Step 4: Test different search methods
|
|
test_search_methods()
|
|
|
|
# Step 5: Check processing logs
|
|
check_processing_logs()
|
|
|
|
# Step 6: If bee not found, re-upload the document
|
|
if not bee_in_content:
|
|
print("\n🔄 Bee classification not found in current document")
|
|
print(" Attempting to re-upload with enhanced processor...")
|
|
reupload_test_document()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("📊 DIAGNOSTIC SUMMARY")
|
|
print("=" * 60)
|
|
|
|
if bee_in_content:
|
|
print("✅ Bee classification is present in document content")
|
|
print("❌ But Web UI search is not finding it")
|
|
print("\n💡 Possible issues:")
|
|
print(" - Web UI might be using different search parameters")
|
|
print(" - Entity extraction might be filtering out classification text")
|
|
print(" - Search indexing might need to be refreshed")
|
|
else:
|
|
print("❌ Bee classification is NOT present in document content")
|
|
print("\n💡 Possible issues:")
|
|
print(" - Document was processed before enhanced processor was active")
|
|
print(" - Image classification is not running properly")
|
|
print(" - Enhanced entity extraction is not working")
|
|
|
|
print("\n🔧 Recommended actions:")
|
|
print(" 1. Check server logs for document processing details")
|
|
print(" 2. Verify the enhanced document processor is being used")
|
|
print(" 3. Try re-uploading test.docx")
|
|
print(" 4. Check if OpenCLIP classifier is available and working")
|
|
|
|
if __name__ == "__main__":
|
|
main() |