Files
railseek6/test_enhanced_processor.py

204 lines
7.1 KiB
Python

"""
Test Enhanced Document Processor with Bee Classification
Uploads test.docx and verifies bee classification is searchable
"""
import requests
import time
import os
# Configuration
LIGHTRAG_URL = "http://localhost:3015"
API_KEY = "jleu1212"
HEADERS = {"X-API-Key": API_KEY}
def clear_existing_documents():
"""Clear existing documents to ensure fresh processing"""
print("🗑️ CLEARING EXISTING DOCUMENTS...")
try:
# Get current documents
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
if response.status_code == 200:
documents = response.json()
if isinstance(documents, list):
for doc in documents:
if 'id' in doc:
doc_id = doc['id']
delete_response = requests.delete(
f"{LIGHTRAG_URL}/documents/{doc_id}",
headers=HEADERS,
timeout=10
)
if delete_response.status_code == 200:
print(f"✅ Deleted document: {doc.get('filename', 'Unknown')}")
else:
print(f"❌ Failed to delete document: {delete_response.status_code}")
print("✅ All documents cleared")
else:
print("❌ Could not get documents list")
except Exception as e:
print(f"❌ Error clearing documents: {e}")
def upload_test_document():
"""Upload test.docx for processing with enhanced processor"""
print("📤 UPLOADING TEST DOCUMENT WITH ENHANCED PROCESSOR...")
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file {test_file} not found")
return False
try:
with open(test_file, 'rb') as f:
files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
response = requests.post(
f"{LIGHTRAG_URL}/documents/upload",
files=files,
headers=HEADERS,
timeout=60 # Longer timeout for processing
)
if response.status_code == 200:
print("✅ Document uploaded successfully")
result = response.json()
print(f" Upload result: {result}")
return True
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Upload error: {e}")
return False
def wait_for_processing():
"""Wait for document processing to complete"""
print("⏳ WAITING FOR DOCUMENT PROCESSING...")
for attempt in range(20): # Wait up to 2 minutes
try:
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
if response.status_code == 200:
documents = response.json()
if isinstance(documents, list):
for doc in documents:
if 'test.docx' in doc.get('filename', '').lower():
status = doc.get('status', 'unknown')
print(f"📄 Document status: {status}")
if status == 'processed':
print("✅ Document processing completed")
return True
time.sleep(6)
except Exception as e:
print(f"⚠️ Status check error: {e}")
time.sleep(6)
print("❌ Timeout waiting for processing")
return False
def test_bee_search():
"""Test if bee classification is now searchable"""
print("🔍 TESTING BEE SEARCH...")
search_queries = [
"bee",
"Bee",
"classification",
"photo of a bee",
"Entity: Bee",
"insect",
"animal"
]
bee_found = False
for query in search_queries:
try:
search_payload = {
"query": query,
"top_k": 10,
"mode": "hybrid" # Use hybrid mode which worked in diagnostics
}
response = requests.post(
f"{LIGHTRAG_URL}/search",
json=search_payload,
headers=HEADERS,
timeout=30
)
if response.status_code == 200:
results = response.json()
if results.get('results'):
print(f"'{query}': Found {len(results['results'])} results")
# Check if any result contains bee-related content
for result in results['results']:
content = result.get('content', '').lower()
score = result.get('score', 0)
if 'bee' in content or 'classification' in content:
print(f"🎯 BEE FOUND: Score {score:.4f}")
print(f" Content: {content[:200]}...")
bee_found = True
else:
print(f"'{query}': No results")
else:
print(f"'{query}' search failed: {response.status_code}")
except Exception as e:
print(f"'{query}' search error: {e}")
return bee_found
def main():
"""Main test function"""
print("🧪 TESTING ENHANCED DOCUMENT PROCESSOR")
print("=" * 60)
# Step 1: Clear existing documents
clear_existing_documents()
# Step 2: Upload test document
if not upload_test_document():
print("❌ Document upload failed")
return False
# Step 3: Wait for processing
if not wait_for_processing():
print("⚠️ Processing timeout, but continuing with search...")
# Step 4: Test bee search
bee_found = test_bee_search()
print("\n" + "=" * 60)
print("📊 TEST RESULTS")
print("=" * 60)
if bee_found:
print("🎉 SUCCESS: Bee classification is now searchable!")
print(" The enhanced document processor is working correctly.")
print(" The Web UI should now detect bee classification.")
else:
print("❌ ISSUE: Bee classification still not searchable")
print(" There may be an issue with the enhanced processor")
print(" or the image classification is not running.")
print("\n💡 Next steps:")
print(" 1. Open the Web UI at http://localhost:3015/webui")
print(" 2. Search for 'bee' to verify classification appears")
if bee_found:
print("\n✅ TEST PASSED: Web UI should now detect bee classification")
return True
else:
print("\n❌ TEST FAILED: Further investigation needed")
return False
if __name__ == "__main__":
success = main()
if success:
print("\n🎉 The enhanced document processor is working correctly!")
else:
print("\n⚠️ The enhanced document processor needs investigation.")