Files
railseek6/test_final_ocr_upload.py

157 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Final test script to upload ocr.pdf and verify upload, indexing, and search functionality
"""
import requests
import json
import time
import sys
def test_ocr_pdf_workflow():
"""Test complete OCR PDF workflow: upload, indexing, and search"""
base_url = "http://localhost:3015"
api_key = "jleu1212"
print("=== Testing LightRAG OCR PDF Workflow ===")
# 1. Check server health
print("\n1. Checking server health...")
try:
health_response = requests.get(f"{base_url}/health")
if health_response.status_code == 200:
print(" ✅ Server is healthy")
print(f" Status: {health_response.json().get('status', 'unknown')}")
else:
print(f" ❌ Server health check failed: {health_response.status_code}")
return False
except Exception as e:
print(f" ❌ Server health check error: {e}")
return False
# 2. Upload ocr.pdf
print("\n2. Uploading ocr.pdf...")
try:
with open("ocr.pdf", "rb") as f:
files = {"file": ("ocr.pdf", f, "application/pdf")}
headers = {"X-API-Key": api_key}
upload_response = requests.post(
f"{base_url}/api/upload",
files=files,
headers=headers
)
if upload_response.status_code == 200:
print(" ✅ File uploaded successfully")
upload_result = upload_response.json()
print(f" Response: {upload_result}")
else:
print(f" ❌ Upload failed: {upload_response.status_code}")
print(f" Response: {upload_response.text}")
return False
except Exception as e:
print(f" ❌ Upload error: {e}")
return False
# 3. Wait for indexing to complete
print("\n3. Waiting for indexing to complete...")
max_wait_time = 120 # 2 minutes
wait_interval = 5
elapsed_time = 0
while elapsed_time < max_wait_time:
try:
# Check document status
status_response = requests.get(f"{base_url}/api/documents", headers={"X-API-Key": api_key})
if status_response.status_code == 200:
documents = status_response.json()
if documents:
doc_status = documents[0].get("status", "unknown")
print(f" Document status: {doc_status} (elapsed: {elapsed_time}s)")
if doc_status == "PROCESSED":
print(" ✅ Indexing completed successfully")
break
elif doc_status == "FAILED":
print(" ❌ Indexing failed")
return False
else:
print(" No documents found, waiting...")
else:
print(f" Status check failed: {status_response.status_code}")
except Exception as e:
print(f" Status check error: {e}")
time.sleep(wait_interval)
elapsed_time += wait_interval
if elapsed_time >= max_wait_time:
print(" ⚠️ Indexing timeout - proceeding with search test")
# 4. Test search functionality
print("\n4. Testing search functionality...")
test_queries = [
"artificial intelligence",
"machine learning",
"neural networks",
"data science"
]
search_success = False
for query in test_queries:
try:
search_payload = {
"query": query,
"top_k": 5
}
search_response = requests.post(
f"{base_url}/api/search",
json=search_payload,
headers={"X-API-Key": api_key, "Content-Type": "application/json"}
)
if search_response.status_code == 200:
search_results = search_response.json()
if search_results and len(search_results) > 0:
print(f" ✅ Search successful for '{query}': Found {len(search_results)} results")
search_success = True
# Print first result snippet
first_result = search_results[0]
content_preview = first_result.get('content', '')[:100] + "..."
print(f" First result preview: {content_preview}")
break
else:
print(f" ⚠️ No results for '{query}'")
else:
print(f" ❌ Search failed for '{query}': {search_response.status_code}")
except Exception as e:
print(f" ❌ Search error for '{query}': {e}")
# 5. Test web UI accessibility
print("\n5. Testing web UI accessibility...")
try:
webui_response = requests.get(f"{base_url}/webui/")
if webui_response.status_code == 200:
print(" ✅ Web UI is accessible")
else:
print(f" ⚠️ Web UI returned status: {webui_response.status_code}")
except Exception as e:
print(f" ❌ Web UI access error: {e}")
# Final summary
print("\n=== Test Summary ===")
if search_success:
print("✅ SUCCESS: OCR PDF workflow is working correctly!")
print(" - File upload: ✓")
print(" - Indexing: ✓")
print(" - Search: ✓")
print(" - Web UI: ✓")
return True
else:
print("❌ FAILURE: Some tests failed")
return False
if __name__ == "__main__":
success = test_ocr_pdf_workflow()
sys.exit(0 if success else 1)