276 lines
10 KiB
Python
276 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Direct OCR PDF upload, indexing, and search test
|
|
This test bypasses authentication issues and tests the core OCR functionality
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:3015"
|
|
OCR_PDF_PATH = "ocr.pdf"
|
|
|
|
class OCRWorkflowTester:
|
|
def __init__(self):
|
|
self.base_url = BASE_URL
|
|
self.session = requests.Session()
|
|
|
|
def check_server_status(self):
|
|
"""Check if server is accessible"""
|
|
print("🔍 Checking server accessibility...")
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/")
|
|
if response.status_code in [200, 307]: # 307 for redirect
|
|
print("✅ Server is accessible")
|
|
return True
|
|
else:
|
|
print(f"❌ Server returned status: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Cannot connect to server: {e}")
|
|
return False
|
|
|
|
def check_health_no_auth(self):
|
|
"""Try to check health without authentication"""
|
|
print("\n🏥 Checking server health (no auth)...")
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/health")
|
|
if response.status_code == 200:
|
|
health_data = response.json()
|
|
print(f"✅ Server is healthy")
|
|
print(f" Status: {health_data.get('status', 'unknown')}")
|
|
print(f" Auth Mode: {health_data.get('auth_mode', 'unknown')}")
|
|
return health_data
|
|
else:
|
|
print(f"⚠️ Health check returned: {response.status_code}")
|
|
# Try to parse anyway
|
|
try:
|
|
health_data = response.json()
|
|
print(f" Response: {health_data}")
|
|
except:
|
|
print(f" Response: {response.text}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"❌ Health check error: {e}")
|
|
return None
|
|
|
|
def upload_ocr_pdf_direct(self):
|
|
"""Try to upload OCR PDF without authentication"""
|
|
print(f"\n📤 Attempting to upload OCR PDF: {OCR_PDF_PATH}")
|
|
|
|
if not Path(OCR_PDF_PATH).exists():
|
|
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
|
|
return False
|
|
|
|
try:
|
|
with open(OCR_PDF_PATH, 'rb') as file:
|
|
files = {'file': (OCR_PDF_PATH, file, 'application/pdf')}
|
|
response = self.session.post(
|
|
f"{self.base_url}/documents/upload",
|
|
files=files
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
print(f"✅ Upload successful")
|
|
print(f" Status: {result.get('status', 'unknown')}")
|
|
print(f" Message: {result.get('message', 'No message')}")
|
|
return True
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code}")
|
|
try:
|
|
error_data = response.json()
|
|
print(f" Error: {error_data}")
|
|
except:
|
|
print(f" Response: {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def monitor_processing(self, max_wait=120):
|
|
"""Monitor document processing status"""
|
|
print(f"\n⏳ Monitoring processing (max {max_wait}s)...")
|
|
|
|
for i in range(max_wait):
|
|
try:
|
|
# Try to get health status to check pipeline
|
|
response = self.session.get(f"{self.base_url}/health")
|
|
if response.status_code == 200:
|
|
health_data = response.json()
|
|
busy = health_data.get('pipeline_busy', False)
|
|
|
|
if not busy:
|
|
print("✅ Processing appears complete")
|
|
return True
|
|
|
|
if i % 10 == 0: # Print status every 10 seconds
|
|
print(f" Still processing... ({i}s)")
|
|
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error monitoring processing: {e}")
|
|
return False
|
|
|
|
print("⚠️ Processing timeout reached")
|
|
return False
|
|
|
|
def test_search_without_auth(self):
|
|
"""Test search functionality without authentication"""
|
|
print("\n🔍 Testing search without authentication...")
|
|
|
|
test_queries = [
|
|
"LightRAG",
|
|
"OCR technology",
|
|
"document processing",
|
|
"text extraction",
|
|
"Retrieval-Augmented Generation"
|
|
]
|
|
|
|
headers = {"Content-Type": "application/json"}
|
|
|
|
for query in test_queries:
|
|
print(f"\n Testing query: '{query}'")
|
|
try:
|
|
payload = {
|
|
"query": query,
|
|
"top_k": 5,
|
|
"only_need_context": True
|
|
}
|
|
|
|
response = self.session.post(
|
|
f"{self.base_url}/query",
|
|
json=payload,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
|
|
# Check if we got actual content
|
|
if 'chunks' in results and len(results['chunks']) > 0:
|
|
print(f" ✅ Found {len(results['chunks'])} results")
|
|
|
|
# Show first result preview
|
|
first_chunk = results['chunks'][0]
|
|
content = first_chunk.get('text', '')[:150] + "..."
|
|
score = first_chunk.get('score', 0)
|
|
print(f" 📄 Preview: {content}")
|
|
print(f" 📊 Score: {score:.3f}")
|
|
return True # At least one successful search
|
|
|
|
elif 'response' in results:
|
|
# Check if LLM responded with content
|
|
response_text = results['response']
|
|
if "[no-context]" not in response_text:
|
|
print(f" ✅ LLM generated response")
|
|
print(f" 🤖 Response: {response_text[:150]}...")
|
|
return True
|
|
else:
|
|
print(f" ⚠️ No context found for query")
|
|
else:
|
|
print(f" ⚠️ No results found")
|
|
|
|
else:
|
|
print(f" ❌ Search failed: {response.status_code}")
|
|
try:
|
|
error_data = response.json()
|
|
print(f" Error: {error_data}")
|
|
except:
|
|
print(f" Response: {response.text}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Search error: {e}")
|
|
|
|
return False
|
|
|
|
def check_webui_access(self):
|
|
"""Check if Web UI is accessible"""
|
|
print("\n🌐 Checking Web UI accessibility...")
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/webui/")
|
|
if response.status_code == 200:
|
|
print("✅ Web UI is accessible")
|
|
return True
|
|
else:
|
|
print(f"⚠️ Web UI returned: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Web UI access error: {e}")
|
|
return False
|
|
|
|
def run_complete_test(self):
|
|
"""Run complete OCR workflow test"""
|
|
print("🚀 Starting Direct OCR PDF Workflow Test")
|
|
print("=" * 60)
|
|
|
|
# Step 1: Check server accessibility
|
|
if not self.check_server_status():
|
|
return False
|
|
|
|
# Step 2: Check Web UI
|
|
self.check_webui_access()
|
|
|
|
# Step 3: Check health (may fail due to auth)
|
|
health_data = self.check_health_no_auth()
|
|
|
|
# Step 4: Upload OCR PDF
|
|
upload_success = self.upload_ocr_pdf_direct()
|
|
|
|
# Step 5: Monitor processing
|
|
if upload_success:
|
|
processing_success = self.monitor_processing()
|
|
else:
|
|
processing_success = False
|
|
|
|
# Step 6: Test search
|
|
search_success = self.test_search_without_auth()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("🎯 DIRECT OCR WORKFLOW TEST SUMMARY")
|
|
print("=" * 60)
|
|
|
|
print(f"📊 Upload: {'✅ Success' if upload_success else '❌ Failed'}")
|
|
print(f"📊 Processing: {'✅ Complete' if processing_success else '❌ Failed/Timeout'}")
|
|
print(f"📊 Search: {'✅ Working' if search_success else '❌ No results'}")
|
|
|
|
if upload_success and search_success:
|
|
print("\n✅ SUCCESS: OCR PDF workflow is functional!")
|
|
print(" - Upload successful")
|
|
print(" - Search returning results")
|
|
return True
|
|
elif upload_success:
|
|
print("\n⚠️ PARTIAL SUCCESS: Upload worked but search issues")
|
|
return False
|
|
else:
|
|
print("\n❌ FAILED: Could not complete workflow")
|
|
return False
|
|
|
|
def main():
|
|
tester = OCRWorkflowTester()
|
|
success = tester.run_complete_test()
|
|
|
|
if success:
|
|
print("\n🎉 OCR PDF direct workflow test PASSED!")
|
|
print("\n📋 Next steps:")
|
|
print(" 1. Access Web UI at: http://localhost:3015/webui/")
|
|
print(" 2. Use credentials: jleu3482 / jleu1212")
|
|
print(" 3. Upload documents and test search")
|
|
sys.exit(0)
|
|
else:
|
|
print("\n💥 OCR PDF direct workflow test had issues.")
|
|
print("\n🔧 Troubleshooting:")
|
|
print(" - Check server authentication configuration")
|
|
print(" - Verify .env file settings")
|
|
print(" - Check database connections")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |