Files
railseek6/test_ocr_direct_workflow.py

276 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Direct OCR PDF upload, indexing, and search test
This test bypasses authentication issues and tests the core OCR functionality
"""
import requests
import time
import json
import sys
from pathlib import Path
# Configuration
BASE_URL = "http://localhost:3015"
OCR_PDF_PATH = "ocr.pdf"
class OCRWorkflowTester:
def __init__(self):
self.base_url = BASE_URL
self.session = requests.Session()
def check_server_status(self):
"""Check if server is accessible"""
print("🔍 Checking server accessibility...")
try:
response = self.session.get(f"{self.base_url}/")
if response.status_code in [200, 307]: # 307 for redirect
print("✅ Server is accessible")
return True
else:
print(f"❌ Server returned status: {response.status_code}")
return False
except Exception as e:
print(f"❌ Cannot connect to server: {e}")
return False
def check_health_no_auth(self):
"""Try to check health without authentication"""
print("\n🏥 Checking server health (no auth)...")
try:
response = self.session.get(f"{self.base_url}/health")
if response.status_code == 200:
health_data = response.json()
print(f"✅ Server is healthy")
print(f" Status: {health_data.get('status', 'unknown')}")
print(f" Auth Mode: {health_data.get('auth_mode', 'unknown')}")
return health_data
else:
print(f"⚠️ Health check returned: {response.status_code}")
# Try to parse anyway
try:
health_data = response.json()
print(f" Response: {health_data}")
except:
print(f" Response: {response.text}")
return None
except Exception as e:
print(f"❌ Health check error: {e}")
return None
def upload_ocr_pdf_direct(self):
"""Try to upload OCR PDF without authentication"""
print(f"\n📤 Attempting to upload OCR PDF: {OCR_PDF_PATH}")
if not Path(OCR_PDF_PATH).exists():
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
return False
try:
with open(OCR_PDF_PATH, 'rb') as file:
files = {'file': (OCR_PDF_PATH, file, 'application/pdf')}
response = self.session.post(
f"{self.base_url}/documents/upload",
files=files
)
if response.status_code == 200:
result = response.json()
print(f"✅ Upload successful")
print(f" Status: {result.get('status', 'unknown')}")
print(f" Message: {result.get('message', 'No message')}")
return True
else:
print(f"❌ Upload failed: {response.status_code}")
try:
error_data = response.json()
print(f" Error: {error_data}")
except:
print(f" Response: {response.text}")
return False
except Exception as e:
print(f"❌ Upload error: {e}")
return False
def monitor_processing(self, max_wait=120):
"""Monitor document processing status"""
print(f"\n⏳ Monitoring processing (max {max_wait}s)...")
for i in range(max_wait):
try:
# Try to get health status to check pipeline
response = self.session.get(f"{self.base_url}/health")
if response.status_code == 200:
health_data = response.json()
busy = health_data.get('pipeline_busy', False)
if not busy:
print("✅ Processing appears complete")
return True
if i % 10 == 0: # Print status every 10 seconds
print(f" Still processing... ({i}s)")
time.sleep(1)
except Exception as e:
print(f"❌ Error monitoring processing: {e}")
return False
print("⚠️ Processing timeout reached")
return False
def test_search_without_auth(self):
"""Test search functionality without authentication"""
print("\n🔍 Testing search without authentication...")
test_queries = [
"LightRAG",
"OCR technology",
"document processing",
"text extraction",
"Retrieval-Augmented Generation"
]
headers = {"Content-Type": "application/json"}
for query in test_queries:
print(f"\n Testing query: '{query}'")
try:
payload = {
"query": query,
"top_k": 5,
"only_need_context": True
}
response = self.session.post(
f"{self.base_url}/query",
json=payload,
headers=headers
)
if response.status_code == 200:
results = response.json()
# Check if we got actual content
if 'chunks' in results and len(results['chunks']) > 0:
print(f" ✅ Found {len(results['chunks'])} results")
# Show first result preview
first_chunk = results['chunks'][0]
content = first_chunk.get('text', '')[:150] + "..."
score = first_chunk.get('score', 0)
print(f" 📄 Preview: {content}")
print(f" 📊 Score: {score:.3f}")
return True # At least one successful search
elif 'response' in results:
# Check if LLM responded with content
response_text = results['response']
if "[no-context]" not in response_text:
print(f" ✅ LLM generated response")
print(f" 🤖 Response: {response_text[:150]}...")
return True
else:
print(f" ⚠️ No context found for query")
else:
print(f" ⚠️ No results found")
else:
print(f" ❌ Search failed: {response.status_code}")
try:
error_data = response.json()
print(f" Error: {error_data}")
except:
print(f" Response: {response.text}")
except Exception as e:
print(f" ❌ Search error: {e}")
return False
def check_webui_access(self):
"""Check if Web UI is accessible"""
print("\n🌐 Checking Web UI accessibility...")
try:
response = self.session.get(f"{self.base_url}/webui/")
if response.status_code == 200:
print("✅ Web UI is accessible")
return True
else:
print(f"⚠️ Web UI returned: {response.status_code}")
return False
except Exception as e:
print(f"❌ Web UI access error: {e}")
return False
def run_complete_test(self):
"""Run complete OCR workflow test"""
print("🚀 Starting Direct OCR PDF Workflow Test")
print("=" * 60)
# Step 1: Check server accessibility
if not self.check_server_status():
return False
# Step 2: Check Web UI
self.check_webui_access()
# Step 3: Check health (may fail due to auth)
health_data = self.check_health_no_auth()
# Step 4: Upload OCR PDF
upload_success = self.upload_ocr_pdf_direct()
# Step 5: Monitor processing
if upload_success:
processing_success = self.monitor_processing()
else:
processing_success = False
# Step 6: Test search
search_success = self.test_search_without_auth()
# Summary
print("\n" + "=" * 60)
print("🎯 DIRECT OCR WORKFLOW TEST SUMMARY")
print("=" * 60)
print(f"📊 Upload: {'✅ Success' if upload_success else '❌ Failed'}")
print(f"📊 Processing: {'✅ Complete' if processing_success else '❌ Failed/Timeout'}")
print(f"📊 Search: {'✅ Working' if search_success else '❌ No results'}")
if upload_success and search_success:
print("\n✅ SUCCESS: OCR PDF workflow is functional!")
print(" - Upload successful")
print(" - Search returning results")
return True
elif upload_success:
print("\n⚠️ PARTIAL SUCCESS: Upload worked but search issues")
return False
else:
print("\n❌ FAILED: Could not complete workflow")
return False
def main():
tester = OCRWorkflowTester()
success = tester.run_complete_test()
if success:
print("\n🎉 OCR PDF direct workflow test PASSED!")
print("\n📋 Next steps:")
print(" 1. Access Web UI at: http://localhost:3015/webui/")
print(" 2. Use credentials: jleu3482 / jleu1212")
print(" 3. Upload documents and test search")
sys.exit(0)
else:
print("\n💥 OCR PDF direct workflow test had issues.")
print("\n🔧 Troubleshooting:")
print(" - Check server authentication configuration")
print(" - Verify .env file settings")
print(" - Check database connections")
sys.exit(1)
if __name__ == "__main__":
main()