Files
railseek6/test_webui_ocr_workflow.py

298 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Test OCR PDF upload, indexing, and search through Web UI simulation
This script simulates the complete web UI workflow for OCR PDF processing
"""
import requests
import time
import json
import sys
from pathlib import Path
# Configuration
BASE_URL = "http://localhost:3015"
USERNAME = "jleu3482"
PASSWORD = "jleu1212"
OCR_PDF_PATH = "ocr.pdf"
class WebUITester:
def __init__(self):
self.base_url = BASE_URL
self.session = requests.Session()
self.access_token = None
def login(self):
"""Login to get JWT token"""
print("🔐 Logging in to Web UI...")
login_data = {
"username": USERNAME,
"password": PASSWORD
}
try:
response = self.session.post(
f"{self.base_url}/login",
data=login_data
)
if response.status_code == 200:
result = response.json()
self.access_token = result.get("access_token")
print(f"✅ Login successful")
print(f" Auth Mode: {result.get('auth_mode', 'unknown')}")
return True
else:
print(f"❌ Login failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Login error: {e}")
return False
def check_server_health(self):
"""Check server health"""
print("\n🏥 Checking server health...")
headers = {"Authorization": f"Bearer {self.access_token}"}
try:
response = self.session.get(f"{self.base_url}/health", headers=headers)
if response.status_code == 200:
health_data = response.json()
print(f"✅ Server is healthy")
print(f" LLM: {health_data['configuration']['llm_binding']}")
print(f" Embedding: {health_data['configuration']['embedding_binding']}")
print(f" Rerank: {health_data['configuration']['rerank_binding']}")
return True
else:
print(f"❌ Health check failed: {response.status_code}")
return False
except Exception as e:
print(f"❌ Health check error: {e}")
return False
def upload_ocr_pdf(self):
"""Upload OCR PDF through web UI API"""
print(f"\n📤 Uploading OCR PDF: {OCR_PDF_PATH}")
if not Path(OCR_PDF_PATH).exists():
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
return False
headers = {"Authorization": f"Bearer {self.access_token}"}
try:
with open(OCR_PDF_PATH, 'rb') as file:
files = {'file': (OCR_PDF_PATH, file, 'application/pdf')}
response = self.session.post(
f"{self.base_url}/documents/upload",
files=files,
headers=headers
)
if response.status_code == 200:
result = response.json()
print(f"✅ Upload successful")
print(f" Status: {result.get('status', 'unknown')}")
print(f" Message: {result.get('message', 'No message')}")
if result.get('track_id'):
print(f" Track ID: {result.get('track_id')}")
return True
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Upload error: {e}")
return False
def wait_for_indexing(self, max_wait=180):
"""Wait for document indexing to complete"""
print(f"\n⏳ Waiting for indexing (max {max_wait}s)...")
headers = {"Authorization": f"Bearer {self.access_token}"}
for i in range(max_wait):
try:
# Check pipeline status
response = self.session.get(f"{self.base_url}/health", headers=headers)
if response.status_code == 200:
health_data = response.json()
busy = health_data.get('pipeline_busy', False)
if not busy:
print("✅ Indexing completed!")
return True
if i % 10 == 0: # Print status every 10 seconds
print(f" Still indexing... ({i}s)")
time.sleep(1)
except Exception as e:
print(f"❌ Error checking indexing status: {e}")
return False
print("❌ Indexing timeout reached")
return False
def check_document_status(self):
"""Check document processing status"""
print("\n📊 Checking document status...")
headers = {"Authorization": f"Bearer {self.access_token}"}
try:
response = self.session.get(
f"{self.base_url}/documents/status_counts",
headers=headers
)
if response.status_code == 200:
status_data = response.json()
status_counts = status_data.get('status_counts', {})
print(f"📈 Document Status Counts:")
for status, count in status_counts.items():
print(f" {status}: {count}")
return status_counts
else:
print(f"❌ Failed to get status: {response.status_code}")
return None
except Exception as e:
print(f"❌ Error checking document status: {e}")
return None
def search_ocr_content(self):
"""Search for OCR content using web UI search"""
print("\n🔍 Testing search functionality...")
test_queries = [
"LightRAG",
"OCR technology",
"document processing",
"text extraction",
"Retrieval-Augmented Generation"
]
headers = {
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json"
}
successful_searches = 0
for query in test_queries:
print(f"\n Testing query: '{query}'")
try:
payload = {
"query": query,
"top_k": 5,
"only_need_context": True
}
response = self.session.post(
f"{self.base_url}/query",
json=payload,
headers=headers
)
if response.status_code == 200:
results = response.json()
# Check if we got actual content
if 'chunks' in results and len(results['chunks']) > 0:
print(f" ✅ Found {len(results['chunks'])} results")
successful_searches += 1
# Show first result preview
first_chunk = results['chunks'][0]
content = first_chunk.get('text', '')[:150] + "..."
score = first_chunk.get('score', 0)
print(f" 📄 Preview: {content}")
print(f" 📊 Score: {score:.3f}")
elif 'response' in results:
# Check if LLM responded with content
response_text = results['response']
if "[no-context]" not in response_text:
print(f" ✅ LLM generated response")
successful_searches += 1
print(f" 🤖 Response: {response_text[:150]}...")
else:
print(f" ⚠️ No context found for query")
else:
print(f" ⚠️ No results found")
else:
print(f" ❌ Search failed: {response.status_code} - {response.text}")
except Exception as e:
print(f" ❌ Search error: {e}")
print(f"\n📊 Search Summary: {successful_searches}/{len(test_queries)} queries successful")
return successful_searches > 0
def run_complete_test(self):
"""Run complete Web UI OCR workflow test"""
print("🚀 Starting Complete Web UI OCR PDF Workflow Test")
print("=" * 70)
# Step 1: Login
if not self.login():
return False
# Step 2: Check server health
if not self.check_server_health():
return False
# Step 3: Upload OCR PDF
if not self.upload_ocr_pdf():
return False
# Step 4: Wait for indexing
if not self.wait_for_indexing():
return False
# Step 5: Check document status
status_counts = self.check_document_status()
# Step 6: Test search
search_success = self.search_ocr_content()
# Summary
print("\n" + "=" * 70)
print("🎯 WEB UI OCR WORKFLOW TEST SUMMARY")
print("=" * 70)
if status_counts:
processed = status_counts.get('PROCESSED', 0)
failed = status_counts.get('FAILED', 0)
print(f"📊 Documents: {processed} processed, {failed} failed")
if processed > 0 and search_success:
print("✅ SUCCESS: OCR PDF workflow completed successfully!")
print(" - Upload successful")
print(" - Indexing completed")
print(" - Search returning results")
return True
else:
print("⚠️ PARTIAL SUCCESS: Some steps completed but issues detected")
return False
else:
print("❌ FAILED: Could not complete workflow")
return False
def main():
tester = WebUITester()
success = tester.run_complete_test()
if success:
print("\n🎉 OCR PDF Web UI workflow test PASSED!")
sys.exit(0)
else:
print("\n💥 OCR PDF Web UI workflow test FAILED!")
sys.exit(1)
if __name__ == "__main__":
main()