298 lines
11 KiB
Python
298 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test OCR PDF upload, indexing, and search through Web UI simulation
|
|
This script simulates the complete web UI workflow for OCR PDF processing
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:3015"
|
|
USERNAME = "jleu3482"
|
|
PASSWORD = "jleu1212"
|
|
OCR_PDF_PATH = "ocr.pdf"
|
|
|
|
class WebUITester:
|
|
def __init__(self):
|
|
self.base_url = BASE_URL
|
|
self.session = requests.Session()
|
|
self.access_token = None
|
|
|
|
def login(self):
|
|
"""Login to get JWT token"""
|
|
print("🔐 Logging in to Web UI...")
|
|
login_data = {
|
|
"username": USERNAME,
|
|
"password": PASSWORD
|
|
}
|
|
|
|
try:
|
|
response = self.session.post(
|
|
f"{self.base_url}/login",
|
|
data=login_data
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
self.access_token = result.get("access_token")
|
|
print(f"✅ Login successful")
|
|
print(f" Auth Mode: {result.get('auth_mode', 'unknown')}")
|
|
return True
|
|
else:
|
|
print(f"❌ Login failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Login error: {e}")
|
|
return False
|
|
|
|
def check_server_health(self):
|
|
"""Check server health"""
|
|
print("\n🏥 Checking server health...")
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/health", headers=headers)
|
|
if response.status_code == 200:
|
|
health_data = response.json()
|
|
print(f"✅ Server is healthy")
|
|
print(f" LLM: {health_data['configuration']['llm_binding']}")
|
|
print(f" Embedding: {health_data['configuration']['embedding_binding']}")
|
|
print(f" Rerank: {health_data['configuration']['rerank_binding']}")
|
|
return True
|
|
else:
|
|
print(f"❌ Health check failed: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Health check error: {e}")
|
|
return False
|
|
|
|
def upload_ocr_pdf(self):
|
|
"""Upload OCR PDF through web UI API"""
|
|
print(f"\n📤 Uploading OCR PDF: {OCR_PDF_PATH}")
|
|
|
|
if not Path(OCR_PDF_PATH).exists():
|
|
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
|
|
return False
|
|
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
|
|
try:
|
|
with open(OCR_PDF_PATH, 'rb') as file:
|
|
files = {'file': (OCR_PDF_PATH, file, 'application/pdf')}
|
|
response = self.session.post(
|
|
f"{self.base_url}/documents/upload",
|
|
files=files,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
print(f"✅ Upload successful")
|
|
print(f" Status: {result.get('status', 'unknown')}")
|
|
print(f" Message: {result.get('message', 'No message')}")
|
|
if result.get('track_id'):
|
|
print(f" Track ID: {result.get('track_id')}")
|
|
return True
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def wait_for_indexing(self, max_wait=180):
|
|
"""Wait for document indexing to complete"""
|
|
print(f"\n⏳ Waiting for indexing (max {max_wait}s)...")
|
|
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
|
|
for i in range(max_wait):
|
|
try:
|
|
# Check pipeline status
|
|
response = self.session.get(f"{self.base_url}/health", headers=headers)
|
|
if response.status_code == 200:
|
|
health_data = response.json()
|
|
busy = health_data.get('pipeline_busy', False)
|
|
|
|
if not busy:
|
|
print("✅ Indexing completed!")
|
|
return True
|
|
|
|
if i % 10 == 0: # Print status every 10 seconds
|
|
print(f" Still indexing... ({i}s)")
|
|
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error checking indexing status: {e}")
|
|
return False
|
|
|
|
print("❌ Indexing timeout reached")
|
|
return False
|
|
|
|
def check_document_status(self):
|
|
"""Check document processing status"""
|
|
print("\n📊 Checking document status...")
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
|
|
try:
|
|
response = self.session.get(
|
|
f"{self.base_url}/documents/status_counts",
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
status_data = response.json()
|
|
status_counts = status_data.get('status_counts', {})
|
|
print(f"📈 Document Status Counts:")
|
|
for status, count in status_counts.items():
|
|
print(f" {status}: {count}")
|
|
return status_counts
|
|
else:
|
|
print(f"❌ Failed to get status: {response.status_code}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error checking document status: {e}")
|
|
return None
|
|
|
|
def search_ocr_content(self):
|
|
"""Search for OCR content using web UI search"""
|
|
print("\n🔍 Testing search functionality...")
|
|
|
|
test_queries = [
|
|
"LightRAG",
|
|
"OCR technology",
|
|
"document processing",
|
|
"text extraction",
|
|
"Retrieval-Augmented Generation"
|
|
]
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {self.access_token}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
successful_searches = 0
|
|
|
|
for query in test_queries:
|
|
print(f"\n Testing query: '{query}'")
|
|
try:
|
|
payload = {
|
|
"query": query,
|
|
"top_k": 5,
|
|
"only_need_context": True
|
|
}
|
|
|
|
response = self.session.post(
|
|
f"{self.base_url}/query",
|
|
json=payload,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
|
|
# Check if we got actual content
|
|
if 'chunks' in results and len(results['chunks']) > 0:
|
|
print(f" ✅ Found {len(results['chunks'])} results")
|
|
successful_searches += 1
|
|
|
|
# Show first result preview
|
|
first_chunk = results['chunks'][0]
|
|
content = first_chunk.get('text', '')[:150] + "..."
|
|
score = first_chunk.get('score', 0)
|
|
print(f" 📄 Preview: {content}")
|
|
print(f" 📊 Score: {score:.3f}")
|
|
|
|
elif 'response' in results:
|
|
# Check if LLM responded with content
|
|
response_text = results['response']
|
|
if "[no-context]" not in response_text:
|
|
print(f" ✅ LLM generated response")
|
|
successful_searches += 1
|
|
print(f" 🤖 Response: {response_text[:150]}...")
|
|
else:
|
|
print(f" ⚠️ No context found for query")
|
|
else:
|
|
print(f" ⚠️ No results found")
|
|
|
|
else:
|
|
print(f" ❌ Search failed: {response.status_code} - {response.text}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Search error: {e}")
|
|
|
|
print(f"\n📊 Search Summary: {successful_searches}/{len(test_queries)} queries successful")
|
|
return successful_searches > 0
|
|
|
|
def run_complete_test(self):
|
|
"""Run complete Web UI OCR workflow test"""
|
|
print("🚀 Starting Complete Web UI OCR PDF Workflow Test")
|
|
print("=" * 70)
|
|
|
|
# Step 1: Login
|
|
if not self.login():
|
|
return False
|
|
|
|
# Step 2: Check server health
|
|
if not self.check_server_health():
|
|
return False
|
|
|
|
# Step 3: Upload OCR PDF
|
|
if not self.upload_ocr_pdf():
|
|
return False
|
|
|
|
# Step 4: Wait for indexing
|
|
if not self.wait_for_indexing():
|
|
return False
|
|
|
|
# Step 5: Check document status
|
|
status_counts = self.check_document_status()
|
|
|
|
# Step 6: Test search
|
|
search_success = self.search_ocr_content()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print("🎯 WEB UI OCR WORKFLOW TEST SUMMARY")
|
|
print("=" * 70)
|
|
|
|
if status_counts:
|
|
processed = status_counts.get('PROCESSED', 0)
|
|
failed = status_counts.get('FAILED', 0)
|
|
|
|
print(f"📊 Documents: {processed} processed, {failed} failed")
|
|
|
|
if processed > 0 and search_success:
|
|
print("✅ SUCCESS: OCR PDF workflow completed successfully!")
|
|
print(" - Upload successful")
|
|
print(" - Indexing completed")
|
|
print(" - Search returning results")
|
|
return True
|
|
else:
|
|
print("⚠️ PARTIAL SUCCESS: Some steps completed but issues detected")
|
|
return False
|
|
else:
|
|
print("❌ FAILED: Could not complete workflow")
|
|
return False
|
|
|
|
def main():
|
|
tester = WebUITester()
|
|
success = tester.run_complete_test()
|
|
|
|
if success:
|
|
print("\n🎉 OCR PDF Web UI workflow test PASSED!")
|
|
sys.exit(0)
|
|
else:
|
|
print("\n💥 OCR PDF Web UI workflow test FAILED!")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |