345 lines
13 KiB
Python
345 lines
13 KiB
Python
"""
|
|
Complete OCR PDF Upload, Indexing, and Search Test for LightRAG Web UI
|
|
Tests the entire workflow: upload ocr.pdf → indexing → search functionality
|
|
Uses correct API endpoints based on server structure
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:3015"
|
|
USERNAME = "jleu3482"
|
|
PASSWORD = "jleu1212"
|
|
OCR_PDF_PATH = "ocr.pdf"
|
|
TEST_QUERIES = [
|
|
"LightRAG",
|
|
"OCR",
|
|
"document processing",
|
|
"text extraction"
|
|
]
|
|
|
|
class LightRAGWebUITest:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.base_url = BASE_URL
|
|
self.username = USERNAME
|
|
self.password = PASSWORD
|
|
self.access_token = None
|
|
|
|
def login(self):
|
|
"""Login and get JWT token"""
|
|
print("=== Logging In ===")
|
|
try:
|
|
# Use form data for OAuth2 password flow
|
|
form_data = {
|
|
"username": self.username,
|
|
"password": self.password
|
|
}
|
|
|
|
headers = {
|
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
}
|
|
|
|
response = self.session.post(
|
|
f"{self.base_url}/login",
|
|
data=form_data,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
login_data = response.json()
|
|
self.access_token = login_data.get("access_token")
|
|
if self.access_token:
|
|
print("✅ Login successful")
|
|
return True
|
|
else:
|
|
print("❌ Login failed: No access token received")
|
|
return False
|
|
else:
|
|
print(f"❌ Login failed: {response.status_code} - {response.text}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Login error: {e}")
|
|
return False
|
|
|
|
def test_health(self):
|
|
"""Test server health"""
|
|
print("=== Testing Server Health ===")
|
|
try:
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
response = self.session.get(f"{self.base_url}/health", headers=headers)
|
|
if response.status_code == 200:
|
|
health_data = response.json()
|
|
print("✅ Server is healthy")
|
|
print(f" LLM Binding: {health_data.get('configuration', {}).get('llm_binding', 'N/A')}")
|
|
print(f" Embedding Binding: {health_data.get('configuration', {}).get('embedding_binding', 'N/A')}")
|
|
print(f" Rerank Binding: {health_data.get('configuration', {}).get('rerank_binding', 'N/A')}")
|
|
return True
|
|
else:
|
|
print(f"❌ Server health check failed: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Server health check error: {e}")
|
|
return False
|
|
|
|
def test_webui_accessibility(self):
|
|
"""Test web UI accessibility"""
|
|
print("\n=== Testing Web UI Accessibility ===")
|
|
try:
|
|
# Test web UI access
|
|
response = self.session.get(f"{self.base_url}/webui/")
|
|
if response.status_code == 200:
|
|
print("✅ Web UI accessible")
|
|
return True
|
|
else:
|
|
print(f"❌ Web UI access failed: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Web UI access error: {e}")
|
|
return False
|
|
|
|
def upload_ocr_pdf(self):
|
|
"""Upload ocr.pdf file"""
|
|
print("\n=== Uploading OCR PDF ===")
|
|
|
|
if not os.path.exists(OCR_PDF_PATH):
|
|
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
|
|
return False
|
|
|
|
try:
|
|
# Prepare file for upload
|
|
files = {
|
|
'file': (os.path.basename(OCR_PDF_PATH),
|
|
open(OCR_PDF_PATH, 'rb'),
|
|
'application/pdf')
|
|
}
|
|
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
|
|
print(f"📤 Uploading {OCR_PDF_PATH}...")
|
|
response = self.session.post(
|
|
f"{self.base_url}/documents/upload",
|
|
files=files,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
print(f"✅ Upload successful: {result}")
|
|
return True
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def wait_for_indexing(self, timeout=120):
|
|
"""Wait for document indexing to complete"""
|
|
print(f"\n=== Waiting for Indexing (max {timeout}s) ===")
|
|
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout:
|
|
try:
|
|
# Check pipeline status
|
|
response = self.session.get(
|
|
f"{self.base_url}/documents/pipeline_status",
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
pipeline_status = response.json()
|
|
busy = pipeline_status.get('busy', False)
|
|
latest_message = pipeline_status.get('latest_message', '')
|
|
print(f"🔄 Pipeline status: busy={busy}, message='{latest_message}'")
|
|
|
|
if not busy:
|
|
print("✅ Pipeline processing completed!")
|
|
return True
|
|
|
|
# Check document status
|
|
response = self.session.get(
|
|
f"{self.base_url}/documents",
|
|
headers=headers
|
|
)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
statuses = documents.get('statuses', {})
|
|
|
|
processed_count = len(statuses.get('PROCESSED', []))
|
|
pending_count = len(statuses.get('PENDING', []))
|
|
processing_count = len(statuses.get('PROCESSING', []))
|
|
|
|
print(f"📊 Documents: {processed_count} processed, {pending_count} pending, {processing_count} processing")
|
|
|
|
if pending_count == 0 and processing_count == 0 and processed_count > 0:
|
|
print("✅ All documents processed!")
|
|
return True
|
|
|
|
time.sleep(5) # Wait 5 seconds between checks
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Error checking indexing status: {e}")
|
|
time.sleep(5)
|
|
|
|
print("⏰ Indexing timeout reached")
|
|
return False
|
|
|
|
def test_search_queries(self):
|
|
"""Test search functionality with OCR content using query endpoint"""
|
|
print("\n=== Testing Search Queries ===")
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {self.access_token}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
successful_searches = 0
|
|
|
|
for query in TEST_QUERIES:
|
|
print(f"\n🔍 Testing query: '{query}'")
|
|
try:
|
|
payload = {
|
|
"query": query,
|
|
"top_k": 5,
|
|
"only_need_context": True # Only return context for search
|
|
}
|
|
|
|
response = self.session.post(
|
|
f"{self.base_url}/query",
|
|
json=payload,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
# The query endpoint returns different structure, check for chunks
|
|
if results and 'chunks' in results and len(results['chunks']) > 0:
|
|
print(f"✅ Search successful: Found {len(results['chunks'])} chunks")
|
|
successful_searches += 1
|
|
|
|
# Show first chunk snippet
|
|
first_chunk = results['chunks'][0]
|
|
content_preview = first_chunk.get('text', '')[:200] + "..."
|
|
print(f" 📄 First chunk preview: {content_preview}")
|
|
else:
|
|
print(f"⚠️ Search returned no results for: '{query}'")
|
|
print(f" Response: {results}")
|
|
else:
|
|
print(f"❌ Search failed: {response.status_code} - {response.text}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search error for '{query}': {e}")
|
|
|
|
print(f"\n📊 Search Summary: {successful_searches}/{len(TEST_QUERIES)} queries successful")
|
|
return successful_searches > 0
|
|
|
|
def check_database_storage(self):
|
|
"""Verify data is stored in all databases"""
|
|
print("\n=== Checking Database Storage ===")
|
|
|
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
|
|
try:
|
|
# Check document status counts
|
|
response = self.session.get(
|
|
f"{self.base_url}/documents/status_counts",
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
status_counts = response.json().get('status_counts', {})
|
|
print(f"📊 Document Status Counts: {status_counts}")
|
|
|
|
processed_count = status_counts.get('PROCESSED', 0)
|
|
if processed_count > 0:
|
|
print("✅ Data stored in databases")
|
|
return True
|
|
else:
|
|
print("⚠️ No processed documents found")
|
|
return False
|
|
else:
|
|
print(f"❌ Could not get status counts: {response.status_code}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Database check error: {e}")
|
|
return False
|
|
|
|
def run_complete_test(self):
|
|
"""Run the complete OCR PDF workflow test"""
|
|
print("🚀 Starting Complete OCR PDF Web UI Workflow Test")
|
|
print("=" * 60)
|
|
|
|
test_results = {}
|
|
|
|
# Step 1: Login first
|
|
test_results['login'] = self.login()
|
|
if not test_results['login']:
|
|
print("❌ Login failed, cannot proceed with other tests")
|
|
return False
|
|
|
|
# Step 2: Test server health
|
|
test_results['health'] = self.test_health()
|
|
|
|
# Step 3: Test web UI accessibility
|
|
test_results['webui'] = self.test_webui_accessibility()
|
|
|
|
# Step 4: Upload OCR PDF
|
|
test_results['upload'] = self.upload_ocr_pdf()
|
|
|
|
# Step 5: Wait for indexing
|
|
if test_results['upload']:
|
|
test_results['indexing'] = self.wait_for_indexing()
|
|
else:
|
|
test_results['indexing'] = False
|
|
|
|
# Step 6: Test search queries
|
|
if test_results['indexing']:
|
|
test_results['search'] = self.test_search_queries()
|
|
else:
|
|
test_results['search'] = False
|
|
|
|
# Step 7: Check database storage
|
|
test_results['storage'] = self.check_database_storage()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("📋 TEST SUMMARY")
|
|
print("=" * 60)
|
|
|
|
for test_name, result in test_results.items():
|
|
status = "✅ PASS" if result else "❌ FAIL"
|
|
print(f"{test_name.upper():<12} : {status}")
|
|
|
|
overall_success = all(test_results.values())
|
|
|
|
if overall_success:
|
|
print("\n🎉 ALL TESTS PASSED! OCR PDF workflow is working correctly.")
|
|
print(" - Login successful")
|
|
print(" - Upload successful")
|
|
print(" - Indexing completed")
|
|
print(" - Search functionality working")
|
|
print(" - Data stored in databases")
|
|
else:
|
|
print("\n⚠️ SOME TESTS FAILED. Check the logs above for details.")
|
|
|
|
return overall_success
|
|
|
|
def main():
|
|
"""Main test execution"""
|
|
test = LightRAGWebUITest()
|
|
success = test.run_complete_test()
|
|
|
|
# Exit with appropriate code
|
|
exit(0 if success else 1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |