""" Complete OCR PDF Upload, Indexing, and Search Test for LightRAG Web UI Tests the entire workflow: upload ocr.pdf → indexing → search functionality Uses correct API endpoints based on server structure """ import requests import time import json import os from pathlib import Path # Configuration BASE_URL = "http://localhost:3015" USERNAME = "jleu3482" PASSWORD = "jleu1212" OCR_PDF_PATH = "ocr.pdf" TEST_QUERIES = [ "LightRAG", "OCR", "document processing", "text extraction" ] class LightRAGWebUITest: def __init__(self): self.session = requests.Session() self.base_url = BASE_URL self.username = USERNAME self.password = PASSWORD self.access_token = None def login(self): """Login and get JWT token""" print("=== Logging In ===") try: # Use form data for OAuth2 password flow form_data = { "username": self.username, "password": self.password } headers = { "Content-Type": "application/x-www-form-urlencoded" } response = self.session.post( f"{self.base_url}/login", data=form_data, headers=headers ) if response.status_code == 200: login_data = response.json() self.access_token = login_data.get("access_token") if self.access_token: print("✅ Login successful") return True else: print("❌ Login failed: No access token received") return False else: print(f"❌ Login failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"❌ Login error: {e}") return False def test_health(self): """Test server health""" print("=== Testing Server Health ===") try: headers = {"Authorization": f"Bearer {self.access_token}"} response = self.session.get(f"{self.base_url}/health", headers=headers) if response.status_code == 200: health_data = response.json() print("✅ Server is healthy") print(f" LLM Binding: {health_data.get('configuration', {}).get('llm_binding', 'N/A')}") print(f" Embedding Binding: {health_data.get('configuration', {}).get('embedding_binding', 'N/A')}") print(f" Rerank Binding: {health_data.get('configuration', {}).get('rerank_binding', 'N/A')}") return True else: print(f"❌ Server health check failed: {response.status_code}") return False except Exception as e: print(f"❌ Server health check error: {e}") return False def test_webui_accessibility(self): """Test web UI accessibility""" print("\n=== Testing Web UI Accessibility ===") try: # Test web UI access response = self.session.get(f"{self.base_url}/webui/") if response.status_code == 200: print("✅ Web UI accessible") return True else: print(f"❌ Web UI access failed: {response.status_code}") return False except Exception as e: print(f"❌ Web UI access error: {e}") return False def upload_ocr_pdf(self): """Upload ocr.pdf file""" print("\n=== Uploading OCR PDF ===") if not os.path.exists(OCR_PDF_PATH): print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}") return False try: # Prepare file for upload files = { 'file': (os.path.basename(OCR_PDF_PATH), open(OCR_PDF_PATH, 'rb'), 'application/pdf') } headers = {"Authorization": f"Bearer {self.access_token}"} print(f"📤 Uploading {OCR_PDF_PATH}...") response = self.session.post( f"{self.base_url}/documents/upload", files=files, headers=headers ) if response.status_code == 200: result = response.json() print(f"✅ Upload successful: {result}") return True else: print(f"❌ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"❌ Upload error: {e}") return False def wait_for_indexing(self, timeout=120): """Wait for document indexing to complete""" print(f"\n=== Waiting for Indexing (max {timeout}s) ===") headers = {"Authorization": f"Bearer {self.access_token}"} start_time = time.time() while time.time() - start_time < timeout: try: # Check pipeline status response = self.session.get( f"{self.base_url}/documents/pipeline_status", headers=headers ) if response.status_code == 200: pipeline_status = response.json() busy = pipeline_status.get('busy', False) latest_message = pipeline_status.get('latest_message', '') print(f"🔄 Pipeline status: busy={busy}, message='{latest_message}'") if not busy: print("✅ Pipeline processing completed!") return True # Check document status response = self.session.get( f"{self.base_url}/documents", headers=headers ) if response.status_code == 200: documents = response.json() statuses = documents.get('statuses', {}) processed_count = len(statuses.get('PROCESSED', [])) pending_count = len(statuses.get('PENDING', [])) processing_count = len(statuses.get('PROCESSING', [])) print(f"📊 Documents: {processed_count} processed, {pending_count} pending, {processing_count} processing") if pending_count == 0 and processing_count == 0 and processed_count > 0: print("✅ All documents processed!") return True time.sleep(5) # Wait 5 seconds between checks except Exception as e: print(f"⚠️ Error checking indexing status: {e}") time.sleep(5) print("⏰ Indexing timeout reached") return False def test_search_queries(self): """Test search functionality with OCR content using query endpoint""" print("\n=== Testing Search Queries ===") headers = { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json" } successful_searches = 0 for query in TEST_QUERIES: print(f"\n🔍 Testing query: '{query}'") try: payload = { "query": query, "top_k": 5, "only_need_context": True # Only return context for search } response = self.session.post( f"{self.base_url}/query", json=payload, headers=headers ) if response.status_code == 200: results = response.json() # The query endpoint returns different structure, check for chunks if results and 'chunks' in results and len(results['chunks']) > 0: print(f"✅ Search successful: Found {len(results['chunks'])} chunks") successful_searches += 1 # Show first chunk snippet first_chunk = results['chunks'][0] content_preview = first_chunk.get('text', '')[:200] + "..." print(f" 📄 First chunk preview: {content_preview}") else: print(f"⚠️ Search returned no results for: '{query}'") print(f" Response: {results}") else: print(f"❌ Search failed: {response.status_code} - {response.text}") except Exception as e: print(f"❌ Search error for '{query}': {e}") print(f"\n📊 Search Summary: {successful_searches}/{len(TEST_QUERIES)} queries successful") return successful_searches > 0 def check_database_storage(self): """Verify data is stored in all databases""" print("\n=== Checking Database Storage ===") headers = {"Authorization": f"Bearer {self.access_token}"} try: # Check document status counts response = self.session.get( f"{self.base_url}/documents/status_counts", headers=headers ) if response.status_code == 200: status_counts = response.json().get('status_counts', {}) print(f"📊 Document Status Counts: {status_counts}") processed_count = status_counts.get('PROCESSED', 0) if processed_count > 0: print("✅ Data stored in databases") return True else: print("⚠️ No processed documents found") return False else: print(f"❌ Could not get status counts: {response.status_code}") return False except Exception as e: print(f"❌ Database check error: {e}") return False def run_complete_test(self): """Run the complete OCR PDF workflow test""" print("🚀 Starting Complete OCR PDF Web UI Workflow Test") print("=" * 60) test_results = {} # Step 1: Login first test_results['login'] = self.login() if not test_results['login']: print("❌ Login failed, cannot proceed with other tests") return False # Step 2: Test server health test_results['health'] = self.test_health() # Step 3: Test web UI accessibility test_results['webui'] = self.test_webui_accessibility() # Step 4: Upload OCR PDF test_results['upload'] = self.upload_ocr_pdf() # Step 5: Wait for indexing if test_results['upload']: test_results['indexing'] = self.wait_for_indexing() else: test_results['indexing'] = False # Step 6: Test search queries if test_results['indexing']: test_results['search'] = self.test_search_queries() else: test_results['search'] = False # Step 7: Check database storage test_results['storage'] = self.check_database_storage() # Summary print("\n" + "=" * 60) print("📋 TEST SUMMARY") print("=" * 60) for test_name, result in test_results.items(): status = "✅ PASS" if result else "❌ FAIL" print(f"{test_name.upper():<12} : {status}") overall_success = all(test_results.values()) if overall_success: print("\n🎉 ALL TESTS PASSED! OCR PDF workflow is working correctly.") print(" - Login successful") print(" - Upload successful") print(" - Indexing completed") print(" - Search functionality working") print(" - Data stored in databases") else: print("\n⚠️ SOME TESTS FAILED. Check the logs above for details.") return overall_success def main(): """Main test execution""" test = LightRAGWebUITest() success = test.run_complete_test() # Exit with appropriate code exit(0 if success else 1) if __name__ == "__main__": main()