#!/usr/bin/env python3 """ Test OCR PDF upload, indexing, and search through Web UI simulation This script simulates the complete web UI workflow for OCR PDF processing """ import requests import time import json import sys from pathlib import Path # Configuration BASE_URL = "http://localhost:3015" USERNAME = "jleu3482" PASSWORD = "jleu1212" OCR_PDF_PATH = "ocr.pdf" class WebUITester: def __init__(self): self.base_url = BASE_URL self.session = requests.Session() self.access_token = None def login(self): """Login to get JWT token""" print("šŸ” Logging in to Web UI...") login_data = { "username": USERNAME, "password": PASSWORD } try: response = self.session.post( f"{self.base_url}/login", data=login_data ) if response.status_code == 200: result = response.json() self.access_token = result.get("access_token") print(f"āœ… Login successful") print(f" Auth Mode: {result.get('auth_mode', 'unknown')}") return True else: print(f"āŒ Login failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"āŒ Login error: {e}") return False def check_server_health(self): """Check server health""" print("\nšŸ„ Checking server health...") headers = {"Authorization": f"Bearer {self.access_token}"} try: response = self.session.get(f"{self.base_url}/health", headers=headers) if response.status_code == 200: health_data = response.json() print(f"āœ… Server is healthy") print(f" LLM: {health_data['configuration']['llm_binding']}") print(f" Embedding: {health_data['configuration']['embedding_binding']}") print(f" Rerank: {health_data['configuration']['rerank_binding']}") return True else: print(f"āŒ Health check failed: {response.status_code}") return False except Exception as e: print(f"āŒ Health check error: {e}") return False def upload_ocr_pdf(self): """Upload OCR PDF through web UI API""" print(f"\nšŸ“¤ Uploading OCR PDF: {OCR_PDF_PATH}") if not Path(OCR_PDF_PATH).exists(): print(f"āŒ OCR PDF file not found: {OCR_PDF_PATH}") return False headers = {"Authorization": f"Bearer {self.access_token}"} try: with open(OCR_PDF_PATH, 'rb') as file: files = {'file': (OCR_PDF_PATH, file, 'application/pdf')} response = self.session.post( f"{self.base_url}/documents/upload", files=files, headers=headers ) if response.status_code == 200: result = response.json() print(f"āœ… Upload successful") print(f" Status: {result.get('status', 'unknown')}") print(f" Message: {result.get('message', 'No message')}") if result.get('track_id'): print(f" Track ID: {result.get('track_id')}") return True else: print(f"āŒ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"āŒ Upload error: {e}") return False def wait_for_indexing(self, max_wait=180): """Wait for document indexing to complete""" print(f"\nā³ Waiting for indexing (max {max_wait}s)...") headers = {"Authorization": f"Bearer {self.access_token}"} for i in range(max_wait): try: # Check pipeline status response = self.session.get(f"{self.base_url}/health", headers=headers) if response.status_code == 200: health_data = response.json() busy = health_data.get('pipeline_busy', False) if not busy: print("āœ… Indexing completed!") return True if i % 10 == 0: # Print status every 10 seconds print(f" Still indexing... ({i}s)") time.sleep(1) except Exception as e: print(f"āŒ Error checking indexing status: {e}") return False print("āŒ Indexing timeout reached") return False def check_document_status(self): """Check document processing status""" print("\nšŸ“Š Checking document status...") headers = {"Authorization": f"Bearer {self.access_token}"} try: response = self.session.get( f"{self.base_url}/documents/status_counts", headers=headers ) if response.status_code == 200: status_data = response.json() status_counts = status_data.get('status_counts', {}) print(f"šŸ“ˆ Document Status Counts:") for status, count in status_counts.items(): print(f" {status}: {count}") return status_counts else: print(f"āŒ Failed to get status: {response.status_code}") return None except Exception as e: print(f"āŒ Error checking document status: {e}") return None def search_ocr_content(self): """Search for OCR content using web UI search""" print("\nšŸ” Testing search functionality...") test_queries = [ "LightRAG", "OCR technology", "document processing", "text extraction", "Retrieval-Augmented Generation" ] headers = { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json" } successful_searches = 0 for query in test_queries: print(f"\n Testing query: '{query}'") try: payload = { "query": query, "top_k": 5, "only_need_context": True } response = self.session.post( f"{self.base_url}/query", json=payload, headers=headers ) if response.status_code == 200: results = response.json() # Check if we got actual content if 'chunks' in results and len(results['chunks']) > 0: print(f" āœ… Found {len(results['chunks'])} results") successful_searches += 1 # Show first result preview first_chunk = results['chunks'][0] content = first_chunk.get('text', '')[:150] + "..." score = first_chunk.get('score', 0) print(f" šŸ“„ Preview: {content}") print(f" šŸ“Š Score: {score:.3f}") elif 'response' in results: # Check if LLM responded with content response_text = results['response'] if "[no-context]" not in response_text: print(f" āœ… LLM generated response") successful_searches += 1 print(f" šŸ¤– Response: {response_text[:150]}...") else: print(f" āš ļø No context found for query") else: print(f" āš ļø No results found") else: print(f" āŒ Search failed: {response.status_code} - {response.text}") except Exception as e: print(f" āŒ Search error: {e}") print(f"\nšŸ“Š Search Summary: {successful_searches}/{len(test_queries)} queries successful") return successful_searches > 0 def run_complete_test(self): """Run complete Web UI OCR workflow test""" print("šŸš€ Starting Complete Web UI OCR PDF Workflow Test") print("=" * 70) # Step 1: Login if not self.login(): return False # Step 2: Check server health if not self.check_server_health(): return False # Step 3: Upload OCR PDF if not self.upload_ocr_pdf(): return False # Step 4: Wait for indexing if not self.wait_for_indexing(): return False # Step 5: Check document status status_counts = self.check_document_status() # Step 6: Test search search_success = self.search_ocr_content() # Summary print("\n" + "=" * 70) print("šŸŽÆ WEB UI OCR WORKFLOW TEST SUMMARY") print("=" * 70) if status_counts: processed = status_counts.get('PROCESSED', 0) failed = status_counts.get('FAILED', 0) print(f"šŸ“Š Documents: {processed} processed, {failed} failed") if processed > 0 and search_success: print("āœ… SUCCESS: OCR PDF workflow completed successfully!") print(" - Upload successful") print(" - Indexing completed") print(" - Search returning results") return True else: print("āš ļø PARTIAL SUCCESS: Some steps completed but issues detected") return False else: print("āŒ FAILED: Could not complete workflow") return False def main(): tester = WebUITester() success = tester.run_complete_test() if success: print("\nšŸŽ‰ OCR PDF Web UI workflow test PASSED!") sys.exit(0) else: print("\nšŸ’„ OCR PDF Web UI workflow test FAILED!") sys.exit(1) if __name__ == "__main__": main()