#!/usr/bin/env python3 """ Direct OCR PDF upload, indexing, and search test This test bypasses authentication issues and tests the core OCR functionality """ import requests import time import json import sys from pathlib import Path # Configuration BASE_URL = "http://localhost:3015" OCR_PDF_PATH = "ocr.pdf" class OCRWorkflowTester: def __init__(self): self.base_url = BASE_URL self.session = requests.Session() def check_server_status(self): """Check if server is accessible""" print("šŸ” Checking server accessibility...") try: response = self.session.get(f"{self.base_url}/") if response.status_code in [200, 307]: # 307 for redirect print("āœ… Server is accessible") return True else: print(f"āŒ Server returned status: {response.status_code}") return False except Exception as e: print(f"āŒ Cannot connect to server: {e}") return False def check_health_no_auth(self): """Try to check health without authentication""" print("\nšŸ„ Checking server health (no auth)...") try: response = self.session.get(f"{self.base_url}/health") if response.status_code == 200: health_data = response.json() print(f"āœ… Server is healthy") print(f" Status: {health_data.get('status', 'unknown')}") print(f" Auth Mode: {health_data.get('auth_mode', 'unknown')}") return health_data else: print(f"āš ļø Health check returned: {response.status_code}") # Try to parse anyway try: health_data = response.json() print(f" Response: {health_data}") except: print(f" Response: {response.text}") return None except Exception as e: print(f"āŒ Health check error: {e}") return None def upload_ocr_pdf_direct(self): """Try to upload OCR PDF without authentication""" print(f"\nšŸ“¤ Attempting to upload OCR PDF: {OCR_PDF_PATH}") if not Path(OCR_PDF_PATH).exists(): print(f"āŒ OCR PDF file not found: {OCR_PDF_PATH}") return False try: with open(OCR_PDF_PATH, 'rb') as file: files = {'file': (OCR_PDF_PATH, file, 'application/pdf')} response = self.session.post( f"{self.base_url}/documents/upload", files=files ) if response.status_code == 200: result = response.json() print(f"āœ… Upload successful") print(f" Status: {result.get('status', 'unknown')}") print(f" Message: {result.get('message', 'No message')}") return True else: print(f"āŒ Upload failed: {response.status_code}") try: error_data = response.json() print(f" Error: {error_data}") except: print(f" Response: {response.text}") return False except Exception as e: print(f"āŒ Upload error: {e}") return False def monitor_processing(self, max_wait=120): """Monitor document processing status""" print(f"\nā³ Monitoring processing (max {max_wait}s)...") for i in range(max_wait): try: # Try to get health status to check pipeline response = self.session.get(f"{self.base_url}/health") if response.status_code == 200: health_data = response.json() busy = health_data.get('pipeline_busy', False) if not busy: print("āœ… Processing appears complete") return True if i % 10 == 0: # Print status every 10 seconds print(f" Still processing... ({i}s)") time.sleep(1) except Exception as e: print(f"āŒ Error monitoring processing: {e}") return False print("āš ļø Processing timeout reached") return False def test_search_without_auth(self): """Test search functionality without authentication""" print("\nšŸ” Testing search without authentication...") test_queries = [ "LightRAG", "OCR technology", "document processing", "text extraction", "Retrieval-Augmented Generation" ] headers = {"Content-Type": "application/json"} for query in test_queries: print(f"\n Testing query: '{query}'") try: payload = { "query": query, "top_k": 5, "only_need_context": True } response = self.session.post( f"{self.base_url}/query", json=payload, headers=headers ) if response.status_code == 200: results = response.json() # Check if we got actual content if 'chunks' in results and len(results['chunks']) > 0: print(f" āœ… Found {len(results['chunks'])} results") # Show first result preview first_chunk = results['chunks'][0] content = first_chunk.get('text', '')[:150] + "..." score = first_chunk.get('score', 0) print(f" šŸ“„ Preview: {content}") print(f" šŸ“Š Score: {score:.3f}") return True # At least one successful search elif 'response' in results: # Check if LLM responded with content response_text = results['response'] if "[no-context]" not in response_text: print(f" āœ… LLM generated response") print(f" šŸ¤– Response: {response_text[:150]}...") return True else: print(f" āš ļø No context found for query") else: print(f" āš ļø No results found") else: print(f" āŒ Search failed: {response.status_code}") try: error_data = response.json() print(f" Error: {error_data}") except: print(f" Response: {response.text}") except Exception as e: print(f" āŒ Search error: {e}") return False def check_webui_access(self): """Check if Web UI is accessible""" print("\n🌐 Checking Web UI accessibility...") try: response = self.session.get(f"{self.base_url}/webui/") if response.status_code == 200: print("āœ… Web UI is accessible") return True else: print(f"āš ļø Web UI returned: {response.status_code}") return False except Exception as e: print(f"āŒ Web UI access error: {e}") return False def run_complete_test(self): """Run complete OCR workflow test""" print("šŸš€ Starting Direct OCR PDF Workflow Test") print("=" * 60) # Step 1: Check server accessibility if not self.check_server_status(): return False # Step 2: Check Web UI self.check_webui_access() # Step 3: Check health (may fail due to auth) health_data = self.check_health_no_auth() # Step 4: Upload OCR PDF upload_success = self.upload_ocr_pdf_direct() # Step 5: Monitor processing if upload_success: processing_success = self.monitor_processing() else: processing_success = False # Step 6: Test search search_success = self.test_search_without_auth() # Summary print("\n" + "=" * 60) print("šŸŽÆ DIRECT OCR WORKFLOW TEST SUMMARY") print("=" * 60) print(f"šŸ“Š Upload: {'āœ… Success' if upload_success else 'āŒ Failed'}") print(f"šŸ“Š Processing: {'āœ… Complete' if processing_success else 'āŒ Failed/Timeout'}") print(f"šŸ“Š Search: {'āœ… Working' if search_success else 'āŒ No results'}") if upload_success and search_success: print("\nāœ… SUCCESS: OCR PDF workflow is functional!") print(" - Upload successful") print(" - Search returning results") return True elif upload_success: print("\nāš ļø PARTIAL SUCCESS: Upload worked but search issues") return False else: print("\nāŒ FAILED: Could not complete workflow") return False def main(): tester = OCRWorkflowTester() success = tester.run_complete_test() if success: print("\nšŸŽ‰ OCR PDF direct workflow test PASSED!") print("\nšŸ“‹ Next steps:") print(" 1. Access Web UI at: http://localhost:3015/webui/") print(" 2. Use credentials: jleu3482 / jleu1212") print(" 3. Upload documents and test search") sys.exit(0) else: print("\nšŸ’„ OCR PDF direct workflow test had issues.") print("\nšŸ”§ Troubleshooting:") print(" - Check server authentication configuration") print(" - Verify .env file settings") print(" - Check database connections") sys.exit(1) if __name__ == "__main__": main()