import requests import json import base64 import time import logging import os from pathlib import Path # Configure logging logging.basicConfig( level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) class OCRWorkflowTester: def __init__(self, base_url="http://localhost:3015", username="jleu3482", password="jleu1212"): self.base_url = base_url self.username = username self.password = password self.session = requests.Session() self._setup_auth() def _setup_auth(self): """Setup authentication for the session""" credentials = f"{self.username}:{self.password}" encoded_credentials = base64.b64encode(credentials.encode()).decode() self.session.headers.update({ "Authorization": f"Basic {encoded_credentials}", "Content-Type": "application/json" }) def check_server_status(self): """Check if server is accessible""" logging.info("🔍 Checking server status...") try: response = self.session.get(f"{self.base_url}/health") if response.status_code == 200: logging.info("✅ Server is running and accessible") return True else: logging.error(f"❌ Server returned status: {response.status_code}") return False except Exception as e: logging.error(f"❌ Server connection failed: {e}") return False def upload_ocr_pdf(self, file_path="ocr.pdf"): """Upload OCR PDF file""" logging.info(f"📤 Uploading OCR PDF: {file_path}") if not Path(file_path).exists(): logging.error(f"❌ File not found: {file_path}") return False try: with open(file_path, 'rb') as f: files = {'file': (file_path, f, 'application/pdf')} # Remove auth header for file upload headers = {k: v for k, v in self.session.headers.items() if k != 'Authorization'} response = requests.post( f"{self.base_url}/documents/upload", files=files, auth=(self.username, self.password) ) if response.status_code == 200: logging.info("✅ Upload successful") result = response.json() logging.info(f"📊 Upload result: {json.dumps(result, indent=2)}") return True else: logging.error(f"❌ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: logging.error(f"❌ Upload error: {e}") return False def monitor_indexing(self, max_wait=60): """Monitor document indexing progress""" logging.info("⏳ Monitoring indexing progress...") for attempt in range(max_wait // 5): try: response = self.session.get(f"{self.base_url}/documents") if response.status_code == 200: documents = response.json() if documents: doc = documents[0] status = doc.get('status', 'unknown') logging.info(f"📄 Document status: {status}") if status == 'completed': logging.info("✅ Indexing completed successfully") return True elif status == 'failed': logging.error("❌ Indexing failed") return False else: logging.info("📭 No documents found yet") else: logging.error(f"❌ Status check failed: {response.status_code}") time.sleep(5) except Exception as e: logging.error(f"❌ Monitoring error: {e}") time.sleep(5) logging.warning("⚠️ Indexing timeout reached") return False def test_search(self, queries): """Test search functionality with multiple queries""" logging.info("🔍 Testing search functionality...") results = {} for query in queries: logging.info(f"🔎 Searching for: '{query}'") try: payload = { "query": query, "top_k": 3 } response = self.session.post( f"{self.base_url}/search", json=payload ) if response.status_code == 200: result = response.json() results[query] = { 'success': True, 'results_count': len(result.get('results', [])), 'sample_content': result.get('results', [{}])[0].get('content', '')[:200] if result.get('results') else 'No results' } logging.info(f"✅ Search successful - {len(result.get('results', []))} results") logging.info(f"📝 Sample: {results[query]['sample_content']}...") else: results[query] = { 'success': False, 'error': f"Status {response.status_code}: {response.text}" } logging.error(f"❌ Search failed: {response.status_code} - {response.text}") except Exception as e: results[query] = { 'success': False, 'error': str(e) } logging.error(f"❌ Search error: {e}") return results def check_database_status(self): """Check database connectivity and status""" logging.info("🗄️ Checking database connections...") try: # Check document count response = self.session.get(f"{self.base_url}/documents") if response.status_code == 200: documents = response.json() logging.info(f"📊 Documents in system: {len(documents)}") # Check search index test_response = self.session.post( f"{self.base_url}/search", json={"query": "test", "top_k": 1} ) if test_response.status_code == 200: logging.info("✅ Search index is operational") else: logging.warning("⚠️ Search index may have issues") return True except Exception as e: logging.error(f"❌ Database check failed: {e}") return False def main(): logging.info("🚀 STARTING OCR PDF WORKFLOW VALIDATION") logging.info("=" * 70) # Initialize tester tester = OCRWorkflowTester() # Test server connectivity if not tester.check_server_status(): logging.error("❌ Cannot proceed - server not accessible") return # Upload OCR PDF if tester.upload_ocr_pdf(): # Wait and monitor indexing logging.info("⏳ Waiting for indexing to complete...") time.sleep(10) if tester.monitor_indexing(): # Test search with OCR-specific queries search_queries = [ "OCR", "text extraction", "document processing", "optical character recognition", "PDF conversion" ] search_results = tester.test_search(search_queries) # Check database status tester.check_database_status() # Print summary logging.info("=" * 70) logging.info("📋 TEST RESULTS SUMMARY") logging.info("=" * 70) successful_searches = sum(1 for result in search_results.values() if result['success']) logging.info(f"✅ Upload: SUCCESS") logging.info(f"✅ Indexing: SUCCESS") logging.info(f"🔍 Search: {successful_searches}/{len(search_queries)} queries successful") for query, result in search_results.items(): status = "✅" if result['success'] else "❌" logging.info(f" {status} '{query}': {result.get('results_count', 'N/A')} results") else: logging.error("❌ Indexing failed or timed out") else: logging.error("❌ Upload failed - cannot proceed with testing") logging.info("=" * 70) logging.info("🌐 Web UI: http://localhost:3015/webui/") logging.info("👤 Username: jleu3482") logging.info("🔑 Password: jleu1212") logging.info("📁 Test file: ocr.pdf") if __name__ == "__main__": main()