import requests import time import os base_url = 'http://localhost:3015' def wait_for_server(timeout=60): """Wait for server to be ready""" print("Waiting for server to start...") start_time = time.time() while time.time() - start_time < timeout: try: response = requests.get(f'{base_url}/', timeout=5) if response.status_code == 200: print("✓ Server is ready") return True except: pass time.sleep(2) print("✗ Server did not start within timeout") return False def test_ocr_with_cpu_mode(): """Test OCR PDF upload with CPU-only processing""" print("=== TESTING OCR PDF WITH CPU-ONLY MODE ===") print("Target file: ocr.pdf (scanned table document)") # Wait for server if not wait_for_server(): return # Login login_data = {'username': 'jleu3482', 'password': 'jleu1212'} try: login_response = requests.post(f'{base_url}/login', data=login_data, timeout=10) if login_response.status_code == 200: token = login_response.json().get('access_token') headers = {'Authorization': f'Bearer {token}'} print("✓ Login successful") # Clear existing documents first print("=== Clearing existing documents ===") clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=10) print(f"Clear status: {clear_response.status_code}") if clear_response.status_code == 200: print("✓ Documents cleared") # Upload OCR PDF print("\n=== Uploading OCR PDF ===") print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)") with open('ocr.pdf', 'rb') as f: files = {'file': ('ocr.pdf', f, 'application/pdf')} upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=30) print(f"Upload status: {upload_response.status_code}") if upload_response.status_code == 200: upload_data = upload_response.json() print(f"Upload response: {upload_data}") track_id = upload_data.get('track_id') if track_id: print(f"\n=== Monitoring OCR Processing (CPU Mode) ===") print("Processing will be slower but more reliable...") # Monitor processing with longer timeout for CPU processing max_attempts = 60 # 10 minutes for CPU processing for attempt in range(max_attempts): try: docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=10) if docs_response.status_code == 200: docs_data = docs_response.json() statuses = docs_data.get('statuses', {}) completed = statuses.get('completed', []) processing = statuses.get('processing', []) failed = statuses.get('failed', []) # Check if our file is completed for doc in completed: if doc.get('file_path') == 'ocr.pdf': print(f"✓ OCR processing completed!") print(f" File: {doc.get('file_path')}") print(f" Size: {doc.get('file_size')}") print(f" Chunks: {doc.get('chunk_count')}") # Now test search print("\n=== Testing Search ===") search_queries = [ "table", "data", "information", "document" ] for query in search_queries: print(f"Searching for: '{query}'") search_data = {'query': query, 'top_k': 5} search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=10) if search_response.status_code == 200: search_results = search_response.json() print(f" Results: {len(search_results.get('results', []))}") for i, result in enumerate(search_results.get('results', [])): if i < 2: # Show first 2 results print(f" {i+1}. Score: {result.get('score'):.3f}") print(f" Text: {result.get('text', '')[:100]}...") else: print(f" Search failed: {search_response.text}") return True # Check if still processing for doc in processing: if doc.get('file_path') == 'ocr.pdf': print(f" Processing... ({attempt + 1}/{max_attempts})") break else: # Not in processing, check failed for doc in failed: if doc.get('file_path') == 'ocr.pdf': print(f"✗ OCR processing failed!") print(f" Error: {doc.get('error_msg', 'Unknown error')}") return False time.sleep(10) # Check every 10 seconds except requests.exceptions.RequestException as e: print(f" Connection error (attempt {attempt + 1}/{max_attempts}): {e}") time.sleep(10) print("✗ OCR processing timed out") return False else: print("✗ No track ID returned") return False else: print(f"✗ Upload failed: {upload_response.text}") return False else: print(f"✗ Login failed: {login_response.text}") return False except Exception as e: print(f"✗ Error during OCR test: {e}") return False if __name__ == "__main__": success = test_ocr_with_cpu_mode() if success: print("\n🎉 SUCCESS: OCR PDF with scanned table processed successfully!") else: print("\n❌ FAILED: OCR processing did not complete successfully")