import requests import time import os base_url = 'http://localhost:3015' def test_ocr_pdf_simple(): """Simple test for OCR PDF upload without clearing documents first""" print("=== SIMPLE OCR PDF TEST ===") print("Testing ocr.pdf upload and processing") # Login login_data = {'username': 'jleu3482', 'password': 'jleu1212'} try: login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30) if login_response.status_code == 200: token = login_response.json().get('access_token') headers = {'Authorization': f'Bearer {token}'} print("✓ Login successful") # Upload OCR PDF directly (skip clearing to avoid timeout) print("\n=== Uploading OCR PDF ===") print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)") with open('ocr.pdf', 'rb') as f: files = {'file': ('ocr.pdf', f, 'application/pdf')} upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60) print(f"Upload status: {upload_response.status_code}") if upload_response.status_code == 200: upload_data = upload_response.json() print(f"Upload response: {upload_data}") track_id = upload_data.get('track_id') if track_id: print(f"\n=== Monitoring OCR Processing ===") print("Started OCR processing...") print("This may take several minutes for CPU-based OCR...") # Monitor for up to 15 minutes max_wait = 900 # 15 minutes start_time = time.time() while time.time() - start_time < max_wait: try: # Check document status docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30) if docs_response.status_code == 200: docs_data = docs_response.json() statuses = docs_data.get('statuses', {}) completed = statuses.get('completed', []) processing = statuses.get('processing', []) failed = statuses.get('failed', []) elapsed = int(time.time() - start_time) # Check for our file for doc in completed: if doc.get('file_path') == 'ocr.pdf': print(f"\n✓ OCR processing completed in {elapsed} seconds!") print(f" File: {doc.get('file_path')}") print(f" Size: {doc.get('file_size')}") print(f" Chunks: {doc.get('chunk_count')}") # Test search immediately print("\n=== Testing Search ===") search_data = {'query': 'table data information', 'top_k': 10} search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=30) if search_response.status_code == 200: search_results = search_response.json() results = search_results.get('results', []) print(f"Found {len(results)} search results") for i, result in enumerate(results[:3]): # Show top 3 print(f"\nResult {i+1}:") print(f" Score: {result.get('score'):.3f}") text = result.get('text', '') print(f" Text: {text[:200]}{'...' if len(text) > 200 else ''}") print(f" Source: {result.get('source', {}).get('file_path', 'Unknown')}") else: print(f"Search failed: {search_response.text}") return True # Check if still processing processing_ocr = False for doc in processing: if doc.get('file_path') == 'ocr.pdf': processing_ocr = True break if processing_ocr: print(f" Still processing... ({elapsed}s elapsed)") else: # Check if failed for doc in failed: if doc.get('file_path') == 'ocr.pdf': print(f"\n✗ OCR processing failed after {elapsed}s!") print(f" Error: {doc.get('error_msg', 'Unknown error')}") return False # Not in any list yet, might be queued print(f" Waiting for processing to start... ({elapsed}s)") time.sleep(10) # Check every 10 seconds except requests.exceptions.RequestException as e: print(f" Connection error: {e}") time.sleep(10) print(f"\n✗ OCR processing timed out after {max_wait} seconds") return False else: print("✗ No track ID returned") return False else: print(f"✗ Upload failed: {upload_response.text}") return False else: print(f"✗ Login failed: {login_response.text}") return False except Exception as e: print(f"✗ Error during OCR test: {e}") return False if __name__ == "__main__": print("Starting OCR PDF test...") print("Note: This test uploads ocr.pdf with the scanned table") print(" and monitors processing for up to 15 minutes.") print(" OCR processing on CPU may be slow but more reliable.\n") success = test_ocr_pdf_simple() if success: print("\n🎉 SUCCESS: OCR PDF with scanned table processed and searchable!") else: print("\n❌ OCR processing failed or timed out")