import requests import os import json import time import sys def test_ocr_workflow(): """Test complete OCR workflow for scanned table PDF""" base_url = 'http://localhost:3015' pdf_file = 'ocr.pdf' print("=" * 60) print("OCR PDF WORKFLOW TEST - SCANNED TABLE DOCUMENT") print("=" * 60) # Check if ocr.pdf exists if not os.path.exists(pdf_file): print(f"❌ {pdf_file} not found in current directory") return False print(f"📄 Testing with: {pdf_file}") print(f"📊 File size: {os.path.getsize(pdf_file)} bytes") # Login first print("\n1. 🔐 AUTHENTICATION") login_data = {'username': 'jleu3482', 'password': 'jleu1212'} try: login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30) if login_response.status_code == 200: token = login_response.json().get('access_token') headers = {'Authorization': f'Bearer {token}'} print('✅ Login successful') else: print(f'❌ Login failed: {login_response.status_code}') return False except Exception as e: print(f'❌ Login error: {e}') return False # Upload ocr.pdf print("\n2. 📤 DOCUMENT UPLOAD") try: print(f"Uploading {pdf_file}...") with open(pdf_file, 'rb') as file: files = {'file': (pdf_file, file, 'application/pdf')} upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60) print(f"Upload Status: {upload_response.status_code}") if upload_response.status_code == 200: upload_result = upload_response.json() print('✅ Upload successful') print(f"Response: {json.dumps(upload_result, indent=2)}") track_id = upload_result.get('track_id', '') else: print(f'❌ Upload failed: {upload_response.text}') return False except Exception as e: print(f'❌ Upload error: {e}') return False # Wait for processing print("\n3. ⏳ WAITING FOR PROCESSING") max_wait = 120 # 2 minutes max wait_time = 0 processing_complete = False while wait_time < max_wait and not processing_complete: print(f"Waiting for processing... ({wait_time}s/{max_wait}s)") time.sleep(10) wait_time += 10 try: # Check document status status_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30) if status_response.status_code == 200: documents = status_response.json() statuses = documents.get('statuses', {}) # Check for completed documents completed = statuses.get('completed', []) processing = statuses.get('processing', []) failed = statuses.get('failed', []) print(f"📊 Status - Completed: {len(completed)}, Processing: {len(processing)}, Failed: {len(failed)}") # Check if our document is completed for doc in completed: if doc.get('file_path') == pdf_file: print("✅ Document processing completed!") processing_complete = True print(f"Document details: {json.dumps(doc, indent=2)}") break # Check if failed for doc in failed: if doc.get('file_path') == pdf_file: print(f"❌ Document processing failed: {doc.get('error_msg', 'Unknown error')}") return False else: print(f"⚠️ Status check failed: {status_response.status_code}") except Exception as e: print(f"⚠️ Status check error: {e}") if not processing_complete: print("❌ Processing timeout - document not processed within expected time") return False # Test search functionality print("\n4. 🔍 SEARCH FUNCTIONALITY TEST") # Test queries based on expected content from scanned table test_queries = [ "table data", "document content", "information in the pdf", "what does this document contain" ] for query in test_queries: print(f"\nTesting query: '{query}'") try: query_data = {'query': query, 'top_k': 5} search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers, timeout=30) if search_response.status_code == 200: results = search_response.json() print(f"✅ Search successful") if isinstance(results, dict): response_text = results.get('response', 'No response field') print(f"Response: {response_text}") # Check for no-context response if '[no-context]' in response_text: print("⚠️ No relevant content found in document") else: print("🎉 Content found and retrieved!") else: print(f"Unexpected response format: {type(results)}") print(f"Raw result: {results}") else: print(f'❌ Search failed: {search_response.text}') except Exception as e: print(f'❌ Search error: {e}') # Get detailed document information print("\n5. 📋 DOCUMENT DETAILS") try: status_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30) if status_response.status_code == 200: documents = status_response.json() completed_docs = documents.get('statuses', {}).get('completed', []) for doc in completed_docs: if doc.get('file_path') == pdf_file: print("📄 Document Summary:") print(f" - File: {doc.get('file_path')}") print(f" - Status: {doc.get('status')}") print(f" - Content Length: {doc.get('content_length', 'N/A')}") print(f" - Chunks Count: {doc.get('chunks_count', 'N/A')}") print(f" - Created: {doc.get('created_at')}") print(f" - Updated: {doc.get('updated_at')}") break except Exception as e: print(f"⚠️ Could not get document details: {e}") print("\n" + "=" * 60) print("WORKFLOW TEST COMPLETED") print("=" * 60) return True if __name__ == "__main__": success = test_ocr_workflow() sys.exit(0 if success else 1)