import requests import json import time import os def test_ocr_upload(): """Test OCR PDF upload and processing""" base_url = 'http://localhost:3015' pdf_file = 'ocr.pdf' print("=" * 60) print("OCR PDF UPLOAD TEST") print("=" * 60) # Check if ocr.pdf exists if not os.path.exists(pdf_file): print(f"āŒ File {pdf_file} not found") return False print(f"šŸ“„ Found OCR PDF: {pdf_file} ({os.path.getsize(pdf_file)} bytes)") # Login first try: login_data = {'username': 'jleu3482', 'password': 'jleu1212'} login_response = requests.post(f'{base_url}/login', data=login_data) if login_response.status_code != 200: print(f"āŒ Login failed: {login_response.text}") return False token = login_response.json().get('access_token') headers = {'Authorization': f'Bearer {token}'} print("āœ… Login successful") # Upload the OCR PDF print(f"šŸ“¤ Uploading {pdf_file}...") with open(pdf_file, 'rb') as f: files = {'file': (pdf_file, f, 'application/pdf')} upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers) print(f"šŸ“¤ Upload status: {upload_response.status_code}") if upload_response.status_code == 200: print("āœ… Upload successful") upload_result = upload_response.json() print(f"Upload response: {json.dumps(upload_result, indent=2)}") else: print(f"āŒ Upload failed: {upload_response.text}") return False # Wait for processing and check status print("\nā³ Waiting for document processing...") max_wait = 180 # 3 minutes max wait_time = 0 processing_complete = False while wait_time < max_wait and not processing_complete: print(f"ā° Waiting... ({wait_time}s/{max_wait}s)") time.sleep(10) wait_time += 10 try: # Check document status status_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30) if status_response.status_code == 200: docs = status_response.json() statuses = docs.get('statuses', {}) completed = statuses.get('completed', []) processing = statuses.get('processing', []) failed = statuses.get('failed', []) print(f"šŸ“Š Status - Completed: {len(completed)}, Processing: {len(processing)}, Failed: {len(failed)}") # Check if our document is completed for doc in completed: if doc.get('file_path') == pdf_file: print("šŸŽ‰ Document processing completed!") processing_complete = True print(f"Document details: {json.dumps(doc, indent=2)}") break # Check if failed for doc in failed: if doc.get('file_path') == pdf_file: print(f"āŒ Document processing failed: {doc}") return False except Exception as e: print(f"āš ļø Status check error: {e}") if not processing_complete: print(f"ā° Processing timeout after {max_wait} seconds") return False # Test search functionality print("\nšŸ” Testing search functionality...") test_queries = [ "table data", "document content", "information in the pdf", "what does this document contain" ] for query in test_queries: print(f"\nTesting query: '{query}'") try: query_data = {'query': query, 'top_k': 5} search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers, timeout=30) if search_response.status_code == 200: results = search_response.json() print(f"āœ… Search successful") if isinstance(results, dict): response_text = results.get('response', 'No response field') print(f"Response: {response_text}") # Check for no-context response if '[no-context]' in response_text: print("āš ļø No relevant content found in document") else: print("šŸŽ‰ Content found and retrieved!") else: print(f"Unexpected response format: {type(results)}") else: print(f'āŒ Search failed: {search_response.text}') except Exception as e: print(f'āŒ Search error: {e}') print("\n" + "=" * 60) print("OCR WORKFLOW TEST COMPLETED SUCCESSFULLY") print("=" * 60) return True except Exception as e: print(f"āŒ Test failed with error: {e}") return False if __name__ == "__main__": success = test_ocr_upload() exit(0 if success else 1)