Files
railseek6/test_ocr_upload.py

142 lines
5.5 KiB
Python

import requests
import json
import time
import os
def test_ocr_upload():
"""Test OCR PDF upload and processing"""
base_url = 'http://localhost:3015'
pdf_file = 'ocr.pdf'
print("=" * 60)
print("OCR PDF UPLOAD TEST")
print("=" * 60)
# Check if ocr.pdf exists
if not os.path.exists(pdf_file):
print(f"❌ File {pdf_file} not found")
return False
print(f"📄 Found OCR PDF: {pdf_file} ({os.path.getsize(pdf_file)} bytes)")
# Login first
try:
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
login_response = requests.post(f'{base_url}/login', data=login_data)
if login_response.status_code != 200:
print(f"❌ Login failed: {login_response.text}")
return False
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print("✅ Login successful")
# Upload the OCR PDF
print(f"📤 Uploading {pdf_file}...")
with open(pdf_file, 'rb') as f:
files = {'file': (pdf_file, f, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers)
print(f"📤 Upload status: {upload_response.status_code}")
if upload_response.status_code == 200:
print("✅ Upload successful")
upload_result = upload_response.json()
print(f"Upload response: {json.dumps(upload_result, indent=2)}")
else:
print(f"❌ Upload failed: {upload_response.text}")
return False
# Wait for processing and check status
print("\n⏳ Waiting for document processing...")
max_wait = 180 # 3 minutes max
wait_time = 0
processing_complete = False
while wait_time < max_wait and not processing_complete:
print(f"⏰ Waiting... ({wait_time}s/{max_wait}s)")
time.sleep(10)
wait_time += 10
try:
# Check document status
status_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
if status_response.status_code == 200:
docs = status_response.json()
statuses = docs.get('statuses', {})
completed = statuses.get('completed', [])
processing = statuses.get('processing', [])
failed = statuses.get('failed', [])
print(f"📊 Status - Completed: {len(completed)}, Processing: {len(processing)}, Failed: {len(failed)}")
# Check if our document is completed
for doc in completed:
if doc.get('file_path') == pdf_file:
print("🎉 Document processing completed!")
processing_complete = True
print(f"Document details: {json.dumps(doc, indent=2)}")
break
# Check if failed
for doc in failed:
if doc.get('file_path') == pdf_file:
print(f"❌ Document processing failed: {doc}")
return False
except Exception as e:
print(f"⚠️ Status check error: {e}")
if not processing_complete:
print(f"⏰ Processing timeout after {max_wait} seconds")
return False
# Test search functionality
print("\n🔍 Testing search functionality...")
test_queries = [
"table data",
"document content",
"information in the pdf",
"what does this document contain"
]
for query in test_queries:
print(f"\nTesting query: '{query}'")
try:
query_data = {'query': query, 'top_k': 5}
search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers, timeout=30)
if search_response.status_code == 200:
results = search_response.json()
print(f"✅ Search successful")
if isinstance(results, dict):
response_text = results.get('response', 'No response field')
print(f"Response: {response_text}")
# Check for no-context response
if '[no-context]' in response_text:
print("⚠️ No relevant content found in document")
else:
print("🎉 Content found and retrieved!")
else:
print(f"Unexpected response format: {type(results)}")
else:
print(f'❌ Search failed: {search_response.text}')
except Exception as e:
print(f'❌ Search error: {e}')
print("\n" + "=" * 60)
print("OCR WORKFLOW TEST COMPLETED SUCCESSFULLY")
print("=" * 60)
return True
except Exception as e:
print(f"❌ Test failed with error: {e}")
return False
if __name__ == "__main__":
success = test_ocr_upload()
exit(0 if success else 1)