Files
railseek6/test_ocr_upload_fresh.py

79 lines
3.8 KiB
Python

import requests
import os
import time
base_url = 'http://localhost:3015'
def test_ocr_upload():
print("Testing OCR PDF upload with GPU-accelerated PaddleOCR...")
# Login with form data
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
login_response = requests.post(f'{base_url}/login', data=login_data)
print(f'Login status: {login_response.status_code}')
if login_response.status_code == 200:
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print('✓ Login successful')
# Check current document status
docs_response = requests.get(f'{base_url}/documents', headers=headers)
print(f'Initial document status: {docs_response.status_code}')
if docs_response.status_code == 200:
docs = docs_response.json()
statuses = docs.get('statuses', {})
print(f'Initial status - Completed: {len(statuses.get("completed", []))}, Processing: {len(statuses.get("processing", []))}, Failed: {len(statuses.get("failed", []))}')
# Upload OCR PDF - use a different file to avoid duplicates
pdf_path = 'test_meaningful.pdf'
if os.path.exists(pdf_path):
print(f'Uploading PDF: {pdf_path}')
with open(pdf_path, 'rb') as f:
files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers)
print(f'Upload status: {upload_response.status_code}')
print(f'Upload response: {upload_response.text}')
# Check document status immediately after upload
if upload_response.status_code == 200:
print("\nChecking document status...")
# Monitor progress for 60 seconds to catch OCR processing
print("Monitoring OCR processing for 60 seconds...")
for i in range(60):
time.sleep(1)
docs_response = requests.get(f'{base_url}/documents', headers=headers)
if docs_response.status_code == 200:
docs = docs_response.json()
statuses = docs.get('statuses', {})
processing = len(statuses.get("processing", []))
completed = len(statuses.get("completed", []))
failed = len(statuses.get("failed", []))
print(f'Progress after {i+1}s: Processing={processing}, Completed={completed}, Failed={failed}')
if completed > 0:
print("✓ OCR processing completed successfully!")
return True
elif failed > 0:
print("✗ OCR processing failed!")
# Show error details
failed_docs = statuses.get("failed", [])
for doc in failed_docs:
print(f" Failed document: {doc.get('file_path')} - {doc.get('error_msg', 'Unknown error')}")
return False
print("⚠ OCR processing timed out after 60 seconds")
return False
else:
print(f'PDF file not found: {pdf_path}')
print("Available PDF files:")
for file in os.listdir('.'):
if file.lower().endswith('.pdf'):
print(f" - {file}")
return False
else:
print(f'Login failed: {login_response.text}')
return False
if __name__ == "__main__":
test_ocr_upload()