Files
railseek6/test_ocr_simple.py

137 lines
7.3 KiB
Python

import requests
import time
import os
base_url = 'http://localhost:3015'
def test_ocr_pdf_simple():
"""Simple test for OCR PDF upload without clearing documents first"""
print("=== SIMPLE OCR PDF TEST ===")
print("Testing ocr.pdf upload and processing")
# Login
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
try:
login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)
if login_response.status_code == 200:
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print("✓ Login successful")
# Upload OCR PDF directly (skip clearing to avoid timeout)
print("\n=== Uploading OCR PDF ===")
print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")
with open('ocr.pdf', 'rb') as f:
files = {'file': ('ocr.pdf', f, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)
print(f"Upload status: {upload_response.status_code}")
if upload_response.status_code == 200:
upload_data = upload_response.json()
print(f"Upload response: {upload_data}")
track_id = upload_data.get('track_id')
if track_id:
print(f"\n=== Monitoring OCR Processing ===")
print("Started OCR processing...")
print("This may take several minutes for CPU-based OCR...")
# Monitor for up to 15 minutes
max_wait = 900 # 15 minutes
start_time = time.time()
while time.time() - start_time < max_wait:
try:
# Check document status
docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
if docs_response.status_code == 200:
docs_data = docs_response.json()
statuses = docs_data.get('statuses', {})
completed = statuses.get('completed', [])
processing = statuses.get('processing', [])
failed = statuses.get('failed', [])
elapsed = int(time.time() - start_time)
# Check for our file
for doc in completed:
if doc.get('file_path') == 'ocr.pdf':
print(f"\n✓ OCR processing completed in {elapsed} seconds!")
print(f" File: {doc.get('file_path')}")
print(f" Size: {doc.get('file_size')}")
print(f" Chunks: {doc.get('chunk_count')}")
# Test search immediately
print("\n=== Testing Search ===")
search_data = {'query': 'table data information', 'top_k': 10}
search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=30)
if search_response.status_code == 200:
search_results = search_response.json()
results = search_results.get('results', [])
print(f"Found {len(results)} search results")
for i, result in enumerate(results[:3]): # Show top 3
print(f"\nResult {i+1}:")
print(f" Score: {result.get('score'):.3f}")
text = result.get('text', '')
print(f" Text: {text[:200]}{'...' if len(text) > 200 else ''}")
print(f" Source: {result.get('source', {}).get('file_path', 'Unknown')}")
else:
print(f"Search failed: {search_response.text}")
return True
# Check if still processing
processing_ocr = False
for doc in processing:
if doc.get('file_path') == 'ocr.pdf':
processing_ocr = True
break
if processing_ocr:
print(f" Still processing... ({elapsed}s elapsed)")
else:
# Check if failed
for doc in failed:
if doc.get('file_path') == 'ocr.pdf':
print(f"\n✗ OCR processing failed after {elapsed}s!")
print(f" Error: {doc.get('error_msg', 'Unknown error')}")
return False
# Not in any list yet, might be queued
print(f" Waiting for processing to start... ({elapsed}s)")
time.sleep(10) # Check every 10 seconds
except requests.exceptions.RequestException as e:
print(f" Connection error: {e}")
time.sleep(10)
print(f"\n✗ OCR processing timed out after {max_wait} seconds")
return False
else:
print("✗ No track ID returned")
return False
else:
print(f"✗ Upload failed: {upload_response.text}")
return False
else:
print(f"✗ Login failed: {login_response.text}")
return False
except Exception as e:
print(f"✗ Error during OCR test: {e}")
return False
if __name__ == "__main__":
print("Starting OCR PDF test...")
print("Note: This test uploads ocr.pdf with the scanned table")
print(" and monitors processing for up to 15 minutes.")
print(" OCR processing on CPU may be slow but more reliable.\n")
success = test_ocr_pdf_simple()
if success:
print("\n🎉 SUCCESS: OCR PDF with scanned table processed and searchable!")
else:
print("\n❌ OCR processing failed or timed out")