Files
railseek6/test_ocr_cpu_mode.py

148 lines
7.4 KiB
Python

import requests
import time
import os
base_url = 'http://localhost:3015'
def wait_for_server(timeout=60):
"""Wait for server to be ready"""
print("Waiting for server to start...")
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(f'{base_url}/', timeout=5)
if response.status_code == 200:
print("✓ Server is ready")
return True
except:
pass
time.sleep(2)
print("✗ Server did not start within timeout")
return False
def test_ocr_with_cpu_mode():
"""Test OCR PDF upload with CPU-only processing"""
print("=== TESTING OCR PDF WITH CPU-ONLY MODE ===")
print("Target file: ocr.pdf (scanned table document)")
# Wait for server
if not wait_for_server():
return
# Login
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
try:
login_response = requests.post(f'{base_url}/login', data=login_data, timeout=10)
if login_response.status_code == 200:
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print("✓ Login successful")
# Clear existing documents first
print("=== Clearing existing documents ===")
clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=10)
print(f"Clear status: {clear_response.status_code}")
if clear_response.status_code == 200:
print("✓ Documents cleared")
# Upload OCR PDF
print("\n=== Uploading OCR PDF ===")
print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")
with open('ocr.pdf', 'rb') as f:
files = {'file': ('ocr.pdf', f, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=30)
print(f"Upload status: {upload_response.status_code}")
if upload_response.status_code == 200:
upload_data = upload_response.json()
print(f"Upload response: {upload_data}")
track_id = upload_data.get('track_id')
if track_id:
print(f"\n=== Monitoring OCR Processing (CPU Mode) ===")
print("Processing will be slower but more reliable...")
# Monitor processing with longer timeout for CPU processing
max_attempts = 60 # 10 minutes for CPU processing
for attempt in range(max_attempts):
try:
docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=10)
if docs_response.status_code == 200:
docs_data = docs_response.json()
statuses = docs_data.get('statuses', {})
completed = statuses.get('completed', [])
processing = statuses.get('processing', [])
failed = statuses.get('failed', [])
# Check if our file is completed
for doc in completed:
if doc.get('file_path') == 'ocr.pdf':
print(f"✓ OCR processing completed!")
print(f" File: {doc.get('file_path')}")
print(f" Size: {doc.get('file_size')}")
print(f" Chunks: {doc.get('chunk_count')}")
# Now test search
print("\n=== Testing Search ===")
search_queries = [
"table", "data", "information", "document"
]
for query in search_queries:
print(f"Searching for: '{query}'")
search_data = {'query': query, 'top_k': 5}
search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=10)
if search_response.status_code == 200:
search_results = search_response.json()
print(f" Results: {len(search_results.get('results', []))}")
for i, result in enumerate(search_results.get('results', [])):
if i < 2: # Show first 2 results
print(f" {i+1}. Score: {result.get('score'):.3f}")
print(f" Text: {result.get('text', '')[:100]}...")
else:
print(f" Search failed: {search_response.text}")
return True
# Check if still processing
for doc in processing:
if doc.get('file_path') == 'ocr.pdf':
print(f" Processing... ({attempt + 1}/{max_attempts})")
break
else:
# Not in processing, check failed
for doc in failed:
if doc.get('file_path') == 'ocr.pdf':
print(f"✗ OCR processing failed!")
print(f" Error: {doc.get('error_msg', 'Unknown error')}")
return False
time.sleep(10) # Check every 10 seconds
except requests.exceptions.RequestException as e:
print(f" Connection error (attempt {attempt + 1}/{max_attempts}): {e}")
time.sleep(10)
print("✗ OCR processing timed out")
return False
else:
print("✗ No track ID returned")
return False
else:
print(f"✗ Upload failed: {upload_response.text}")
return False
else:
print(f"✗ Login failed: {login_response.text}")
return False
except Exception as e:
print(f"✗ Error during OCR test: {e}")
return False
if __name__ == "__main__":
success = test_ocr_with_cpu_mode()
if success:
print("\n🎉 SUCCESS: OCR PDF with scanned table processed successfully!")
else:
print("\n❌ FAILED: OCR processing did not complete successfully")