148 lines
7.4 KiB
Python
148 lines
7.4 KiB
Python
import requests
|
|
import time
|
|
import os
|
|
|
|
base_url = 'http://localhost:3015'
|
|
|
|
def wait_for_server(timeout=60):
|
|
"""Wait for server to be ready"""
|
|
print("Waiting for server to start...")
|
|
start_time = time.time()
|
|
while time.time() - start_time < timeout:
|
|
try:
|
|
response = requests.get(f'{base_url}/', timeout=5)
|
|
if response.status_code == 200:
|
|
print("✓ Server is ready")
|
|
return True
|
|
except:
|
|
pass
|
|
time.sleep(2)
|
|
print("✗ Server did not start within timeout")
|
|
return False
|
|
|
|
def test_ocr_with_cpu_mode():
|
|
"""Test OCR PDF upload with CPU-only processing"""
|
|
print("=== TESTING OCR PDF WITH CPU-ONLY MODE ===")
|
|
print("Target file: ocr.pdf (scanned table document)")
|
|
|
|
# Wait for server
|
|
if not wait_for_server():
|
|
return
|
|
|
|
# Login
|
|
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
|
|
try:
|
|
login_response = requests.post(f'{base_url}/login', data=login_data, timeout=10)
|
|
if login_response.status_code == 200:
|
|
token = login_response.json().get('access_token')
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
print("✓ Login successful")
|
|
|
|
# Clear existing documents first
|
|
print("=== Clearing existing documents ===")
|
|
clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=10)
|
|
print(f"Clear status: {clear_response.status_code}")
|
|
if clear_response.status_code == 200:
|
|
print("✓ Documents cleared")
|
|
|
|
# Upload OCR PDF
|
|
print("\n=== Uploading OCR PDF ===")
|
|
print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")
|
|
|
|
with open('ocr.pdf', 'rb') as f:
|
|
files = {'file': ('ocr.pdf', f, 'application/pdf')}
|
|
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=30)
|
|
|
|
print(f"Upload status: {upload_response.status_code}")
|
|
if upload_response.status_code == 200:
|
|
upload_data = upload_response.json()
|
|
print(f"Upload response: {upload_data}")
|
|
track_id = upload_data.get('track_id')
|
|
|
|
if track_id:
|
|
print(f"\n=== Monitoring OCR Processing (CPU Mode) ===")
|
|
print("Processing will be slower but more reliable...")
|
|
|
|
# Monitor processing with longer timeout for CPU processing
|
|
max_attempts = 60 # 10 minutes for CPU processing
|
|
for attempt in range(max_attempts):
|
|
try:
|
|
docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=10)
|
|
if docs_response.status_code == 200:
|
|
docs_data = docs_response.json()
|
|
statuses = docs_data.get('statuses', {})
|
|
|
|
completed = statuses.get('completed', [])
|
|
processing = statuses.get('processing', [])
|
|
failed = statuses.get('failed', [])
|
|
|
|
# Check if our file is completed
|
|
for doc in completed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"✓ OCR processing completed!")
|
|
print(f" File: {doc.get('file_path')}")
|
|
print(f" Size: {doc.get('file_size')}")
|
|
print(f" Chunks: {doc.get('chunk_count')}")
|
|
|
|
# Now test search
|
|
print("\n=== Testing Search ===")
|
|
search_queries = [
|
|
"table", "data", "information", "document"
|
|
]
|
|
|
|
for query in search_queries:
|
|
print(f"Searching for: '{query}'")
|
|
search_data = {'query': query, 'top_k': 5}
|
|
search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=10)
|
|
if search_response.status_code == 200:
|
|
search_results = search_response.json()
|
|
print(f" Results: {len(search_results.get('results', []))}")
|
|
for i, result in enumerate(search_results.get('results', [])):
|
|
if i < 2: # Show first 2 results
|
|
print(f" {i+1}. Score: {result.get('score'):.3f}")
|
|
print(f" Text: {result.get('text', '')[:100]}...")
|
|
else:
|
|
print(f" Search failed: {search_response.text}")
|
|
|
|
return True
|
|
|
|
# Check if still processing
|
|
for doc in processing:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f" Processing... ({attempt + 1}/{max_attempts})")
|
|
break
|
|
else:
|
|
# Not in processing, check failed
|
|
for doc in failed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"✗ OCR processing failed!")
|
|
print(f" Error: {doc.get('error_msg', 'Unknown error')}")
|
|
return False
|
|
|
|
time.sleep(10) # Check every 10 seconds
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" Connection error (attempt {attempt + 1}/{max_attempts}): {e}")
|
|
time.sleep(10)
|
|
|
|
print("✗ OCR processing timed out")
|
|
return False
|
|
else:
|
|
print("✗ No track ID returned")
|
|
return False
|
|
else:
|
|
print(f"✗ Upload failed: {upload_response.text}")
|
|
return False
|
|
else:
|
|
print(f"✗ Login failed: {login_response.text}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"✗ Error during OCR test: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = test_ocr_with_cpu_mode()
|
|
if success:
|
|
print("\n🎉 SUCCESS: OCR PDF with scanned table processed successfully!")
|
|
else:
|
|
print("\n❌ FAILED: OCR processing did not complete successfully") |