178 lines
6.8 KiB
Python
178 lines
6.8 KiB
Python
import requests
|
|
import os
|
|
import json
|
|
import time
|
|
import sys
|
|
|
|
def test_ocr_workflow():
|
|
"""Test complete OCR workflow for scanned table PDF"""
|
|
base_url = 'http://localhost:3015'
|
|
pdf_file = 'ocr.pdf'
|
|
|
|
print("=" * 60)
|
|
print("OCR PDF WORKFLOW TEST - SCANNED TABLE DOCUMENT")
|
|
print("=" * 60)
|
|
|
|
# Check if ocr.pdf exists
|
|
if not os.path.exists(pdf_file):
|
|
print(f"❌ {pdf_file} not found in current directory")
|
|
return False
|
|
|
|
print(f"📄 Testing with: {pdf_file}")
|
|
print(f"📊 File size: {os.path.getsize(pdf_file)} bytes")
|
|
|
|
# Login first
|
|
print("\n1. 🔐 AUTHENTICATION")
|
|
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
|
|
try:
|
|
login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)
|
|
if login_response.status_code == 200:
|
|
token = login_response.json().get('access_token')
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
print('✅ Login successful')
|
|
else:
|
|
print(f'❌ Login failed: {login_response.status_code}')
|
|
return False
|
|
except Exception as e:
|
|
print(f'❌ Login error: {e}')
|
|
return False
|
|
|
|
# Upload ocr.pdf
|
|
print("\n2. 📤 DOCUMENT UPLOAD")
|
|
try:
|
|
print(f"Uploading {pdf_file}...")
|
|
with open(pdf_file, 'rb') as file:
|
|
files = {'file': (pdf_file, file, 'application/pdf')}
|
|
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)
|
|
|
|
print(f"Upload Status: {upload_response.status_code}")
|
|
if upload_response.status_code == 200:
|
|
upload_result = upload_response.json()
|
|
print('✅ Upload successful')
|
|
print(f"Response: {json.dumps(upload_result, indent=2)}")
|
|
track_id = upload_result.get('track_id', '')
|
|
else:
|
|
print(f'❌ Upload failed: {upload_response.text}')
|
|
return False
|
|
except Exception as e:
|
|
print(f'❌ Upload error: {e}')
|
|
return False
|
|
|
|
# Wait for processing
|
|
print("\n3. ⏳ WAITING FOR PROCESSING")
|
|
max_wait = 120 # 2 minutes max
|
|
wait_time = 0
|
|
processing_complete = False
|
|
|
|
while wait_time < max_wait and not processing_complete:
|
|
print(f"Waiting for processing... ({wait_time}s/{max_wait}s)")
|
|
time.sleep(10)
|
|
wait_time += 10
|
|
|
|
try:
|
|
# Check document status
|
|
status_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
|
|
if status_response.status_code == 200:
|
|
documents = status_response.json()
|
|
statuses = documents.get('statuses', {})
|
|
|
|
# Check for completed documents
|
|
completed = statuses.get('completed', [])
|
|
processing = statuses.get('processing', [])
|
|
failed = statuses.get('failed', [])
|
|
|
|
print(f"📊 Status - Completed: {len(completed)}, Processing: {len(processing)}, Failed: {len(failed)}")
|
|
|
|
# Check if our document is completed
|
|
for doc in completed:
|
|
if doc.get('file_path') == pdf_file:
|
|
print("✅ Document processing completed!")
|
|
processing_complete = True
|
|
print(f"Document details: {json.dumps(doc, indent=2)}")
|
|
break
|
|
|
|
# Check if failed
|
|
for doc in failed:
|
|
if doc.get('file_path') == pdf_file:
|
|
print(f"❌ Document processing failed: {doc.get('error_msg', 'Unknown error')}")
|
|
return False
|
|
|
|
else:
|
|
print(f"⚠️ Status check failed: {status_response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Status check error: {e}")
|
|
|
|
if not processing_complete:
|
|
print("❌ Processing timeout - document not processed within expected time")
|
|
return False
|
|
|
|
# Test search functionality
|
|
print("\n4. 🔍 SEARCH FUNCTIONALITY TEST")
|
|
|
|
# Test queries based on expected content from scanned table
|
|
test_queries = [
|
|
"table data",
|
|
"document content",
|
|
"information in the pdf",
|
|
"what does this document contain"
|
|
]
|
|
|
|
for query in test_queries:
|
|
print(f"\nTesting query: '{query}'")
|
|
try:
|
|
query_data = {'query': query, 'top_k': 5}
|
|
search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers, timeout=30)
|
|
|
|
if search_response.status_code == 200:
|
|
results = search_response.json()
|
|
print(f"✅ Search successful")
|
|
|
|
if isinstance(results, dict):
|
|
response_text = results.get('response', 'No response field')
|
|
print(f"Response: {response_text}")
|
|
|
|
# Check for no-context response
|
|
if '[no-context]' in response_text:
|
|
print("⚠️ No relevant content found in document")
|
|
else:
|
|
print("🎉 Content found and retrieved!")
|
|
|
|
else:
|
|
print(f"Unexpected response format: {type(results)}")
|
|
print(f"Raw result: {results}")
|
|
else:
|
|
print(f'❌ Search failed: {search_response.text}')
|
|
|
|
except Exception as e:
|
|
print(f'❌ Search error: {e}')
|
|
|
|
# Get detailed document information
|
|
print("\n5. 📋 DOCUMENT DETAILS")
|
|
try:
|
|
status_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
|
|
if status_response.status_code == 200:
|
|
documents = status_response.json()
|
|
completed_docs = documents.get('statuses', {}).get('completed', [])
|
|
|
|
for doc in completed_docs:
|
|
if doc.get('file_path') == pdf_file:
|
|
print("📄 Document Summary:")
|
|
print(f" - File: {doc.get('file_path')}")
|
|
print(f" - Status: {doc.get('status')}")
|
|
print(f" - Content Length: {doc.get('content_length', 'N/A')}")
|
|
print(f" - Chunks Count: {doc.get('chunks_count', 'N/A')}")
|
|
print(f" - Created: {doc.get('created_at')}")
|
|
print(f" - Updated: {doc.get('updated_at')}")
|
|
break
|
|
except Exception as e:
|
|
print(f"⚠️ Could not get document details: {e}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("WORKFLOW TEST COMPLETED")
|
|
print("=" * 60)
|
|
return True
|
|
|
|
if __name__ == "__main__":
|
|
success = test_ocr_workflow()
|
|
sys.exit(0 if success else 1) |