Files
railseek6/final_ocr_validation_test.py

403 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Final OCR PDF Upload and Search Validation Test
Comprehensive test that handles authentication and tests the complete workflow
"""
import requests
import json
import time
import sys
import os
import base64
from pathlib import Path
# Configuration
BASE_URL = "http://localhost:3015"
OCR_PDF_PATH = "ocr.pdf"
TEST_QUERY = "document processing"
MAX_WAIT_TIME = 300
POLL_INTERVAL = 10
# Authentication
USERNAME = "jleu3482"
PASSWORD = "jleu1212"
class OCRWorkflowValidator:
def __init__(self):
self.session = requests.Session()
self.doc_id = None
self.auth_token = None
def log_step(self, message, status="INFO"):
"""Log step with timestamp"""
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] [{status}] {message}")
def setup_authentication(self):
"""Setup authentication using various methods"""
self.log_step("Setting up authentication...")
# Method 1: Basic Auth
try:
self.session.auth = (USERNAME, PASSWORD)
# Test if basic auth works
response = self.session.get(f"{BASE_URL}/", timeout=5)
if response.status_code == 200:
self.log_step("✓ Basic authentication configured")
return True
except Exception as e:
self.log_step(f"Basic auth failed: {e}", "WARNING")
# Method 2: Token-based auth (if available)
try:
# Try to get token from various endpoints
token_endpoints = [
f"{BASE_URL}/auth/token",
f"{BASE_URL}/api/token",
f"{BASE_URL}/token"
]
for endpoint in token_endpoints:
try:
auth_data = {"username": USERNAME, "password": PASSWORD}
response = self.session.post(endpoint, data=auth_data, timeout=5)
if response.status_code == 200:
token_data = response.json()
if 'access_token' in token_data:
self.auth_token = token_data['access_token']
self.session.headers.update({'Authorization': f'Bearer {self.auth_token}'})
self.log_step("✓ Token authentication configured")
return True
except:
continue
except Exception as e:
self.log_step(f"Token auth failed: {e}", "WARNING")
# Method 3: Custom headers
try:
# Try with custom auth headers that might be expected
auth_headers = [
{'X-API-Key': PASSWORD},
{'Authorization': f'Basic {base64.b64encode(f"{USERNAME}:{PASSWORD}".encode()).decode()}'},
{'X-Username': USERNAME, 'X-Password': PASSWORD}
]
for headers in auth_headers:
self.session.headers.update(headers)
response = self.session.get(f"{BASE_URL}/", timeout=5)
if response.status_code == 200:
self.log_step("✓ Custom header authentication configured")
return True
# Reset headers if none worked
self.session.headers.clear()
except Exception as e:
self.log_step(f"Custom header auth failed: {e}", "WARNING")
self.log_step("✗ All authentication methods failed", "ERROR")
return False
def check_server_status(self):
"""Check if server is responding"""
self.log_step("Checking server status...")
endpoints_to_try = [
"/",
"/health",
"/api/health"
]
for endpoint in endpoints_to_try:
try:
response = self.session.get(f"{BASE_URL}{endpoint}", timeout=5)
self.log_step(f"✓ Endpoint {endpoint}: {response.status_code}")
if response.status_code == 200:
return True
except requests.exceptions.RequestException as e:
self.log_step(f"✗ Endpoint {endpoint}: {e}", "WARNING")
continue
self.log_step("✗ No working endpoints found", "ERROR")
return False
def verify_ocr_pdf_exists(self):
"""Verify the OCR PDF file exists and is valid"""
self.log_step("Verifying OCR PDF file...")
if not os.path.exists(OCR_PDF_PATH):
self.log_step(f"✗ OCR PDF file not found: {OCR_PDF_PATH}", "ERROR")
return False
file_size = os.path.getsize(OCR_PDF_PATH)
if file_size == 0:
self.log_step("✗ OCR PDF file is empty", "ERROR")
return False
self.log_step(f"✓ OCR PDF file verified ({file_size} bytes)")
return True
def test_webui_login(self):
"""Test if we can access the web UI login"""
self.log_step("Testing web UI access...")
try:
response = self.session.get(f"{BASE_URL}/webui/", timeout=10)
if response.status_code == 200:
self.log_step("✓ Web UI is accessible")
return True
else:
self.log_step(f"✗ Web UI returned {response.status_code}", "WARNING")
return False
except Exception as e:
self.log_step(f"✗ Web UI access failed: {e}", "WARNING")
return False
def upload_ocr_pdf_direct(self):
"""Upload OCR PDF using direct file upload"""
self.log_step("Uploading OCR PDF file directly...")
if not self.verify_ocr_pdf_exists():
return False
try:
with open(OCR_PDF_PATH, 'rb') as file:
files = {'file': (os.path.basename(OCR_PDF_PATH), file, 'application/pdf')}
# Try multiple upload endpoints with different auth methods
upload_endpoints = [
f"{BASE_URL}/documents/upload",
f"{BASE_URL}/upload",
f"{BASE_URL}/api/upload"
]
for endpoint in upload_endpoints:
try:
self.log_step(f"Trying upload endpoint: {endpoint}")
# Try with basic auth
auth = (USERNAME, PASSWORD)
response = self.session.post(
endpoint,
files=files,
auth=auth,
timeout=30
)
if response.status_code in [200, 201]:
result = response.json()
self.log_step("✓ OCR PDF upload successful")
# Extract document ID from response
if 'document_id' in result:
self.doc_id = result['document_id']
elif 'id' in result:
self.doc_id = result['id']
elif 'doc_id' in result:
self.doc_id = result['doc_id']
if self.doc_id:
self.log_step(f"Document ID: {self.doc_id}")
return True
else:
self.log_step(f"Upload endpoint {endpoint} returned {response.status_code}: {response.text}", "WARNING")
except requests.exceptions.RequestException as e:
self.log_step(f"Upload endpoint {endpoint} failed: {e}", "WARNING")
continue
self.log_step("✗ All upload endpoints failed", "ERROR")
return False
except Exception as e:
self.log_step(f"✗ Upload failed: {e}", "ERROR")
return False
def check_document_processing(self):
"""Check if document is being processed"""
self.log_step("Checking document processing status...")
if not self.doc_id:
self.log_step("✗ No document ID available", "ERROR")
return False
start_time = time.time()
while time.time() - start_time < MAX_WAIT_TIME:
try:
# Check documents list
response = self.session.get(f"{BASE_URL}/documents", timeout=10)
if response.status_code == 200:
documents = response.json()
if isinstance(documents, list):
for doc in documents:
if str(doc.get('id')) == str(self.doc_id):
status = doc.get('status', 'unknown')
self.log_step(f"Document status: {status}")
if status in ['completed', 'processed', 'indexed']:
self.log_step("✓ Document processing completed")
return True
elif status in ['processing', 'indexing']:
self.log_step(f"Still processing... ({status})")
elif status in ['failed', 'error']:
self.log_step(f"✗ Processing failed: {status}", "ERROR")
return False
elapsed = int(time.time() - start_time)
self.log_step(f"Waiting for processing... ({elapsed}s elapsed)")
time.sleep(POLL_INTERVAL)
except Exception as e:
self.log_step(f"Error checking status: {e}", "WARNING")
time.sleep(POLL_INTERVAL)
self.log_step("✗ Processing timeout reached", "ERROR")
return False
def test_search_functionality(self):
"""Test search functionality with OCR content"""
self.log_step("Testing search functionality...")
search_payload = {
"query": TEST_QUERY,
"top_k": 5
}
search_endpoints = [
f"{BASE_URL}/search",
f"{BASE_URL}/query",
f"{BASE_URL}/api/search"
]
for endpoint in search_endpoints:
try:
self.log_step(f"Testing search endpoint: {endpoint}")
# Try with basic auth
auth = (USERNAME, PASSWORD)
response = self.session.post(
endpoint,
json=search_payload,
auth=auth,
timeout=15
)
if response.status_code == 200:
results = response.json()
self.log_step("✓ Search request successful")
# Validate search results structure
if isinstance(results, list) and len(results) > 0:
self.log_step(f"✓ Search returned {len(results)} results")
# Check if results contain relevant content
for i, result in enumerate(results[:3]):
if isinstance(result, dict):
content = result.get('content', result.get('text', str(result)))
else:
content = str(result)
content_preview = content[:100] + "..." if len(content) > 100 else content
self.log_step(f"Result {i+1}: {content_preview}")
return True
else:
self.log_step("✗ Search returned no results", "WARNING")
# Continue to next endpoint
else:
self.log_step(f"Search endpoint returned {response.status_code}: {response.text}", "WARNING")
except requests.exceptions.RequestException as e:
self.log_step(f"Search endpoint {endpoint} failed: {e}", "WARNING")
continue
self.log_step("✗ All search endpoints failed", "ERROR")
return False
def verify_database_integration(self):
"""Verify data is stored in databases"""
self.log_step("Verifying database integration...")
# Check if we can access document details
if self.doc_id:
try:
response = self.session.get(f"{BASE_URL}/documents/{self.doc_id}", timeout=10)
if response.status_code == 200:
doc_details = response.json()
self.log_step("✓ Document details accessible")
self.log_step(f"Document metadata: {json.dumps(doc_details, indent=2)}")
return True
except Exception as e:
self.log_step(f"Document details check failed: {e}", "WARNING")
self.log_step("✗ Database integration verification incomplete", "WARNING")
return False
def run_complete_validation(self):
"""Run the complete validation workflow"""
self.log_step("Starting Final OCR PDF Upload and Search Validation")
self.log_step("=" * 60)
steps = [
("Server Status Check", self.check_server_status),
("Authentication Setup", self.setup_authentication),
("Web UI Access Test", self.test_webui_login),
("OCR PDF Verification", self.verify_ocr_pdf_exists),
("PDF Upload", self.upload_ocr_pdf_direct),
("Document Processing", self.check_document_processing),
("Search Functionality", self.test_search_functionality),
("Database Integration", self.verify_database_integration)
]
results = []
for step_name, step_func in steps:
self.log_step(f"Executing: {step_name}")
success = step_func()
results.append((step_name, success))
if not success:
self.log_step(f"✗ Workflow failed at: {step_name}", "ERROR")
# Don't break, continue to gather more information
# Generate final report
self.log_step("=" * 60)
self.log_step("FINAL VALIDATION RESULTS SUMMARY")
self.log_step("=" * 60)
passed = 0
total = len(results)
for step_name, success in results:
status = "✓ PASS" if success else "✗ FAIL"
self.log_step(f"{step_name}: {status}")
if success:
passed += 1
success_rate = (passed / total) * 100
self.log_step(f"Success Rate: {passed}/{total} ({success_rate:.1f}%)")
if passed == total:
self.log_step("🎉 COMPLETE WORKFLOW VALIDATION SUCCESSFUL!", "SUCCESS")
return True
elif passed >= total - 2: # Allow 2 failures for non-critical steps
self.log_step("⚠️ PARTIAL WORKFLOW VALIDATION - Most functionality working", "WARNING")
return True
else:
self.log_step("❌ WORKFLOW VALIDATION FAILED - Major issues detected", "ERROR")
return False
def main():
"""Main execution function"""
validator = OCRWorkflowValidator()
try:
success = validator.run_complete_validation()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
validator.log_step("Validation interrupted by user", "WARNING")
sys.exit(1)
except Exception as e:
validator.log_step(f"Unexpected error: {e}", "ERROR")
sys.exit(1)
if __name__ == "__main__":
main()