403 lines
16 KiB
Python
403 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Final OCR PDF Upload and Search Validation Test
|
|
Comprehensive test that handles authentication and tests the complete workflow
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import time
|
|
import sys
|
|
import os
|
|
import base64
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:3015"
|
|
OCR_PDF_PATH = "ocr.pdf"
|
|
TEST_QUERY = "document processing"
|
|
MAX_WAIT_TIME = 300
|
|
POLL_INTERVAL = 10
|
|
|
|
# Authentication
|
|
USERNAME = "jleu3482"
|
|
PASSWORD = "jleu1212"
|
|
|
|
class OCRWorkflowValidator:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.doc_id = None
|
|
self.auth_token = None
|
|
|
|
def log_step(self, message, status="INFO"):
|
|
"""Log step with timestamp"""
|
|
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
print(f"[{timestamp}] [{status}] {message}")
|
|
|
|
def setup_authentication(self):
|
|
"""Setup authentication using various methods"""
|
|
self.log_step("Setting up authentication...")
|
|
|
|
# Method 1: Basic Auth
|
|
try:
|
|
self.session.auth = (USERNAME, PASSWORD)
|
|
# Test if basic auth works
|
|
response = self.session.get(f"{BASE_URL}/", timeout=5)
|
|
if response.status_code == 200:
|
|
self.log_step("✓ Basic authentication configured")
|
|
return True
|
|
except Exception as e:
|
|
self.log_step(f"Basic auth failed: {e}", "WARNING")
|
|
|
|
# Method 2: Token-based auth (if available)
|
|
try:
|
|
# Try to get token from various endpoints
|
|
token_endpoints = [
|
|
f"{BASE_URL}/auth/token",
|
|
f"{BASE_URL}/api/token",
|
|
f"{BASE_URL}/token"
|
|
]
|
|
|
|
for endpoint in token_endpoints:
|
|
try:
|
|
auth_data = {"username": USERNAME, "password": PASSWORD}
|
|
response = self.session.post(endpoint, data=auth_data, timeout=5)
|
|
if response.status_code == 200:
|
|
token_data = response.json()
|
|
if 'access_token' in token_data:
|
|
self.auth_token = token_data['access_token']
|
|
self.session.headers.update({'Authorization': f'Bearer {self.auth_token}'})
|
|
self.log_step("✓ Token authentication configured")
|
|
return True
|
|
except:
|
|
continue
|
|
except Exception as e:
|
|
self.log_step(f"Token auth failed: {e}", "WARNING")
|
|
|
|
# Method 3: Custom headers
|
|
try:
|
|
# Try with custom auth headers that might be expected
|
|
auth_headers = [
|
|
{'X-API-Key': PASSWORD},
|
|
{'Authorization': f'Basic {base64.b64encode(f"{USERNAME}:{PASSWORD}".encode()).decode()}'},
|
|
{'X-Username': USERNAME, 'X-Password': PASSWORD}
|
|
]
|
|
|
|
for headers in auth_headers:
|
|
self.session.headers.update(headers)
|
|
response = self.session.get(f"{BASE_URL}/", timeout=5)
|
|
if response.status_code == 200:
|
|
self.log_step("✓ Custom header authentication configured")
|
|
return True
|
|
|
|
# Reset headers if none worked
|
|
self.session.headers.clear()
|
|
except Exception as e:
|
|
self.log_step(f"Custom header auth failed: {e}", "WARNING")
|
|
|
|
self.log_step("✗ All authentication methods failed", "ERROR")
|
|
return False
|
|
|
|
def check_server_status(self):
|
|
"""Check if server is responding"""
|
|
self.log_step("Checking server status...")
|
|
|
|
endpoints_to_try = [
|
|
"/",
|
|
"/health",
|
|
"/api/health"
|
|
]
|
|
|
|
for endpoint in endpoints_to_try:
|
|
try:
|
|
response = self.session.get(f"{BASE_URL}{endpoint}", timeout=5)
|
|
self.log_step(f"✓ Endpoint {endpoint}: {response.status_code}")
|
|
if response.status_code == 200:
|
|
return True
|
|
except requests.exceptions.RequestException as e:
|
|
self.log_step(f"✗ Endpoint {endpoint}: {e}", "WARNING")
|
|
continue
|
|
|
|
self.log_step("✗ No working endpoints found", "ERROR")
|
|
return False
|
|
|
|
def verify_ocr_pdf_exists(self):
|
|
"""Verify the OCR PDF file exists and is valid"""
|
|
self.log_step("Verifying OCR PDF file...")
|
|
if not os.path.exists(OCR_PDF_PATH):
|
|
self.log_step(f"✗ OCR PDF file not found: {OCR_PDF_PATH}", "ERROR")
|
|
return False
|
|
|
|
file_size = os.path.getsize(OCR_PDF_PATH)
|
|
if file_size == 0:
|
|
self.log_step("✗ OCR PDF file is empty", "ERROR")
|
|
return False
|
|
|
|
self.log_step(f"✓ OCR PDF file verified ({file_size} bytes)")
|
|
return True
|
|
|
|
def test_webui_login(self):
|
|
"""Test if we can access the web UI login"""
|
|
self.log_step("Testing web UI access...")
|
|
try:
|
|
response = self.session.get(f"{BASE_URL}/webui/", timeout=10)
|
|
if response.status_code == 200:
|
|
self.log_step("✓ Web UI is accessible")
|
|
return True
|
|
else:
|
|
self.log_step(f"✗ Web UI returned {response.status_code}", "WARNING")
|
|
return False
|
|
except Exception as e:
|
|
self.log_step(f"✗ Web UI access failed: {e}", "WARNING")
|
|
return False
|
|
|
|
def upload_ocr_pdf_direct(self):
|
|
"""Upload OCR PDF using direct file upload"""
|
|
self.log_step("Uploading OCR PDF file directly...")
|
|
|
|
if not self.verify_ocr_pdf_exists():
|
|
return False
|
|
|
|
try:
|
|
with open(OCR_PDF_PATH, 'rb') as file:
|
|
files = {'file': (os.path.basename(OCR_PDF_PATH), file, 'application/pdf')}
|
|
|
|
# Try multiple upload endpoints with different auth methods
|
|
upload_endpoints = [
|
|
f"{BASE_URL}/documents/upload",
|
|
f"{BASE_URL}/upload",
|
|
f"{BASE_URL}/api/upload"
|
|
]
|
|
|
|
for endpoint in upload_endpoints:
|
|
try:
|
|
self.log_step(f"Trying upload endpoint: {endpoint}")
|
|
|
|
# Try with basic auth
|
|
auth = (USERNAME, PASSWORD)
|
|
response = self.session.post(
|
|
endpoint,
|
|
files=files,
|
|
auth=auth,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code in [200, 201]:
|
|
result = response.json()
|
|
self.log_step("✓ OCR PDF upload successful")
|
|
|
|
# Extract document ID from response
|
|
if 'document_id' in result:
|
|
self.doc_id = result['document_id']
|
|
elif 'id' in result:
|
|
self.doc_id = result['id']
|
|
elif 'doc_id' in result:
|
|
self.doc_id = result['doc_id']
|
|
|
|
if self.doc_id:
|
|
self.log_step(f"Document ID: {self.doc_id}")
|
|
return True
|
|
else:
|
|
self.log_step(f"Upload endpoint {endpoint} returned {response.status_code}: {response.text}", "WARNING")
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
self.log_step(f"Upload endpoint {endpoint} failed: {e}", "WARNING")
|
|
continue
|
|
|
|
self.log_step("✗ All upload endpoints failed", "ERROR")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log_step(f"✗ Upload failed: {e}", "ERROR")
|
|
return False
|
|
|
|
def check_document_processing(self):
|
|
"""Check if document is being processed"""
|
|
self.log_step("Checking document processing status...")
|
|
|
|
if not self.doc_id:
|
|
self.log_step("✗ No document ID available", "ERROR")
|
|
return False
|
|
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < MAX_WAIT_TIME:
|
|
try:
|
|
# Check documents list
|
|
response = self.session.get(f"{BASE_URL}/documents", timeout=10)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
if isinstance(documents, list):
|
|
for doc in documents:
|
|
if str(doc.get('id')) == str(self.doc_id):
|
|
status = doc.get('status', 'unknown')
|
|
self.log_step(f"Document status: {status}")
|
|
|
|
if status in ['completed', 'processed', 'indexed']:
|
|
self.log_step("✓ Document processing completed")
|
|
return True
|
|
elif status in ['processing', 'indexing']:
|
|
self.log_step(f"Still processing... ({status})")
|
|
elif status in ['failed', 'error']:
|
|
self.log_step(f"✗ Processing failed: {status}", "ERROR")
|
|
return False
|
|
|
|
elapsed = int(time.time() - start_time)
|
|
self.log_step(f"Waiting for processing... ({elapsed}s elapsed)")
|
|
time.sleep(POLL_INTERVAL)
|
|
|
|
except Exception as e:
|
|
self.log_step(f"Error checking status: {e}", "WARNING")
|
|
time.sleep(POLL_INTERVAL)
|
|
|
|
self.log_step("✗ Processing timeout reached", "ERROR")
|
|
return False
|
|
|
|
def test_search_functionality(self):
|
|
"""Test search functionality with OCR content"""
|
|
self.log_step("Testing search functionality...")
|
|
|
|
search_payload = {
|
|
"query": TEST_QUERY,
|
|
"top_k": 5
|
|
}
|
|
|
|
search_endpoints = [
|
|
f"{BASE_URL}/search",
|
|
f"{BASE_URL}/query",
|
|
f"{BASE_URL}/api/search"
|
|
]
|
|
|
|
for endpoint in search_endpoints:
|
|
try:
|
|
self.log_step(f"Testing search endpoint: {endpoint}")
|
|
|
|
# Try with basic auth
|
|
auth = (USERNAME, PASSWORD)
|
|
response = self.session.post(
|
|
endpoint,
|
|
json=search_payload,
|
|
auth=auth,
|
|
timeout=15
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
self.log_step("✓ Search request successful")
|
|
|
|
# Validate search results structure
|
|
if isinstance(results, list) and len(results) > 0:
|
|
self.log_step(f"✓ Search returned {len(results)} results")
|
|
|
|
# Check if results contain relevant content
|
|
for i, result in enumerate(results[:3]):
|
|
if isinstance(result, dict):
|
|
content = result.get('content', result.get('text', str(result)))
|
|
else:
|
|
content = str(result)
|
|
content_preview = content[:100] + "..." if len(content) > 100 else content
|
|
self.log_step(f"Result {i+1}: {content_preview}")
|
|
|
|
return True
|
|
else:
|
|
self.log_step("✗ Search returned no results", "WARNING")
|
|
# Continue to next endpoint
|
|
else:
|
|
self.log_step(f"Search endpoint returned {response.status_code}: {response.text}", "WARNING")
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
self.log_step(f"Search endpoint {endpoint} failed: {e}", "WARNING")
|
|
continue
|
|
|
|
self.log_step("✗ All search endpoints failed", "ERROR")
|
|
return False
|
|
|
|
def verify_database_integration(self):
|
|
"""Verify data is stored in databases"""
|
|
self.log_step("Verifying database integration...")
|
|
|
|
# Check if we can access document details
|
|
if self.doc_id:
|
|
try:
|
|
response = self.session.get(f"{BASE_URL}/documents/{self.doc_id}", timeout=10)
|
|
if response.status_code == 200:
|
|
doc_details = response.json()
|
|
self.log_step("✓ Document details accessible")
|
|
self.log_step(f"Document metadata: {json.dumps(doc_details, indent=2)}")
|
|
return True
|
|
except Exception as e:
|
|
self.log_step(f"Document details check failed: {e}", "WARNING")
|
|
|
|
self.log_step("✗ Database integration verification incomplete", "WARNING")
|
|
return False
|
|
|
|
def run_complete_validation(self):
|
|
"""Run the complete validation workflow"""
|
|
self.log_step("Starting Final OCR PDF Upload and Search Validation")
|
|
self.log_step("=" * 60)
|
|
|
|
steps = [
|
|
("Server Status Check", self.check_server_status),
|
|
("Authentication Setup", self.setup_authentication),
|
|
("Web UI Access Test", self.test_webui_login),
|
|
("OCR PDF Verification", self.verify_ocr_pdf_exists),
|
|
("PDF Upload", self.upload_ocr_pdf_direct),
|
|
("Document Processing", self.check_document_processing),
|
|
("Search Functionality", self.test_search_functionality),
|
|
("Database Integration", self.verify_database_integration)
|
|
]
|
|
|
|
results = []
|
|
for step_name, step_func in steps:
|
|
self.log_step(f"Executing: {step_name}")
|
|
success = step_func()
|
|
results.append((step_name, success))
|
|
|
|
if not success:
|
|
self.log_step(f"✗ Workflow failed at: {step_name}", "ERROR")
|
|
# Don't break, continue to gather more information
|
|
|
|
# Generate final report
|
|
self.log_step("=" * 60)
|
|
self.log_step("FINAL VALIDATION RESULTS SUMMARY")
|
|
self.log_step("=" * 60)
|
|
|
|
passed = 0
|
|
total = len(results)
|
|
|
|
for step_name, success in results:
|
|
status = "✓ PASS" if success else "✗ FAIL"
|
|
self.log_step(f"{step_name}: {status}")
|
|
if success:
|
|
passed += 1
|
|
|
|
success_rate = (passed / total) * 100
|
|
self.log_step(f"Success Rate: {passed}/{total} ({success_rate:.1f}%)")
|
|
|
|
if passed == total:
|
|
self.log_step("🎉 COMPLETE WORKFLOW VALIDATION SUCCESSFUL!", "SUCCESS")
|
|
return True
|
|
elif passed >= total - 2: # Allow 2 failures for non-critical steps
|
|
self.log_step("⚠️ PARTIAL WORKFLOW VALIDATION - Most functionality working", "WARNING")
|
|
return True
|
|
else:
|
|
self.log_step("❌ WORKFLOW VALIDATION FAILED - Major issues detected", "ERROR")
|
|
return False
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
validator = OCRWorkflowValidator()
|
|
|
|
try:
|
|
success = validator.run_complete_validation()
|
|
sys.exit(0 if success else 1)
|
|
|
|
except KeyboardInterrupt:
|
|
validator.log_step("Validation interrupted by user", "WARNING")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
validator.log_step(f"Unexpected error: {e}", "ERROR")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |