Files
railseek6/test_ocr_workflow_fixed.py

323 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Fixed OCR PDF Upload and Search Validation Script
Uses correct endpoints and handles current server state
"""
import requests
import json
import time
import sys
import os
from pathlib import Path
# Configuration
BASE_URL = "http://localhost:3015"
OCR_PDF_PATH = "ocr.pdf"
TEST_QUERY = "document processing"
MAX_WAIT_TIME = 300
POLL_INTERVAL = 10
class OCRWorkflowValidator:
def __init__(self):
self.session = requests.Session()
self.doc_id = None
def log_step(self, message, status="INFO"):
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] [{status}] {message}")
def check_server_status(self):
"""Check if server is responding on any endpoint"""
self.log_step("Checking server status...")
# Try multiple endpoints to see what's available
endpoints_to_try = [
"/",
"/health",
"/api/health",
"/documents",
"/webui/"
]
for endpoint in endpoints_to_try:
try:
response = self.session.get(f"{BASE_URL}{endpoint}", timeout=5)
self.log_step(f"✓ Endpoint {endpoint}: {response.status_code}")
if response.status_code == 200:
return True
except requests.exceptions.RequestException as e:
self.log_step(f"✗ Endpoint {endpoint}: {e}", "WARNING")
continue
self.log_step("✗ No working endpoints found", "ERROR")
return False
def verify_ocr_pdf_exists(self):
"""Verify the OCR PDF file exists and is valid"""
self.log_step("Verifying OCR PDF file...")
if not os.path.exists(OCR_PDF_PATH):
self.log_step(f"✗ OCR PDF file not found: {OCR_PDF_PATH}", "ERROR")
return False
file_size = os.path.getsize(OCR_PDF_PATH)
if file_size == 0:
self.log_step("✗ OCR PDF file is empty", "ERROR")
return False
self.log_step(f"✓ OCR PDF file verified ({file_size} bytes)")
return True
def upload_ocr_pdf(self):
"""Upload OCR PDF using the correct endpoint from logs"""
self.log_step("Uploading OCR PDF file...")
if not self.verify_ocr_pdf_exists():
return False
try:
with open(OCR_PDF_PATH, 'rb') as file:
files = {'file': (os.path.basename(OCR_PDF_PATH), file, 'application/pdf')}
# Use the endpoint we saw in logs: /documents/upload
upload_endpoint = f"{BASE_URL}/documents/upload"
self.log_step(f"Uploading to: {upload_endpoint}")
response = self.session.post(
upload_endpoint,
files=files,
timeout=30
)
if response.status_code in [200, 201]:
result = response.json()
self.log_step("✓ OCR PDF upload successful")
# Extract document ID from response
if 'document_id' in result:
self.doc_id = result['document_id']
elif 'id' in result:
self.doc_id = result['id']
elif 'doc_id' in result:
self.doc_id = result['doc_id']
if self.doc_id:
self.log_step(f"Document ID: {self.doc_id}")
else:
# If no ID in response, try to get it from documents list
self.log_step("No document ID in response, checking documents list")
if self.get_document_list():
self.log_step(f"Found document in list: {self.doc_id}")
return True
else:
self.log_step(f"✗ Upload failed with status {response.status_code}: {response.text}", "ERROR")
return False
except Exception as e:
self.log_step(f"✗ Upload failed: {e}", "ERROR")
return False
def get_document_list(self):
"""Get list of documents to find our uploaded document"""
try:
response = self.session.get(f"{BASE_URL}/documents", timeout=10)
if response.status_code == 200:
documents = response.json()
if isinstance(documents, list) and len(documents) > 0:
# Assume the first document is our uploaded one
doc = documents[0]
if 'id' in doc:
self.doc_id = doc['id']
return True
self.log_step("No documents found in list", "WARNING")
return False
except Exception as e:
self.log_step(f"Failed to get document list: {e}", "WARNING")
return False
def wait_for_indexing(self):
"""Wait for document to be fully indexed"""
self.log_step("Waiting for document indexing to complete...")
if not self.doc_id:
self.log_step("✗ No document ID available to check indexing status", "ERROR")
return False
start_time = time.time()
while time.time() - start_time < MAX_WAIT_TIME:
try:
# Try to get document status
status_endpoint = f"{BASE_URL}/documents/{self.doc_id}/status"
response = self.session.get(status_endpoint, timeout=10)
if response.status_code == 200:
status_data = response.json()
# Check various status indicators
if 'status' in status_data:
status = status_data['status'].lower()
if status in ['completed', 'done', 'indexed']:
self.log_step("✓ Document indexing completed")
return True
elif status in ['processing', 'indexing']:
self.log_step(f"Indexing in progress... ({status})")
elif status in ['failed', 'error']:
self.log_step(f"✗ Indexing failed: {status_data.get('message', 'Unknown error')}", "ERROR")
return False
# Alternative status check
if 'indexed' in status_data and status_data['indexed']:
self.log_step("✓ Document indexing completed")
return True
# Also check documents list for status
if self.check_document_status_in_list():
self.log_step("✓ Document indexed (from list)")
return True
except requests.exceptions.RequestException:
pass
elapsed = int(time.time() - start_time)
self.log_step(f"Waiting... ({elapsed}s elapsed)")
time.sleep(POLL_INTERVAL)
self.log_step("✗ Indexing timeout reached", "ERROR")
return False
def check_document_status_in_list(self):
"""Check document status from documents list"""
try:
response = self.session.get(f"{BASE_URL}/documents", timeout=10)
if response.status_code == 200:
documents = response.json()
for doc in documents:
if doc.get('id') == self.doc_id:
# Check if document has indexing status
if doc.get('status') in ['completed', 'indexed']:
return True
elif doc.get('indexed') is True:
return True
return False
except:
return False
def test_search_functionality(self):
"""Test search functionality with OCR content"""
self.log_step("Testing search functionality...")
search_payload = {
"query": TEST_QUERY,
"top_k": 5
}
search_endpoints = [
f"{BASE_URL}/search",
f"{BASE_URL}/query",
f"{BASE_URL}/documents/search"
]
for endpoint in search_endpoints:
try:
self.log_step(f"Testing search endpoint: {endpoint}")
response = self.session.post(
endpoint,
json=search_payload,
timeout=15
)
if response.status_code == 200:
results = response.json()
self.log_step("✓ Search request successful")
# Validate search results structure
if isinstance(results, list) and len(results) > 0:
self.log_step(f"✓ Search returned {len(results)} results")
# Check if results contain relevant content
for i, result in enumerate(results[:3]):
if isinstance(result, dict):
content = result.get('content', result.get('text', str(result)))
else:
content = str(result)
content_preview = content[:100] + "..." if len(content) > 100 else content
self.log_step(f"Result {i+1}: {content_preview}")
return True
else:
self.log_step("✗ Search returned no results", "WARNING")
# Continue to next endpoint
except requests.exceptions.RequestException as e:
self.log_step(f"Search endpoint {endpoint} failed: {e}", "WARNING")
continue
self.log_step("✗ All search endpoints failed", "ERROR")
return False
def run_complete_validation(self):
"""Run the complete validation workflow"""
self.log_step("Starting OCR PDF Upload and Search Validation")
self.log_step("=" * 50)
steps = [
("Server Status Check", self.check_server_status),
("OCR PDF Verification", self.verify_ocr_pdf_exists),
("PDF Upload", self.upload_ocr_pdf),
("Indexing Wait", self.wait_for_indexing),
("Search Test", self.test_search_functionality)
]
results = []
for step_name, step_func in steps:
self.log_step(f"Executing: {step_name}")
success = step_func()
results.append((step_name, success))
if not success:
self.log_step(f"✗ Workflow failed at: {step_name}", "ERROR")
break
# Generate final report
self.log_step("=" * 50)
self.log_step("VALIDATION RESULTS SUMMARY")
self.log_step("=" * 50)
passed = 0
total = len(results)
for step_name, success in results:
status = "✓ PASS" if success else "✗ FAIL"
self.log_step(f"{step_name}: {status}")
if success:
passed += 1
success_rate = (passed / total) * 100
self.log_step(f"Success Rate: {passed}/{total} ({success_rate:.1f}%)")
if passed == total:
self.log_step("🎉 COMPLETE WORKFLOW VALIDATION SUCCESSFUL!", "SUCCESS")
return True
else:
self.log_step("❌ WORKFLOW VALIDATION FAILED", "ERROR")
return False
def main():
"""Main execution function"""
validator = OCRWorkflowValidator()
try:
success = validator.run_complete_validation()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
validator.log_step("Validation interrupted by user", "WARNING")
sys.exit(1)
except Exception as e:
validator.log_step(f"Unexpected error: {e}", "ERROR")
sys.exit(1)
if __name__ == "__main__":
main()