Files
railseek6/final_ocr_test_validation.py

243 lines
9.1 KiB
Python

import requests
import json
import base64
import time
import logging
import os
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] [%(levelname)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
class OCRWorkflowTester:
def __init__(self, base_url="http://localhost:3015", username="jleu3482", password="jleu1212"):
self.base_url = base_url
self.username = username
self.password = password
self.session = requests.Session()
self._setup_auth()
def _setup_auth(self):
"""Setup authentication for the session"""
credentials = f"{self.username}:{self.password}"
encoded_credentials = base64.b64encode(credentials.encode()).decode()
self.session.headers.update({
"Authorization": f"Basic {encoded_credentials}",
"Content-Type": "application/json"
})
def check_server_status(self):
"""Check if server is accessible"""
logging.info("🔍 Checking server status...")
try:
response = self.session.get(f"{self.base_url}/health")
if response.status_code == 200:
logging.info("✅ Server is running and accessible")
return True
else:
logging.error(f"❌ Server returned status: {response.status_code}")
return False
except Exception as e:
logging.error(f"❌ Server connection failed: {e}")
return False
def upload_ocr_pdf(self, file_path="ocr.pdf"):
"""Upload OCR PDF file"""
logging.info(f"📤 Uploading OCR PDF: {file_path}")
if not Path(file_path).exists():
logging.error(f"❌ File not found: {file_path}")
return False
try:
with open(file_path, 'rb') as f:
files = {'file': (file_path, f, 'application/pdf')}
# Remove auth header for file upload
headers = {k: v for k, v in self.session.headers.items() if k != 'Authorization'}
response = requests.post(
f"{self.base_url}/documents/upload",
files=files,
auth=(self.username, self.password)
)
if response.status_code == 200:
logging.info("✅ Upload successful")
result = response.json()
logging.info(f"📊 Upload result: {json.dumps(result, indent=2)}")
return True
else:
logging.error(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
logging.error(f"❌ Upload error: {e}")
return False
def monitor_indexing(self, max_wait=60):
"""Monitor document indexing progress"""
logging.info("⏳ Monitoring indexing progress...")
for attempt in range(max_wait // 5):
try:
response = self.session.get(f"{self.base_url}/documents")
if response.status_code == 200:
documents = response.json()
if documents:
doc = documents[0]
status = doc.get('status', 'unknown')
logging.info(f"📄 Document status: {status}")
if status == 'completed':
logging.info("✅ Indexing completed successfully")
return True
elif status == 'failed':
logging.error("❌ Indexing failed")
return False
else:
logging.info("📭 No documents found yet")
else:
logging.error(f"❌ Status check failed: {response.status_code}")
time.sleep(5)
except Exception as e:
logging.error(f"❌ Monitoring error: {e}")
time.sleep(5)
logging.warning("⚠️ Indexing timeout reached")
return False
def test_search(self, queries):
"""Test search functionality with multiple queries"""
logging.info("🔍 Testing search functionality...")
results = {}
for query in queries:
logging.info(f"🔎 Searching for: '{query}'")
try:
payload = {
"query": query,
"top_k": 3
}
response = self.session.post(
f"{self.base_url}/search",
json=payload
)
if response.status_code == 200:
result = response.json()
results[query] = {
'success': True,
'results_count': len(result.get('results', [])),
'sample_content': result.get('results', [{}])[0].get('content', '')[:200] if result.get('results') else 'No results'
}
logging.info(f"✅ Search successful - {len(result.get('results', []))} results")
logging.info(f"📝 Sample: {results[query]['sample_content']}...")
else:
results[query] = {
'success': False,
'error': f"Status {response.status_code}: {response.text}"
}
logging.error(f"❌ Search failed: {response.status_code} - {response.text}")
except Exception as e:
results[query] = {
'success': False,
'error': str(e)
}
logging.error(f"❌ Search error: {e}")
return results
def check_database_status(self):
"""Check database connectivity and status"""
logging.info("🗄️ Checking database connections...")
try:
# Check document count
response = self.session.get(f"{self.base_url}/documents")
if response.status_code == 200:
documents = response.json()
logging.info(f"📊 Documents in system: {len(documents)}")
# Check search index
test_response = self.session.post(
f"{self.base_url}/search",
json={"query": "test", "top_k": 1}
)
if test_response.status_code == 200:
logging.info("✅ Search index is operational")
else:
logging.warning("⚠️ Search index may have issues")
return True
except Exception as e:
logging.error(f"❌ Database check failed: {e}")
return False
def main():
logging.info("🚀 STARTING OCR PDF WORKFLOW VALIDATION")
logging.info("=" * 70)
# Initialize tester
tester = OCRWorkflowTester()
# Test server connectivity
if not tester.check_server_status():
logging.error("❌ Cannot proceed - server not accessible")
return
# Upload OCR PDF
if tester.upload_ocr_pdf():
# Wait and monitor indexing
logging.info("⏳ Waiting for indexing to complete...")
time.sleep(10)
if tester.monitor_indexing():
# Test search with OCR-specific queries
search_queries = [
"OCR",
"text extraction",
"document processing",
"optical character recognition",
"PDF conversion"
]
search_results = tester.test_search(search_queries)
# Check database status
tester.check_database_status()
# Print summary
logging.info("=" * 70)
logging.info("📋 TEST RESULTS SUMMARY")
logging.info("=" * 70)
successful_searches = sum(1 for result in search_results.values() if result['success'])
logging.info(f"✅ Upload: SUCCESS")
logging.info(f"✅ Indexing: SUCCESS")
logging.info(f"🔍 Search: {successful_searches}/{len(search_queries)} queries successful")
for query, result in search_results.items():
status = "" if result['success'] else ""
logging.info(f" {status} '{query}': {result.get('results_count', 'N/A')} results")
else:
logging.error("❌ Indexing failed or timed out")
else:
logging.error("❌ Upload failed - cannot proceed with testing")
logging.info("=" * 70)
logging.info("🌐 Web UI: http://localhost:3015/webui/")
logging.info("👤 Username: jleu3482")
logging.info("🔑 Password: jleu1212")
logging.info("📁 Test file: ocr.pdf")
if __name__ == "__main__":
main()