243 lines
9.1 KiB
Python
243 lines
9.1 KiB
Python
import requests
|
|
import json
|
|
import base64
|
|
import time
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='[%(asctime)s] [%(levelname)s] %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
|
|
class OCRWorkflowTester:
|
|
def __init__(self, base_url="http://localhost:3015", username="jleu3482", password="jleu1212"):
|
|
self.base_url = base_url
|
|
self.username = username
|
|
self.password = password
|
|
self.session = requests.Session()
|
|
self._setup_auth()
|
|
|
|
def _setup_auth(self):
|
|
"""Setup authentication for the session"""
|
|
credentials = f"{self.username}:{self.password}"
|
|
encoded_credentials = base64.b64encode(credentials.encode()).decode()
|
|
self.session.headers.update({
|
|
"Authorization": f"Basic {encoded_credentials}",
|
|
"Content-Type": "application/json"
|
|
})
|
|
|
|
def check_server_status(self):
|
|
"""Check if server is accessible"""
|
|
logging.info("🔍 Checking server status...")
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/health")
|
|
if response.status_code == 200:
|
|
logging.info("✅ Server is running and accessible")
|
|
return True
|
|
else:
|
|
logging.error(f"❌ Server returned status: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
logging.error(f"❌ Server connection failed: {e}")
|
|
return False
|
|
|
|
def upload_ocr_pdf(self, file_path="ocr.pdf"):
|
|
"""Upload OCR PDF file"""
|
|
logging.info(f"📤 Uploading OCR PDF: {file_path}")
|
|
|
|
if not Path(file_path).exists():
|
|
logging.error(f"❌ File not found: {file_path}")
|
|
return False
|
|
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
files = {'file': (file_path, f, 'application/pdf')}
|
|
# Remove auth header for file upload
|
|
headers = {k: v for k, v in self.session.headers.items() if k != 'Authorization'}
|
|
response = requests.post(
|
|
f"{self.base_url}/documents/upload",
|
|
files=files,
|
|
auth=(self.username, self.password)
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
logging.info("✅ Upload successful")
|
|
result = response.json()
|
|
logging.info(f"📊 Upload result: {json.dumps(result, indent=2)}")
|
|
return True
|
|
else:
|
|
logging.error(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logging.error(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def monitor_indexing(self, max_wait=60):
|
|
"""Monitor document indexing progress"""
|
|
logging.info("⏳ Monitoring indexing progress...")
|
|
|
|
for attempt in range(max_wait // 5):
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/documents")
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
if documents:
|
|
doc = documents[0]
|
|
status = doc.get('status', 'unknown')
|
|
logging.info(f"📄 Document status: {status}")
|
|
|
|
if status == 'completed':
|
|
logging.info("✅ Indexing completed successfully")
|
|
return True
|
|
elif status == 'failed':
|
|
logging.error("❌ Indexing failed")
|
|
return False
|
|
else:
|
|
logging.info("📭 No documents found yet")
|
|
else:
|
|
logging.error(f"❌ Status check failed: {response.status_code}")
|
|
|
|
time.sleep(5)
|
|
|
|
except Exception as e:
|
|
logging.error(f"❌ Monitoring error: {e}")
|
|
time.sleep(5)
|
|
|
|
logging.warning("⚠️ Indexing timeout reached")
|
|
return False
|
|
|
|
def test_search(self, queries):
|
|
"""Test search functionality with multiple queries"""
|
|
logging.info("🔍 Testing search functionality...")
|
|
|
|
results = {}
|
|
for query in queries:
|
|
logging.info(f"🔎 Searching for: '{query}'")
|
|
try:
|
|
payload = {
|
|
"query": query,
|
|
"top_k": 3
|
|
}
|
|
|
|
response = self.session.post(
|
|
f"{self.base_url}/search",
|
|
json=payload
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
results[query] = {
|
|
'success': True,
|
|
'results_count': len(result.get('results', [])),
|
|
'sample_content': result.get('results', [{}])[0].get('content', '')[:200] if result.get('results') else 'No results'
|
|
}
|
|
logging.info(f"✅ Search successful - {len(result.get('results', []))} results")
|
|
logging.info(f"📝 Sample: {results[query]['sample_content']}...")
|
|
else:
|
|
results[query] = {
|
|
'success': False,
|
|
'error': f"Status {response.status_code}: {response.text}"
|
|
}
|
|
logging.error(f"❌ Search failed: {response.status_code} - {response.text}")
|
|
|
|
except Exception as e:
|
|
results[query] = {
|
|
'success': False,
|
|
'error': str(e)
|
|
}
|
|
logging.error(f"❌ Search error: {e}")
|
|
|
|
return results
|
|
|
|
def check_database_status(self):
|
|
"""Check database connectivity and status"""
|
|
logging.info("🗄️ Checking database connections...")
|
|
|
|
try:
|
|
# Check document count
|
|
response = self.session.get(f"{self.base_url}/documents")
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
logging.info(f"📊 Documents in system: {len(documents)}")
|
|
|
|
# Check search index
|
|
test_response = self.session.post(
|
|
f"{self.base_url}/search",
|
|
json={"query": "test", "top_k": 1}
|
|
)
|
|
if test_response.status_code == 200:
|
|
logging.info("✅ Search index is operational")
|
|
else:
|
|
logging.warning("⚠️ Search index may have issues")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logging.error(f"❌ Database check failed: {e}")
|
|
return False
|
|
|
|
def main():
|
|
logging.info("🚀 STARTING OCR PDF WORKFLOW VALIDATION")
|
|
logging.info("=" * 70)
|
|
|
|
# Initialize tester
|
|
tester = OCRWorkflowTester()
|
|
|
|
# Test server connectivity
|
|
if not tester.check_server_status():
|
|
logging.error("❌ Cannot proceed - server not accessible")
|
|
return
|
|
|
|
# Upload OCR PDF
|
|
if tester.upload_ocr_pdf():
|
|
# Wait and monitor indexing
|
|
logging.info("⏳ Waiting for indexing to complete...")
|
|
time.sleep(10)
|
|
|
|
if tester.monitor_indexing():
|
|
# Test search with OCR-specific queries
|
|
search_queries = [
|
|
"OCR",
|
|
"text extraction",
|
|
"document processing",
|
|
"optical character recognition",
|
|
"PDF conversion"
|
|
]
|
|
|
|
search_results = tester.test_search(search_queries)
|
|
|
|
# Check database status
|
|
tester.check_database_status()
|
|
|
|
# Print summary
|
|
logging.info("=" * 70)
|
|
logging.info("📋 TEST RESULTS SUMMARY")
|
|
logging.info("=" * 70)
|
|
|
|
successful_searches = sum(1 for result in search_results.values() if result['success'])
|
|
logging.info(f"✅ Upload: SUCCESS")
|
|
logging.info(f"✅ Indexing: SUCCESS")
|
|
logging.info(f"🔍 Search: {successful_searches}/{len(search_queries)} queries successful")
|
|
|
|
for query, result in search_results.items():
|
|
status = "✅" if result['success'] else "❌"
|
|
logging.info(f" {status} '{query}': {result.get('results_count', 'N/A')} results")
|
|
|
|
else:
|
|
logging.error("❌ Indexing failed or timed out")
|
|
else:
|
|
logging.error("❌ Upload failed - cannot proceed with testing")
|
|
|
|
logging.info("=" * 70)
|
|
logging.info("🌐 Web UI: http://localhost:3015/webui/")
|
|
logging.info("👤 Username: jleu3482")
|
|
logging.info("🔑 Password: jleu1212")
|
|
logging.info("📁 Test file: ocr.pdf")
|
|
|
|
if __name__ == "__main__":
|
|
main() |