Files
railseek6/test_production_ocr_workflow.py

226 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""
Production OCR Workflow Test
Tests the complete OCR PDF processing with PaddleOCR and GPU acceleration
"""
import os
import sys
import time
import requests
import json
from pathlib import Path
def test_production_ocr_workflow():
"""Test complete OCR workflow with production configuration"""
print("🚀 Testing Production OCR Workflow with PaddleOCR and GPU Acceleration")
print("=" * 60)
# Test server accessibility
print("\n🔍 Testing server accessibility...")
try:
response = requests.get("http://localhost:3015/", timeout=10)
if response.status_code == 200:
print("✅ Server is accessible")
else:
print(f"❌ Server returned status: {response.status_code}")
return False
except Exception as e:
print(f"❌ Cannot connect to server: {e}")
return False
# Test OCR PDF upload
print("\n📁 Testing OCR PDF upload...")
pdf_file = "inputs/ocr.pdf"
if not os.path.exists(pdf_file):
print(f"❌ OCR PDF file not found: {pdf_file}")
return False
print(f"📄 Using OCR PDF: {pdf_file}")
# Upload the PDF
try:
with open(pdf_file, 'rb') as f:
files = {'file': (os.path.basename(pdf_file), f, 'application/pdf')}
response = requests.post("http://localhost:3015/documents/upload", files=files, timeout=30)
if response.status_code == 200:
upload_result = response.json()
print(f"✅ Upload successful: {json.dumps(upload_result, indent=2)}")
# Wait for processing
print("\n⏳ Waiting for OCR processing (30 seconds)...")
time.sleep(30)
# Check document status
print("\n📋 Checking document status...")
try:
doc_response = requests.get("http://localhost:3015/documents", timeout=10)
if doc_response.status_code == 200:
doc_status = doc_response.json()
print(f"📄 Documents in system: {json.dumps(doc_status, indent=2)}")
# Check if we have successful documents
if 'statuses' in doc_status:
if 'success' in doc_status['statuses'] and doc_status['statuses']['success']:
print("🎉 OCR processing successful! Documents indexed successfully.")
return True
elif 'failed' in doc_status['statuses'] and doc_status['statuses']['failed']:
failed_docs = doc_status['statuses']['failed']
for doc in failed_docs:
if doc.get('file_path') == 'ocr.pdf':
print(f"❌ OCR processing failed: {doc.get('error_msg', 'Unknown error')}")
return False
else:
print("⚠️ No document status available yet")
else:
print(f"❌ Failed to get document status: {doc_response.status_code}")
except Exception as e:
print(f"❌ Error checking document status: {e}")
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Upload error: {e}")
return False
return True
def test_paddleocr_gpu():
"""Test PaddleOCR with GPU acceleration"""
print("\n🔬 Testing PaddleOCR GPU Acceleration...")
try:
import paddleocr
import paddle
# Check if GPU is available
if paddle.device.is_compiled_with_cuda():
print("✅ CUDA is available")
gpu_count = paddle.device.cuda.device_count()
print(f"✅ GPU devices: {gpu_count}")
# Test PaddleOCR initialization
try:
ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
print("✅ PaddleOCR initialized with GPU support")
# Test with a simple image if available
test_image = "test_ocr_image.png"
if os.path.exists(test_image):
result = ocr.ocr(test_image, cls=True)
print(f"✅ PaddleOCR test successful: {len(result)} results")
else:
print("⚠️ No test image available, but PaddleOCR initialized successfully")
return True
except Exception as e:
print(f"❌ PaddleOCR GPU initialization failed: {e}")
return False
else:
print("❌ CUDA not available - using CPU fallback")
return False
except ImportError as e:
print(f"❌ PaddleOCR not installed: {e}")
return False
except Exception as e:
print(f"❌ GPU test error: {e}")
return False
def test_database_connections():
"""Test database connectivity"""
print("\n🗄️ Testing Database Connections...")
databases = {
"Redis": "redis://localhost:6379",
"Neo4j": "bolt://localhost:7687",
"Qdrant": "http://localhost:6333",
"PostgreSQL": "postgresql://jleu3482:jleu1212@localhost:5432/rag_anything"
}
all_connected = True
# Test Redis
try:
import redis
r = redis.Redis(host='localhost', port=6379, db=1)
r.ping()
print("✅ Redis connection successful")
except Exception as e:
print(f"❌ Redis connection failed: {e}")
all_connected = False
# Test Neo4j
try:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "jleu1212"))
with driver.session() as session:
session.run("RETURN 1")
print("✅ Neo4j connection successful")
except Exception as e:
print(f"❌ Neo4j connection failed: {e}")
all_connected = False
# Test Qdrant (simple HTTP check)
try:
response = requests.get("http://localhost:6333", timeout=5)
if response.status_code == 200:
print("✅ Qdrant connection successful")
else:
print(f"❌ Qdrant connection failed: {response.status_code}")
all_connected = False
except Exception as e:
print(f"❌ Qdrant connection failed: {e}")
all_connected = False
# Test PostgreSQL
try:
import psycopg2
conn = psycopg2.connect(
host="localhost",
port=5432,
user="jleu3482",
password="jleu1212",
database="rag_anything"
)
conn.close()
print("✅ PostgreSQL connection successful")
except Exception as e:
print(f"❌ PostgreSQL connection failed: {e}")
all_connected = False
return all_connected
def main():
"""Run complete production OCR workflow test"""
print("🏭 Production OCR Workflow Test")
print("=" * 60)
# Test database connections
db_success = test_database_connections()
# Test PaddleOCR GPU
ocr_success = test_paddleocr_gpu()
# Test OCR workflow
workflow_success = test_production_ocr_workflow()
# Summary
print("\n" + "=" * 60)
print("📊 TEST SUMMARY")
print("=" * 60)
print(f"🗄️ Database Connections: {'✅ PASS' if db_success else '❌ FAIL'}")
print(f"🔬 PaddleOCR GPU: {'✅ PASS' if ocr_success else '⚠️ PARTIAL (CPU fallback)'}")
print(f"📄 OCR Workflow: {'✅ PASS' if workflow_success else '❌ FAIL'}")
if db_success and workflow_success:
print("\n🎉 PRODUCTION OCR WORKFLOW TEST COMPLETED SUCCESSFULLY!")
return True
else:
print("\n💥 PRODUCTION OCR WORKFLOW TEST FAILED!")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)