226 lines
8.0 KiB
Python
226 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Production OCR Workflow Test
|
|
Tests the complete OCR PDF processing with PaddleOCR and GPU acceleration
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import requests
|
|
import json
|
|
from pathlib import Path
|
|
|
|
def test_production_ocr_workflow():
|
|
"""Test complete OCR workflow with production configuration"""
|
|
print("🚀 Testing Production OCR Workflow with PaddleOCR and GPU Acceleration")
|
|
print("=" * 60)
|
|
|
|
# Test server accessibility
|
|
print("\n🔍 Testing server accessibility...")
|
|
try:
|
|
response = requests.get("http://localhost:3015/", timeout=10)
|
|
if response.status_code == 200:
|
|
print("✅ Server is accessible")
|
|
else:
|
|
print(f"❌ Server returned status: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Cannot connect to server: {e}")
|
|
return False
|
|
|
|
# Test OCR PDF upload
|
|
print("\n📁 Testing OCR PDF upload...")
|
|
pdf_file = "inputs/ocr.pdf"
|
|
if not os.path.exists(pdf_file):
|
|
print(f"❌ OCR PDF file not found: {pdf_file}")
|
|
return False
|
|
|
|
print(f"📄 Using OCR PDF: {pdf_file}")
|
|
|
|
# Upload the PDF
|
|
try:
|
|
with open(pdf_file, 'rb') as f:
|
|
files = {'file': (os.path.basename(pdf_file), f, 'application/pdf')}
|
|
response = requests.post("http://localhost:3015/documents/upload", files=files, timeout=30)
|
|
|
|
if response.status_code == 200:
|
|
upload_result = response.json()
|
|
print(f"✅ Upload successful: {json.dumps(upload_result, indent=2)}")
|
|
|
|
# Wait for processing
|
|
print("\n⏳ Waiting for OCR processing (30 seconds)...")
|
|
time.sleep(30)
|
|
|
|
# Check document status
|
|
print("\n📋 Checking document status...")
|
|
try:
|
|
doc_response = requests.get("http://localhost:3015/documents", timeout=10)
|
|
if doc_response.status_code == 200:
|
|
doc_status = doc_response.json()
|
|
print(f"📄 Documents in system: {json.dumps(doc_status, indent=2)}")
|
|
|
|
# Check if we have successful documents
|
|
if 'statuses' in doc_status:
|
|
if 'success' in doc_status['statuses'] and doc_status['statuses']['success']:
|
|
print("🎉 OCR processing successful! Documents indexed successfully.")
|
|
return True
|
|
elif 'failed' in doc_status['statuses'] and doc_status['statuses']['failed']:
|
|
failed_docs = doc_status['statuses']['failed']
|
|
for doc in failed_docs:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"❌ OCR processing failed: {doc.get('error_msg', 'Unknown error')}")
|
|
return False
|
|
else:
|
|
print("⚠️ No document status available yet")
|
|
else:
|
|
print(f"❌ Failed to get document status: {doc_response.status_code}")
|
|
except Exception as e:
|
|
print(f"❌ Error checking document status: {e}")
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
return True
|
|
|
|
def test_paddleocr_gpu():
|
|
"""Test PaddleOCR with GPU acceleration"""
|
|
print("\n🔬 Testing PaddleOCR GPU Acceleration...")
|
|
try:
|
|
import paddleocr
|
|
import paddle
|
|
|
|
# Check if GPU is available
|
|
if paddle.device.is_compiled_with_cuda():
|
|
print("✅ CUDA is available")
|
|
gpu_count = paddle.device.cuda.device_count()
|
|
print(f"✅ GPU devices: {gpu_count}")
|
|
|
|
# Test PaddleOCR initialization
|
|
try:
|
|
ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
|
|
print("✅ PaddleOCR initialized with GPU support")
|
|
|
|
# Test with a simple image if available
|
|
test_image = "test_ocr_image.png"
|
|
if os.path.exists(test_image):
|
|
result = ocr.ocr(test_image, cls=True)
|
|
print(f"✅ PaddleOCR test successful: {len(result)} results")
|
|
else:
|
|
print("⚠️ No test image available, but PaddleOCR initialized successfully")
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"❌ PaddleOCR GPU initialization failed: {e}")
|
|
return False
|
|
else:
|
|
print("❌ CUDA not available - using CPU fallback")
|
|
return False
|
|
|
|
except ImportError as e:
|
|
print(f"❌ PaddleOCR not installed: {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ GPU test error: {e}")
|
|
return False
|
|
|
|
def test_database_connections():
|
|
"""Test database connectivity"""
|
|
print("\n🗄️ Testing Database Connections...")
|
|
|
|
databases = {
|
|
"Redis": "redis://localhost:6379",
|
|
"Neo4j": "bolt://localhost:7687",
|
|
"Qdrant": "http://localhost:6333",
|
|
"PostgreSQL": "postgresql://jleu3482:jleu1212@localhost:5432/rag_anything"
|
|
}
|
|
|
|
all_connected = True
|
|
|
|
# Test Redis
|
|
try:
|
|
import redis
|
|
r = redis.Redis(host='localhost', port=6379, db=1)
|
|
r.ping()
|
|
print("✅ Redis connection successful")
|
|
except Exception as e:
|
|
print(f"❌ Redis connection failed: {e}")
|
|
all_connected = False
|
|
|
|
# Test Neo4j
|
|
try:
|
|
from neo4j import GraphDatabase
|
|
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "jleu1212"))
|
|
with driver.session() as session:
|
|
session.run("RETURN 1")
|
|
print("✅ Neo4j connection successful")
|
|
except Exception as e:
|
|
print(f"❌ Neo4j connection failed: {e}")
|
|
all_connected = False
|
|
|
|
# Test Qdrant (simple HTTP check)
|
|
try:
|
|
response = requests.get("http://localhost:6333", timeout=5)
|
|
if response.status_code == 200:
|
|
print("✅ Qdrant connection successful")
|
|
else:
|
|
print(f"❌ Qdrant connection failed: {response.status_code}")
|
|
all_connected = False
|
|
except Exception as e:
|
|
print(f"❌ Qdrant connection failed: {e}")
|
|
all_connected = False
|
|
|
|
# Test PostgreSQL
|
|
try:
|
|
import psycopg2
|
|
conn = psycopg2.connect(
|
|
host="localhost",
|
|
port=5432,
|
|
user="jleu3482",
|
|
password="jleu1212",
|
|
database="rag_anything"
|
|
)
|
|
conn.close()
|
|
print("✅ PostgreSQL connection successful")
|
|
except Exception as e:
|
|
print(f"❌ PostgreSQL connection failed: {e}")
|
|
all_connected = False
|
|
|
|
return all_connected
|
|
|
|
def main():
|
|
"""Run complete production OCR workflow test"""
|
|
print("🏭 Production OCR Workflow Test")
|
|
print("=" * 60)
|
|
|
|
# Test database connections
|
|
db_success = test_database_connections()
|
|
|
|
# Test PaddleOCR GPU
|
|
ocr_success = test_paddleocr_gpu()
|
|
|
|
# Test OCR workflow
|
|
workflow_success = test_production_ocr_workflow()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("📊 TEST SUMMARY")
|
|
print("=" * 60)
|
|
print(f"🗄️ Database Connections: {'✅ PASS' if db_success else '❌ FAIL'}")
|
|
print(f"🔬 PaddleOCR GPU: {'✅ PASS' if ocr_success else '⚠️ PARTIAL (CPU fallback)'}")
|
|
print(f"📄 OCR Workflow: {'✅ PASS' if workflow_success else '❌ FAIL'}")
|
|
|
|
if db_success and workflow_success:
|
|
print("\n🎉 PRODUCTION OCR WORKFLOW TEST COMPLETED SUCCESSFULLY!")
|
|
return True
|
|
else:
|
|
print("\n💥 PRODUCTION OCR WORKFLOW TEST FAILED!")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
sys.exit(0 if success else 1) |