Files
railseek6/test_complete_ocr_workflow.py

261 lines
9.6 KiB
Python

"""
Complete OCR Workflow Test with GPU Mode
Tests upload, indexing, and search functionality for scanned PDF tables
"""
import os
import sys
import requests
import time
import json
from pathlib import Path
# Configure GPU environment
os.environ['CUDA_PATH'] = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
os.environ['CUDA_HOME'] = os.environ['CUDA_PATH']
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['PATH'] = f"{os.environ['CUDA_PATH']}\\bin;{os.environ['PATH']}"
# Server configuration
BASE_URL = 'http://localhost:3015'
AUTH_CREDENTIALS = {'username': 'jleu3482', 'password': 'jleu1212'}
OCR_PDF_PATH = 'ocr.pdf'
def verify_gpu_environment():
"""Verify GPU environment is properly configured"""
print("🔍 Verifying GPU Environment...")
try:
import paddle
print(f"✅ PaddlePaddle version: {paddle.__version__}")
print(f"✅ CUDA compiled: {paddle.is_compiled_with_cuda()}")
print(f"✅ GPU devices: {paddle.device.cuda.device_count()}")
# Test PaddleOCR GPU initialization
from paddleocr import PaddleOCR
ocr_engine = PaddleOCR(use_gpu=True, lang='en', show_log=False)
print("✅ PaddleOCR GPU initialization successful")
return True
except Exception as e:
print(f"❌ GPU environment verification failed: {e}")
return False
def test_server_connectivity():
"""Test server connectivity and authentication"""
print("\n🌐 Testing Server Connectivity...")
try:
# Test basic connectivity
response = requests.get(f'{BASE_URL}/', timeout=5)
print(f"✅ Server is running (status: {response.status_code})")
# Test authentication
login_response = requests.post(f'{BASE_URL}/login', data=AUTH_CREDENTIALS, timeout=10)
if login_response.status_code == 200:
token = login_response.json().get('access_token')
print("✅ Authentication successful")
return token
else:
print(f"❌ Authentication failed: {login_response.status_code} - {login_response.text}")
return None
except Exception as e:
print(f"❌ Server connectivity test failed: {e}")
return None
def clear_existing_documents(token):
"""Clear existing documents from the system"""
print("\n🗑️ Clearing existing documents...")
try:
headers = {'Authorization': f'Bearer {token}'}
clear_response = requests.delete(f'{BASE_URL}/documents', headers=headers, timeout=30)
if clear_response.status_code == 200:
print("✅ Documents cleared successfully")
return True
else:
print(f"⚠️ Clear documents response: {clear_response.status_code}")
return True # Continue even if clear fails
except Exception as e:
print(f"⚠️ Clear documents failed: {e}")
return True # Continue anyway
def upload_ocr_pdf(token):
"""Upload OCR PDF for processing"""
print(f"\n📤 Uploading {OCR_PDF_PATH}...")
try:
headers = {'Authorization': f'Bearer {token}'}
if not os.path.exists(OCR_PDF_PATH):
print(f"❌ OCR PDF not found: {OCR_PDF_PATH}")
return False
# Upload with longer timeout
with open(OCR_PDF_PATH, 'rb') as f:
files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
upload_response = requests.post(f'{BASE_URL}/documents/upload',
files=files, headers=headers, timeout=60)
if upload_response.status_code == 200:
upload_data = upload_response.json()
print(f"✅ Upload successful: {upload_data}")
return upload_data
else:
print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
return False
except Exception as e:
print(f"❌ Upload failed: {e}")
return False
def monitor_processing(token, max_wait=120):
"""Monitor document processing with progress updates"""
print(f"\n🔄 Monitoring OCR processing (max {max_wait}s)...")
try:
headers = {'Authorization': f'Bearer {token}'}
for i in range(max_wait // 5):
time.sleep(5) # Check every 5 seconds
docs_response = requests.get(f'{BASE_URL}/documents', headers=headers, timeout=10)
if docs_response.status_code == 200:
docs_data = docs_response.json()
statuses = docs_data.get('statuses', {})
completed = len(statuses.get('completed', []))
processing = len(statuses.get('processing', []))
failed = len(statuses.get('failed', []))
elapsed = (i + 1) * 5
print(f"⏰ Progress after {elapsed}s: Processing={processing}, Completed={completed}, Failed={failed}")
# Check for completed documents
if completed > 0:
completed_docs = statuses.get('completed', [])
for doc in completed_docs:
print(f"🎉 Completed: {doc.get('file_path')}")
print(f" Content length: {doc.get('content_length', 0)}")
print(f" Chunks: {doc.get('chunks_count', 0)}")
return True
# Check for failed documents
if failed > 0:
failed_docs = statuses.get('failed', [])
for doc in failed_docs:
print(f"❌ Failed: {doc.get('file_path')}")
print(f" Error: {doc.get('error_msg', 'Unknown error')}")
return False
print("⏰ Processing timeout - check server logs for details")
return False
except Exception as e:
print(f"❌ Monitoring failed: {e}")
return False
def test_search_functionality(token):
"""Test search functionality with OCR-extracted content"""
print("\n🔍 Testing Search Functionality...")
try:
headers = {'Authorization': f'Bearer {token}'}
# Test search queries based on OCR content
test_queries = [
"safety precautions",
"minimum safe distance",
"high voltage work",
"traction voltage",
"conductive tools",
"live parts"
]
success_count = 0
for query in test_queries:
try:
search_data = {'query': query}
search_response = requests.post(f'{BASE_URL}/api/search',
json=search_data, headers=headers, timeout=15)
if search_response.status_code == 200:
search_results = search_response.json()
results = search_results.get('results', [])
print(f"✅ Search '{query}': Found {len(results)} results")
# Show first result snippet if available
if results:
first_result = results[0]
snippet = first_result.get('content', '')[:100] + '...'
print(f" 📄 First result: {snippet}")
success_count += 1
else:
print(f"❌ Search '{query}' failed: {search_response.status_code}")
except Exception as e:
print(f"❌ Search '{query}' error: {e}")
print(f"\n📊 Search test: {success_count}/{len(test_queries)} queries successful")
return success_count > 0
except Exception as e:
print(f"❌ Search functionality test failed: {e}")
return False
def main():
"""Main OCR workflow test"""
print("🚀 Complete OCR Workflow Test with GPU Mode")
print("=" * 60)
# Step 1: Verify GPU environment
if not verify_gpu_environment():
print("❌ Cannot proceed - GPU environment not ready")
return
# Step 2: Test server connectivity
token = test_server_connectivity()
if not token:
print("❌ Cannot proceed - server connectivity failed")
return
# Step 3: Clear existing documents
if not clear_existing_documents(token):
print("⚠️ Clear documents failed, but continuing...")
# Step 4: Upload OCR PDF
upload_result = upload_ocr_pdf(token)
if not upload_result:
print("❌ OCR PDF upload failed")
return
# Step 5: Monitor processing
processing_ok = monitor_processing(token)
if not processing_ok:
print("❌ OCR processing failed")
return
# Step 6: Test search functionality
search_ok = test_search_functionality(token)
# Final results
print("\n" + "=" * 60)
print("📊 FINAL OCR WORKFLOW RESULTS:")
print(f" GPU Environment: ✅")
print(f" Server Connectivity: ✅")
print(f" OCR PDF Upload: ✅")
print(f" Processing: {'' if processing_ok else ''}")
print(f" Search: {'' if search_ok else ''}")
if processing_ok and search_ok:
print("\n🎉 SUCCESS: OCR PDF upload, indexing, and search working with GPU mode!")
print(" The scanned table document has been successfully processed and is searchable.")
else:
print("\n⚠️ PARTIAL SUCCESS: Some workflow steps completed, but issues remain.")
print(" Check server logs for detailed error information.")
if __name__ == "__main__":
main()