261 lines
9.6 KiB
Python
261 lines
9.6 KiB
Python
"""
|
|
Complete OCR Workflow Test with GPU Mode
|
|
Tests upload, indexing, and search functionality for scanned PDF tables
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import requests
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# Configure GPU environment
|
|
os.environ['CUDA_PATH'] = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
|
|
os.environ['CUDA_HOME'] = os.environ['CUDA_PATH']
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
os.environ['PATH'] = f"{os.environ['CUDA_PATH']}\\bin;{os.environ['PATH']}"
|
|
|
|
# Server configuration
|
|
BASE_URL = 'http://localhost:3015'
|
|
AUTH_CREDENTIALS = {'username': 'jleu3482', 'password': 'jleu1212'}
|
|
OCR_PDF_PATH = 'ocr.pdf'
|
|
|
|
def verify_gpu_environment():
|
|
"""Verify GPU environment is properly configured"""
|
|
print("🔍 Verifying GPU Environment...")
|
|
|
|
try:
|
|
import paddle
|
|
print(f"✅ PaddlePaddle version: {paddle.__version__}")
|
|
print(f"✅ CUDA compiled: {paddle.is_compiled_with_cuda()}")
|
|
print(f"✅ GPU devices: {paddle.device.cuda.device_count()}")
|
|
|
|
# Test PaddleOCR GPU initialization
|
|
from paddleocr import PaddleOCR
|
|
ocr_engine = PaddleOCR(use_gpu=True, lang='en', show_log=False)
|
|
print("✅ PaddleOCR GPU initialization successful")
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"❌ GPU environment verification failed: {e}")
|
|
return False
|
|
|
|
def test_server_connectivity():
|
|
"""Test server connectivity and authentication"""
|
|
print("\n🌐 Testing Server Connectivity...")
|
|
|
|
try:
|
|
# Test basic connectivity
|
|
response = requests.get(f'{BASE_URL}/', timeout=5)
|
|
print(f"✅ Server is running (status: {response.status_code})")
|
|
|
|
# Test authentication
|
|
login_response = requests.post(f'{BASE_URL}/login', data=AUTH_CREDENTIALS, timeout=10)
|
|
if login_response.status_code == 200:
|
|
token = login_response.json().get('access_token')
|
|
print("✅ Authentication successful")
|
|
return token
|
|
else:
|
|
print(f"❌ Authentication failed: {login_response.status_code} - {login_response.text}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"❌ Server connectivity test failed: {e}")
|
|
return None
|
|
|
|
def clear_existing_documents(token):
|
|
"""Clear existing documents from the system"""
|
|
print("\n🗑️ Clearing existing documents...")
|
|
|
|
try:
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
clear_response = requests.delete(f'{BASE_URL}/documents', headers=headers, timeout=30)
|
|
|
|
if clear_response.status_code == 200:
|
|
print("✅ Documents cleared successfully")
|
|
return True
|
|
else:
|
|
print(f"⚠️ Clear documents response: {clear_response.status_code}")
|
|
return True # Continue even if clear fails
|
|
except Exception as e:
|
|
print(f"⚠️ Clear documents failed: {e}")
|
|
return True # Continue anyway
|
|
|
|
def upload_ocr_pdf(token):
|
|
"""Upload OCR PDF for processing"""
|
|
print(f"\n📤 Uploading {OCR_PDF_PATH}...")
|
|
|
|
try:
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
|
|
if not os.path.exists(OCR_PDF_PATH):
|
|
print(f"❌ OCR PDF not found: {OCR_PDF_PATH}")
|
|
return False
|
|
|
|
# Upload with longer timeout
|
|
with open(OCR_PDF_PATH, 'rb') as f:
|
|
files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
|
|
upload_response = requests.post(f'{BASE_URL}/documents/upload',
|
|
files=files, headers=headers, timeout=60)
|
|
|
|
if upload_response.status_code == 200:
|
|
upload_data = upload_response.json()
|
|
print(f"✅ Upload successful: {upload_data}")
|
|
return upload_data
|
|
else:
|
|
print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload failed: {e}")
|
|
return False
|
|
|
|
def monitor_processing(token, max_wait=120):
|
|
"""Monitor document processing with progress updates"""
|
|
print(f"\n🔄 Monitoring OCR processing (max {max_wait}s)...")
|
|
|
|
try:
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
|
|
for i in range(max_wait // 5):
|
|
time.sleep(5) # Check every 5 seconds
|
|
|
|
docs_response = requests.get(f'{BASE_URL}/documents', headers=headers, timeout=10)
|
|
if docs_response.status_code == 200:
|
|
docs_data = docs_response.json()
|
|
statuses = docs_data.get('statuses', {})
|
|
|
|
completed = len(statuses.get('completed', []))
|
|
processing = len(statuses.get('processing', []))
|
|
failed = len(statuses.get('failed', []))
|
|
|
|
elapsed = (i + 1) * 5
|
|
print(f"⏰ Progress after {elapsed}s: Processing={processing}, Completed={completed}, Failed={failed}")
|
|
|
|
# Check for completed documents
|
|
if completed > 0:
|
|
completed_docs = statuses.get('completed', [])
|
|
for doc in completed_docs:
|
|
print(f"🎉 Completed: {doc.get('file_path')}")
|
|
print(f" Content length: {doc.get('content_length', 0)}")
|
|
print(f" Chunks: {doc.get('chunks_count', 0)}")
|
|
return True
|
|
|
|
# Check for failed documents
|
|
if failed > 0:
|
|
failed_docs = statuses.get('failed', [])
|
|
for doc in failed_docs:
|
|
print(f"❌ Failed: {doc.get('file_path')}")
|
|
print(f" Error: {doc.get('error_msg', 'Unknown error')}")
|
|
return False
|
|
|
|
print("⏰ Processing timeout - check server logs for details")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Monitoring failed: {e}")
|
|
return False
|
|
|
|
def test_search_functionality(token):
|
|
"""Test search functionality with OCR-extracted content"""
|
|
print("\n🔍 Testing Search Functionality...")
|
|
|
|
try:
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
|
|
# Test search queries based on OCR content
|
|
test_queries = [
|
|
"safety precautions",
|
|
"minimum safe distance",
|
|
"high voltage work",
|
|
"traction voltage",
|
|
"conductive tools",
|
|
"live parts"
|
|
]
|
|
|
|
success_count = 0
|
|
for query in test_queries:
|
|
try:
|
|
search_data = {'query': query}
|
|
search_response = requests.post(f'{BASE_URL}/api/search',
|
|
json=search_data, headers=headers, timeout=15)
|
|
|
|
if search_response.status_code == 200:
|
|
search_results = search_response.json()
|
|
results = search_results.get('results', [])
|
|
print(f"✅ Search '{query}': Found {len(results)} results")
|
|
|
|
# Show first result snippet if available
|
|
if results:
|
|
first_result = results[0]
|
|
snippet = first_result.get('content', '')[:100] + '...'
|
|
print(f" 📄 First result: {snippet}")
|
|
|
|
success_count += 1
|
|
else:
|
|
print(f"❌ Search '{query}' failed: {search_response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search '{query}' error: {e}")
|
|
|
|
print(f"\n📊 Search test: {success_count}/{len(test_queries)} queries successful")
|
|
return success_count > 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search functionality test failed: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main OCR workflow test"""
|
|
print("🚀 Complete OCR Workflow Test with GPU Mode")
|
|
print("=" * 60)
|
|
|
|
# Step 1: Verify GPU environment
|
|
if not verify_gpu_environment():
|
|
print("❌ Cannot proceed - GPU environment not ready")
|
|
return
|
|
|
|
# Step 2: Test server connectivity
|
|
token = test_server_connectivity()
|
|
if not token:
|
|
print("❌ Cannot proceed - server connectivity failed")
|
|
return
|
|
|
|
# Step 3: Clear existing documents
|
|
if not clear_existing_documents(token):
|
|
print("⚠️ Clear documents failed, but continuing...")
|
|
|
|
# Step 4: Upload OCR PDF
|
|
upload_result = upload_ocr_pdf(token)
|
|
if not upload_result:
|
|
print("❌ OCR PDF upload failed")
|
|
return
|
|
|
|
# Step 5: Monitor processing
|
|
processing_ok = monitor_processing(token)
|
|
if not processing_ok:
|
|
print("❌ OCR processing failed")
|
|
return
|
|
|
|
# Step 6: Test search functionality
|
|
search_ok = test_search_functionality(token)
|
|
|
|
# Final results
|
|
print("\n" + "=" * 60)
|
|
print("📊 FINAL OCR WORKFLOW RESULTS:")
|
|
print(f" GPU Environment: ✅")
|
|
print(f" Server Connectivity: ✅")
|
|
print(f" OCR PDF Upload: ✅")
|
|
print(f" Processing: {'✅' if processing_ok else '❌'}")
|
|
print(f" Search: {'✅' if search_ok else '❌'}")
|
|
|
|
if processing_ok and search_ok:
|
|
print("\n🎉 SUCCESS: OCR PDF upload, indexing, and search working with GPU mode!")
|
|
print(" The scanned table document has been successfully processed and is searchable.")
|
|
else:
|
|
print("\n⚠️ PARTIAL SUCCESS: Some workflow steps completed, but issues remain.")
|
|
print(" Check server logs for detailed error information.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |