Files
railseek6/test_server_ocr_fix.py

165 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
Test script to verify the OCR fix is working with the server
"""
import asyncio
import requests
import json
import sys
from pathlib import Path
# Test configuration
SERVER_URL = "http://localhost:3015"
TEST_PDF = "ocr.pdf"
async def test_server_ocr():
"""Test OCR PDF upload to the running server"""
print("🧪 Testing Server OCR Fix")
print("=" * 50)
# Check if server is running
try:
health_response = requests.get(f"{SERVER_URL}/docs")
if health_response.status_code != 200:
print("❌ Server is not responding properly")
return False
print("✅ Server is running")
except Exception as e:
print(f"❌ Cannot connect to server: {e}")
return False
# Check if PDF file exists
if not Path(TEST_PDF).exists():
print(f"❌ Test PDF file not found: {TEST_PDF}")
return False
print(f"✅ Test PDF found: {TEST_PDF}")
# Test document upload
try:
print(f"📤 Uploading {TEST_PDF} to server...")
with open(TEST_PDF, 'rb') as f:
files = {'file': (TEST_PDF, f, 'application/pdf')}
response = requests.post(f"{SERVER_URL}/documents/upload", files=files)
if response.status_code == 200:
result = response.json()
print(f"✅ Upload successful: {result}")
# Check processing status
track_id = result.get('track_id')
if track_id:
print(f"📊 Checking processing status for track_id: {track_id}")
# Wait a bit for processing
await asyncio.sleep(2)
status_response = requests.get(f"{SERVER_URL}/documents/track_status/{track_id}")
if status_response.status_code == 200:
status_data = status_response.json()
print(f"📈 Processing status: {status_data}")
# Check if any documents were processed successfully
documents = status_data.get('documents', [])
if documents:
for doc in documents:
print(f"📄 Document: {doc.get('id')}")
print(f" Status: {doc.get('status')}")
print(f" Content Length: {doc.get('content_length')}")
print(f" Error: {doc.get('error_msg', 'None')}")
if doc.get('status') == 'PROCESSED':
print("🎉 OCR processing successful!")
return True
elif doc.get('status') == 'FAILED':
print(f"❌ OCR processing failed: {doc.get('error_msg')}")
return False
else:
print("❌ No documents found in processing status")
return False
else:
print(f"❌ Failed to get processing status: {status_response.status_code}")
return False
else:
print("❌ No track_id returned from upload")
return False
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Error during upload test: {e}")
return False
async def test_direct_processor():
"""Test the document processor directly"""
print("\n🔧 Testing Document Processor Directly")
print("=" * 50)
try:
sys.path.append('LightRAG-main')
from lightrag.document_processor import get_document_processor
processor = get_document_processor()
print("✅ Document processor loaded successfully")
# Test OCR initialization
if processor.ocr_processor.ocr_available:
print("✅ OCR processor is available")
print(f" GPU enabled: {processor.ocr_processor.use_gpu}")
else:
print("❌ OCR processor not available")
return False
# Test processing the PDF
print(f"📄 Processing {TEST_PDF} directly...")
result = await processor.process_document(TEST_PDF)
print(f"✅ Direct processing result:")
print(f" Success: {result.success}")
print(f" Content length: {len(result.content)}")
print(f" Error: {result.error}")
print(f" Metadata: {result.metadata}")
if result.success and len(result.content) > 0:
print("🎉 Direct OCR processing successful!")
return True
else:
print("❌ Direct OCR processing failed")
return False
except Exception as e:
print(f"❌ Error in direct processor test: {e}")
import traceback
traceback.print_exc()
return False
async def main():
"""Run all tests"""
print("🚀 Starting OCR Fix Verification Tests")
print("=" * 50)
# Test 1: Direct processor test
direct_result = await test_direct_processor()
# Test 2: Server upload test
server_result = await test_server_ocr()
print("\n" + "=" * 50)
print("📊 TEST RESULTS SUMMARY")
print("=" * 50)
print(f"Direct Processor Test: {'✅ PASS' if direct_result else '❌ FAIL'}")
print(f"Server Upload Test: {'✅ PASS' if server_result else '❌ FAIL'}")
if direct_result and server_result:
print("\n🎉 ALL TESTS PASSED - OCR fix is working!")
return True
else:
print("\n💥 SOME TESTS FAILED - OCR fix needs more work")
return False
if __name__ == "__main__":
success = asyncio.run(main())
sys.exit(0 if success else 1)