165 lines
5.9 KiB
Python
165 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify the OCR fix is working with the server
|
|
"""
|
|
|
|
import asyncio
|
|
import requests
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Test configuration
|
|
SERVER_URL = "http://localhost:3015"
|
|
TEST_PDF = "ocr.pdf"
|
|
|
|
async def test_server_ocr():
|
|
"""Test OCR PDF upload to the running server"""
|
|
|
|
print("🧪 Testing Server OCR Fix")
|
|
print("=" * 50)
|
|
|
|
# Check if server is running
|
|
try:
|
|
health_response = requests.get(f"{SERVER_URL}/docs")
|
|
if health_response.status_code != 200:
|
|
print("❌ Server is not responding properly")
|
|
return False
|
|
print("✅ Server is running")
|
|
except Exception as e:
|
|
print(f"❌ Cannot connect to server: {e}")
|
|
return False
|
|
|
|
# Check if PDF file exists
|
|
if not Path(TEST_PDF).exists():
|
|
print(f"❌ Test PDF file not found: {TEST_PDF}")
|
|
return False
|
|
print(f"✅ Test PDF found: {TEST_PDF}")
|
|
|
|
# Test document upload
|
|
try:
|
|
print(f"📤 Uploading {TEST_PDF} to server...")
|
|
|
|
with open(TEST_PDF, 'rb') as f:
|
|
files = {'file': (TEST_PDF, f, 'application/pdf')}
|
|
response = requests.post(f"{SERVER_URL}/documents/upload", files=files)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
print(f"✅ Upload successful: {result}")
|
|
|
|
# Check processing status
|
|
track_id = result.get('track_id')
|
|
if track_id:
|
|
print(f"📊 Checking processing status for track_id: {track_id}")
|
|
|
|
# Wait a bit for processing
|
|
await asyncio.sleep(2)
|
|
|
|
status_response = requests.get(f"{SERVER_URL}/documents/track_status/{track_id}")
|
|
if status_response.status_code == 200:
|
|
status_data = status_response.json()
|
|
print(f"📈 Processing status: {status_data}")
|
|
|
|
# Check if any documents were processed successfully
|
|
documents = status_data.get('documents', [])
|
|
if documents:
|
|
for doc in documents:
|
|
print(f"📄 Document: {doc.get('id')}")
|
|
print(f" Status: {doc.get('status')}")
|
|
print(f" Content Length: {doc.get('content_length')}")
|
|
print(f" Error: {doc.get('error_msg', 'None')}")
|
|
|
|
if doc.get('status') == 'PROCESSED':
|
|
print("🎉 OCR processing successful!")
|
|
return True
|
|
elif doc.get('status') == 'FAILED':
|
|
print(f"❌ OCR processing failed: {doc.get('error_msg')}")
|
|
return False
|
|
else:
|
|
print("❌ No documents found in processing status")
|
|
return False
|
|
else:
|
|
print(f"❌ Failed to get processing status: {status_response.status_code}")
|
|
return False
|
|
else:
|
|
print("❌ No track_id returned from upload")
|
|
return False
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during upload test: {e}")
|
|
return False
|
|
|
|
async def test_direct_processor():
|
|
"""Test the document processor directly"""
|
|
print("\n🔧 Testing Document Processor Directly")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
sys.path.append('LightRAG-main')
|
|
from lightrag.document_processor import get_document_processor
|
|
|
|
processor = get_document_processor()
|
|
print("✅ Document processor loaded successfully")
|
|
|
|
# Test OCR initialization
|
|
if processor.ocr_processor.ocr_available:
|
|
print("✅ OCR processor is available")
|
|
print(f" GPU enabled: {processor.ocr_processor.use_gpu}")
|
|
else:
|
|
print("❌ OCR processor not available")
|
|
return False
|
|
|
|
# Test processing the PDF
|
|
print(f"📄 Processing {TEST_PDF} directly...")
|
|
result = await processor.process_document(TEST_PDF)
|
|
|
|
print(f"✅ Direct processing result:")
|
|
print(f" Success: {result.success}")
|
|
print(f" Content length: {len(result.content)}")
|
|
print(f" Error: {result.error}")
|
|
print(f" Metadata: {result.metadata}")
|
|
|
|
if result.success and len(result.content) > 0:
|
|
print("🎉 Direct OCR processing successful!")
|
|
return True
|
|
else:
|
|
print("❌ Direct OCR processing failed")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error in direct processor test: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
async def main():
|
|
"""Run all tests"""
|
|
print("🚀 Starting OCR Fix Verification Tests")
|
|
print("=" * 50)
|
|
|
|
# Test 1: Direct processor test
|
|
direct_result = await test_direct_processor()
|
|
|
|
# Test 2: Server upload test
|
|
server_result = await test_server_ocr()
|
|
|
|
print("\n" + "=" * 50)
|
|
print("📊 TEST RESULTS SUMMARY")
|
|
print("=" * 50)
|
|
print(f"Direct Processor Test: {'✅ PASS' if direct_result else '❌ FAIL'}")
|
|
print(f"Server Upload Test: {'✅ PASS' if server_result else '❌ FAIL'}")
|
|
|
|
if direct_result and server_result:
|
|
print("\n🎉 ALL TESTS PASSED - OCR fix is working!")
|
|
return True
|
|
else:
|
|
print("\n💥 SOME TESTS FAILED - OCR fix needs more work")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = asyncio.run(main())
|
|
sys.exit(0 if success else 1) |