60 lines
1.8 KiB
Python
60 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Direct test of OCR fix with GPU mode
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.append('LightRAG-main')
|
|
|
|
from lightrag.document_processor import OCRProcessor
|
|
|
|
def test_ocr_fix():
|
|
print("🧪 Testing OCR Fix with GPU Mode")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Initialize OCR with GPU
|
|
print("🔧 Initializing OCR processor with GPU...")
|
|
ocr = OCRProcessor(use_gpu=True)
|
|
print(f"✅ OCR available: {ocr.ocr_available}")
|
|
print(f"✅ Using GPU: {ocr.use_gpu}")
|
|
|
|
if not ocr.ocr_available:
|
|
print("❌ OCR not available, cannot proceed")
|
|
return False
|
|
|
|
# Test extraction on the PDF
|
|
print("\n📄 Testing OCR extraction on ocr.pdf...")
|
|
result = ocr.extract_text_from_image('ocr.pdf')
|
|
|
|
print(f"✅ Extracted text length: {len(result['text'])}")
|
|
print(f"✅ Confidence: {result['confidence']}")
|
|
print(f"✅ Line count: {result['line_count']}")
|
|
|
|
if result['text']:
|
|
print("\n📝 First 500 characters:")
|
|
print("-" * 50)
|
|
print(result['text'][:500])
|
|
print("-" * 50)
|
|
|
|
# Check if we got meaningful content
|
|
if len(result['text']) > 100:
|
|
print("🎉 SUCCESS: OCR extracted meaningful text from scanned PDF!")
|
|
return True
|
|
else:
|
|
print("⚠️ WARNING: Text extracted but seems too short")
|
|
return False
|
|
else:
|
|
print("❌ FAILED: No text extracted from PDF")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = test_ocr_fix()
|
|
sys.exit(0 if success else 1) |