Files
railseek6/test_ocr_fix_direct.py

60 lines
1.8 KiB
Python

#!/usr/bin/env python3
"""
Direct test of OCR fix with GPU mode
"""
import sys
import os
sys.path.append('LightRAG-main')
from lightrag.document_processor import OCRProcessor
def test_ocr_fix():
print("🧪 Testing OCR Fix with GPU Mode")
print("=" * 50)
try:
# Initialize OCR with GPU
print("🔧 Initializing OCR processor with GPU...")
ocr = OCRProcessor(use_gpu=True)
print(f"✅ OCR available: {ocr.ocr_available}")
print(f"✅ Using GPU: {ocr.use_gpu}")
if not ocr.ocr_available:
print("❌ OCR not available, cannot proceed")
return False
# Test extraction on the PDF
print("\n📄 Testing OCR extraction on ocr.pdf...")
result = ocr.extract_text_from_image('ocr.pdf')
print(f"✅ Extracted text length: {len(result['text'])}")
print(f"✅ Confidence: {result['confidence']}")
print(f"✅ Line count: {result['line_count']}")
if result['text']:
print("\n📝 First 500 characters:")
print("-" * 50)
print(result['text'][:500])
print("-" * 50)
# Check if we got meaningful content
if len(result['text']) > 100:
print("🎉 SUCCESS: OCR extracted meaningful text from scanned PDF!")
return True
else:
print("⚠️ WARNING: Text extracted but seems too short")
return False
else:
print("❌ FAILED: No text extracted from PDF")
return False
except Exception as e:
print(f"❌ ERROR: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = test_ocr_fix()
sys.exit(0 if success else 1)