railseek6/test_ocr_fix_direct.py

#!/usr/bin/env python3
"""
Direct test of OCR fix with GPU mode
"""

import sys
import os
sys.path.append('LightRAG-main')

from lightrag.document_processor import OCRProcessor

def test_ocr_fix():
    print("🧪 Testing OCR Fix with GPU Mode")
    print("=" * 50)

    try:
        # Initialize OCR with GPU
        print("🔧 Initializing OCR processor with GPU...")
        ocr = OCRProcessor(use_gpu=True)
        print(f"✅ OCR available: {ocr.ocr_available}")
        print(f"✅ Using GPU: {ocr.use_gpu}")

        if not ocr.ocr_available:
            print("❌ OCR not available, cannot proceed")
            return False

        # Test extraction on the PDF
        print("\n📄 Testing OCR extraction on ocr.pdf...")
        result = ocr.extract_text_from_image('ocr.pdf')

        print(f"✅ Extracted text length: {len(result['text'])}")
        print(f"✅ Confidence: {result['confidence']}")
        print(f"✅ Line count: {result['line_count']}")

        if result['text']:
            print("\n📝 First 500 characters:")
            print("-" * 50)
            print(result['text'][:500])
            print("-" * 50)

            # Check if we got meaningful content
            if len(result['text']) > 100:
                print("🎉 SUCCESS: OCR extracted meaningful text from scanned PDF!")
                return True
            else:
                print("⚠️  WARNING: Text extracted but seems too short")
                return False
        else:
            print("❌ FAILED: No text extracted from PDF")
            return False

    except Exception as e:
        print(f"❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        return False

if __name__ == "__main__":
    success = test_ocr_fix()
    sys.exit(0 if success else 1)