railseek6/test_direct_ocr.py

import os
import sys
import asyncio
from pathlib import Path

# Add LightRAG to path
sys.path.append('LightRAG-main')

async def test_direct_ocr():
    """Test OCR directly using the document processor"""

    print("Testing direct OCR with GPU...")
    print("=" * 50)

    try:
        # Import the document processor
        from lightrag.document_processor import get_document_processor

        print("Initializing document processor...")
        processor = get_document_processor()

        # Test file
        ocr_pdf_path = "ocr.pdf"

        if not os.path.exists(ocr_pdf_path):
            print(f"Error: OCR PDF file not found at {ocr_pdf_path}")
            return False

        print(f"Processing OCR PDF: {ocr_pdf_path}")

        # Process the document directly
        result = await processor.process_document(ocr_pdf_path)

        print(f"Processing result: {result.success}")
        print(f"Error message: {result.error}")
        print(f"Content length: {len(result.content)}")

        if result.success:
            print("✅ OCR processing successful!")
            print(f"Extracted content preview: {result.content[:500]}...")

            # Check if GPU was used
            if hasattr(processor, 'use_gpu'):
                print(f"GPU usage: {processor.use_gpu}")

            # Check for any fallback messages
            if "fallback" in result.content.lower():
                print("⚠️  WARNING: Fallback detected in content")

            return True
        else:
            print(f"❌ OCR processing failed: {result.error}")
            return False

    except ImportError as e:
        print(f"❌ Import error: {e}")
        return False
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        return False

async def check_paddleocr_gpu():
    """Check if PaddleOCR is using GPU"""

    print("\nChecking PaddleOCR GPU status...")
    print("=" * 30)

    try:
        import paddle
        print(f"Paddle version: {paddle.__version__}")
        print(f"Paddle is compiled with CUDA: {paddle.is_compiled_with_cuda()}")
        print(f"Paddle device: {paddle.get_device()}")
        print(f"GPU available: {paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0}")

        if paddle.device.cuda.device_count() > 0:
            print(f"Number of GPUs: {paddle.device.cuda.device_count()}")
            for i in range(paddle.device.cuda.device_count()):
                print(f"GPU {i}: {paddle.device.cuda.get_device_name(i)}")
        else:
            print("No GPUs detected by PaddlePaddle")

    except ImportError:
        print("❌ PaddlePaddle not installed")
    except Exception as e:
        print(f"❌ Error checking PaddleOCR: {e}")

async def main():
    """Main test function"""

    print("Direct OCR Test Suite")
    print("=" * 50)

    # Check PaddleOCR GPU status
    await check_paddleocr_gpu()

    print("\n" + "=" * 50)

    # Test direct OCR
    success = await test_direct_ocr()

    print("\n" + "=" * 50)
    if success:
        print("🎉 Direct OCR test completed successfully!")
    else:
        print("💥 Direct OCR test failed!")

    return success

if __name__ == "__main__":
    # Run the async tests
    asyncio.run(main())