railseek6/test_ocr_batch.py

#!/usr/bin/env python3
"""
Test OCR batch processing and initialization improvements.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))

from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
from lightrag.document_processor import DocumentProcessor
import asyncio
import time

def test_ocr_initialization():
    print("Testing OCR initialization improvements...")
    processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)

    # Wait for initialization (should be quick if pre-warmed)
    start = time.time()
    ready = processor._ensure_ocr_initialized(timeout=30.0)
    elapsed = time.time() - start
    print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")

    if ready:
        print("OCR metrics:", processor.get_metrics())
    else:
        print("OCR not available")

    processor.close()

async def test_document_processor():
    print("\nTesting DocumentProcessor with batch OCR...")
    processor = DocumentProcessor()
    # Use a dummy PDF file (if exists) or just test initialization
    test_pdf = "test_meaningful.pdf"
    if os.path.exists(test_pdf):
        print(f"Processing {test_pdf}...")
        start = time.time()
        result = await processor.process_document(test_pdf)
        elapsed = time.time() - start
        print(f"Processing completed in {elapsed:.2f}s")
        print(f"Success: {result.success}")
        print(f"Pages: {result.metadata.get('pages', 'N/A')}")
        print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
        print(f"Content length: {len(result.content)}")
    else:
        print(f"Test PDF not found at {test_pdf}, skipping processing test.")

if __name__ == "__main__":
    test_ocr_initialization()
    asyncio.run(test_document_processor())
    print("\nAll tests completed.")