#!/usr/bin/env python3 """ Test OCR batch processing and initialization improvements. """ import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main')) from lightrag.optimized_ocr_processor import OptimizedOCRProcessor from lightrag.document_processor import DocumentProcessor import asyncio import time def test_ocr_initialization(): print("Testing OCR initialization improvements...") processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2) # Wait for initialization (should be quick if pre-warmed) start = time.time() ready = processor._ensure_ocr_initialized(timeout=30.0) elapsed = time.time() - start print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s") if ready: print("OCR metrics:", processor.get_metrics()) else: print("OCR not available") processor.close() async def test_document_processor(): print("\nTesting DocumentProcessor with batch OCR...") processor = DocumentProcessor() # Use a dummy PDF file (if exists) or just test initialization test_pdf = "test_meaningful.pdf" if os.path.exists(test_pdf): print(f"Processing {test_pdf}...") start = time.time() result = await processor.process_document(test_pdf) elapsed = time.time() - start print(f"Processing completed in {elapsed:.2f}s") print(f"Success: {result.success}") print(f"Pages: {result.metadata.get('pages', 'N/A')}") print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}") print(f"Content length: {len(result.content)}") else: print(f"Test PDF not found at {test_pdf}, skipping processing test.") if __name__ == "__main__": test_ocr_initialization() asyncio.run(test_document_processor()) print("\nAll tests completed.")