Files
railseek6/test_ocr_batch.py
2026-01-13 19:10:24 +08:00

52 lines
1.8 KiB
Python

#!/usr/bin/env python3
"""
Test OCR batch processing and initialization improvements.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
from lightrag.document_processor import DocumentProcessor
import asyncio
import time
def test_ocr_initialization():
print("Testing OCR initialization improvements...")
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
# Wait for initialization (should be quick if pre-warmed)
start = time.time()
ready = processor._ensure_ocr_initialized(timeout=30.0)
elapsed = time.time() - start
print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")
if ready:
print("OCR metrics:", processor.get_metrics())
else:
print("OCR not available")
processor.close()
async def test_document_processor():
print("\nTesting DocumentProcessor with batch OCR...")
processor = DocumentProcessor()
# Use a dummy PDF file (if exists) or just test initialization
test_pdf = "test_meaningful.pdf"
if os.path.exists(test_pdf):
print(f"Processing {test_pdf}...")
start = time.time()
result = await processor.process_document(test_pdf)
elapsed = time.time() - start
print(f"Processing completed in {elapsed:.2f}s")
print(f"Success: {result.success}")
print(f"Pages: {result.metadata.get('pages', 'N/A')}")
print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
print(f"Content length: {len(result.content)}")
else:
print(f"Test PDF not found at {test_pdf}, skipping processing test.")
if __name__ == "__main__":
test_ocr_initialization()
asyncio.run(test_document_processor())
print("\nAll tests completed.")