52 lines
1.8 KiB
Python
52 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test OCR batch processing and initialization improvements.
|
|
"""
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
|
|
|
|
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
|
from lightrag.document_processor import DocumentProcessor
|
|
import asyncio
|
|
import time
|
|
|
|
def test_ocr_initialization():
|
|
print("Testing OCR initialization improvements...")
|
|
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
|
|
|
|
# Wait for initialization (should be quick if pre-warmed)
|
|
start = time.time()
|
|
ready = processor._ensure_ocr_initialized(timeout=30.0)
|
|
elapsed = time.time() - start
|
|
print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")
|
|
|
|
if ready:
|
|
print("OCR metrics:", processor.get_metrics())
|
|
else:
|
|
print("OCR not available")
|
|
|
|
processor.close()
|
|
|
|
async def test_document_processor():
|
|
print("\nTesting DocumentProcessor with batch OCR...")
|
|
processor = DocumentProcessor()
|
|
# Use a dummy PDF file (if exists) or just test initialization
|
|
test_pdf = "test_meaningful.pdf"
|
|
if os.path.exists(test_pdf):
|
|
print(f"Processing {test_pdf}...")
|
|
start = time.time()
|
|
result = await processor.process_document(test_pdf)
|
|
elapsed = time.time() - start
|
|
print(f"Processing completed in {elapsed:.2f}s")
|
|
print(f"Success: {result.success}")
|
|
print(f"Pages: {result.metadata.get('pages', 'N/A')}")
|
|
print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
|
|
print(f"Content length: {len(result.content)}")
|
|
else:
|
|
print(f"Test PDF not found at {test_pdf}, skipping processing test.")
|
|
|
|
if __name__ == "__main__":
|
|
test_ocr_initialization()
|
|
asyncio.run(test_document_processor())
|
|
print("\nAll tests completed.") |