ocr speed improved
This commit is contained in:
52
test_ocr_batch.py
Normal file
52
test_ocr_batch.py
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test OCR batch processing and initialization improvements.
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
|
||||
|
||||
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
||||
from lightrag.document_processor import DocumentProcessor
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
def test_ocr_initialization():
|
||||
print("Testing OCR initialization improvements...")
|
||||
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
|
||||
|
||||
# Wait for initialization (should be quick if pre-warmed)
|
||||
start = time.time()
|
||||
ready = processor._ensure_ocr_initialized(timeout=30.0)
|
||||
elapsed = time.time() - start
|
||||
print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")
|
||||
|
||||
if ready:
|
||||
print("OCR metrics:", processor.get_metrics())
|
||||
else:
|
||||
print("OCR not available")
|
||||
|
||||
processor.close()
|
||||
|
||||
async def test_document_processor():
|
||||
print("\nTesting DocumentProcessor with batch OCR...")
|
||||
processor = DocumentProcessor()
|
||||
# Use a dummy PDF file (if exists) or just test initialization
|
||||
test_pdf = "test_meaningful.pdf"
|
||||
if os.path.exists(test_pdf):
|
||||
print(f"Processing {test_pdf}...")
|
||||
start = time.time()
|
||||
result = await processor.process_document(test_pdf)
|
||||
elapsed = time.time() - start
|
||||
print(f"Processing completed in {elapsed:.2f}s")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Pages: {result.metadata.get('pages', 'N/A')}")
|
||||
print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
|
||||
print(f"Content length: {len(result.content)}")
|
||||
else:
|
||||
print(f"Test PDF not found at {test_pdf}, skipping processing test.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_ocr_initialization()
|
||||
asyncio.run(test_document_processor())
|
||||
print("\nAll tests completed.")
|
||||
Reference in New Issue
Block a user