ocr speed improved
This commit is contained in:
@@ -7,6 +7,7 @@ import os
|
||||
import logging
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import threading
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
import tempfile
|
||||
@@ -35,7 +36,7 @@ class OptimizedOCRProcessor:
|
||||
Optimized OCR processor with batch processing, shared model instance, and async support
|
||||
"""
|
||||
|
||||
def __init__(self, use_gpu: bool = True, languages: List[str] = None,
|
||||
def __init__(self, use_gpu: bool = True, languages: List[str] = None,
|
||||
batch_size: int = 4, max_workers: int = 2):
|
||||
"""
|
||||
Initialize optimized OCR processor
|
||||
@@ -55,6 +56,9 @@ class OptimizedOCRProcessor:
|
||||
self._model_loaded = False
|
||||
self._temp_dir = None
|
||||
self._executor = None
|
||||
self._initialization_lock = threading.Lock()
|
||||
self._initialization_thread = None
|
||||
self._initialization_started = False
|
||||
|
||||
# Performance metrics
|
||||
self.metrics = {
|
||||
@@ -64,7 +68,38 @@ class OptimizedOCRProcessor:
|
||||
"errors": []
|
||||
}
|
||||
|
||||
self._initialize_ocr()
|
||||
# Start lazy initialization in background thread
|
||||
self._start_lazy_initialization()
|
||||
|
||||
def _start_lazy_initialization(self):
|
||||
"""Start OCR initialization in a background thread."""
|
||||
with self._initialization_lock:
|
||||
if self._initialization_started:
|
||||
return
|
||||
self._initialization_started = True
|
||||
# Start thread
|
||||
self._initialization_thread = threading.Thread(
|
||||
target=self._initialize_ocr,
|
||||
name="OCRInitializer",
|
||||
daemon=True
|
||||
)
|
||||
self._initialization_thread.start()
|
||||
logger.info("Started lazy OCR initialization in background thread")
|
||||
|
||||
def _ensure_ocr_initialized(self, timeout: float = None):
|
||||
"""
|
||||
Block until OCR initialization is complete.
|
||||
If timeout is None, wait indefinitely.
|
||||
Returns True if OCR is available, False otherwise.
|
||||
"""
|
||||
if self.ocr_available:
|
||||
return True
|
||||
if not self._initialization_started:
|
||||
self._start_lazy_initialization()
|
||||
if self._initialization_thread is not None:
|
||||
self._initialization_thread.join(timeout=timeout)
|
||||
# After join, check if OCR is now available
|
||||
return self.ocr_available
|
||||
|
||||
def _initialize_ocr(self):
|
||||
"""Initialize PaddleOCR with shared model instance"""
|
||||
@@ -138,7 +173,9 @@ class OptimizedOCRProcessor:
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
if not self.ocr_available:
|
||||
# Ensure OCR is initialized (wait up to 30 seconds)
|
||||
if not self._ensure_ocr_initialized(timeout=30.0):
|
||||
logger.warning("OCR not available after waiting")
|
||||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||||
|
||||
try:
|
||||
@@ -182,7 +219,9 @@ class OptimizedOCRProcessor:
|
||||
|
||||
batch_start_time = time.time()
|
||||
|
||||
if not self.ocr_available:
|
||||
# Ensure OCR is initialized (wait up to 30 seconds)
|
||||
if not self._ensure_ocr_initialized(timeout=30.0):
|
||||
logger.warning("OCR not available for batch processing")
|
||||
return [BatchOCRResult(
|
||||
image_path=path,
|
||||
text="",
|
||||
|
||||
Reference in New Issue
Block a user