ocr speed improved

This commit is contained in:
2026-01-13 19:10:24 +08:00
parent a5eb381384
commit e7256a10ea
7 changed files with 283 additions and 22 deletions

View File

@@ -244,6 +244,23 @@ def create_app(args):
task.add_done_callback(app.state.background_tasks.discard) task.add_done_callback(app.state.background_tasks.discard)
logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.") logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.")
# Warm up OCR processor in background to avoid coldstart delay on first upload
async def warm_up_ocr_processor():
try:
logger.info("Starting OCR processor warmup...")
# Import inside function to avoid unnecessary dependency if OCR not used
from lightrag.document_processor import get_document_processor
# This will initialize OptimizedOCRProcessor (≈9 seconds)
processor = get_document_processor()
logger.info("OCR processor warmed up successfully")
except Exception as e:
logger.warning(f"OCR warmup failed (noncritical): {e}")
# Schedule warmup as a background task (nonblocking)
warm_up_task = asyncio.create_task(warm_up_ocr_processor())
app.state.background_tasks.add(warm_up_task)
warm_up_task.add_done_callback(app.state.background_tasks.discard)
ASCIIColors.green("\nServer is ready to accept connections! 🚀\n") ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
yield yield

View File

@@ -586,7 +586,7 @@ class DocumentProcessor:
return score return score
async def _process_pdf(self, file_path: Path) -> ProcessingResult: async def _process_pdf(self, file_path: Path) -> ProcessingResult:
"""Process PDF files with text extraction and OCR fallback""" """Process PDF files with text extraction and OCR fallback using batch processing"""
pdf_document = None pdf_document = None
try: try:
content_parts = [] content_parts = []
@@ -598,6 +598,10 @@ class DocumentProcessor:
pdf_document = fitz.open(str(file_path)) pdf_document = fitz.open(str(file_path))
total_pages = len(pdf_document) total_pages = len(pdf_document)
# Collect pages that need OCR
ocr_pages = [] # list of (page_num, temp_path)
page_texts = {} # page_num -> text (if usable)
for page_num in range(total_pages): for page_num in range(total_pages):
page = pdf_document[page_num] page = pdf_document[page_num]
@@ -607,7 +611,7 @@ class DocumentProcessor:
# Determine if text is usable (not garbled) # Determine if text is usable (not garbled)
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement # Threshold 0.5 means at least half of characters are printable ASCII and not replacement
if text.strip() and text_score >= 0.5: if text.strip() and text_score >= 0.5:
content_parts.append(f"Page {page_num + 1}:\n{text}") page_texts[page_num] = text
else: else:
# Text is empty, garbled, or low quality -> use OCR # Text is empty, garbled, or low quality -> use OCR
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR") logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
@@ -621,34 +625,94 @@ class DocumentProcessor:
temp_file.write(img_data) temp_file.write(img_data)
temp_path = temp_file.name temp_path = temp_file.name
try: ocr_pages.append((page_num, temp_path))
if self.ocr_processor.ocr_available:
logger.info(f"Running OCR on page {page_num + 1} with high resolution") # Process OCR pages in batch if any
if ocr_pages and self.ocr_processor.ocr_available:
try:
temp_paths = [temp_path for _, temp_path in ocr_pages]
logger.info(f"Running batch OCR on {len(temp_paths)} pages")
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
logger.info(f"Batch OCR completed for {len(batch_results)} pages")
# Map results back to pages
for idx, (page_num, temp_path) in enumerate(ocr_pages):
ocr_result = None
if idx < len(batch_results):
batch_result = batch_results[idx]
ocr_result = {
"text": batch_result.text,
"confidence": batch_result.confidence,
"bboxes": batch_result.bboxes,
"line_count": batch_result.line_count
}
else:
# Fallback to individual OCR
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
if ocr_result["text"].strip():
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
processed_with_ocr = True
else:
logger.warning(f"OCR returned empty text for page {page_num + 1}")
# Don't add empty content, just mark as processed
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
# Extract tables from OCR
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
if ocr_tables:
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
tables.extend(ocr_tables)
# Clean up temporary file
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as batch_error:
logger.error(f"Batch OCR processing failed: {batch_error}")
# Fall back to individual processing for each page
for page_num, temp_path in ocr_pages:
try:
ocr_result = self.ocr_processor.extract_text_from_image(temp_path) ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
if ocr_result["text"].strip(): if ocr_result["text"].strip():
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}") content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
processed_with_ocr = True processed_with_ocr = True
else: else:
logger.warning(f"OCR returned empty text for page {page_num + 1}")
# Don't add empty content, just mark as processed
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]") content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
# Extract tables from OCR # Extract tables
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path) ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
if ocr_tables: if ocr_tables:
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
tables.extend(ocr_tables) tables.extend(ocr_tables)
else: except Exception as ocr_error:
logger.warning("OCR not available, skipping OCR processing") logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]") content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
except Exception as ocr_error: finally:
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}") if temp_path and os.path.exists(temp_path):
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]") os.unlink(temp_path)
finally: elif ocr_pages and not self.ocr_processor.ocr_available:
logger.warning("OCR not available, skipping OCR processing")
for page_num, temp_path in ocr_pages:
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path) os.unlink(temp_path)
# Add text pages content
for page_num, text in page_texts.items():
content_parts.append(f"Page {page_num + 1}:\n{text}")
# Sort content parts by page number
def extract_page_num(part):
# Find the first number after "Page "
import re
match = re.search(r'Page\s+(\d+)', part)
if match:
return int(match.group(1))
return 0
content_parts.sort(key=extract_page_num)
full_content = "\n\n".join(content_parts) full_content = "\n\n".join(content_parts)
return ProcessingResult( return ProcessingResult(

View File

@@ -7,6 +7,7 @@ import os
import logging import logging
import asyncio import asyncio
import concurrent.futures import concurrent.futures
import threading
from typing import Dict, List, Any, Optional, Tuple from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass from dataclasses import dataclass
import tempfile import tempfile
@@ -35,7 +36,7 @@ class OptimizedOCRProcessor:
Optimized OCR processor with batch processing, shared model instance, and async support Optimized OCR processor with batch processing, shared model instance, and async support
""" """
def __init__(self, use_gpu: bool = True, languages: List[str] = None, def __init__(self, use_gpu: bool = True, languages: List[str] = None,
batch_size: int = 4, max_workers: int = 2): batch_size: int = 4, max_workers: int = 2):
""" """
Initialize optimized OCR processor Initialize optimized OCR processor
@@ -55,6 +56,9 @@ class OptimizedOCRProcessor:
self._model_loaded = False self._model_loaded = False
self._temp_dir = None self._temp_dir = None
self._executor = None self._executor = None
self._initialization_lock = threading.Lock()
self._initialization_thread = None
self._initialization_started = False
# Performance metrics # Performance metrics
self.metrics = { self.metrics = {
@@ -64,7 +68,38 @@ class OptimizedOCRProcessor:
"errors": [] "errors": []
} }
self._initialize_ocr() # Start lazy initialization in background thread
self._start_lazy_initialization()
def _start_lazy_initialization(self):
"""Start OCR initialization in a background thread."""
with self._initialization_lock:
if self._initialization_started:
return
self._initialization_started = True
# Start thread
self._initialization_thread = threading.Thread(
target=self._initialize_ocr,
name="OCRInitializer",
daemon=True
)
self._initialization_thread.start()
logger.info("Started lazy OCR initialization in background thread")
def _ensure_ocr_initialized(self, timeout: float = None):
"""
Block until OCR initialization is complete.
If timeout is None, wait indefinitely.
Returns True if OCR is available, False otherwise.
"""
if self.ocr_available:
return True
if not self._initialization_started:
self._start_lazy_initialization()
if self._initialization_thread is not None:
self._initialization_thread.join(timeout=timeout)
# After join, check if OCR is now available
return self.ocr_available
def _initialize_ocr(self): def _initialize_ocr(self):
"""Initialize PaddleOCR with shared model instance""" """Initialize PaddleOCR with shared model instance"""
@@ -138,7 +173,9 @@ class OptimizedOCRProcessor:
""" """
start_time = time.time() start_time = time.time()
if not self.ocr_available: # Ensure OCR is initialized (wait up to 30 seconds)
if not self._ensure_ocr_initialized(timeout=30.0):
logger.warning("OCR not available after waiting")
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try: try:
@@ -182,7 +219,9 @@ class OptimizedOCRProcessor:
batch_start_time = time.time() batch_start_time = time.time()
if not self.ocr_available: # Ensure OCR is initialized (wait up to 30 seconds)
if not self._ensure_ocr_initialized(timeout=30.0):
logger.warning("OCR not available for batch processing")
return [BatchOCRResult( return [BatchOCRResult(
image_path=path, image_path=path,
text="", text="",

41
measure_ocr_init.py Normal file
View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python3
"""
Measure OCR initialization time.
"""
import time
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
def measure_optimized_ocr_init():
from LightRAG_main.lightrag.optimized_ocr_processor import OptimizedOCRProcessor
start = time.time()
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
elapsed = time.time() - start
print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds")
print(f"OCR available: {processor.ocr_available}")
if processor.ocr_available:
# warm up model (already done in initialization)
pass
return processor
def measure_simple_ocr_init():
from simple_ocr_processor import SimpleOCRProcessor
start = time.time()
processor = SimpleOCRProcessor()
elapsed = time.time() - start
print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds")
print(f"OCR available: {processor.available}")
return processor
if __name__ == "__main__":
print("Measuring OCR initialization times...")
try:
measure_optimized_ocr_init()
except Exception as e:
print(f"Failed to measure OptimizedOCRProcessor: {e}")
try:
measure_simple_ocr_init()
except Exception as e:
print(f"Failed to measure SimpleOCRProcessor: {e}")

48
measure_ocr_init2.py Normal file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Measure OCR initialization time.
"""
import time
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
def measure_optimized_ocr_init():
try:
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
except ImportError:
# try alternative path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
start = time.time()
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
elapsed = time.time() - start
print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds")
print(f"OCR available: {processor.ocr_available}")
if processor.ocr_available:
# warm up model (already done in initialization)
pass
return processor
def measure_simple_ocr_init():
from simple_ocr_processor import SimpleOCRProcessor
start = time.time()
processor = SimpleOCRProcessor()
elapsed = time.time() - start
print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds")
print(f"OCR available: {processor.available}")
return processor
if __name__ == "__main__":
print("Measuring OCR initialization times...")
try:
measure_optimized_ocr_init()
except Exception as e:
print(f"Failed to measure OptimizedOCRProcessor: {e}")
import traceback
traceback.print_exc()
try:
measure_simple_ocr_init()
except Exception as e:
print(f"Failed to measure SimpleOCRProcessor: {e}")

52
test_ocr_batch.py Normal file
View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python3
"""
Test OCR batch processing and initialization improvements.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
from lightrag.document_processor import DocumentProcessor
import asyncio
import time
def test_ocr_initialization():
print("Testing OCR initialization improvements...")
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
# Wait for initialization (should be quick if pre-warmed)
start = time.time()
ready = processor._ensure_ocr_initialized(timeout=30.0)
elapsed = time.time() - start
print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")
if ready:
print("OCR metrics:", processor.get_metrics())
else:
print("OCR not available")
processor.close()
async def test_document_processor():
print("\nTesting DocumentProcessor with batch OCR...")
processor = DocumentProcessor()
# Use a dummy PDF file (if exists) or just test initialization
test_pdf = "test_meaningful.pdf"
if os.path.exists(test_pdf):
print(f"Processing {test_pdf}...")
start = time.time()
result = await processor.process_document(test_pdf)
elapsed = time.time() - start
print(f"Processing completed in {elapsed:.2f}s")
print(f"Success: {result.success}")
print(f"Pages: {result.metadata.get('pages', 'N/A')}")
print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
print(f"Content length: {len(result.content)}")
else:
print(f"Test PDF not found at {test_pdf}, skipping processing test.")
if __name__ == "__main__":
test_ocr_initialization()
asyncio.run(test_document_processor())
print("\nAll tests completed.")