ocr speed improved
This commit is contained in:
@@ -244,6 +244,23 @@ def create_app(args):
|
|||||||
task.add_done_callback(app.state.background_tasks.discard)
|
task.add_done_callback(app.state.background_tasks.discard)
|
||||||
logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.")
|
logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.")
|
||||||
|
|
||||||
|
# Warm up OCR processor in background to avoid cold‑start delay on first upload
|
||||||
|
async def warm_up_ocr_processor():
|
||||||
|
try:
|
||||||
|
logger.info("Starting OCR processor warm‑up...")
|
||||||
|
# Import inside function to avoid unnecessary dependency if OCR not used
|
||||||
|
from lightrag.document_processor import get_document_processor
|
||||||
|
# This will initialize OptimizedOCRProcessor (≈9 seconds)
|
||||||
|
processor = get_document_processor()
|
||||||
|
logger.info("OCR processor warmed up successfully")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"OCR warm‑up failed (non‑critical): {e}")
|
||||||
|
|
||||||
|
# Schedule warm‑up as a background task (non‑blocking)
|
||||||
|
warm_up_task = asyncio.create_task(warm_up_ocr_processor())
|
||||||
|
app.state.background_tasks.add(warm_up_task)
|
||||||
|
warm_up_task.add_done_callback(app.state.background_tasks.discard)
|
||||||
|
|
||||||
ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
|
ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|||||||
@@ -586,7 +586,7 @@ class DocumentProcessor:
|
|||||||
return score
|
return score
|
||||||
|
|
||||||
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
|
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
|
||||||
"""Process PDF files with text extraction and OCR fallback"""
|
"""Process PDF files with text extraction and OCR fallback using batch processing"""
|
||||||
pdf_document = None
|
pdf_document = None
|
||||||
try:
|
try:
|
||||||
content_parts = []
|
content_parts = []
|
||||||
@@ -598,6 +598,10 @@ class DocumentProcessor:
|
|||||||
pdf_document = fitz.open(str(file_path))
|
pdf_document = fitz.open(str(file_path))
|
||||||
total_pages = len(pdf_document)
|
total_pages = len(pdf_document)
|
||||||
|
|
||||||
|
# Collect pages that need OCR
|
||||||
|
ocr_pages = [] # list of (page_num, temp_path)
|
||||||
|
page_texts = {} # page_num -> text (if usable)
|
||||||
|
|
||||||
for page_num in range(total_pages):
|
for page_num in range(total_pages):
|
||||||
page = pdf_document[page_num]
|
page = pdf_document[page_num]
|
||||||
|
|
||||||
@@ -607,7 +611,7 @@ class DocumentProcessor:
|
|||||||
# Determine if text is usable (not garbled)
|
# Determine if text is usable (not garbled)
|
||||||
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
|
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
|
||||||
if text.strip() and text_score >= 0.5:
|
if text.strip() and text_score >= 0.5:
|
||||||
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
page_texts[page_num] = text
|
||||||
else:
|
else:
|
||||||
# Text is empty, garbled, or low quality -> use OCR
|
# Text is empty, garbled, or low quality -> use OCR
|
||||||
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
|
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
|
||||||
@@ -621,34 +625,94 @@ class DocumentProcessor:
|
|||||||
temp_file.write(img_data)
|
temp_file.write(img_data)
|
||||||
temp_path = temp_file.name
|
temp_path = temp_file.name
|
||||||
|
|
||||||
try:
|
ocr_pages.append((page_num, temp_path))
|
||||||
if self.ocr_processor.ocr_available:
|
|
||||||
logger.info(f"Running OCR on page {page_num + 1} with high resolution")
|
# Process OCR pages in batch if any
|
||||||
|
if ocr_pages and self.ocr_processor.ocr_available:
|
||||||
|
try:
|
||||||
|
temp_paths = [temp_path for _, temp_path in ocr_pages]
|
||||||
|
logger.info(f"Running batch OCR on {len(temp_paths)} pages")
|
||||||
|
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
|
||||||
|
logger.info(f"Batch OCR completed for {len(batch_results)} pages")
|
||||||
|
|
||||||
|
# Map results back to pages
|
||||||
|
for idx, (page_num, temp_path) in enumerate(ocr_pages):
|
||||||
|
ocr_result = None
|
||||||
|
if idx < len(batch_results):
|
||||||
|
batch_result = batch_results[idx]
|
||||||
|
ocr_result = {
|
||||||
|
"text": batch_result.text,
|
||||||
|
"confidence": batch_result.confidence,
|
||||||
|
"bboxes": batch_result.bboxes,
|
||||||
|
"line_count": batch_result.line_count
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Fallback to individual OCR
|
||||||
|
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||||||
|
|
||||||
|
if ocr_result["text"].strip():
|
||||||
|
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
|
||||||
|
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
||||||
|
processed_with_ocr = True
|
||||||
|
else:
|
||||||
|
logger.warning(f"OCR returned empty text for page {page_num + 1}")
|
||||||
|
# Don't add empty content, just mark as processed
|
||||||
|
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
||||||
|
|
||||||
|
# Extract tables from OCR
|
||||||
|
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||||
|
if ocr_tables:
|
||||||
|
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
||||||
|
tables.extend(ocr_tables)
|
||||||
|
|
||||||
|
# Clean up temporary file
|
||||||
|
if temp_path and os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
except Exception as batch_error:
|
||||||
|
logger.error(f"Batch OCR processing failed: {batch_error}")
|
||||||
|
# Fall back to individual processing for each page
|
||||||
|
for page_num, temp_path in ocr_pages:
|
||||||
|
try:
|
||||||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||||||
|
|
||||||
if ocr_result["text"].strip():
|
if ocr_result["text"].strip():
|
||||||
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
|
|
||||||
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
||||||
processed_with_ocr = True
|
processed_with_ocr = True
|
||||||
else:
|
else:
|
||||||
logger.warning(f"OCR returned empty text for page {page_num + 1}")
|
|
||||||
# Don't add empty content, just mark as processed
|
|
||||||
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
||||||
|
|
||||||
# Extract tables from OCR
|
# Extract tables
|
||||||
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||||
if ocr_tables:
|
if ocr_tables:
|
||||||
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
|
||||||
tables.extend(ocr_tables)
|
tables.extend(ocr_tables)
|
||||||
else:
|
except Exception as ocr_error:
|
||||||
logger.warning("OCR not available, skipping OCR processing")
|
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
|
||||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
|
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
|
||||||
except Exception as ocr_error:
|
finally:
|
||||||
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
|
if temp_path and os.path.exists(temp_path):
|
||||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
|
os.unlink(temp_path)
|
||||||
finally:
|
elif ocr_pages and not self.ocr_processor.ocr_available:
|
||||||
|
logger.warning("OCR not available, skipping OCR processing")
|
||||||
|
for page_num, temp_path in ocr_pages:
|
||||||
|
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
|
||||||
|
if temp_path and os.path.exists(temp_path):
|
||||||
os.unlink(temp_path)
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
# Add text pages content
|
||||||
|
for page_num, text in page_texts.items():
|
||||||
|
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
||||||
|
|
||||||
|
# Sort content parts by page number
|
||||||
|
def extract_page_num(part):
|
||||||
|
# Find the first number after "Page "
|
||||||
|
import re
|
||||||
|
match = re.search(r'Page\s+(\d+)', part)
|
||||||
|
if match:
|
||||||
|
return int(match.group(1))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
content_parts.sort(key=extract_page_num)
|
||||||
|
|
||||||
full_content = "\n\n".join(content_parts)
|
full_content = "\n\n".join(content_parts)
|
||||||
|
|
||||||
return ProcessingResult(
|
return ProcessingResult(
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
import asyncio
|
import asyncio
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
import threading
|
||||||
from typing import Dict, List, Any, Optional, Tuple
|
from typing import Dict, List, Any, Optional, Tuple
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import tempfile
|
import tempfile
|
||||||
@@ -35,7 +36,7 @@ class OptimizedOCRProcessor:
|
|||||||
Optimized OCR processor with batch processing, shared model instance, and async support
|
Optimized OCR processor with batch processing, shared model instance, and async support
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, use_gpu: bool = True, languages: List[str] = None,
|
def __init__(self, use_gpu: bool = True, languages: List[str] = None,
|
||||||
batch_size: int = 4, max_workers: int = 2):
|
batch_size: int = 4, max_workers: int = 2):
|
||||||
"""
|
"""
|
||||||
Initialize optimized OCR processor
|
Initialize optimized OCR processor
|
||||||
@@ -55,6 +56,9 @@ class OptimizedOCRProcessor:
|
|||||||
self._model_loaded = False
|
self._model_loaded = False
|
||||||
self._temp_dir = None
|
self._temp_dir = None
|
||||||
self._executor = None
|
self._executor = None
|
||||||
|
self._initialization_lock = threading.Lock()
|
||||||
|
self._initialization_thread = None
|
||||||
|
self._initialization_started = False
|
||||||
|
|
||||||
# Performance metrics
|
# Performance metrics
|
||||||
self.metrics = {
|
self.metrics = {
|
||||||
@@ -64,7 +68,38 @@ class OptimizedOCRProcessor:
|
|||||||
"errors": []
|
"errors": []
|
||||||
}
|
}
|
||||||
|
|
||||||
self._initialize_ocr()
|
# Start lazy initialization in background thread
|
||||||
|
self._start_lazy_initialization()
|
||||||
|
|
||||||
|
def _start_lazy_initialization(self):
|
||||||
|
"""Start OCR initialization in a background thread."""
|
||||||
|
with self._initialization_lock:
|
||||||
|
if self._initialization_started:
|
||||||
|
return
|
||||||
|
self._initialization_started = True
|
||||||
|
# Start thread
|
||||||
|
self._initialization_thread = threading.Thread(
|
||||||
|
target=self._initialize_ocr,
|
||||||
|
name="OCRInitializer",
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
self._initialization_thread.start()
|
||||||
|
logger.info("Started lazy OCR initialization in background thread")
|
||||||
|
|
||||||
|
def _ensure_ocr_initialized(self, timeout: float = None):
|
||||||
|
"""
|
||||||
|
Block until OCR initialization is complete.
|
||||||
|
If timeout is None, wait indefinitely.
|
||||||
|
Returns True if OCR is available, False otherwise.
|
||||||
|
"""
|
||||||
|
if self.ocr_available:
|
||||||
|
return True
|
||||||
|
if not self._initialization_started:
|
||||||
|
self._start_lazy_initialization()
|
||||||
|
if self._initialization_thread is not None:
|
||||||
|
self._initialization_thread.join(timeout=timeout)
|
||||||
|
# After join, check if OCR is now available
|
||||||
|
return self.ocr_available
|
||||||
|
|
||||||
def _initialize_ocr(self):
|
def _initialize_ocr(self):
|
||||||
"""Initialize PaddleOCR with shared model instance"""
|
"""Initialize PaddleOCR with shared model instance"""
|
||||||
@@ -138,7 +173,9 @@ class OptimizedOCRProcessor:
|
|||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
if not self.ocr_available:
|
# Ensure OCR is initialized (wait up to 30 seconds)
|
||||||
|
if not self._ensure_ocr_initialized(timeout=30.0):
|
||||||
|
logger.warning("OCR not available after waiting")
|
||||||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -182,7 +219,9 @@ class OptimizedOCRProcessor:
|
|||||||
|
|
||||||
batch_start_time = time.time()
|
batch_start_time = time.time()
|
||||||
|
|
||||||
if not self.ocr_available:
|
# Ensure OCR is initialized (wait up to 30 seconds)
|
||||||
|
if not self._ensure_ocr_initialized(timeout=30.0):
|
||||||
|
logger.warning("OCR not available for batch processing")
|
||||||
return [BatchOCRResult(
|
return [BatchOCRResult(
|
||||||
image_path=path,
|
image_path=path,
|
||||||
text="",
|
text="",
|
||||||
|
|||||||
BIN
inputs/safedistance/safedistance/__enqueued__/ocr_001.pdf
Normal file
BIN
inputs/safedistance/safedistance/__enqueued__/ocr_001.pdf
Normal file
Binary file not shown.
41
measure_ocr_init.py
Normal file
41
measure_ocr_init.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Measure OCR initialization time.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
def measure_optimized_ocr_init():
|
||||||
|
from LightRAG_main.lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
||||||
|
start = time.time()
|
||||||
|
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds")
|
||||||
|
print(f"OCR available: {processor.ocr_available}")
|
||||||
|
if processor.ocr_available:
|
||||||
|
# warm up model (already done in initialization)
|
||||||
|
pass
|
||||||
|
return processor
|
||||||
|
|
||||||
|
def measure_simple_ocr_init():
|
||||||
|
from simple_ocr_processor import SimpleOCRProcessor
|
||||||
|
start = time.time()
|
||||||
|
processor = SimpleOCRProcessor()
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds")
|
||||||
|
print(f"OCR available: {processor.available}")
|
||||||
|
return processor
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Measuring OCR initialization times...")
|
||||||
|
try:
|
||||||
|
measure_optimized_ocr_init()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to measure OptimizedOCRProcessor: {e}")
|
||||||
|
try:
|
||||||
|
measure_simple_ocr_init()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to measure SimpleOCRProcessor: {e}")
|
||||||
48
measure_ocr_init2.py
Normal file
48
measure_ocr_init2.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Measure OCR initialization time.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
def measure_optimized_ocr_init():
|
||||||
|
try:
|
||||||
|
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
||||||
|
except ImportError:
|
||||||
|
# try alternative path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
|
||||||
|
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
||||||
|
start = time.time()
|
||||||
|
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds")
|
||||||
|
print(f"OCR available: {processor.ocr_available}")
|
||||||
|
if processor.ocr_available:
|
||||||
|
# warm up model (already done in initialization)
|
||||||
|
pass
|
||||||
|
return processor
|
||||||
|
|
||||||
|
def measure_simple_ocr_init():
|
||||||
|
from simple_ocr_processor import SimpleOCRProcessor
|
||||||
|
start = time.time()
|
||||||
|
processor = SimpleOCRProcessor()
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds")
|
||||||
|
print(f"OCR available: {processor.available}")
|
||||||
|
return processor
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Measuring OCR initialization times...")
|
||||||
|
try:
|
||||||
|
measure_optimized_ocr_init()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to measure OptimizedOCRProcessor: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
try:
|
||||||
|
measure_simple_ocr_init()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to measure SimpleOCRProcessor: {e}")
|
||||||
52
test_ocr_batch.py
Normal file
52
test_ocr_batch.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test OCR batch processing and initialization improvements.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
|
||||||
|
|
||||||
|
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
||||||
|
from lightrag.document_processor import DocumentProcessor
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
|
||||||
|
def test_ocr_initialization():
|
||||||
|
print("Testing OCR initialization improvements...")
|
||||||
|
processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
|
||||||
|
|
||||||
|
# Wait for initialization (should be quick if pre-warmed)
|
||||||
|
start = time.time()
|
||||||
|
ready = processor._ensure_ocr_initialized(timeout=30.0)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")
|
||||||
|
|
||||||
|
if ready:
|
||||||
|
print("OCR metrics:", processor.get_metrics())
|
||||||
|
else:
|
||||||
|
print("OCR not available")
|
||||||
|
|
||||||
|
processor.close()
|
||||||
|
|
||||||
|
async def test_document_processor():
|
||||||
|
print("\nTesting DocumentProcessor with batch OCR...")
|
||||||
|
processor = DocumentProcessor()
|
||||||
|
# Use a dummy PDF file (if exists) or just test initialization
|
||||||
|
test_pdf = "test_meaningful.pdf"
|
||||||
|
if os.path.exists(test_pdf):
|
||||||
|
print(f"Processing {test_pdf}...")
|
||||||
|
start = time.time()
|
||||||
|
result = await processor.process_document(test_pdf)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"Processing completed in {elapsed:.2f}s")
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"Pages: {result.metadata.get('pages', 'N/A')}")
|
||||||
|
print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
|
||||||
|
print(f"Content length: {len(result.content)}")
|
||||||
|
else:
|
||||||
|
print(f"Test PDF not found at {test_pdf}, skipping processing test.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_ocr_initialization()
|
||||||
|
asyncio.run(test_document_processor())
|
||||||
|
print("\nAll tests completed.")
|
||||||
Reference in New Issue
Block a user