ocr speed improved

2026-01-13 19:10:24 +08:00
parent a5eb381384
commit e7256a10ea
7 changed files with 283 additions and 22 deletions
--- a/LightRAG-main/lightrag/api/lightrag_server.py
+++ b/LightRAG-main/lightrag/api/lightrag_server.py
@@ -244,6 +244,23 @@ def create_app(args):
                task.add_done_callback(app.state.background_tasks.discard)
                logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.")
            # Warm up OCR processor in background to avoid cold‑start delay on first upload
            async def warm_up_ocr_processor():
                try:
                    logger.info("Starting OCR processor warm‑up...")
                    # Import inside function to avoid unnecessary dependency if OCR not used
                    from lightrag.document_processor import get_document_processor
                    # This will initialize OptimizedOCRProcessor (≈9 seconds)
                    processor = get_document_processor()
                    logger.info("OCR processor warmed up successfully")
                except Exception as e:
                    logger.warning(f"OCR warm‑up failed (non‑critical): {e}")
            # Schedule warm‑up as a background task (non‑blocking)
            warm_up_task = asyncio.create_task(warm_up_ocr_processor())
            app.state.background_tasks.add(warm_up_task)
            warm_up_task.add_done_callback(app.state.background_tasks.discard)
            ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
            yield
--- a/LightRAG-main/lightrag/document_processor.py
+++ b/LightRAG-main/lightrag/document_processor.py
@@ -586,7 +586,7 @@ class DocumentProcessor:
        return score
    async def _process_pdf(self, file_path: Path) -> ProcessingResult:
-        """Process PDF files with text extraction and OCR fallback"""
+        """Process PDF files with text extraction and OCR fallback using batch processing"""
        pdf_document = None
        try:
            content_parts = []
@@ -598,6 +598,10 @@ class DocumentProcessor:
            pdf_document = fitz.open(str(file_path))
            total_pages = len(pdf_document)
            # Collect pages that need OCR
            ocr_pages = []  # list of (page_num, temp_path)
            page_texts = {}  # page_num -> text (if usable)
            for page_num in range(total_pages):
                page = pdf_document[page_num]
@@ -607,7 +611,7 @@ class DocumentProcessor:
                # Determine if text is usable (not garbled)
                # Threshold 0.5 means at least half of characters are printable ASCII and not replacement
                if text.strip() and text_score >= 0.5:
-                    content_parts.append(f"Page {page_num + 1}:\n{text}")
+                    page_texts[page_num] = text
                else:
                    # Text is empty, garbled, or low quality -> use OCR
                    logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
@@ -621,34 +625,94 @@ class DocumentProcessor:
                        temp_file.write(img_data)
                        temp_path = temp_file.name
-                    try:
+                    ocr_pages.append((page_num, temp_path))
-                        if self.ocr_processor.ocr_available:
+            
-                            logger.info(f"Running OCR on page {page_num + 1} with high resolution")
+            # Process OCR pages in batch if any
            if ocr_pages and self.ocr_processor.ocr_available:
                try:
                    temp_paths = [temp_path for _, temp_path in ocr_pages]
                    logger.info(f"Running batch OCR on {len(temp_paths)} pages")
                    batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
                    logger.info(f"Batch OCR completed for {len(batch_results)} pages")
                    # Map results back to pages
                    for idx, (page_num, temp_path) in enumerate(ocr_pages):
                        ocr_result = None
                        if idx < len(batch_results):
                            batch_result = batch_results[idx]
                            ocr_result = {
                                "text": batch_result.text,
                                "confidence": batch_result.confidence,
                                "bboxes": batch_result.bboxes,
                                "line_count": batch_result.line_count
                            }
                        else:
                            # Fallback to individual OCR
                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
                        if ocr_result["text"].strip():
                            logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
                            content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
                            processed_with_ocr = True
                        else:
                            logger.warning(f"OCR returned empty text for page {page_num + 1}")
                            # Don't add empty content, just mark as processed
                            content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
                        # Extract tables from OCR
                        ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
                        if ocr_tables:
                            logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
                            tables.extend(ocr_tables)
                        # Clean up temporary file
                        if temp_path and os.path.exists(temp_path):
                            os.unlink(temp_path)
                except Exception as batch_error:
                    logger.error(f"Batch OCR processing failed: {batch_error}")
                    # Fall back to individual processing for each page
                    for page_num, temp_path in ocr_pages:
                        try:
                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
                            if ocr_result["text"].strip():
                                logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
                                content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
                                processed_with_ocr = True
                            else:
                                logger.warning(f"OCR returned empty text for page {page_num + 1}")
                                # Don't add empty content, just mark as processed
                                content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
-                            # Extract tables from OCR
+                            # Extract tables
                            ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
                            if ocr_tables:
                                logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
                                tables.extend(ocr_tables)
-                        else:
+                        except Exception as ocr_error:
-                            logger.warning("OCR not available, skipping OCR processing")
+                            logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
-                            content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
+                            content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
-                    except Exception as ocr_error:
+                        finally:
-                        logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
+                            if temp_path and os.path.exists(temp_path):
-                        content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
+                                os.unlink(temp_path)
-                    finally:
+            elif ocr_pages and not self.ocr_processor.ocr_available:
                logger.warning("OCR not available, skipping OCR processing")
                for page_num, temp_path in ocr_pages:
                    content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
                    if temp_path and os.path.exists(temp_path):
                        os.unlink(temp_path)
            # Add text pages content
            for page_num, text in page_texts.items():
                content_parts.append(f"Page {page_num + 1}:\n{text}")
            # Sort content parts by page number
            def extract_page_num(part):
                # Find the first number after "Page "
                import re
                match = re.search(r'Page\s+(\d+)', part)
                if match:
                    return int(match.group(1))
                return 0
            content_parts.sort(key=extract_page_num)
            full_content = "\n\n".join(content_parts)
            return ProcessingResult(
--- a/LightRAG-main/lightrag/optimized_ocr_processor.py
+++ b/LightRAG-main/lightrag/optimized_ocr_processor.py
@@ -7,6 +7,7 @@ import os
 import logging
 import asyncio
 import concurrent.futures
 import threading
 from typing import Dict, List, Any, Optional, Tuple
 from dataclasses import dataclass
 import tempfile
@@ -35,7 +36,7 @@ class OptimizedOCRProcessor:
    Optimized OCR processor with batch processing, shared model instance, and async support
    """
-    def __init__(self, use_gpu: bool = True, languages: List[str] = None, 
+    def __init__(self, use_gpu: bool = True, languages: List[str] = None,
                 batch_size: int = 4, max_workers: int = 2):
        """
        Initialize optimized OCR processor
@@ -55,6 +56,9 @@ class OptimizedOCRProcessor:
        self._model_loaded = False
        self._temp_dir = None
        self._executor = None
        self._initialization_lock = threading.Lock()
        self._initialization_thread = None
        self._initialization_started = False
        # Performance metrics
        self.metrics = {
@@ -64,7 +68,38 @@ class OptimizedOCRProcessor:
            "errors": []
        }
-        self._initialize_ocr()
+        # Start lazy initialization in background thread
        self._start_lazy_initialization()
    def _start_lazy_initialization(self):
        """Start OCR initialization in a background thread."""
        with self._initialization_lock:
            if self._initialization_started:
                return
            self._initialization_started = True
        # Start thread
        self._initialization_thread = threading.Thread(
            target=self._initialize_ocr,
            name="OCRInitializer",
            daemon=True
        )
        self._initialization_thread.start()
        logger.info("Started lazy OCR initialization in background thread")
    def _ensure_ocr_initialized(self, timeout: float = None):
        """
        Block until OCR initialization is complete.
        If timeout is None, wait indefinitely.
        Returns True if OCR is available, False otherwise.
        """
        if self.ocr_available:
            return True
        if not self._initialization_started:
            self._start_lazy_initialization()
        if self._initialization_thread is not None:
            self._initialization_thread.join(timeout=timeout)
        # After join, check if OCR is now available
        return self.ocr_available
    def _initialize_ocr(self):
        """Initialize PaddleOCR with shared model instance"""
@@ -138,7 +173,9 @@ class OptimizedOCRProcessor:
        """
        start_time = time.time()
-        if not self.ocr_available:
+        # Ensure OCR is initialized (wait up to 30 seconds)
        if not self._ensure_ocr_initialized(timeout=30.0):
            logger.warning("OCR not available after waiting")
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
        try:
@@ -182,7 +219,9 @@ class OptimizedOCRProcessor:
        batch_start_time = time.time()
-        if not self.ocr_available:
+        # Ensure OCR is initialized (wait up to 30 seconds)
        if not self._ensure_ocr_initialized(timeout=30.0):
            logger.warning("OCR not available for batch processing")
            return [BatchOCRResult(
                image_path=path,
                text="",
--- a/inputs/safedistance/safedistance/enqueued/ocr_001.pdf
+++ b/inputs/safedistance/safedistance/enqueued/ocr_001.pdf
--- a/measure_ocr_init.py
+++ b/measure_ocr_init.py
@@ -0,0 +1,41 @@
 #!/usr/bin/env python3
 """
 Measure OCR initialization time.
 """
 import time
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 def measure_optimized_ocr_init():
    from LightRAG_main.lightrag.optimized_ocr_processor import OptimizedOCRProcessor
    start = time.time()
    processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
    elapsed = time.time() - start
    print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds")
    print(f"OCR available: {processor.ocr_available}")
    if processor.ocr_available:
        # warm up model (already done in initialization)
        pass
    return processor
 def measure_simple_ocr_init():
    from simple_ocr_processor import SimpleOCRProcessor
    start = time.time()
    processor = SimpleOCRProcessor()
    elapsed = time.time() - start
    print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds")
    print(f"OCR available: {processor.available}")
    return processor
 if __name__ == "__main__":
    print("Measuring OCR initialization times...")
    try:
        measure_optimized_ocr_init()
    except Exception as e:
        print(f"Failed to measure OptimizedOCRProcessor: {e}")
    try:
        measure_simple_ocr_init()
    except Exception as e:
        print(f"Failed to measure SimpleOCRProcessor: {e}")
--- a/measure_ocr_init2.py
+++ b/measure_ocr_init2.py
@@ -0,0 +1,48 @@
 #!/usr/bin/env python3
 """
 Measure OCR initialization time.
 """
 import time
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 def measure_optimized_ocr_init():
    try:
        from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
    except ImportError:
        # try alternative path
        sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
        from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
    start = time.time()
    processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
    elapsed = time.time() - start
    print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds")
    print(f"OCR available: {processor.ocr_available}")
    if processor.ocr_available:
        # warm up model (already done in initialization)
        pass
    return processor
 def measure_simple_ocr_init():
    from simple_ocr_processor import SimpleOCRProcessor
    start = time.time()
    processor = SimpleOCRProcessor()
    elapsed = time.time() - start
    print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds")
    print(f"OCR available: {processor.available}")
    return processor
 if __name__ == "__main__":
    print("Measuring OCR initialization times...")
    try:
        measure_optimized_ocr_init()
    except Exception as e:
        print(f"Failed to measure OptimizedOCRProcessor: {e}")
        import traceback
        traceback.print_exc()
    try:
        measure_simple_ocr_init()
    except Exception as e:
        print(f"Failed to measure SimpleOCRProcessor: {e}")
--- a/test_ocr_batch.py
+++ b/test_ocr_batch.py
@@ -0,0 +1,52 @@
 #!/usr/bin/env python3
 """
 Test OCR batch processing and initialization improvements.
 """
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main'))
 from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
 from lightrag.document_processor import DocumentProcessor
 import asyncio
 import time
 def test_ocr_initialization():
    print("Testing OCR initialization improvements...")
    processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)
    # Wait for initialization (should be quick if pre-warmed)
    start = time.time()
    ready = processor._ensure_ocr_initialized(timeout=30.0)
    elapsed = time.time() - start
    print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s")
    if ready:
        print("OCR metrics:", processor.get_metrics())
    else:
        print("OCR not available")
    processor.close()
 async def test_document_processor():
    print("\nTesting DocumentProcessor with batch OCR...")
    processor = DocumentProcessor()
    # Use a dummy PDF file (if exists) or just test initialization
    test_pdf = "test_meaningful.pdf"
    if os.path.exists(test_pdf):
        print(f"Processing {test_pdf}...")
        start = time.time()
        result = await processor.process_document(test_pdf)
        elapsed = time.time() - start
        print(f"Processing completed in {elapsed:.2f}s")
        print(f"Success: {result.success}")
        print(f"Pages: {result.metadata.get('pages', 'N/A')}")
        print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}")
        print(f"Content length: {len(result.content)}")
    else:
        print(f"Test PDF not found at {test_pdf}, skipping processing test.")
 if __name__ == "__main__":
    test_ocr_initialization()
    asyncio.run(test_document_processor())
    print("\nAll tests completed.")