diff --git a/LightRAG-main/lightrag/api/lightrag_server.py b/LightRAG-main/lightrag/api/lightrag_server.py index 87647a12..6193a243 100644 --- a/LightRAG-main/lightrag/api/lightrag_server.py +++ b/LightRAG-main/lightrag/api/lightrag_server.py @@ -244,6 +244,23 @@ def create_app(args): task.add_done_callback(app.state.background_tasks.discard) logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.") + # Warm up OCR processor in background to avoid cold‑start delay on first upload + async def warm_up_ocr_processor(): + try: + logger.info("Starting OCR processor warm‑up...") + # Import inside function to avoid unnecessary dependency if OCR not used + from lightrag.document_processor import get_document_processor + # This will initialize OptimizedOCRProcessor (≈9 seconds) + processor = get_document_processor() + logger.info("OCR processor warmed up successfully") + except Exception as e: + logger.warning(f"OCR warm‑up failed (non‑critical): {e}") + + # Schedule warm‑up as a background task (non‑blocking) + warm_up_task = asyncio.create_task(warm_up_ocr_processor()) + app.state.background_tasks.add(warm_up_task) + warm_up_task.add_done_callback(app.state.background_tasks.discard) + ASCIIColors.green("\nServer is ready to accept connections! 🚀\n") yield diff --git a/LightRAG-main/lightrag/document_processor.py b/LightRAG-main/lightrag/document_processor.py index 28100a7e..f5a7f790 100644 --- a/LightRAG-main/lightrag/document_processor.py +++ b/LightRAG-main/lightrag/document_processor.py @@ -586,7 +586,7 @@ class DocumentProcessor: return score async def _process_pdf(self, file_path: Path) -> ProcessingResult: - """Process PDF files with text extraction and OCR fallback""" + """Process PDF files with text extraction and OCR fallback using batch processing""" pdf_document = None try: content_parts = [] @@ -598,6 +598,10 @@ class DocumentProcessor: pdf_document = fitz.open(str(file_path)) total_pages = len(pdf_document) + # Collect pages that need OCR + ocr_pages = [] # list of (page_num, temp_path) + page_texts = {} # page_num -> text (if usable) + for page_num in range(total_pages): page = pdf_document[page_num] @@ -607,7 +611,7 @@ class DocumentProcessor: # Determine if text is usable (not garbled) # Threshold 0.5 means at least half of characters are printable ASCII and not replacement if text.strip() and text_score >= 0.5: - content_parts.append(f"Page {page_num + 1}:\n{text}") + page_texts[page_num] = text else: # Text is empty, garbled, or low quality -> use OCR logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR") @@ -621,34 +625,94 @@ class DocumentProcessor: temp_file.write(img_data) temp_path = temp_file.name - try: - if self.ocr_processor.ocr_available: - logger.info(f"Running OCR on page {page_num + 1} with high resolution") + ocr_pages.append((page_num, temp_path)) + + # Process OCR pages in batch if any + if ocr_pages and self.ocr_processor.ocr_available: + try: + temp_paths = [temp_path for _, temp_path in ocr_pages] + logger.info(f"Running batch OCR on {len(temp_paths)} pages") + batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths) + logger.info(f"Batch OCR completed for {len(batch_results)} pages") + + # Map results back to pages + for idx, (page_num, temp_path) in enumerate(ocr_pages): + ocr_result = None + if idx < len(batch_results): + batch_result = batch_results[idx] + ocr_result = { + "text": batch_result.text, + "confidence": batch_result.confidence, + "bboxes": batch_result.bboxes, + "line_count": batch_result.line_count + } + else: + # Fallback to individual OCR + ocr_result = self.ocr_processor.extract_text_from_image(temp_path) + + if ocr_result["text"].strip(): + logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}") + content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}") + processed_with_ocr = True + else: + logger.warning(f"OCR returned empty text for page {page_num + 1}") + # Don't add empty content, just mark as processed + content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]") + + # Extract tables from OCR + ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path) + if ocr_tables: + logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}") + tables.extend(ocr_tables) + + # Clean up temporary file + if temp_path and os.path.exists(temp_path): + os.unlink(temp_path) + + except Exception as batch_error: + logger.error(f"Batch OCR processing failed: {batch_error}") + # Fall back to individual processing for each page + for page_num, temp_path in ocr_pages: + try: ocr_result = self.ocr_processor.extract_text_from_image(temp_path) - if ocr_result["text"].strip(): - logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}") content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}") processed_with_ocr = True else: - logger.warning(f"OCR returned empty text for page {page_num + 1}") - # Don't add empty content, just mark as processed content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]") - # Extract tables from OCR + # Extract tables ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path) if ocr_tables: - logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}") tables.extend(ocr_tables) - else: - logger.warning("OCR not available, skipping OCR processing") - content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]") - except Exception as ocr_error: - logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}") - content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]") - finally: + except Exception as ocr_error: + logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}") + content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]") + finally: + if temp_path and os.path.exists(temp_path): + os.unlink(temp_path) + elif ocr_pages and not self.ocr_processor.ocr_available: + logger.warning("OCR not available, skipping OCR processing") + for page_num, temp_path in ocr_pages: + content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]") + if temp_path and os.path.exists(temp_path): os.unlink(temp_path) + # Add text pages content + for page_num, text in page_texts.items(): + content_parts.append(f"Page {page_num + 1}:\n{text}") + + # Sort content parts by page number + def extract_page_num(part): + # Find the first number after "Page " + import re + match = re.search(r'Page\s+(\d+)', part) + if match: + return int(match.group(1)) + return 0 + + content_parts.sort(key=extract_page_num) + full_content = "\n\n".join(content_parts) return ProcessingResult( diff --git a/LightRAG-main/lightrag/optimized_ocr_processor.py b/LightRAG-main/lightrag/optimized_ocr_processor.py index 7128f293..fe49006e 100644 --- a/LightRAG-main/lightrag/optimized_ocr_processor.py +++ b/LightRAG-main/lightrag/optimized_ocr_processor.py @@ -7,6 +7,7 @@ import os import logging import asyncio import concurrent.futures +import threading from typing import Dict, List, Any, Optional, Tuple from dataclasses import dataclass import tempfile @@ -35,7 +36,7 @@ class OptimizedOCRProcessor: Optimized OCR processor with batch processing, shared model instance, and async support """ - def __init__(self, use_gpu: bool = True, languages: List[str] = None, + def __init__(self, use_gpu: bool = True, languages: List[str] = None, batch_size: int = 4, max_workers: int = 2): """ Initialize optimized OCR processor @@ -55,6 +56,9 @@ class OptimizedOCRProcessor: self._model_loaded = False self._temp_dir = None self._executor = None + self._initialization_lock = threading.Lock() + self._initialization_thread = None + self._initialization_started = False # Performance metrics self.metrics = { @@ -64,7 +68,38 @@ class OptimizedOCRProcessor: "errors": [] } - self._initialize_ocr() + # Start lazy initialization in background thread + self._start_lazy_initialization() + + def _start_lazy_initialization(self): + """Start OCR initialization in a background thread.""" + with self._initialization_lock: + if self._initialization_started: + return + self._initialization_started = True + # Start thread + self._initialization_thread = threading.Thread( + target=self._initialize_ocr, + name="OCRInitializer", + daemon=True + ) + self._initialization_thread.start() + logger.info("Started lazy OCR initialization in background thread") + + def _ensure_ocr_initialized(self, timeout: float = None): + """ + Block until OCR initialization is complete. + If timeout is None, wait indefinitely. + Returns True if OCR is available, False otherwise. + """ + if self.ocr_available: + return True + if not self._initialization_started: + self._start_lazy_initialization() + if self._initialization_thread is not None: + self._initialization_thread.join(timeout=timeout) + # After join, check if OCR is now available + return self.ocr_available def _initialize_ocr(self): """Initialize PaddleOCR with shared model instance""" @@ -138,7 +173,9 @@ class OptimizedOCRProcessor: """ start_time = time.time() - if not self.ocr_available: + # Ensure OCR is initialized (wait up to 30 seconds) + if not self._ensure_ocr_initialized(timeout=30.0): + logger.warning("OCR not available after waiting") return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} try: @@ -182,7 +219,9 @@ class OptimizedOCRProcessor: batch_start_time = time.time() - if not self.ocr_available: + # Ensure OCR is initialized (wait up to 30 seconds) + if not self._ensure_ocr_initialized(timeout=30.0): + logger.warning("OCR not available for batch processing") return [BatchOCRResult( image_path=path, text="", diff --git a/inputs/safedistance/safedistance/__enqueued__/ocr_001.pdf b/inputs/safedistance/safedistance/__enqueued__/ocr_001.pdf new file mode 100644 index 00000000..c6aec7d2 Binary files /dev/null and b/inputs/safedistance/safedistance/__enqueued__/ocr_001.pdf differ diff --git a/measure_ocr_init.py b/measure_ocr_init.py new file mode 100644 index 00000000..7c621900 --- /dev/null +++ b/measure_ocr_init.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +""" +Measure OCR initialization time. +""" +import time +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +def measure_optimized_ocr_init(): + from LightRAG_main.lightrag.optimized_ocr_processor import OptimizedOCRProcessor + start = time.time() + processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2) + elapsed = time.time() - start + print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds") + print(f"OCR available: {processor.ocr_available}") + if processor.ocr_available: + # warm up model (already done in initialization) + pass + return processor + +def measure_simple_ocr_init(): + from simple_ocr_processor import SimpleOCRProcessor + start = time.time() + processor = SimpleOCRProcessor() + elapsed = time.time() - start + print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds") + print(f"OCR available: {processor.available}") + return processor + +if __name__ == "__main__": + print("Measuring OCR initialization times...") + try: + measure_optimized_ocr_init() + except Exception as e: + print(f"Failed to measure OptimizedOCRProcessor: {e}") + try: + measure_simple_ocr_init() + except Exception as e: + print(f"Failed to measure SimpleOCRProcessor: {e}") \ No newline at end of file diff --git a/measure_ocr_init2.py b/measure_ocr_init2.py new file mode 100644 index 00000000..15fe0442 --- /dev/null +++ b/measure_ocr_init2.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Measure OCR initialization time. +""" +import time +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +def measure_optimized_ocr_init(): + try: + from lightrag.optimized_ocr_processor import OptimizedOCRProcessor + except ImportError: + # try alternative path + sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main')) + from lightrag.optimized_ocr_processor import OptimizedOCRProcessor + start = time.time() + processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2) + elapsed = time.time() - start + print(f"OptimizedOCRProcessor initialization time: {elapsed:.2f} seconds") + print(f"OCR available: {processor.ocr_available}") + if processor.ocr_available: + # warm up model (already done in initialization) + pass + return processor + +def measure_simple_ocr_init(): + from simple_ocr_processor import SimpleOCRProcessor + start = time.time() + processor = SimpleOCRProcessor() + elapsed = time.time() - start + print(f"SimpleOCRProcessor initialization time: {elapsed:.2f} seconds") + print(f"OCR available: {processor.available}") + return processor + +if __name__ == "__main__": + print("Measuring OCR initialization times...") + try: + measure_optimized_ocr_init() + except Exception as e: + print(f"Failed to measure OptimizedOCRProcessor: {e}") + import traceback + traceback.print_exc() + try: + measure_simple_ocr_init() + except Exception as e: + print(f"Failed to measure SimpleOCRProcessor: {e}") \ No newline at end of file diff --git a/test_ocr_batch.py b/test_ocr_batch.py new file mode 100644 index 00000000..1e60bd92 --- /dev/null +++ b/test_ocr_batch.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Test OCR batch processing and initialization improvements. +""" +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'LightRAG-main')) + +from lightrag.optimized_ocr_processor import OptimizedOCRProcessor +from lightrag.document_processor import DocumentProcessor +import asyncio +import time + +def test_ocr_initialization(): + print("Testing OCR initialization improvements...") + processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2) + + # Wait for initialization (should be quick if pre-warmed) + start = time.time() + ready = processor._ensure_ocr_initialized(timeout=30.0) + elapsed = time.time() - start + print(f"OCR ready: {ready}, elapsed: {elapsed:.2f}s") + + if ready: + print("OCR metrics:", processor.get_metrics()) + else: + print("OCR not available") + + processor.close() + +async def test_document_processor(): + print("\nTesting DocumentProcessor with batch OCR...") + processor = DocumentProcessor() + # Use a dummy PDF file (if exists) or just test initialization + test_pdf = "test_meaningful.pdf" + if os.path.exists(test_pdf): + print(f"Processing {test_pdf}...") + start = time.time() + result = await processor.process_document(test_pdf) + elapsed = time.time() - start + print(f"Processing completed in {elapsed:.2f}s") + print(f"Success: {result.success}") + print(f"Pages: {result.metadata.get('pages', 'N/A')}") + print(f"Processed with OCR: {result.metadata.get('processed_with_ocr', False)}") + print(f"Content length: {len(result.content)}") + else: + print(f"Test PDF not found at {test_pdf}, skipping processing test.") + +if __name__ == "__main__": + test_ocr_initialization() + asyncio.run(test_document_processor()) + print("\nAll tests completed.") \ No newline at end of file