railseek6/LightRAG-main/lightrag/optimized_ocr_processor.py

"""
Optimized OCR Processor with Batch Processing and Async Support
Replaces the inefficient process-per-request approach with shared model instance
"""

import os
import logging
import asyncio
import concurrent.futures
import threading
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
import tempfile
from pathlib import Path
import json
import time

# Configure logging
logger = logging.getLogger(__name__)


@dataclass
class BatchOCRResult:
    """Batch OCR processing result"""
    image_path: str
    text: str
    confidence: float
    bboxes: List
    line_count: int
    processing_time: float
    error: Optional[str] = None


class OptimizedOCRProcessor:
    """
    Optimized OCR processor with batch processing, shared model instance, and async support
    """

    def __init__(self, use_gpu: bool = True, languages: List[str] = None,
                 batch_size: int = 4, max_workers: int = 2):
        """
        Initialize optimized OCR processor

        Args:
            use_gpu: Whether to use GPU acceleration
            languages: List of languages for OCR
            batch_size: Number of images to process in each batch
            max_workers: Maximum number of parallel workers
        """
        self.use_gpu = use_gpu
        self.languages = languages or ['en', 'ch']
        self.batch_size = batch_size
        self.max_workers = max_workers
        self.ocr_available = False
        self._ocr_engine = None
        self._model_loaded = False
        self._temp_dir = None
        self._executor = None
        self._initialization_lock = threading.Lock()
        self._initialization_thread = None
        self._initialization_started = False

        # Performance metrics
        self.metrics = {
            "total_images_processed": 0,
            "total_processing_time": 0.0,
            "batch_processing_times": [],
            "errors": []
        }

        # Start lazy initialization in background thread
        self._start_lazy_initialization()

    def _start_lazy_initialization(self):
        """Start OCR initialization in a background thread."""
        with self._initialization_lock:
            if self._initialization_started:
                return
            self._initialization_started = True
        # Start thread
        self._initialization_thread = threading.Thread(
            target=self._initialize_ocr,
            name="OCRInitializer",
            daemon=True
        )
        self._initialization_thread.start()
        logger.info("Started lazy OCR initialization in background thread")

    def _ensure_ocr_initialized(self, timeout: float = None):
        """
        Block until OCR initialization is complete.
        If timeout is None, wait indefinitely.
        Returns True if OCR is available, False otherwise.
        """
        if self.ocr_available:
            return True
        if not self._initialization_started:
            self._start_lazy_initialization()
        if self._initialization_thread is not None:
            self._initialization_thread.join(timeout=timeout)
        # After join, check if OCR is now available
        return self.ocr_available

    def _initialize_ocr(self):
        """Initialize PaddleOCR with shared model instance"""
        try:
            logger.info("Initializing optimized OCR processor with batch support")

            # Test if PaddleOCR can be imported
            try:
                from paddleocr import PaddleOCR
                self._ocr_engine = PaddleOCR(
                    use_gpu=self.use_gpu,
                    use_angle_cls=True,
                    lang='en',
                    show_log=False,
                    gpu_mem=2000
                )
                self._model_loaded = True
                self.ocr_available = True
                logger.info("PaddleOCR initialized successfully with shared model instance")

                # Warm up the model with a dummy image
                self._warm_up_model()

            except ImportError as e:
                logger.error(f"Failed to import PaddleOCR: {e}")
                self.ocr_available = False
                return
            except Exception as e:
                logger.error(f"Failed to initialize PaddleOCR: {e}")
                self.ocr_available = False
                return

            # Create temporary directory for batch processing
            self._temp_dir = tempfile.mkdtemp(prefix="optimized_ocr_")

            # Create thread pool executor for async operations
            self._executor = concurrent.futures.ThreadPoolExecutor(
                max_workers=self.max_workers,
                thread_name_prefix="ocr_worker"
            )

            logger.info(f"Optimized OCR processor ready (GPU: {self.use_gpu}, "
                       f"batch_size: {self.batch_size}, workers: {self.max_workers})")

        except Exception as e:
            logger.error(f"Failed to initialize optimized OCR processor: {e}")
            self.ocr_available = False

    def _warm_up_model(self):
        """Warm up the OCR model with a dummy image to reduce first-call latency"""
        try:
            # Create a simple test image
            import numpy as np
            test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255

            # Warm up with a small batch
            self._ocr_engine.ocr(test_image, cls=True)
            logger.info("OCR model warmed up successfully")
        except Exception as e:
            logger.warning(f"Model warm-up failed: {e}")

    def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
        """
        Extract text from a single image (backward compatibility)

        Args:
            image_path: Path to image file

        Returns:
            OCR result dictionary
        """
        start_time = time.time()

        # Ensure OCR is initialized (wait up to 30 seconds)
        if not self._ensure_ocr_initialized(timeout=30.0):
            logger.warning("OCR not available after waiting")
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        try:
            # Process single image using batch method for consistency
            batch_result = self._process_batch_internal([image_path])
            if batch_result and len(batch_result) > 0:
                result = batch_result[0]
                processing_time = time.time() - start_time

                # Update metrics
                self.metrics["total_images_processed"] += 1
                self.metrics["total_processing_time"] += processing_time

                return {
                    "text": result.text,
                    "confidence": result.confidence,
                    "bboxes": result.bboxes,
                    "line_count": result.line_count,
                    "processing_time": processing_time
                }
            else:
                return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        except Exception as e:
            logger.error(f"Error processing image {image_path}: {e}")
            self.metrics["errors"].append(str(e))
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

    def extract_text_from_images_batch(self, image_paths: List[str]) -> List[BatchOCRResult]:
        """
        Extract text from multiple images in a batch

        Args:
            image_paths: List of image file paths

        Returns:
            List of BatchOCRResult objects
        """
        if not image_paths:
            return []

        batch_start_time = time.time()

        # Ensure OCR is initialized (wait up to 30 seconds)
        if not self._ensure_ocr_initialized(timeout=30.0):
            logger.warning("OCR not available for batch processing")
            return [BatchOCRResult(
                image_path=path,
                text="",
                confidence=0.0,
                bboxes=[],
                line_count=0,
                processing_time=0.0,
                error="OCR not available"
            ) for path in image_paths]

        try:
            # Process images in batches
            all_results = []

            for i in range(0, len(image_paths), self.batch_size):
                batch = image_paths[i:i + self.batch_size]
                batch_results = self._process_batch_internal(batch)
                all_results.extend(batch_results)

            batch_time = time.time() - batch_start_time

            # Update metrics
            self.metrics["total_images_processed"] += len(image_paths)
            self.metrics["total_processing_time"] += batch_time
            self.metrics["batch_processing_times"].append({
                "batch_size": len(image_paths),
                "processing_time": batch_time,
                "avg_time_per_image": batch_time / len(image_paths) if image_paths else 0
            })

            logger.info(f"Processed {len(image_paths)} images in {batch_time:.2f}s "
                       f"({batch_time/len(image_paths):.2f}s per image)")

            return all_results

        except Exception as e:
            logger.error(f"Error in batch processing: {e}")
            self.metrics["errors"].append(str(e))

            # Fall back to individual processing
            results = []
            for path in image_paths:
                try:
                    single_result = self.extract_text_from_image(path)
                    results.append(BatchOCRResult(
                        image_path=path,
                        text=single_result["text"],
                        confidence=single_result["confidence"],
                        bboxes=single_result["bboxes"],
                        line_count=single_result["line_count"],
                        processing_time=single_result.get("processing_time", 0.0)
                    ))
                except Exception as img_error:
                    results.append(BatchOCRResult(
                        image_path=path,
                        text="",
                        confidence=0.0,
                        bboxes=[],
                        line_count=0,
                        processing_time=0.0,
                        error=str(img_error)
                    ))

            return results

    def _process_batch_internal(self, image_paths: List[str]) -> List[BatchOCRResult]:
        """
        Internal batch processing method

        Args:
            image_paths: List of image file paths

        Returns:
            List of BatchOCRResult objects
        """
        results = []

        try:
            # Process all images in the batch
            for image_path in image_paths:
                image_start_time = time.time()

                try:
                    # Use the shared OCR engine
                    ocr_result = self._ocr_engine.ocr(image_path, cls=True)

                    # Parse OCR result
                    text_lines = []
                    bboxes = []
                    total_confidence = 0.0
                    line_count = 0

                    if ocr_result and ocr_result[0]:
                        for line in ocr_result[0]:
                            if line and len(line) >= 2:
                                bbox, (text, confidence) = line
                                text_lines.append(str(text))
                                bboxes.append(bbox)
                                total_confidence += float(confidence) if confidence else 0.0
                                line_count += 1

                    avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
                    full_text = "\n".join(text_lines)
                    processing_time = time.time() - image_start_time

                    results.append(BatchOCRResult(
                        image_path=image_path,
                        text=full_text,
                        confidence=avg_confidence,
                        bboxes=bboxes,
                        line_count=line_count,
                        processing_time=processing_time
                    ))

                except Exception as e:
                    processing_time = time.time() - image_start_time
                    logger.error(f"Error processing {image_path}: {e}")
                    results.append(BatchOCRResult(
                        image_path=image_path,
                        text="",
                        confidence=0.0,
                        bboxes=[],
                        line_count=0,
                        processing_time=processing_time,
                        error=str(e)
                    ))

            return results

        except Exception as e:
            logger.error(f"Error in internal batch processing: {e}")
            raise

    async def extract_text_from_images_batch_async(self, image_paths: List[str]) -> List[BatchOCRResult]:
        """
        Async version of batch processing

        Args:
            image_paths: List of image file paths

        Returns:
            List of BatchOCRResult objects
        """
        loop = asyncio.get_event_loop()

        # Run batch processing in thread pool
        future = loop.run_in_executor(
            self._executor,
            self.extract_text_from_images_batch,
            image_paths
        )

        try:
            results = await asyncio.wait_for(future, timeout=300)  # 5 minute timeout
            return results
        except asyncio.TimeoutError:
            logger.error("OCR batch processing timeout")
            return [BatchOCRResult(
                image_path=path,
                text="",
                confidence=0.0,
                bboxes=[],
                line_count=0,
                processing_time=0.0,
                error="Processing timeout"
            ) for path in image_paths]

    def extract_tables_from_image(self, image_path: str) -> List[Dict[str, Any]]:
        """
        Extract tables from image using OCR and layout analysis

        Args:
            image_path: Path to image file

        Returns:
            List of table dictionaries
        """
        try:
            # Use OCR to get text with bounding boxes
            ocr_result = self.extract_text_from_image(image_path)

            # Simple table detection based on text alignment
            tables = self._detect_tables_from_bboxes(ocr_result["bboxes"], ocr_result["text"])
            return tables
        except Exception as e:
            logger.error(f"Table extraction failed: {e}")
            return []

    def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]:
        """
        Enhanced table detection from OCR bounding boxes with improved accuracy

        Features:
        1. Adaptive row grouping based on text height
        2. Column alignment detection using common x-coordinates
        3. Header row detection based on formatting patterns
        4. Table boundary validation
        5. Multi-table detection in single image
        """
        tables = []

        if not bboxes or len(bboxes) < 4:  # Need at least 4 text elements for a table
            return tables

        text_lines = text.split('\n') if text else []

        # Step 1: Calculate text height statistics for adaptive row grouping
        text_heights = []
        for bbox in bboxes:
            if not bbox or len(bbox) < 4:
                continue
            try:
                # Get min and max y coordinates
                y_coords = [float(point[1]) for point in bbox if point and len(point) >= 2]
                if y_coords:
                    height = max(y_coords) - min(y_coords)
                    if height > 0:
                        text_heights.append(height)
            except (TypeError, ValueError, IndexError):
                continue

        avg_text_height = sum(text_heights) / len(text_heights) if text_heights else 20.0
        row_tolerance = avg_text_height * 0.8  # 80% of text height for row grouping

        # Step 2: Group text by rows with adaptive tolerance
        rows = {}
        for i, bbox in enumerate(bboxes):
            try:
                if not bbox or len(bbox) < 4:
                    continue

                # Calculate y-center of bounding box
                y_values = []
                for point in bbox:
                    if point and len(point) >= 2:
                        try:
                            y_val = point[1]
                            if isinstance(y_val, (int, float)):
                                y_values.append(float(y_val))
                            elif isinstance(y_val, str):
                                y_values.append(float(y_val))
                            else:
                                y_values.append(0.0)
                        except (TypeError, ValueError):
                            y_values.append(0.0)
                    else:
                        y_values.append(0.0)

                if not y_values:
                    continue

                y_center = sum(y_values) / len(y_values)

                # Find existing row or create new one
                row_found = False
                for row_key in list(rows.keys()):
                    if abs(y_center - row_key) <= row_tolerance:
                        rows[row_key].append((bbox, text_lines[i] if i < len(text_lines) else ""))
                        row_found = True
                        break

                if not row_found:
                    rows[y_center] = [(bbox, text_lines[i] if i < len(text_lines) else "")]

            except Exception as e:
                logger.debug(f"Error processing bbox {i} for table detection: {e}")
                continue

        if len(rows) < 2:  # Need at least 2 rows for a table
            return tables

        # Step 3: Sort rows by y-coordinate and process each row
        sorted_row_keys = sorted(rows.keys())
        sorted_rows = [rows[key] for key in sorted_row_keys]

        # Step 4: Detect column positions using x-coordinate clustering
        all_x_centers = []
        for row in sorted_rows:
            for bbox, _ in row:
                try:
                    if bbox and len(bbox) >= 4:
                        x_coords = [float(point[0]) for point in bbox if point and len(point) >= 1]
                        if x_coords:
                            x_center = sum(x_coords) / len(x_coords)
                            all_x_centers.append(x_center)
                except (TypeError, ValueError, IndexError):
                    continue

        if not all_x_centers:
            return tables

        # Simple column clustering: sort x-centers and group by proximity
        all_x_centers.sort()
        column_positions = []
        current_cluster = [all_x_centers[0]]

        for x in all_x_centers[1:]:
            if x - current_cluster[-1] <= avg_text_height * 1.5:  # 1.5x text width tolerance
                current_cluster.append(x)
            else:
                column_positions.append(sum(current_cluster) / len(current_cluster))
                current_cluster = [x]

        if current_cluster:
            column_positions.append(sum(current_cluster) / len(current_cluster))

        # Need at least 2 columns for a table
        if len(column_positions) < 2:
            return tables

        # Step 5: Create table structure with proper cell alignment
        column_positions.sort()
        table_data = []
        column_count = len(column_positions)

        for row in sorted_rows:
            # Sort row items by x-coordinate
            def get_x_center(item):
                try:
                    bbox = item[0]
                    if bbox and len(bbox) >= 4:
                        x_coords = [float(point[0]) for point in bbox if point and len(point) >= 1]
                        return sum(x_coords) / len(x_coords) if x_coords else 0.0
                except (TypeError, ValueError, IndexError):
                    pass
                return 0.0

            sorted_row = sorted(row, key=get_x_center)

            # Create row with cells aligned to columns
            row_cells = [""] * column_count
            for bbox, cell_text in sorted_row:
                try:
                    x_center = get_x_center((bbox, cell_text))
                    # Find closest column
                    if column_positions:
                        closest_col = min(range(column_count),
                                         key=lambda i: abs(x_center - column_positions[i]))
                        # Only assign if cell is empty or this text is closer to column center
                        if not row_cells[closest_col] or \
                           abs(x_center - column_positions[closest_col]) < avg_text_height * 0.5:
                            row_cells[closest_col] = cell_text
                except Exception:
                    continue

            # Only add row if it has meaningful content (not all empty)
            if any(cell.strip() for cell in row_cells):
                table_data.append(row_cells)

        # Step 6: Validate table structure
        if len(table_data) >= 2 and column_count >= 2:
            # Calculate table consistency score
            non_empty_cells = sum(1 for row in table_data for cell in row if cell.strip())
            total_cells = len(table_data) * column_count
            fill_ratio = non_empty_cells / total_cells if total_cells > 0 else 0

            # Only accept tables with reasonable fill ratio (20-90%)
            if 0.2 <= fill_ratio <= 0.9:
                # Detect potential header row (first row often has different characteristics)
                has_header = False
                if len(table_data) >= 3:
                    # Check if first row has more text or different formatting
                    first_row_text_len = sum(len(cell) for cell in table_data[0])
                    second_row_text_len = sum(len(cell) for cell in table_data[1])
                    if first_row_text_len > second_row_text_len * 1.5:
                        has_header = True

                tables.append({
                    "data": table_data,
                    "rows": len(table_data),
                    "columns": column_count,
                    "has_header": has_header,
                    "fill_ratio": fill_ratio,
                    "type": "detected_table"
                })

        return tables

    def get_metrics(self) -> Dict[str, Any]:
        """
        Get performance metrics

        Returns:
            Dictionary of performance metrics
        """
        avg_time_per_image = 0.0
        if self.metrics["total_images_processed"] > 0:
            avg_time_per_image = self.metrics["total_processing_time"] / self.metrics["total_images_processed"]

        return {
            **self.metrics,
            "avg_time_per_image": avg_time_per_image,
            "ocr_available": self.ocr_available,
            "model_loaded": self._model_loaded,
            "batch_size": self.batch_size,
            "max_workers": self.max_workers,
            "use_gpu": self.use_gpu
        }

    def close(self):
        """Clean up resources"""
        if self._executor:
            self._executor.shutdown(wait=True)
            self._executor = None

        if self._temp_dir and os.path.exists(self._temp_dir):
            import shutil
            try:
                shutil.rmtree(self._temp_dir)
            except Exception as e:
                logger.warning(f"Failed to delete temporary directory: {e}")

        self._ocr_engine = None
        self._model_loaded = False
        self.ocr_available = False

        logger.info("Optimized OCR processor closed")

    def __del__(self):
        """Destructor to ensure cleanup"""
        self.close()


# Performance test function
def test_optimized_ocr_performance():
    """Test optimized OCR processor performance"""
    print("=== Testing Optimized OCR Processor ===")

    # Create test processor
    processor = OptimizedOCRProcessor(use_gpu=True, batch_size=4, max_workers=2)

    if not processor.ocr_available:
        print("OCR not available, skipping test")
        return

    # Generate dummy image paths (in real test, these would be actual images)
    test_images = [f"test_image_{i}.png" for i in range(8)]

    # Test single processing
    print("\n1. Testing single image processing:")
    single_start = time.time()
    single_results = []
    for img in test_images[:2]:  # Test with 2 images
        result = processor.extract_text_from_image(img)
        single_results.append(result)
    single_time = time.time() - single_start
    print(f"   Processed {len(test_images[:2])} images in {single_time:.2f}s")

    # Test batch processing
    print("\n2. Testing batch processing (4 images/batch):")
    batch_start = time.time()
    batch_results = processor.extract_text_from_images_batch(test_images)
    batch_time = time.time() - batch_start
    print(f"   Processed {len(test_images)} images in {batch_time:.2f}s")
    print(f"   Average time per image: {batch_time/len(test_images):.2f}s")

    # Show metrics
    print("\n3. Performance Metrics:")
    metrics = processor.get_metrics()
    for key, value in metrics.items():
        if key not in ["batch_processing_times", "errors", "bboxes"]:
            print(f"   {key}: {value}")

    processor.close()

    return single_results, batch_results


if __name__ == "__main__":
    # Run performance test
    print("Running optimized OCR processor performance test...")
    test_optimized_ocr_performance()