railseek6/optimized_batch_ocr.py

#!/usr/bin/env python3
"""
Optimized Batch OCR Processor
Implements batch processing and async operations for significant performance improvement
"""
import os
import time
import asyncio
import concurrent.futures
from typing import List, Dict, Any
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class OptimizedBatchOCRProcessor:
    """OCR processor with batch processing and async support"""

    def __init__(self, batch_size: int = 8, max_workers: int = 4):
        self.batch_size = batch_size
        self.max_workers = max_workers
        self.ocr_engine = None
        self._initialize_ocr()

    def _initialize_ocr(self):
        """Initialize PaddleOCR with GPU support"""
        try:
            from paddleocr import PaddleOCR
            logger.info("Initializing PaddleOCR with GPU for batch processing")
            self.ocr_engine = PaddleOCR(
                use_gpu=True,
                use_angle_cls=True,
                lang='en',
                show_log=False,
                gpu_mem=2000
            )
            logger.info("✅ PaddleOCR initialized successfully for batch processing")
        except Exception as e:
            logger.error(f"❌ Failed to initialize PaddleOCR: {e}")
            raise

    def process_single_image(self, image_path: str) -> Dict[str, Any]:
        """Process single image (compatibility method)"""
        if not self.ocr_engine:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        try:
            result = self.ocr_engine.ocr(image_path)
            return self._parse_ocr_result(result)
        except Exception as e:
            logger.error(f"OCR failed for {image_path}: {e}")
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

    def process_batch(self, image_paths: List[str]) -> List[Dict[str, Any]]:
        """
        Process multiple images in batch for better performance
        Note: PaddleOCR doesn't have native batch support, so we use threading
        """
        if not self.ocr_engine:
            return [{"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} for _ in image_paths]

        logger.info(f"Processing batch of {len(image_paths)} images")
        start_time = time.time()

        # Process in parallel using thread pool
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all OCR tasks
            future_to_path = {
                executor.submit(self.process_single_image, path): path
                for path in image_paths
            }

            # Collect results as they complete
            results = []
            for future in concurrent.futures.as_completed(future_to_path):
                path = future_to_path[future]
                try:
                    result = future.result(timeout=30)
                    results.append((path, result))
                except Exception as e:
                    logger.error(f"Failed to process {path}: {e}")
                    results.append((path, {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}))

        # Sort results to maintain original order
        results.sort(key=lambda x: image_paths.index(x[0]))
        batch_results = [result for _, result in results]

        total_time = time.time() - start_time
        logger.info(f"✅ Batch processing completed in {total_time:.3f}s "
                   f"({total_time/len(image_paths):.3f}s per image)")

        return batch_results

    async def process_batch_async(self, image_paths: List[str]) -> List[Dict[str, Any]]:
        """Async version of batch processing"""
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            None, self.process_batch, image_paths
        )

    def _parse_ocr_result(self, result) -> Dict[str, Any]:
        """Parse PaddleOCR result into standardized format"""
        if not result or not result[0]:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        extracted_text = []
        bboxes = []
        total_confidence = 0.0
        line_count = 0

        for line in result[0]:
            try:
                if len(line) == 2:
                    bbox, (text, confidence) = line
                elif len(line) >= 1:
                    bbox = line[0] if len(line) > 0 else []
                    if len(line) > 1:
                        if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
                            text, confidence = line[1][0], line[1][1]
                        else:
                            text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
                    else:
                        text, confidence = "", 0.0
                else:
                    continue

                text_str = str(text) if text is not None else ""
                confidence_float = 0.0
                if confidence is not None:
                    if isinstance(confidence, (int, float)):
                        confidence_float = float(confidence)
                    elif isinstance(confidence, str):
                        try:
                            confidence_float = float(confidence)
                        except ValueError:
                            confidence_float = 0.0
                    else:
                        confidence_float = 0.0

                extracted_text.append(text_str)
                bboxes.append(bbox)
                total_confidence += confidence_float
                line_count += 1

            except (TypeError, ValueError, IndexError) as e:
                logger.warning(f"Error parsing OCR line: {e}")
                extracted_text.append("")
                bboxes.append([])
                total_confidence += 0.0
                line_count += 1

        avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
        full_text = "\n".join(extracted_text)

        return {
            "text": full_text,
            "confidence": avg_confidence,
            "bboxes": bboxes,
            "line_count": line_count
        }

class AsyncDocumentProcessor:
    """Async document processor with optimized pipeline"""

    def __init__(self):
        self.ocr_processor = OptimizedBatchOCRProcessor()
        self.image_extractor = None
        self._initialize_components()

    def _initialize_components(self):
        """Initialize all processing components"""
        logger.info("Initializing AsyncDocumentProcessor")

    async def extract_images(self, file_path: str) -> List[str]:
        """Extract images from document asynchronously"""
        file_ext = Path(file_path).suffix.lower()
        output_dir = "extracted_images_batch"
        os.makedirs(output_dir, exist_ok=True)

        if file_ext == '.pdf':
            return await self._extract_images_from_pdf(file_path, output_dir)
        elif file_ext == '.docx':
            return await self._extract_images_from_docx(file_path, output_dir)
        elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
            return [file_path]
        else:
            return []

    async def _extract_images_from_pdf(self, pdf_path: str, output_dir: str) -> List[str]:
        """Extract images from PDF asynchronously"""
        try:
            from pdf2image import convert_from_path
            logger.info(f"Extracting images from PDF: {pdf_path}")

            # Run in thread pool since pdf2image is CPU-bound
            loop = asyncio.get_event_loop()
            images = await loop.run_in_executor(
                None, convert_from_path, pdf_path
            )

            image_paths = []
            for i, image in enumerate(images):
                img_path = os.path.join(output_dir, f"pdf_page_{i+1}.png")
                image.save(img_path, 'PNG')
                image_paths.append(img_path)

            logger.info(f"Extracted {len(image_paths)} images from PDF")
            return image_paths

        except Exception as e:
            logger.error(f"Failed to extract images from PDF: {e}")
            return []

    async def _extract_images_from_docx(self, docx_path: str, output_dir: str) -> List[str]:
        """Extract images from Word document asynchronously"""
        try:
            from word_image_extractor import extract_images_from_docx
            logger.info(f"Extracting images from DOCX: {docx_path}")

            loop = asyncio.get_event_loop()
            images = await loop.run_in_executor(
                None, extract_images_from_docx, docx_path, output_dir
            )

            logger.info(f"Extracted {len(images)} images from DOCX")
            return images

        except Exception as e:
            logger.error(f"Failed to extract images from DOCX: {e}")
            return []

    async def process_document(self, file_path: str) -> Dict[str, Any]:
        """Process document with optimized async pipeline"""
        logger.info(f"Processing document: {file_path}")
        start_time = time.time()

        result = {
            "success": False,
            "file_path": file_path,
            "text_content": "",
            "images": [],
            "processing_time": 0,
            "metadata": {}
        }

        try:
            # Step 1: Extract images (async)
            image_paths = await self.extract_images(file_path)
            result["metadata"]["images_extracted"] = len(image_paths)

            if image_paths:
                # Step 2: Process images in batch (async)
                logger.info(f"Processing {len(image_paths)} images in batch")
                ocr_results = await self.ocr_processor.process_batch_async(image_paths)

                # Step 3: Combine results
                processed_images = []
                all_text = []

                for i, (img_path, ocr_result) in enumerate(zip(image_paths, ocr_results)):
                    image_data = {
                        "path": img_path,
                        "ocr_text": ocr_result["text"],
                        "ocr_confidence": ocr_result["confidence"],
                        "line_count": ocr_result["line_count"]
                    }
                    processed_images.append(image_data)

                    if ocr_result["text"].strip():
                        all_text.append(f"Image {i+1}:\n{ocr_result['text']}")

                result["images"] = processed_images
                result["text_content"] = "\n\n".join(all_text)
                result["metadata"]["images_processed"] = len(processed_images)
                result["metadata"]["total_text_chars"] = len(result["text_content"])

            result["success"] = True

        except Exception as e:
            logger.error(f"Document processing failed: {e}")
            result["error"] = str(e)

        result["processing_time"] = time.time() - start_time
        logger.info(f"Document processing completed in {result['processing_time']:.3f}s")

        return result

# Performance comparison test
async def performance_comparison():
    """Compare performance between single and batch processing"""
    print("🚀 PERFORMANCE COMPARISON: SINGLE vs BATCH PROCESSING")
    print("=" * 60)

    # Create test images
    test_images = []
    try:
        from PIL import Image
        import tempfile

        print("Creating test images...")
        for i in range(8):
            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
                img_path = f.name

            # Create test image with some text
            img = Image.new('RGB', (800, 600), color='white')
            img.save(img_path)
            test_images.append(img_path)

        # Test single processing
        print("\n📊 TEST 1: Single Image Processing (Sequential)")
        single_processor = OptimizedBatchOCRProcessor()

        single_times = []
        for img_path in test_images[:4]:  # Test with 4 images
            start = time.time()
            result = single_processor.process_single_image(img_path)
            elapsed = time.time() - start
            single_times.append(elapsed)
            print(f"  Image {len(single_times)}: {elapsed:.3f}s, {len(result['text'])} chars")

        single_total = sum(single_times)
        print(f"  📈 Total time: {single_total:.3f}s")
        print(f"  📊 Average per image: {single_total/len(single_times):.3f}s")

        # Test batch processing
        print("\n📊 TEST 2: Batch Processing (4 images)")
        batch_start = time.time()
        batch_results = single_processor.process_batch(test_images[:4])
        batch_time = time.time() - batch_start

        print(f"  📈 Batch time: {batch_time:.3f}s")
        print(f"  📊 Average per image: {batch_time/4:.3f}s")

        # Calculate improvement
        improvement = (single_total - batch_time) / single_total * 100
        print(f"  🚀 Performance improvement: {improvement:.1f}% faster")

        # Test with all 8 images
        print("\n📊 TEST 3: Larger Batch (8 images)")
        batch_start = time.time()
        batch_results = single_processor.process_batch(test_images)
        batch_time = time.time() - batch_start

        print(f"  📈 Batch time: {batch_time:.3f}s")
        print(f"  📊 Average per image: {batch_time/8:.3f}s")

        # Cleanup
        for img_path in test_images:
            os.unlink(img_path)

    except Exception as e:
        print(f"❌ Performance test failed: {e}")

async def test_document_processing():
    """Test complete document processing pipeline"""
    print("\n📄 TESTING COMPLETE DOCUMENT PROCESSING PIPELINE")
    print("=" * 60)

    processor = AsyncDocumentProcessor()

    # Test with available documents
    test_files = [
        ("test.docx", "Word document"),
        ("test.pdf", "PDF document"),
        ("ocr.pdf", "OCR test PDF")
    ]

    for file_name, description in test_files:
        if os.path.exists(file_name):
            print(f"\n📂 Processing {description}: {file_name}")
            result = await processor.process_document(file_name)

            print(f"  ✅ Success: {result['success']}")
            print(f"  ⏱️  Processing time: {result['processing_time']:.3f}s")
            print(f"  📊 Images processed: {result['metadata'].get('images_processed', 0)}")
            print(f"  📝 Total text: {result['metadata'].get('total_text_chars', 0)} chars")

            if result.get('error'):
                print(f"  ❌ Error: {result['error']}")
        else:
            print(f"\n⚠️  Test file not found: {file_name}")

if __name__ == "__main__":
    print("🔧 OPTIMIZED BATCH OCR PROCESSOR")
    print("=" * 50)
    print("This implementation demonstrates:")
    print("1. Batch processing for multiple images")
    print("2. Async/await for I/O operations")
    print("3. Thread pool for CPU-bound tasks")
    print("4. Performance comparison metrics")

    # Run performance comparison
    asyncio.run(performance_comparison())

    # Run document processing test
    asyncio.run(test_document_processing())

    print("\n🎉 OPTIMIZATION DEMONSTRATION COMPLETE")
    print("\n💡 Key takeaways:")
    print("  - Batch processing reduces per-image overhead")
    print("  - Async operations prevent blocking on I/O")
    print("  - Thread pool maximizes CPU utilization")
    print("  - Expected improvement: 30-50% faster processing")