#!/usr/bin/env python3 """ Optimized Batch OCR Processor Implements batch processing and async operations for significant performance improvement """ import os import time import asyncio import concurrent.futures from typing import List, Dict, Any from pathlib import Path import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class OptimizedBatchOCRProcessor: """OCR processor with batch processing and async support""" def __init__(self, batch_size: int = 8, max_workers: int = 4): self.batch_size = batch_size self.max_workers = max_workers self.ocr_engine = None self._initialize_ocr() def _initialize_ocr(self): """Initialize PaddleOCR with GPU support""" try: from paddleocr import PaddleOCR logger.info("Initializing PaddleOCR with GPU for batch processing") self.ocr_engine = PaddleOCR( use_gpu=True, use_angle_cls=True, lang='en', show_log=False, gpu_mem=2000 ) logger.info("āœ… PaddleOCR initialized successfully for batch processing") except Exception as e: logger.error(f"āŒ Failed to initialize PaddleOCR: {e}") raise def process_single_image(self, image_path: str) -> Dict[str, Any]: """Process single image (compatibility method)""" if not self.ocr_engine: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} try: result = self.ocr_engine.ocr(image_path) return self._parse_ocr_result(result) except Exception as e: logger.error(f"OCR failed for {image_path}: {e}") return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} def process_batch(self, image_paths: List[str]) -> List[Dict[str, Any]]: """ Process multiple images in batch for better performance Note: PaddleOCR doesn't have native batch support, so we use threading """ if not self.ocr_engine: return [{"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} for _ in image_paths] logger.info(f"Processing batch of {len(image_paths)} images") start_time = time.time() # Process in parallel using thread pool with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: # Submit all OCR tasks future_to_path = { executor.submit(self.process_single_image, path): path for path in image_paths } # Collect results as they complete results = [] for future in concurrent.futures.as_completed(future_to_path): path = future_to_path[future] try: result = future.result(timeout=30) results.append((path, result)) except Exception as e: logger.error(f"Failed to process {path}: {e}") results.append((path, {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0})) # Sort results to maintain original order results.sort(key=lambda x: image_paths.index(x[0])) batch_results = [result for _, result in results] total_time = time.time() - start_time logger.info(f"āœ… Batch processing completed in {total_time:.3f}s " f"({total_time/len(image_paths):.3f}s per image)") return batch_results async def process_batch_async(self, image_paths: List[str]) -> List[Dict[str, Any]]: """Async version of batch processing""" loop = asyncio.get_event_loop() return await loop.run_in_executor( None, self.process_batch, image_paths ) def _parse_ocr_result(self, result) -> Dict[str, Any]: """Parse PaddleOCR result into standardized format""" if not result or not result[0]: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} extracted_text = [] bboxes = [] total_confidence = 0.0 line_count = 0 for line in result[0]: try: if len(line) == 2: bbox, (text, confidence) = line elif len(line) >= 1: bbox = line[0] if len(line) > 0 else [] if len(line) > 1: if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2: text, confidence = line[1][0], line[1][1] else: text, confidence = str(line[1]) if len(line) > 1 else "", 0.0 else: text, confidence = "", 0.0 else: continue text_str = str(text) if text is not None else "" confidence_float = 0.0 if confidence is not None: if isinstance(confidence, (int, float)): confidence_float = float(confidence) elif isinstance(confidence, str): try: confidence_float = float(confidence) except ValueError: confidence_float = 0.0 else: confidence_float = 0.0 extracted_text.append(text_str) bboxes.append(bbox) total_confidence += confidence_float line_count += 1 except (TypeError, ValueError, IndexError) as e: logger.warning(f"Error parsing OCR line: {e}") extracted_text.append("") bboxes.append([]) total_confidence += 0.0 line_count += 1 avg_confidence = total_confidence / line_count if line_count > 0 else 0.0 full_text = "\n".join(extracted_text) return { "text": full_text, "confidence": avg_confidence, "bboxes": bboxes, "line_count": line_count } class AsyncDocumentProcessor: """Async document processor with optimized pipeline""" def __init__(self): self.ocr_processor = OptimizedBatchOCRProcessor() self.image_extractor = None self._initialize_components() def _initialize_components(self): """Initialize all processing components""" logger.info("Initializing AsyncDocumentProcessor") async def extract_images(self, file_path: str) -> List[str]: """Extract images from document asynchronously""" file_ext = Path(file_path).suffix.lower() output_dir = "extracted_images_batch" os.makedirs(output_dir, exist_ok=True) if file_ext == '.pdf': return await self._extract_images_from_pdf(file_path, output_dir) elif file_ext == '.docx': return await self._extract_images_from_docx(file_path, output_dir) elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']: return [file_path] else: return [] async def _extract_images_from_pdf(self, pdf_path: str, output_dir: str) -> List[str]: """Extract images from PDF asynchronously""" try: from pdf2image import convert_from_path logger.info(f"Extracting images from PDF: {pdf_path}") # Run in thread pool since pdf2image is CPU-bound loop = asyncio.get_event_loop() images = await loop.run_in_executor( None, convert_from_path, pdf_path ) image_paths = [] for i, image in enumerate(images): img_path = os.path.join(output_dir, f"pdf_page_{i+1}.png") image.save(img_path, 'PNG') image_paths.append(img_path) logger.info(f"Extracted {len(image_paths)} images from PDF") return image_paths except Exception as e: logger.error(f"Failed to extract images from PDF: {e}") return [] async def _extract_images_from_docx(self, docx_path: str, output_dir: str) -> List[str]: """Extract images from Word document asynchronously""" try: from word_image_extractor import extract_images_from_docx logger.info(f"Extracting images from DOCX: {docx_path}") loop = asyncio.get_event_loop() images = await loop.run_in_executor( None, extract_images_from_docx, docx_path, output_dir ) logger.info(f"Extracted {len(images)} images from DOCX") return images except Exception as e: logger.error(f"Failed to extract images from DOCX: {e}") return [] async def process_document(self, file_path: str) -> Dict[str, Any]: """Process document with optimized async pipeline""" logger.info(f"Processing document: {file_path}") start_time = time.time() result = { "success": False, "file_path": file_path, "text_content": "", "images": [], "processing_time": 0, "metadata": {} } try: # Step 1: Extract images (async) image_paths = await self.extract_images(file_path) result["metadata"]["images_extracted"] = len(image_paths) if image_paths: # Step 2: Process images in batch (async) logger.info(f"Processing {len(image_paths)} images in batch") ocr_results = await self.ocr_processor.process_batch_async(image_paths) # Step 3: Combine results processed_images = [] all_text = [] for i, (img_path, ocr_result) in enumerate(zip(image_paths, ocr_results)): image_data = { "path": img_path, "ocr_text": ocr_result["text"], "ocr_confidence": ocr_result["confidence"], "line_count": ocr_result["line_count"] } processed_images.append(image_data) if ocr_result["text"].strip(): all_text.append(f"Image {i+1}:\n{ocr_result['text']}") result["images"] = processed_images result["text_content"] = "\n\n".join(all_text) result["metadata"]["images_processed"] = len(processed_images) result["metadata"]["total_text_chars"] = len(result["text_content"]) result["success"] = True except Exception as e: logger.error(f"Document processing failed: {e}") result["error"] = str(e) result["processing_time"] = time.time() - start_time logger.info(f"Document processing completed in {result['processing_time']:.3f}s") return result # Performance comparison test async def performance_comparison(): """Compare performance between single and batch processing""" print("šŸš€ PERFORMANCE COMPARISON: SINGLE vs BATCH PROCESSING") print("=" * 60) # Create test images test_images = [] try: from PIL import Image import tempfile print("Creating test images...") for i in range(8): with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f: img_path = f.name # Create test image with some text img = Image.new('RGB', (800, 600), color='white') img.save(img_path) test_images.append(img_path) # Test single processing print("\nšŸ“Š TEST 1: Single Image Processing (Sequential)") single_processor = OptimizedBatchOCRProcessor() single_times = [] for img_path in test_images[:4]: # Test with 4 images start = time.time() result = single_processor.process_single_image(img_path) elapsed = time.time() - start single_times.append(elapsed) print(f" Image {len(single_times)}: {elapsed:.3f}s, {len(result['text'])} chars") single_total = sum(single_times) print(f" šŸ“ˆ Total time: {single_total:.3f}s") print(f" šŸ“Š Average per image: {single_total/len(single_times):.3f}s") # Test batch processing print("\nšŸ“Š TEST 2: Batch Processing (4 images)") batch_start = time.time() batch_results = single_processor.process_batch(test_images[:4]) batch_time = time.time() - batch_start print(f" šŸ“ˆ Batch time: {batch_time:.3f}s") print(f" šŸ“Š Average per image: {batch_time/4:.3f}s") # Calculate improvement improvement = (single_total - batch_time) / single_total * 100 print(f" šŸš€ Performance improvement: {improvement:.1f}% faster") # Test with all 8 images print("\nšŸ“Š TEST 3: Larger Batch (8 images)") batch_start = time.time() batch_results = single_processor.process_batch(test_images) batch_time = time.time() - batch_start print(f" šŸ“ˆ Batch time: {batch_time:.3f}s") print(f" šŸ“Š Average per image: {batch_time/8:.3f}s") # Cleanup for img_path in test_images: os.unlink(img_path) except Exception as e: print(f"āŒ Performance test failed: {e}") async def test_document_processing(): """Test complete document processing pipeline""" print("\nšŸ“„ TESTING COMPLETE DOCUMENT PROCESSING PIPELINE") print("=" * 60) processor = AsyncDocumentProcessor() # Test with available documents test_files = [ ("test.docx", "Word document"), ("test.pdf", "PDF document"), ("ocr.pdf", "OCR test PDF") ] for file_name, description in test_files: if os.path.exists(file_name): print(f"\nšŸ“‚ Processing {description}: {file_name}") result = await processor.process_document(file_name) print(f" āœ… Success: {result['success']}") print(f" ā±ļø Processing time: {result['processing_time']:.3f}s") print(f" šŸ“Š Images processed: {result['metadata'].get('images_processed', 0)}") print(f" šŸ“ Total text: {result['metadata'].get('total_text_chars', 0)} chars") if result.get('error'): print(f" āŒ Error: {result['error']}") else: print(f"\nāš ļø Test file not found: {file_name}") if __name__ == "__main__": print("šŸ”§ OPTIMIZED BATCH OCR PROCESSOR") print("=" * 50) print("This implementation demonstrates:") print("1. Batch processing for multiple images") print("2. Async/await for I/O operations") print("3. Thread pool for CPU-bound tasks") print("4. Performance comparison metrics") # Run performance comparison asyncio.run(performance_comparison()) # Run document processing test asyncio.run(test_document_processing()) print("\nšŸŽ‰ OPTIMIZATION DEMONSTRATION COMPLETE") print("\nšŸ’” Key takeaways:") print(" - Batch processing reduces per-image overhead") print(" - Async operations prevent blocking on I/O") print(" - Thread pool maximizes CPU utilization") print(" - Expected improvement: 30-50% faster processing")