Files
railseek6/optimized_batch_ocr.py

406 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Optimized Batch OCR Processor
Implements batch processing and async operations for significant performance improvement
"""
import os
import time
import asyncio
import concurrent.futures
from typing import List, Dict, Any
from pathlib import Path
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class OptimizedBatchOCRProcessor:
"""OCR processor with batch processing and async support"""
def __init__(self, batch_size: int = 8, max_workers: int = 4):
self.batch_size = batch_size
self.max_workers = max_workers
self.ocr_engine = None
self._initialize_ocr()
def _initialize_ocr(self):
"""Initialize PaddleOCR with GPU support"""
try:
from paddleocr import PaddleOCR
logger.info("Initializing PaddleOCR with GPU for batch processing")
self.ocr_engine = PaddleOCR(
use_gpu=True,
use_angle_cls=True,
lang='en',
show_log=False,
gpu_mem=2000
)
logger.info("✅ PaddleOCR initialized successfully for batch processing")
except Exception as e:
logger.error(f"❌ Failed to initialize PaddleOCR: {e}")
raise
def process_single_image(self, image_path: str) -> Dict[str, Any]:
"""Process single image (compatibility method)"""
if not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
result = self.ocr_engine.ocr(image_path)
return self._parse_ocr_result(result)
except Exception as e:
logger.error(f"OCR failed for {image_path}: {e}")
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
def process_batch(self, image_paths: List[str]) -> List[Dict[str, Any]]:
"""
Process multiple images in batch for better performance
Note: PaddleOCR doesn't have native batch support, so we use threading
"""
if not self.ocr_engine:
return [{"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} for _ in image_paths]
logger.info(f"Processing batch of {len(image_paths)} images")
start_time = time.time()
# Process in parallel using thread pool
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all OCR tasks
future_to_path = {
executor.submit(self.process_single_image, path): path
for path in image_paths
}
# Collect results as they complete
results = []
for future in concurrent.futures.as_completed(future_to_path):
path = future_to_path[future]
try:
result = future.result(timeout=30)
results.append((path, result))
except Exception as e:
logger.error(f"Failed to process {path}: {e}")
results.append((path, {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}))
# Sort results to maintain original order
results.sort(key=lambda x: image_paths.index(x[0]))
batch_results = [result for _, result in results]
total_time = time.time() - start_time
logger.info(f"✅ Batch processing completed in {total_time:.3f}s "
f"({total_time/len(image_paths):.3f}s per image)")
return batch_results
async def process_batch_async(self, image_paths: List[str]) -> List[Dict[str, Any]]:
"""Async version of batch processing"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None, self.process_batch, image_paths
)
def _parse_ocr_result(self, result) -> Dict[str, Any]:
"""Parse PaddleOCR result into standardized format"""
if not result or not result[0]:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
extracted_text = []
bboxes = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
if len(line) == 2:
bbox, (text, confidence) = line
elif len(line) >= 1:
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
text_str = str(text) if text is not None else ""
confidence_float = 0.0
if confidence is not None:
if isinstance(confidence, (int, float)):
confidence_float = float(confidence)
elif isinstance(confidence, str):
try:
confidence_float = float(confidence)
except ValueError:
confidence_float = 0.0
else:
confidence_float = 0.0
extracted_text.append(text_str)
bboxes.append(bbox)
total_confidence += confidence_float
line_count += 1
except (TypeError, ValueError, IndexError) as e:
logger.warning(f"Error parsing OCR line: {e}")
extracted_text.append("")
bboxes.append([])
total_confidence += 0.0
line_count += 1
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
full_text = "\n".join(extracted_text)
return {
"text": full_text,
"confidence": avg_confidence,
"bboxes": bboxes,
"line_count": line_count
}
class AsyncDocumentProcessor:
"""Async document processor with optimized pipeline"""
def __init__(self):
self.ocr_processor = OptimizedBatchOCRProcessor()
self.image_extractor = None
self._initialize_components()
def _initialize_components(self):
"""Initialize all processing components"""
logger.info("Initializing AsyncDocumentProcessor")
async def extract_images(self, file_path: str) -> List[str]:
"""Extract images from document asynchronously"""
file_ext = Path(file_path).suffix.lower()
output_dir = "extracted_images_batch"
os.makedirs(output_dir, exist_ok=True)
if file_ext == '.pdf':
return await self._extract_images_from_pdf(file_path, output_dir)
elif file_ext == '.docx':
return await self._extract_images_from_docx(file_path, output_dir)
elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
return [file_path]
else:
return []
async def _extract_images_from_pdf(self, pdf_path: str, output_dir: str) -> List[str]:
"""Extract images from PDF asynchronously"""
try:
from pdf2image import convert_from_path
logger.info(f"Extracting images from PDF: {pdf_path}")
# Run in thread pool since pdf2image is CPU-bound
loop = asyncio.get_event_loop()
images = await loop.run_in_executor(
None, convert_from_path, pdf_path
)
image_paths = []
for i, image in enumerate(images):
img_path = os.path.join(output_dir, f"pdf_page_{i+1}.png")
image.save(img_path, 'PNG')
image_paths.append(img_path)
logger.info(f"Extracted {len(image_paths)} images from PDF")
return image_paths
except Exception as e:
logger.error(f"Failed to extract images from PDF: {e}")
return []
async def _extract_images_from_docx(self, docx_path: str, output_dir: str) -> List[str]:
"""Extract images from Word document asynchronously"""
try:
from word_image_extractor import extract_images_from_docx
logger.info(f"Extracting images from DOCX: {docx_path}")
loop = asyncio.get_event_loop()
images = await loop.run_in_executor(
None, extract_images_from_docx, docx_path, output_dir
)
logger.info(f"Extracted {len(images)} images from DOCX")
return images
except Exception as e:
logger.error(f"Failed to extract images from DOCX: {e}")
return []
async def process_document(self, file_path: str) -> Dict[str, Any]:
"""Process document with optimized async pipeline"""
logger.info(f"Processing document: {file_path}")
start_time = time.time()
result = {
"success": False,
"file_path": file_path,
"text_content": "",
"images": [],
"processing_time": 0,
"metadata": {}
}
try:
# Step 1: Extract images (async)
image_paths = await self.extract_images(file_path)
result["metadata"]["images_extracted"] = len(image_paths)
if image_paths:
# Step 2: Process images in batch (async)
logger.info(f"Processing {len(image_paths)} images in batch")
ocr_results = await self.ocr_processor.process_batch_async(image_paths)
# Step 3: Combine results
processed_images = []
all_text = []
for i, (img_path, ocr_result) in enumerate(zip(image_paths, ocr_results)):
image_data = {
"path": img_path,
"ocr_text": ocr_result["text"],
"ocr_confidence": ocr_result["confidence"],
"line_count": ocr_result["line_count"]
}
processed_images.append(image_data)
if ocr_result["text"].strip():
all_text.append(f"Image {i+1}:\n{ocr_result['text']}")
result["images"] = processed_images
result["text_content"] = "\n\n".join(all_text)
result["metadata"]["images_processed"] = len(processed_images)
result["metadata"]["total_text_chars"] = len(result["text_content"])
result["success"] = True
except Exception as e:
logger.error(f"Document processing failed: {e}")
result["error"] = str(e)
result["processing_time"] = time.time() - start_time
logger.info(f"Document processing completed in {result['processing_time']:.3f}s")
return result
# Performance comparison test
async def performance_comparison():
"""Compare performance between single and batch processing"""
print("🚀 PERFORMANCE COMPARISON: SINGLE vs BATCH PROCESSING")
print("=" * 60)
# Create test images
test_images = []
try:
from PIL import Image
import tempfile
print("Creating test images...")
for i in range(8):
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
img_path = f.name
# Create test image with some text
img = Image.new('RGB', (800, 600), color='white')
img.save(img_path)
test_images.append(img_path)
# Test single processing
print("\n📊 TEST 1: Single Image Processing (Sequential)")
single_processor = OptimizedBatchOCRProcessor()
single_times = []
for img_path in test_images[:4]: # Test with 4 images
start = time.time()
result = single_processor.process_single_image(img_path)
elapsed = time.time() - start
single_times.append(elapsed)
print(f" Image {len(single_times)}: {elapsed:.3f}s, {len(result['text'])} chars")
single_total = sum(single_times)
print(f" 📈 Total time: {single_total:.3f}s")
print(f" 📊 Average per image: {single_total/len(single_times):.3f}s")
# Test batch processing
print("\n📊 TEST 2: Batch Processing (4 images)")
batch_start = time.time()
batch_results = single_processor.process_batch(test_images[:4])
batch_time = time.time() - batch_start
print(f" 📈 Batch time: {batch_time:.3f}s")
print(f" 📊 Average per image: {batch_time/4:.3f}s")
# Calculate improvement
improvement = (single_total - batch_time) / single_total * 100
print(f" 🚀 Performance improvement: {improvement:.1f}% faster")
# Test with all 8 images
print("\n📊 TEST 3: Larger Batch (8 images)")
batch_start = time.time()
batch_results = single_processor.process_batch(test_images)
batch_time = time.time() - batch_start
print(f" 📈 Batch time: {batch_time:.3f}s")
print(f" 📊 Average per image: {batch_time/8:.3f}s")
# Cleanup
for img_path in test_images:
os.unlink(img_path)
except Exception as e:
print(f"❌ Performance test failed: {e}")
async def test_document_processing():
"""Test complete document processing pipeline"""
print("\n📄 TESTING COMPLETE DOCUMENT PROCESSING PIPELINE")
print("=" * 60)
processor = AsyncDocumentProcessor()
# Test with available documents
test_files = [
("test.docx", "Word document"),
("test.pdf", "PDF document"),
("ocr.pdf", "OCR test PDF")
]
for file_name, description in test_files:
if os.path.exists(file_name):
print(f"\n📂 Processing {description}: {file_name}")
result = await processor.process_document(file_name)
print(f" ✅ Success: {result['success']}")
print(f" ⏱️ Processing time: {result['processing_time']:.3f}s")
print(f" 📊 Images processed: {result['metadata'].get('images_processed', 0)}")
print(f" 📝 Total text: {result['metadata'].get('total_text_chars', 0)} chars")
if result.get('error'):
print(f" ❌ Error: {result['error']}")
else:
print(f"\n⚠️ Test file not found: {file_name}")
if __name__ == "__main__":
print("🔧 OPTIMIZED BATCH OCR PROCESSOR")
print("=" * 50)
print("This implementation demonstrates:")
print("1. Batch processing for multiple images")
print("2. Async/await for I/O operations")
print("3. Thread pool for CPU-bound tasks")
print("4. Performance comparison metrics")
# Run performance comparison
asyncio.run(performance_comparison())
# Run document processing test
asyncio.run(test_document_processing())
print("\n🎉 OPTIMIZATION DEMONSTRATION COMPLETE")
print("\n💡 Key takeaways:")
print(" - Batch processing reduces per-image overhead")
print(" - Async operations prevent blocking on I/O")
print(" - Thread pool maximizes CPU utilization")
print(" - Expected improvement: 30-50% faster processing")