""" Optimized Document Processor with Async Pipeline and Batch OCR Replaces the sequential processing with parallel pipeline stages """ import os import logging import asyncio import concurrent.futures from typing import Dict, List, Any, Optional, Tuple from dataclasses import dataclass import tempfile from pathlib import Path import time from collections import defaultdict # Import required libraries import fitz # PyMuPDF import docx import openpyxl from pptx import Presentation from bs4 import BeautifulSoup import pandas as pd from .optimized_ocr_processor import OptimizedOCRProcessor, BatchOCRResult from .production_config import get_config # Configure logging logger = logging.getLogger(__name__) @dataclass class ProcessingResult: """Result of document processing""" success: bool content: str metadata: Dict[str, Any] error: Optional[str] = None tables: List[Dict[str, Any]] = None images: List[Dict[str, Any]] = None processing_time: float = 0.0 class AsyncDocumentProcessor: """ Async document processor with parallel pipeline stages and batch OCR """ def __init__(self, batch_size: int = 4, max_workers: int = 2): """ Initialize async document processor Args: batch_size: Number of images to process in each OCR batch max_workers: Maximum number of parallel workers """ self.config = get_config() self.batch_size = batch_size self.max_workers = max_workers # Initialize optimized OCR processor self.ocr_processor = OptimizedOCRProcessor( use_gpu=self.config.performance.USE_GPU, languages=self.config.document_processing.OCR_LANGUAGES, batch_size=batch_size, max_workers=max_workers ) self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS # Initialize image classifier if available self.image_classifier = None try: # Add the workspace directory to path where fast_image_classifier.py is located import sys workspace_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if workspace_dir not in sys.path: sys.path.insert(0, workspace_dir) from fast_image_classifier import get_image_classifier self.image_classifier = get_image_classifier() logger.info("Image classifier initialized successfully") except Exception as e: logger.warning(f"Failed to initialize image classifier: {e}") # Performance metrics self.metrics = { "documents_processed": 0, "total_processing_time": 0.0, "pages_processed": 0, "images_processed": 0, "ocr_batches_processed": 0 } logger.info(f"Async document processor initialized (batch_size: {batch_size}, workers: {max_workers})") async def process_document(self, file_path: str) -> ProcessingResult: """ Process document with async pipeline Args: file_path: Path to document file Returns: ProcessingResult object """ start_time = time.time() file_path = Path(file_path) if not file_path.exists(): return ProcessingResult( success=False, content="", metadata={"error": "File not found"}, error="File not found", processing_time=time.time() - start_time ) # Determine file type and process accordingly extension = file_path.suffix.lower() try: if extension in ['.pdf']: result = await self._process_pdf_async(file_path) elif extension in ['.doc', '.docx']: result = await self._process_word_async(file_path) elif extension in ['.xls', '.xlsx']: result = await self._process_excel_async(file_path) elif extension in ['.ppt', '.pptx']: result = await self._process_powerpoint_async(file_path) elif extension in ['.txt', '.csv', '.html']: result = await self._process_text_async(file_path) elif extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']: result = await self._process_image_async(file_path) else: result = ProcessingResult( success=False, content="", metadata={"error": f"Unsupported file type: {extension}"}, error=f"Unsupported file type: {extension}", processing_time=time.time() - start_time ) # Update metrics processing_time = time.time() - start_time result.processing_time = processing_time self.metrics["documents_processed"] += 1 self.metrics["total_processing_time"] += processing_time return result except Exception as e: logger.error(f"Error processing {file_path}: {e}") processing_time = time.time() - start_time return ProcessingResult( success=False, content="", metadata={"error": str(e)}, error=str(e), processing_time=processing_time ) async def _process_pdf_async(self, file_path: Path) -> ProcessingResult: """ Process PDF files with async pipeline Args: file_path: Path to PDF file Returns: ProcessingResult object """ pdf_document = None try: # Open PDF pdf_document = fitz.open(str(file_path)) total_pages = len(pdf_document) # Create async tasks for each page page_tasks = [] for page_num in range(total_pages): task = self._process_pdf_page_async(pdf_document[page_num], page_num) page_tasks.append(task) # Process pages in parallel page_results = await asyncio.gather(*page_tasks, return_exceptions=True) # Combine results content_parts = [] tables = [] images = [] processed_with_ocr = False for i, result in enumerate(page_results): if isinstance(result, Exception): logger.error(f"Error processing page {i}: {result}") content_parts.append(f"Page {i + 1}: [Processing error: {str(result)}]") else: page_content, page_tables, page_images, used_ocr = result content_parts.append(page_content) tables.extend(page_tables) images.extend(page_images) if used_ocr: processed_with_ocr = True full_content = "\n\n".join(content_parts) # Update metrics self.metrics["pages_processed"] += total_pages return ProcessingResult( success=True, content=full_content, metadata={ "pages": total_pages, "file_type": "pdf", "processed_with_ocr": processed_with_ocr }, tables=tables, images=images ) except Exception as e: logger.error(f"PDF processing failed: {e}") raise finally: if pdf_document: pdf_document.close() async def _process_pdf_page_async(self, page, page_num: int) -> Tuple[str, List, List, bool]: """ Process a single PDF page asynchronously Args: page: PDF page object page_num: Page number (0-indexed) Returns: Tuple of (content, tables, images, used_ocr) """ try: # Try text extraction first text = page.get_text() if text.strip(): return f"Page {page_num + 1}:\n{text}", [], [], False # Fall back to OCR for scanned pages logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR") # Use higher resolution for better OCR accuracy mat = fitz.Matrix(2, 2) # 2x resolution pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") # Save temporary image for OCR with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file: temp_file.write(img_data) temp_path = temp_file.name try: if self.ocr_processor.ocr_available: # Use async OCR ocr_results = await self.ocr_processor.extract_text_from_images_batch_async([temp_path]) if ocr_results and ocr_results[0].text.strip(): content = f"Page {page_num + 1} (OCR):\n{ocr_results[0].text}" # Extract tables from OCR ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path) # Create image metadata images = [{ "path": temp_path, "index": page_num, "ocr_text": ocr_results[0].text, "ocr_confidence": ocr_results[0].confidence }] return content, ocr_tables, images, True else: return f"Page {page_num + 1}: [Scanned content - no text detected by OCR]", [], [], True else: return f"Page {page_num + 1}: [Image content - OCR not available]", [], [], False finally: os.unlink(temp_path) except Exception as e: logger.error(f"Error processing PDF page {page_num + 1}: {e}") return f"Page {page_num + 1}: [Processing error: {str(e)}]", [], [], False async def _process_word_async(self, file_path: Path) -> ProcessingResult: """ Process Word documents asynchronously Args: file_path: Path to Word document Returns: ProcessingResult object """ try: doc = docx.Document(str(file_path)) # Extract text from paragraphs content_parts = [] for para in doc.paragraphs: if para.text.strip(): content_parts.append(para.text) # Extract tables tables = [] for table in doc.tables: table_data = [] for row in table.rows: row_data = [cell.text for cell in row.cells] table_data.append(row_data) if table_data: tables.append({ "data": table_data, "rows": len(table_data), "columns": max(len(row) for row in table_data) if table_data else 0 }) # Extract and process images asynchronously images = await self._extract_word_images_async(file_path) # Add image content to text for img in images: if "ocr_text" in img: content_parts.append(f"[Image {img['index'] + 1} OCR Text]: {img['ocr_text']}") elif "primary_classification" in img: content_parts.append(f"[Image {img['index'] + 1} Classification]: {img['primary_classification']}") full_content = "\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "word", "paragraphs": len([p for p in content_parts if not p.startswith('[')]), "tables_count": len(tables), "images_count": len(images) }, tables=tables, images=images ) except Exception as e: logger.error(f"Word document processing failed: {e}") raise async def _extract_word_images_async(self, file_path: Path) -> List[Dict[str, Any]]: """ Extract and process images from Word document asynchronously Args: file_path: Path to Word document Returns: List of image metadata dictionaries """ images = [] try: import zipfile import os # Create temporary directory for extracted images with tempfile.TemporaryDirectory() as temp_dir: # Extract images from docx using zipfile with zipfile.ZipFile(str(file_path), 'r') as zip_ref: image_files = [] for file_info in zip_ref.filelist: if file_info.filename.startswith('word/media/'): # Extract the image image_filename = os.path.basename(file_info.filename) image_path = os.path.join(temp_dir, image_filename) # Extract and save with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target: target.write(source.read()) image_files.append((len(image_files), image_path)) logger.info(f"Extracted image: {image_path}") if image_files: logger.info(f"Found {len(image_files)} images in Word document") # Process images in batches for batch_start in range(0, len(image_files), self.batch_size): batch = image_files[batch_start:batch_start + self.batch_size] # Prepare batch for OCR image_paths = [path for _, path in batch] indices = [idx for idx, _ in batch] # Process batch with OCR if self.ocr_processor.ocr_available: ocr_results = await self.ocr_processor.extract_text_from_images_batch_async(image_paths) for i, (idx, image_path) in enumerate(batch): if i < len(ocr_results): ocr_result = ocr_results[i] image_metadata = { "path": image_path, "index": idx, "ocr_text": ocr_result.text, "ocr_confidence": ocr_result.confidence } # Only classify if OCR found no text if not ocr_result.text.strip() and self.image_classifier: try: classification_results = self.image_classifier.classify_image(image_path, top_k=3) image_metadata["classification"] = classification_results if classification_results: image_metadata["primary_classification"] = classification_results[0]["label"] except Exception as classify_error: logger.error(f"Image classification failed: {classify_error}") images.append(image_metadata) # Update metrics self.metrics["images_processed"] += len(batch) self.metrics["ocr_batches_processed"] += 1 return images except Exception as e: logger.warning(f"Image extraction from Word document failed: {e}") return images async def _process_excel_async(self, file_path: Path) -> ProcessingResult: """ Process Excel files asynchronously Args: file_path: Path to Excel file Returns: ProcessingResult object """ try: workbook = openpyxl.load_workbook(str(file_path)) content_parts = [] tables = [] for sheet_name in workbook.sheetnames: sheet = workbook[sheet_name] content_parts.append(f"Sheet: {sheet_name}") # Extract data from cells sheet_data = [] for row in sheet.iter_rows(values_only=True): if any(cell is not None for cell in row): sheet_data.append([str(cell) if cell is not None else "" for cell in row]) if sheet_data: tables.append({ "data": sheet_data, "sheet": sheet_name, "rows": len(sheet_data), "columns": max(len(row) for row in sheet_data) if sheet_data else 0 }) # Add sample content (first few rows) sample_rows = min(5, len(sheet_data)) for i in range(sample_rows): content_parts.append(" | ".join(sheet_data[i])) workbook.close() full_content = "\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "excel", "sheets": len(workbook.sheetnames), "tables_count": len(tables) }, tables=tables ) except Exception as e: logger.error(f"Excel processing failed: {e}") raise async def _process_powerpoint_async(self, file_path: Path) -> ProcessingResult: """ Process PowerPoint presentations asynchronously Args: file_path: Path to PowerPoint file Returns: ProcessingResult object """ try: presentation = Presentation(str(file_path)) content_parts = [] for i, slide in enumerate(presentation.slides): content_parts.append(f"Slide {i + 1}:") # Extract text from slide shapes slide_text = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_text.append(shape.text) if slide_text: content_parts.extend(slide_text) content_parts.append("") # Empty line between slides full_content = "\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "powerpoint", "slides": len(presentation.slides) } ) except Exception as e: logger.error(f"PowerPoint processing failed: {e}") raise async def _process_text_async(self, file_path: Path) -> ProcessingResult: """ Process text-based files asynchronously Args: file_path: Path to text file Returns: ProcessingResult object """ try: extension = file_path.suffix.lower() if extension == '.csv': # Process CSV with pandas df = pd.read_csv(file_path) content = df.to_string(index=False) tables = [{ "data": df.values.tolist(), "columns": df.columns.tolist(), "rows": len(df), "columns_count": len(df.columns) }] return ProcessingResult