""" Multi-format Document Processing Pipeline for LightRAG Supports PDF, images, Office documents, and more with GPU acceleration Enhanced with text-first extraction and isolated image classification """ import os import logging import asyncio from typing import Dict, List, Any, Optional, Union, Tuple from dataclasses import dataclass import tempfile from pathlib import Path # Import required libraries import fitz # PyMuPDF import docx import openpyxl from pptx import Presentation from bs4 import BeautifulSoup import pandas as pd from .production_config import get_config # Import optimized image classifier using subprocess isolation import sys import os # Add the workspace directory to path where fast_image_classifier.py is located workspace_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if workspace_dir not in sys.path: sys.path.insert(0, workspace_dir) from fast_image_classifier import get_image_classifier # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class ProcessingResult: """Result of document processing""" success: bool content: str metadata: Dict[str, Any] error: Optional[str] = None tables: List[Dict[str, Any]] = None images: List[Dict[str, Any]] = None class OCRProcessor: """GPU-accelerated OCR processing using PaddleOCR with process-per-request isolation""" def __init__(self, use_gpu: bool = True, languages: List[str] = None): self.use_gpu = use_gpu self.languages = languages or ['en', 'ch'] self.ocr_available = False self._temp_dir = None self._initialize_ocr() def _initialize_ocr(self): """Initialize PaddleOCR by testing if it can be loaded""" try: logger.info("Testing PaddleOCR availability with process-per-request approach") # Create a simple test script to verify OCR works test_script = """ import sys import json from paddleocr import PaddleOCR try: # Test OCR initialization ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang='en', show_log=False, gpu_mem=2000) print("PaddleOCR test: SUCCESS") sys.exit(0) except Exception as e: print(f"PaddleOCR test: FAILED - {e}") sys.exit(1) """ import tempfile import subprocess # Create temporary directory self._temp_dir = tempfile.mkdtemp(prefix="paddleocr_") script_path = os.path.join(self._temp_dir, "test_ocr.py") with open(script_path, 'w') as f: f.write(test_script) # Run test env = os.environ.copy() result = subprocess.run( [sys.executable, script_path], capture_output=True, text=True, timeout=30, env=env ) if result.returncode == 0: self.ocr_available = True logger.info("PaddleOCR is available for process-per-request OCR") else: logger.error(f"PaddleOCR test failed: {result.stderr}") self.ocr_available = False except Exception as e: logger.error(f"Failed to initialize OCR processor: {e}") self.ocr_available = False def extract_text_from_image(self, image_path: str) -> Dict[str, Any]: """Extract text from image using isolated OCR process per request""" if not self.ocr_available: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} try: import tempfile import subprocess import json # OCR script that processes one image and returns JSON result ocr_script = """ import sys import json from paddleocr import PaddleOCR def extract_text_from_image(image_path): try: ocr_engine = PaddleOCR( use_gpu=True, use_angle_cls=True, lang='en', show_log=False, gpu_mem=2000 ) result = ocr_engine.ocr(image_path) if not result or not result[0]: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} extracted_text = [] bboxes = [] total_confidence = 0.0 line_count = 0 for line in result[0]: try: if len(line) == 2: bbox, (text, confidence) = line elif len(line) >= 1: bbox = line[0] if len(line) > 0 else [] if len(line) > 1: if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2: text, confidence = line[1][0], line[1][1] else: text, confidence = str(line[1]) if len(line) > 1 else "", 0.0 else: text, confidence = "", 0.0 else: continue text_str = str(text) if text is not None else "" confidence_float = 0.0 if confidence is not None: if isinstance(confidence, (int, float)): confidence_float = float(confidence) elif isinstance(confidence, str): try: confidence_float = float(confidence) except ValueError: confidence_float = 0.0 else: confidence_float = 0.0 else: confidence_float = 0.0 extracted_text.append(text_str) bboxes.append(bbox) total_confidence += confidence_float line_count += 1 except (TypeError, ValueError, IndexError) as e: extracted_text.append("") bboxes.append([]) total_confidence += 0.0 line_count += 1 avg_confidence = total_confidence / line_count if line_count > 0 else 0.0 full_text = "\\\\n".join(extracted_text) return { "text": full_text, "confidence": avg_confidence, "bboxes": bboxes, "line_count": line_count } except Exception as e: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} # Main execution if __name__ == "__main__": image_path = sys.argv[1] try: result = extract_text_from_image(image_path) print(json.dumps(result)) except Exception as e: print(json.dumps({"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0, "error": str(e)})) """ # Write OCR script script_path = os.path.join(self._temp_dir, "ocr_single.py") with open(script_path, 'w') as f: f.write(ocr_script) # Run OCR process env = os.environ.copy() result = subprocess.run( [sys.executable, script_path, image_path], capture_output=True, text=True, timeout=60, # 60 second timeout for OCR env=env ) if result.returncode == 0: try: ocr_result = json.loads(result.stdout) return ocr_result except json.JSONDecodeError: logger.error(f"Failed to parse OCR result: {result.stdout}") return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} else: logger.error(f"OCR process failed with return code {result.returncode}: {result.stderr}") return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} except subprocess.TimeoutExpired: logger.error("OCR processing timeout") return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} except Exception as e: logger.error(f"OCR request failed: {e}") return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} def close(self): """Close the OCR process""" if self._process: try: exit_request = {"action": "exit"} self._process.stdin.write(json.dumps(exit_request) + '\n') self._process.stdin.flush() self._process.wait(timeout=5) except: self._process.terminate() finally: self._process = None if self._temp_dir and os.path.exists(self._temp_dir): import shutil try: shutil.rmtree(self._temp_dir) except: pass def __del__(self): """Destructor to ensure cleanup""" self.close() def extract_tables_from_image(self, image_path: str) -> List[Dict[str, Any]]: """Extract tables from image using OCR and layout analysis""" try: # Use OCR to get text with bounding boxes ocr_result = self.extract_text_from_image(image_path) # Simple table detection based on text alignment tables = self._detect_tables_from_bboxes(ocr_result["bboxes"], ocr_result["text"]) return tables except Exception as e: logger.error(f"Table extraction failed: {e}") return [] def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]: """Detect tables from OCR bounding boxes""" tables = [] if not bboxes: return tables # Group text by rows based on y-coordinates rows = {} for i, bbox in enumerate(bboxes): try: # Ensure all points are converted to float with proper error handling y_values = [] for point in bbox: if point and len(point) >= 2: try: # Ensure we convert both coordinates to float with explicit type safety y_val = point[1] if isinstance(y_val, (int, float)): y_values.append(float(y_val)) elif isinstance(y_val, str): y_values.append(float(y_val)) else: logger.warning(f"Unexpected y-coordinate type: {type(y_val)}, value: {y_val}") y_values.append(0.0) except (TypeError, ValueError) as conv_error: logger.warning(f"Type conversion error for y-coordinate {point[1]}: {conv_error}") y_values.append(0.0) else: y_values.append(0.0) # Safe calculation of y_center with explicit float conversion try: if y_values: # Convert all values to float explicitly and handle any remaining type issues float_y_values = [] for val in y_values: try: float_y_values.append(float(val)) except (TypeError, ValueError): float_y_values.append(0.0) y_center = sum(float_y_values) / len(float_y_values) else: y_center = 0.0 except (TypeError, ZeroDivisionError) as calc_error: logger.warning(f"Error calculating y_center: {calc_error}") y_center = 0.0 row_key = round(y_center / 10) # Group by 10-pixel rows if row_key not in rows: rows[row_key] = [] # Safe text extraction with bounds checking text_lines = text.split('\n') row_text = text_lines[i] if i < len(text_lines) else "" rows[row_key].append((bbox, row_text)) except Exception as e: logger.warning(f"Error processing bbox {i}: {e}") continue # Sort rows and create table structure sorted_rows = sorted(rows.keys()) table_data = [] for row_key in sorted_rows: try: # Ensure all x-coordinates are converted to float with proper error handling def get_x_coordinate(item): try: if (item[0] and len(item[0]) > 0 and item[0][0] and len(item[0][0]) > 0): # Explicit float conversion with error handling x_val = item[0][0][0] return float(x_val) if x_val is not None else 0.0 return 0.0 except (TypeError, ValueError, IndexError) as x_error: logger.warning(f"Error getting x-coordinate: {x_error}") return 0.0 row_items = sorted(rows[row_key], key=get_x_coordinate) row_text = [item[1] for item in row_items] table_data.append(row_text) except Exception as e: logger.warning(f"Error sorting row {row_key}: {e}") continue if len(table_data) > 1: # At least 2 rows for a table tables.append({ "data": table_data, "rows": len(table_data), "columns": max(len(row) for row in table_data) if table_data else 0 }) return tables class DocumentProcessor: """Main document processor for multiple file formats""" def __init__(self): self.config = get_config() self.ocr_processor = OCRProcessor( use_gpu=self.config.performance.USE_GPU, languages=self.config.document_processing.OCR_LANGUAGES ) self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS # Initialize image classifier if available self.image_classifier = None if get_image_classifier: try: self.image_classifier = get_image_classifier() logger.info("Image classifier initialized successfully") except Exception as e: logger.warning(f"Failed to initialize image classifier: {e}") async def process_document(self, file_path: str) -> ProcessingResult: """Process document based on file extension""" file_path = Path(file_path) if not file_path.exists(): return ProcessingResult( success=False, content="", metadata={"error": "File not found"}, error="File not found" ) # Determine file type and process accordingly extension = file_path.suffix.lower() try: if extension in ['.pdf']: return await self._process_pdf(file_path) elif extension in ['.doc', '.docx']: return await self._process_word(file_path) elif extension in ['.xls', '.xlsx']: return await self._process_excel(file_path) elif extension in ['.ppt', '.pptx']: return await self._process_powerpoint(file_path) elif extension in ['.txt', '.csv', '.html']: return await self._process_text(file_path) elif extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']: return await self._process_image(file_path) else: return ProcessingResult( success=False, content="", metadata={"error": f"Unsupported file type: {extension}"}, error=f"Unsupported file type: {extension}" ) except Exception as e: logger.error(f"Error processing {file_path}: {e}") return ProcessingResult( success=False, content="", metadata={"error": str(e)}, error=str(e) ) def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]: """ Extract and process images from documents Returns processed images metadata and additional content from OCR """ processed_images = [] additional_content = [] for i, image_data in enumerate(images): temp_path = None try: # Save image to temporary file with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file: if file_type == 'word': # For Word documents, image_data is an inline_shape image_bytes = image_data.image.blob elif file_type == 'pdf': # For PDFs, image_data is a pixmap image_bytes = image_data.tobytes("png") else: image_bytes = image_data temp_file.write(image_bytes) temp_path = temp_file.name # Process image with OCR first, then classify only if no text found image_metadata = {"path": temp_path, "index": i} # Step 1: Always run GPU OCR first if self.ocr_processor.ocr_available: try: logger.info(f"Running GPU OCR on image {i+1}") ocr_result = self.ocr_processor.extract_text_from_image(temp_path) logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}") if ocr_result["text"].strip(): image_metadata["ocr_text"] = ocr_result["text"] image_metadata["ocr_confidence"] = ocr_result["confidence"] additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}") logger.info(f"Image {i+1} has text content, skipping classification") else: logger.info(f"Image {i+1} has no text, proceeding to classification") # Step 2: Only classify if OCR found no text if self.image_classifier and self.image_classifier.available: try: classification_results = self.image_classifier.classify_image(temp_path, top_k=3) image_metadata["classification"] = classification_results # Add classification to content for indexing top_label = classification_results[0]["label"] if classification_results else "unknown" top_confidence = classification_results[0]["confidence"] if classification_results else 0.0 image_metadata["primary_classification"] = top_label # Add classification with confidence for better searchability classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})" additional_content.append(classification_text) logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}") # Add bee classification as a special entity for search if "bee" in top_label.lower(): # Add multiple variations to ensure it gets picked up by entity extraction bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee." additional_content.append(bee_entity_text) # Also add as standalone entity markers additional_content.append("Entity: Bee") additional_content.append("Entity: Insect") additional_content.append("Entity: Animal") except Exception as classify_error: logger.error(f"Image classification failed for image {i+1}: {classify_error}") image_metadata["classification_error"] = str(classify_error) except Exception as ocr_error: logger.error(f"OCR processing failed for image {i+1}: {ocr_error}") image_metadata["ocr_error"] = str(ocr_error) processed_images.append(image_metadata) except Exception as e: logger.error(f"Error processing image {i}: {e}") processed_images.append({ "index": i, "error": str(e), "path": temp_path or "unknown" }) finally: # Clean up temporary file if temp_path and os.path.exists(temp_path): try: os.unlink(temp_path) except Exception as e: logger.warning(f"Failed to delete temporary image file {temp_path}: {e}") return processed_images, "\n".join(additional_content) async def _process_pdf(self, file_path: Path) -> ProcessingResult: """Process PDF files with text extraction and OCR fallback""" pdf_document = None try: content_parts = [] tables = [] images = [] processed_with_ocr = False # Open PDF pdf_document = fitz.open(str(file_path)) total_pages = len(pdf_document) for page_num in range(total_pages): page = pdf_document[page_num] # Try text extraction first text = page.get_text() if text.strip(): content_parts.append(f"Page {page_num + 1}:\n{text}") else: # Fall back to OCR for scanned pages with higher resolution logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR") # Use higher resolution for better OCR accuracy on scanned documents mat = fitz.Matrix(2, 2) # 2x resolution for better OCR pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") # Save temporary image for OCR with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file: temp_file.write(img_data) temp_path = temp_file.name try: if self.ocr_processor.ocr_available: logger.info(f"Running OCR on page {page_num + 1} with high resolution") ocr_result = self.ocr_processor.extract_text_from_image(temp_path) if ocr_result["text"].strip(): logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}") content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}") processed_with_ocr = True else: logger.warning(f"OCR returned empty text for page {page_num + 1}") # Don't add empty content, just mark as processed content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]") # Extract tables from OCR ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path) if ocr_tables: logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}") tables.extend(ocr_tables) else: logger.warning("OCR not available, skipping OCR processing") content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]") except Exception as ocr_error: logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}") content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]") finally: os.unlink(temp_path) full_content = "\n\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "pages": total_pages, "file_type": "pdf", "processed_with_ocr": processed_with_ocr }, tables=tables, images=images ) except Exception as e: logger.error(f"PDF processing failed: {e}") raise finally: if pdf_document: pdf_document.close() async def _process_word(self, file_path: Path) -> ProcessingResult: """Process Word documents with image extraction and classification""" try: doc = docx.Document(str(file_path)) content_parts = [] tables = [] images = [] # Extract text from paragraphs first (primary content) for para in doc.paragraphs: if para.text.strip(): content_parts.append(para.text) # Extract tables for table in doc.tables: table_data = [] for row in table.rows: row_data = [cell.text for cell in row.cells] table_data.append(row_data) if table_data: tables.append({ "data": table_data, "rows": len(table_data), "columns": max(len(row) for row in table_data) if table_data else 0 }) # Extract and process images using zipfile method try: import zipfile import os # Create temporary directory for extracted images with tempfile.TemporaryDirectory() as temp_dir: # Extract images from docx using zipfile with zipfile.ZipFile(str(file_path), 'r') as zip_ref: image_files = [] for file_info in zip_ref.filelist: if file_info.filename.startswith('word/media/'): # Extract the image image_filename = os.path.basename(file_info.filename) image_path = os.path.join(temp_dir, image_filename) # Extract and save with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target: target.write(source.read()) image_files.append(image_path) logger.info(f"📸 Extracted image: {image_path}") if image_files: logger.info(f"Found {len(image_files)} images in Word document using zipfile method") # Process each extracted image for i, image_path in enumerate(image_files): try: image_metadata = {"path": image_path, "index": i} # Step 1: Always run GPU OCR first if self.ocr_processor.ocr_available: ocr_result = self.ocr_processor.extract_text_from_image(image_path) if ocr_result["text"].strip(): image_metadata["ocr_text"] = ocr_result["text"] image_metadata["ocr_confidence"] = ocr_result["confidence"] content_parts.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}") logger.info(f"Image {i+1} has text content, skipping classification") else: logger.info(f"Image {i+1} has no text, proceeding to classification") # Step 2: Only classify if OCR found no text if self.image_classifier and self.image_classifier.available: classification_results = self.image_classifier.classify_image(image_path, top_k=3) image_metadata["classification"] = classification_results # Add classification to content for indexing top_label = classification_results[0]["label"] if classification_results else "unknown" top_confidence = classification_results[0]["confidence"] if classification_results else 0.0 image_metadata["primary_classification"] = top_label # Add classification with confidence for better searchability classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})" content_parts.append(classification_text) logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}") # Add bee classification as a special entity for search if "bee" in top_label.lower(): # Add multiple variations to ensure it gets picked up by entity extraction bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee." content_parts.append(bee_entity_text) # Also add as standalone entity markers content_parts.append("Entity: Bee") content_parts.append("Entity: Insect") content_parts.append("Entity: Animal") images.append(image_metadata) except Exception as img_error: logger.error(f"Error processing image {i}: {img_error}") images.append({ "index": i, "error": str(img_error), "path": image_path }) except Exception as img_error: logger.warning(f"Image extraction from Word document failed: {img_error}") full_content = "\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "word", "paragraphs": len([p for p in content_parts if not p.startswith('[')]), "tables_count": len(tables), "images_count": len(images) }, tables=tables, images=images ) except Exception as e: logger.error(f"Word document processing failed: {e}") raise async def _process_excel(self, file_path: Path) -> ProcessingResult: """Process Excel files""" try: workbook = openpyxl.load_workbook(str(file_path)) content_parts = [] tables = [] for sheet_name in workbook.sheetnames: sheet = workbook[sheet_name] content_parts.append(f"Sheet: {sheet_name}") # Extract data from cells sheet_data = [] for row in sheet.iter_rows(values_only=True): if any(cell is not None for cell in row): sheet_data.append([str(cell) if cell is not None else "" for cell in row]) if sheet_data: tables.append({ "data": sheet_data, "sheet": sheet_name, "rows": len(sheet_data), "columns": max(len(row) for row in sheet_data) if sheet_data else 0 }) # Add sample content (first few rows) sample_rows = min(5, len(sheet_data)) for i in range(sample_rows): content_parts.append(" | ".join(sheet_data[i])) workbook.close() full_content = "\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "excel", "sheets": len(workbook.sheetnames), "tables_count": len(tables) }, tables=tables ) except Exception as e: logger.error(f"Excel processing failed: {e}") raise async def _process_powerpoint(self, file_path: Path) -> ProcessingResult: """Process PowerPoint presentations""" try: presentation = Presentation(str(file_path)) content_parts = [] for i, slide in enumerate(presentation.slides): content_parts.append(f"Slide {i + 1}:") # Extract text from slide shapes slide_text = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_text.append(shape.text) if slide_text: content_parts.extend(slide_text) content_parts.append("") # Empty line between slides full_content = "\n".join(content_parts) return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "powerpoint", "slides": len(presentation.slides) } ) except Exception as e: logger.error(f"PowerPoint processing failed: {e}") raise async def _process_text(self, file_path: Path) -> ProcessingResult: """Process text-based files (TXT, CSV, HTML)""" try: extension = file_path.suffix.lower() if extension == '.csv': # Process CSV with pandas df = pd.read_csv(file_path) content = df.to_string(index=False) tables = [{ "data": df.values.tolist(), "columns": df.columns.tolist(), "rows": len(df), "columns_count": len(df.columns) }] return ProcessingResult( success=True, content=content, metadata={"file_type": "csv", "rows": len(df), "columns": len(df.columns)}, tables=tables ) elif extension == '.html': # Process HTML with BeautifulSoup with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) content = '\n'.join(chunk for chunk in chunks if chunk) return ProcessingResult( success=True, content=content, metadata={"file_type": "html"} ) else: # TXT and other text files with open(file_path, 'r', encoding='utf-8') as f: content = f.read() return ProcessingResult( success=True, content=content, metadata={"file_type": "text"} ) except Exception as e: logger.error(f"Text file processing failed: {e}") raise async def _process_image(self, file_path: Path) -> ProcessingResult: """Process image files with OCR""" try: content_parts = [] tables = [] images = [{"path": str(file_path), "classification": "processed_with_ocr"}] # Always perform OCR on images ocr_result = self.ocr_processor.extract_text_from_image(str(file_path)) if ocr_result["text"].strip(): content_parts.append(ocr_result["text"]) # Extract tables from image ocr_tables = self.ocr_processor.extract_tables_from_image(str(file_path)) tables.extend(ocr_tables) full_content = "\n".join(content_parts) if content_parts else "No text extracted from image" return ProcessingResult( success=True, content=full_content, metadata={ "file_type": "image", "ocr_confidence": ocr_result.get("confidence", 0.0), "line_count": ocr_result.get("line_count", 0) }, tables=tables, images=images ) except Exception as e: logger.error(f"Image processing failed: {e}") raise def get_supported_formats(self) -> List[str]: """Get list of supported file formats""" return list(self.supported_extensions) async def process_batch(self, file_paths: List[str]) -> List[ProcessingResult]: """Process multiple documents in batch""" tasks = [self.process_document(file_path) for file_path in file_paths] results = await asyncio.gather(*tasks, return_exceptions=True) # Handle exceptions in results processed_results = [] for result in results: if isinstance(result, Exception): processed_results.append(ProcessingResult( success=False, content="", metadata={"error": str(result)}, error=str(result) )) else: processed_results.append(result) return processed_results # Singleton instance _processor_instance = None def get_document_processor() -> DocumentProcessor: """Get singleton document processor instance""" global _processor_instance if _processor_instance is None: _processor_instance = DocumentProcessor() return _processor_instance async def test_processor(): """Test function for document processor""" processor = get_document_processor() # Test with a sample file (modify path as needed) test_file = "test_documents/test_document.txt" if os.path.exists(test_file): result = await processor.process_document(test_file) print(f"Success: {result.success}") print(f"Content length: {len(result.content)}") print(f"Metadata: {result.metadata}") else: print("Test file not found") if __name__ == "__main__": # Run test asyncio.run(test_processor())