Files
railseek6/LightRAG-main/lightrag/optimized_document_processor.py

558 lines
21 KiB
Python

"""
Optimized Document Processor with Async Pipeline and Batch OCR
Replaces the sequential processing with parallel pipeline stages
"""
import os
import logging
import asyncio
import concurrent.futures
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
import tempfile
from pathlib import Path
import time
from collections import defaultdict
# Import required libraries
import fitz # PyMuPDF
import docx
import openpyxl
from pptx import Presentation
from bs4 import BeautifulSoup
import pandas as pd
from .optimized_ocr_processor import OptimizedOCRProcessor, BatchOCRResult
from .production_config import get_config
# Configure logging
logger = logging.getLogger(__name__)
@dataclass
class ProcessingResult:
"""Result of document processing"""
success: bool
content: str
metadata: Dict[str, Any]
error: Optional[str] = None
tables: List[Dict[str, Any]] = None
images: List[Dict[str, Any]] = None
processing_time: float = 0.0
class AsyncDocumentProcessor:
"""
Async document processor with parallel pipeline stages and batch OCR
"""
def __init__(self, batch_size: int = 4, max_workers: int = 2):
"""
Initialize async document processor
Args:
batch_size: Number of images to process in each OCR batch
max_workers: Maximum number of parallel workers
"""
self.config = get_config()
self.batch_size = batch_size
self.max_workers = max_workers
# Initialize optimized OCR processor
self.ocr_processor = OptimizedOCRProcessor(
use_gpu=self.config.performance.USE_GPU,
languages=self.config.document_processing.OCR_LANGUAGES,
batch_size=batch_size,
max_workers=max_workers
)
self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS
# Initialize image classifier if available
self.image_classifier = None
try:
# Add the workspace directory to path where fast_image_classifier.py is located
import sys
workspace_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if workspace_dir not in sys.path:
sys.path.insert(0, workspace_dir)
from fast_image_classifier import get_image_classifier
self.image_classifier = get_image_classifier()
logger.info("Image classifier initialized successfully")
except Exception as e:
logger.warning(f"Failed to initialize image classifier: {e}")
# Performance metrics
self.metrics = {
"documents_processed": 0,
"total_processing_time": 0.0,
"pages_processed": 0,
"images_processed": 0,
"ocr_batches_processed": 0
}
logger.info(f"Async document processor initialized (batch_size: {batch_size}, workers: {max_workers})")
async def process_document(self, file_path: str) -> ProcessingResult:
"""
Process document with async pipeline
Args:
file_path: Path to document file
Returns:
ProcessingResult object
"""
start_time = time.time()
file_path = Path(file_path)
if not file_path.exists():
return ProcessingResult(
success=False,
content="",
metadata={"error": "File not found"},
error="File not found",
processing_time=time.time() - start_time
)
# Determine file type and process accordingly
extension = file_path.suffix.lower()
try:
if extension in ['.pdf']:
result = await self._process_pdf_async(file_path)
elif extension in ['.doc', '.docx']:
result = await self._process_word_async(file_path)
elif extension in ['.xls', '.xlsx']:
result = await self._process_excel_async(file_path)
elif extension in ['.ppt', '.pptx']:
result = await self._process_powerpoint_async(file_path)
elif extension in ['.txt', '.csv', '.html']:
result = await self._process_text_async(file_path)
elif extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']:
result = await self._process_image_async(file_path)
else:
result = ProcessingResult(
success=False,
content="",
metadata={"error": f"Unsupported file type: {extension}"},
error=f"Unsupported file type: {extension}",
processing_time=time.time() - start_time
)
# Update metrics
processing_time = time.time() - start_time
result.processing_time = processing_time
self.metrics["documents_processed"] += 1
self.metrics["total_processing_time"] += processing_time
return result
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
processing_time = time.time() - start_time
return ProcessingResult(
success=False,
content="",
metadata={"error": str(e)},
error=str(e),
processing_time=processing_time
)
async def _process_pdf_async(self, file_path: Path) -> ProcessingResult:
"""
Process PDF files with async pipeline
Args:
file_path: Path to PDF file
Returns:
ProcessingResult object
"""
pdf_document = None
try:
# Open PDF
pdf_document = fitz.open(str(file_path))
total_pages = len(pdf_document)
# Create async tasks for each page
page_tasks = []
for page_num in range(total_pages):
task = self._process_pdf_page_async(pdf_document[page_num], page_num)
page_tasks.append(task)
# Process pages in parallel
page_results = await asyncio.gather(*page_tasks, return_exceptions=True)
# Combine results
content_parts = []
tables = []
images = []
processed_with_ocr = False
for i, result in enumerate(page_results):
if isinstance(result, Exception):
logger.error(f"Error processing page {i}: {result}")
content_parts.append(f"Page {i + 1}: [Processing error: {str(result)}]")
else:
page_content, page_tables, page_images, used_ocr = result
content_parts.append(page_content)
tables.extend(page_tables)
images.extend(page_images)
if used_ocr:
processed_with_ocr = True
full_content = "\n\n".join(content_parts)
# Update metrics
self.metrics["pages_processed"] += total_pages
return ProcessingResult(
success=True,
content=full_content,
metadata={
"pages": total_pages,
"file_type": "pdf",
"processed_with_ocr": processed_with_ocr
},
tables=tables,
images=images
)
except Exception as e:
logger.error(f"PDF processing failed: {e}")
raise
finally:
if pdf_document:
pdf_document.close()
async def _process_pdf_page_async(self, page, page_num: int) -> Tuple[str, List, List, bool]:
"""
Process a single PDF page asynchronously
Args:
page: PDF page object
page_num: Page number (0-indexed)
Returns:
Tuple of (content, tables, images, used_ocr)
"""
try:
# Try text extraction first
text = page.get_text()
if text.strip():
return f"Page {page_num + 1}:\n{text}", [], [], False
# Fall back to OCR for scanned pages
logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR")
# Use higher resolution for better OCR accuracy
mat = fitz.Matrix(2, 2) # 2x resolution
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
# Save temporary image for OCR
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
temp_file.write(img_data)
temp_path = temp_file.name
try:
if self.ocr_processor.ocr_available:
# Use async OCR
ocr_results = await self.ocr_processor.extract_text_from_images_batch_async([temp_path])
if ocr_results and ocr_results[0].text.strip():
content = f"Page {page_num + 1} (OCR):\n{ocr_results[0].text}"
# Extract tables from OCR
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
# Create image metadata
images = [{
"path": temp_path,
"index": page_num,
"ocr_text": ocr_results[0].text,
"ocr_confidence": ocr_results[0].confidence
}]
return content, ocr_tables, images, True
else:
return f"Page {page_num + 1}: [Scanned content - no text detected by OCR]", [], [], True
else:
return f"Page {page_num + 1}: [Image content - OCR not available]", [], [], False
finally:
os.unlink(temp_path)
except Exception as e:
logger.error(f"Error processing PDF page {page_num + 1}: {e}")
return f"Page {page_num + 1}: [Processing error: {str(e)}]", [], [], False
async def _process_word_async(self, file_path: Path) -> ProcessingResult:
"""
Process Word documents asynchronously
Args:
file_path: Path to Word document
Returns:
ProcessingResult object
"""
try:
doc = docx.Document(str(file_path))
# Extract text from paragraphs
content_parts = []
for para in doc.paragraphs:
if para.text.strip():
content_parts.append(para.text)
# Extract tables
tables = []
for table in doc.tables:
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
if table_data:
tables.append({
"data": table_data,
"rows": len(table_data),
"columns": max(len(row) for row in table_data) if table_data else 0
})
# Extract and process images asynchronously
images = await self._extract_word_images_async(file_path)
# Add image content to text
for img in images:
if "ocr_text" in img:
content_parts.append(f"[Image {img['index'] + 1} OCR Text]: {img['ocr_text']}")
elif "primary_classification" in img:
content_parts.append(f"[Image {img['index'] + 1} Classification]: {img['primary_classification']}")
full_content = "\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "word",
"paragraphs": len([p for p in content_parts if not p.startswith('[')]),
"tables_count": len(tables),
"images_count": len(images)
},
tables=tables,
images=images
)
except Exception as e:
logger.error(f"Word document processing failed: {e}")
raise
async def _extract_word_images_async(self, file_path: Path) -> List[Dict[str, Any]]:
"""
Extract and process images from Word document asynchronously
Args:
file_path: Path to Word document
Returns:
List of image metadata dictionaries
"""
images = []
try:
import zipfile
import os
# Create temporary directory for extracted images
with tempfile.TemporaryDirectory() as temp_dir:
# Extract images from docx using zipfile
with zipfile.ZipFile(str(file_path), 'r') as zip_ref:
image_files = []
for file_info in zip_ref.filelist:
if file_info.filename.startswith('word/media/'):
# Extract the image
image_filename = os.path.basename(file_info.filename)
image_path = os.path.join(temp_dir, image_filename)
# Extract and save
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
target.write(source.read())
image_files.append((len(image_files), image_path))
logger.info(f"Extracted image: {image_path}")
if image_files:
logger.info(f"Found {len(image_files)} images in Word document")
# Process images in batches
for batch_start in range(0, len(image_files), self.batch_size):
batch = image_files[batch_start:batch_start + self.batch_size]
# Prepare batch for OCR
image_paths = [path for _, path in batch]
indices = [idx for idx, _ in batch]
# Process batch with OCR
if self.ocr_processor.ocr_available:
ocr_results = await self.ocr_processor.extract_text_from_images_batch_async(image_paths)
for i, (idx, image_path) in enumerate(batch):
if i < len(ocr_results):
ocr_result = ocr_results[i]
image_metadata = {
"path": image_path,
"index": idx,
"ocr_text": ocr_result.text,
"ocr_confidence": ocr_result.confidence
}
# Only classify if OCR found no text
if not ocr_result.text.strip() and self.image_classifier:
try:
classification_results = self.image_classifier.classify_image(image_path, top_k=3)
image_metadata["classification"] = classification_results
if classification_results:
image_metadata["primary_classification"] = classification_results[0]["label"]
except Exception as classify_error:
logger.error(f"Image classification failed: {classify_error}")
images.append(image_metadata)
# Update metrics
self.metrics["images_processed"] += len(batch)
self.metrics["ocr_batches_processed"] += 1
return images
except Exception as e:
logger.warning(f"Image extraction from Word document failed: {e}")
return images
async def _process_excel_async(self, file_path: Path) -> ProcessingResult:
"""
Process Excel files asynchronously
Args:
file_path: Path to Excel file
Returns:
ProcessingResult object
"""
try:
workbook = openpyxl.load_workbook(str(file_path))
content_parts = []
tables = []
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
content_parts.append(f"Sheet: {sheet_name}")
# Extract data from cells
sheet_data = []
for row in sheet.iter_rows(values_only=True):
if any(cell is not None for cell in row):
sheet_data.append([str(cell) if cell is not None else "" for cell in row])
if sheet_data:
tables.append({
"data": sheet_data,
"sheet": sheet_name,
"rows": len(sheet_data),
"columns": max(len(row) for row in sheet_data) if sheet_data else 0
})
# Add sample content (first few rows)
sample_rows = min(5, len(sheet_data))
for i in range(sample_rows):
content_parts.append(" | ".join(sheet_data[i]))
workbook.close()
full_content = "\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "excel",
"sheets": len(workbook.sheetnames),
"tables_count": len(tables)
},
tables=tables
)
except Exception as e:
logger.error(f"Excel processing failed: {e}")
raise
async def _process_powerpoint_async(self, file_path: Path) -> ProcessingResult:
"""
Process PowerPoint presentations asynchronously
Args:
file_path: Path to PowerPoint file
Returns:
ProcessingResult object
"""
try:
presentation = Presentation(str(file_path))
content_parts = []
for i, slide in enumerate(presentation.slides):
content_parts.append(f"Slide {i + 1}:")
# Extract text from slide shapes
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text.append(shape.text)
if slide_text:
content_parts.extend(slide_text)
content_parts.append("") # Empty line between slides
full_content = "\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "powerpoint",
"slides": len(presentation.slides)
}
)
except Exception as e:
logger.error(f"PowerPoint processing failed: {e}")
raise
async def _process_text_async(self, file_path: Path) -> ProcessingResult:
"""
Process text-based files asynchronously
Args:
file_path: Path to text file
Returns:
ProcessingResult object
"""
try:
extension = file_path.suffix.lower()
if extension == '.csv':
# Process CSV with pandas
df = pd.read_csv(file_path)
content = df.to_string(index=False)
tables = [{
"data": df.values.tolist(),
"columns": df.columns.tolist(),
"rows": len(df),
"columns_count": len(df.columns)
}]
return ProcessingResult