Files
railseek6/LightRAG-main/lightrag/document_processor.py
2026-01-13 18:25:49 +08:00

1014 lines
44 KiB
Python
Raw Blame History

"""
Multi-format Document Processing Pipeline for LightRAG
Supports PDF, images, Office documents, and more with GPU acceleration
Enhanced with text-first extraction and isolated image classification
"""
import os
import logging
import asyncio
from typing import Dict, List, Any, Optional, Union, Tuple
from dataclasses import dataclass
import tempfile
from pathlib import Path
# Import required libraries
import fitz # PyMuPDF
import docx
import openpyxl
from pptx import Presentation
from bs4 import BeautifulSoup
import pandas as pd
from .production_config import get_config
# Import optimized image classifier using subprocess isolation
import sys
import os
# Add the workspace directory to path where fast_image_classifier.py is located
workspace_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if workspace_dir not in sys.path:
sys.path.insert(0, workspace_dir)
from fast_image_classifier import get_image_classifier
# Import optimized OCR processor
from .optimized_ocr_processor import OptimizedOCRProcessor
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class ProcessingResult:
"""Result of document processing"""
success: bool
content: str
metadata: Dict[str, Any]
error: Optional[str] = None
tables: List[Dict[str, Any]] = None
images: List[Dict[str, Any]] = None
class OCRProcessor:
"""GPU-accelerated OCR processing using PaddleOCR with process-per-request isolation"""
def __init__(self, use_gpu: bool = True, languages: List[str] = None):
self.use_gpu = use_gpu
self.languages = languages or ['en', 'ch']
self.ocr_available = False
self._temp_dir = None
self._initialize_ocr()
def _initialize_ocr(self):
"""Initialize PaddleOCR by testing if it can be loaded"""
try:
logger.info("Testing PaddleOCR availability with process-per-request approach")
# Create a simple test script to verify OCR works
test_script = """
import sys
import json
from paddleocr import PaddleOCR
try:
# Test OCR initialization
ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang='en', show_log=False, gpu_mem=2000)
print("PaddleOCR test: SUCCESS")
sys.exit(0)
except Exception as e:
print(f"PaddleOCR test: FAILED - {e}")
sys.exit(1)
"""
import tempfile
import subprocess
# Create temporary directory
self._temp_dir = tempfile.mkdtemp(prefix="paddleocr_")
script_path = os.path.join(self._temp_dir, "test_ocr.py")
with open(script_path, 'w') as f:
f.write(test_script)
# Run test
env = os.environ.copy()
result = subprocess.run(
[sys.executable, script_path],
capture_output=True,
text=True,
timeout=30,
env=env
)
if result.returncode == 0:
self.ocr_available = True
logger.info("PaddleOCR is available for process-per-request OCR")
else:
logger.error(f"PaddleOCR test failed: {result.stderr}")
self.ocr_available = False
except Exception as e:
logger.error(f"Failed to initialize OCR processor: {e}")
self.ocr_available = False
def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract text from image using isolated OCR process per request"""
if not self.ocr_available:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
import tempfile
import subprocess
import json
# OCR script that processes one image and returns JSON result
ocr_script = """
import sys
import json
from paddleocr import PaddleOCR
def extract_text_from_image(image_path):
try:
ocr_engine = PaddleOCR(
use_gpu=True,
use_angle_cls=True,
lang='en',
show_log=False,
gpu_mem=2000
)
result = ocr_engine.ocr(image_path)
if not result or not result[0]:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
extracted_text = []
bboxes = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
if len(line) == 2:
bbox, (text, confidence) = line
elif len(line) >= 1:
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
text_str = str(text) if text is not None else ""
confidence_float = 0.0
if confidence is not None:
if isinstance(confidence, (int, float)):
confidence_float = float(confidence)
elif isinstance(confidence, str):
try:
confidence_float = float(confidence)
except ValueError:
confidence_float = 0.0
else:
confidence_float = 0.0
else:
confidence_float = 0.0
extracted_text.append(text_str)
bboxes.append(bbox)
total_confidence += confidence_float
line_count += 1
except (TypeError, ValueError, IndexError) as e:
extracted_text.append("")
bboxes.append([])
total_confidence += 0.0
line_count += 1
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
full_text = "\\\\n".join(extracted_text)
return {
"text": full_text,
"confidence": avg_confidence,
"bboxes": bboxes,
"line_count": line_count
}
except Exception as e:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
# Main execution
if __name__ == "__main__":
image_path = sys.argv[1]
try:
result = extract_text_from_image(image_path)
print(json.dumps(result))
except Exception as e:
print(json.dumps({"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0, "error": str(e)}))
"""
# Write OCR script
script_path = os.path.join(self._temp_dir, "ocr_single.py")
with open(script_path, 'w') as f:
f.write(ocr_script)
# Run OCR process
env = os.environ.copy()
result = subprocess.run(
[sys.executable, script_path, image_path],
capture_output=True,
text=True,
timeout=60, # 60 second timeout for OCR
env=env
)
if result.returncode == 0:
try:
ocr_result = json.loads(result.stdout)
return ocr_result
except json.JSONDecodeError:
logger.error(f"Failed to parse OCR result: {result.stdout}")
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
else:
logger.error(f"OCR process failed with return code {result.returncode}: {result.stderr}")
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
except subprocess.TimeoutExpired:
logger.error("OCR processing timeout")
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
except Exception as e:
logger.error(f"OCR request failed: {e}")
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
def close(self):
"""Close the OCR process"""
if self._process:
try:
exit_request = {"action": "exit"}
self._process.stdin.write(json.dumps(exit_request) + '\n')
self._process.stdin.flush()
self._process.wait(timeout=5)
except:
self._process.terminate()
finally:
self._process = None
if self._temp_dir and os.path.exists(self._temp_dir):
import shutil
try:
shutil.rmtree(self._temp_dir)
except:
pass
def __del__(self):
"""Destructor to ensure cleanup"""
self.close()
def extract_tables_from_image(self, image_path: str) -> List[Dict[str, Any]]:
"""Extract tables from image using OCR and layout analysis"""
try:
# Use OCR to get text with bounding boxes
ocr_result = self.extract_text_from_image(image_path)
# Simple table detection based on text alignment
tables = self._detect_tables_from_bboxes(ocr_result["bboxes"], ocr_result["text"])
return tables
except Exception as e:
logger.error(f"Table extraction failed: {e}")
return []
def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]:
"""Detect tables from OCR bounding boxes"""
tables = []
if not bboxes:
return tables
# Group text by rows based on y-coordinates
rows = {}
for i, bbox in enumerate(bboxes):
try:
# Ensure all points are converted to float with proper error handling
y_values = []
for point in bbox:
if point and len(point) >= 2:
try:
# Ensure we convert both coordinates to float with explicit type safety
y_val = point[1]
if isinstance(y_val, (int, float)):
y_values.append(float(y_val))
elif isinstance(y_val, str):
y_values.append(float(y_val))
else:
logger.warning(f"Unexpected y-coordinate type: {type(y_val)}, value: {y_val}")
y_values.append(0.0)
except (TypeError, ValueError) as conv_error:
logger.warning(f"Type conversion error for y-coordinate {point[1]}: {conv_error}")
y_values.append(0.0)
else:
y_values.append(0.0)
# Safe calculation of y_center with explicit float conversion
try:
if y_values:
# Convert all values to float explicitly and handle any remaining type issues
float_y_values = []
for val in y_values:
try:
float_y_values.append(float(val))
except (TypeError, ValueError):
float_y_values.append(0.0)
y_center = sum(float_y_values) / len(float_y_values)
else:
y_center = 0.0
except (TypeError, ZeroDivisionError) as calc_error:
logger.warning(f"Error calculating y_center: {calc_error}")
y_center = 0.0
row_key = round(y_center / 10) # Group by 10-pixel rows
if row_key not in rows:
rows[row_key] = []
# Safe text extraction with bounds checking
text_lines = text.split('\n')
row_text = text_lines[i] if i < len(text_lines) else ""
rows[row_key].append((bbox, row_text))
except Exception as e:
logger.warning(f"Error processing bbox {i}: {e}")
continue
# Sort rows and create table structure
sorted_rows = sorted(rows.keys())
table_data = []
for row_key in sorted_rows:
try:
# Ensure all x-coordinates are converted to float with proper error handling
def get_x_coordinate(item):
try:
if (item[0] and len(item[0]) > 0 and
item[0][0] and len(item[0][0]) > 0):
# Explicit float conversion with error handling
x_val = item[0][0][0]
return float(x_val) if x_val is not None else 0.0
return 0.0
except (TypeError, ValueError, IndexError) as x_error:
logger.warning(f"Error getting x-coordinate: {x_error}")
return 0.0
row_items = sorted(rows[row_key], key=get_x_coordinate)
row_text = [item[1] for item in row_items]
table_data.append(row_text)
except Exception as e:
logger.warning(f"Error sorting row {row_key}: {e}")
continue
if len(table_data) > 1: # At least 2 rows for a table
tables.append({
"data": table_data,
"rows": len(table_data),
"columns": max(len(row) for row in table_data) if table_data else 0
})
return tables
class DocumentProcessor:
"""Main document processor for multiple file formats"""
def __init__(self):
self.config = get_config()
self.ocr_processor = OptimizedOCRProcessor(
use_gpu=self.config.performance.USE_GPU,
languages=self.config.document_processing.OCR_LANGUAGES,
batch_size=4, # Process 4 images at a time for better performance
max_workers=2 # Use 2 parallel workers for async operations
)
self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS
# Initialize image classifier if available
self.image_classifier = None
if get_image_classifier:
try:
self.image_classifier = get_image_classifier()
logger.info("Image classifier initialized successfully")
except Exception as e:
logger.warning(f"Failed to initialize image classifier: {e}")
async def process_document(self, file_path: str) -> ProcessingResult:
"""Process document based on file extension"""
file_path = Path(file_path)
if not file_path.exists():
return ProcessingResult(
success=False,
content="",
metadata={"error": "File not found"},
error="File not found"
)
# Determine file type and process accordingly
extension = file_path.suffix.lower()
try:
if extension in ['.pdf']:
return await self._process_pdf(file_path)
elif extension in ['.doc', '.docx']:
return await self._process_word(file_path)
elif extension in ['.xls', '.xlsx']:
return await self._process_excel(file_path)
elif extension in ['.ppt', '.pptx']:
return await self._process_powerpoint(file_path)
elif extension in ['.txt', '.csv', '.html']:
return await self._process_text(file_path)
elif extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']:
return await self._process_image(file_path)
else:
return ProcessingResult(
success=False,
content="",
metadata={"error": f"Unsupported file type: {extension}"},
error=f"Unsupported file type: {extension}"
)
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
return ProcessingResult(
success=False,
content="",
metadata={"error": str(e)},
error=str(e)
)
def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]:
"""
Extract and process images from documents with batch OCR processing
Returns processed images metadata and additional content from OCR
"""
processed_images = []
additional_content = []
temp_paths = []
temp_files = []
# Step 1: Save all images to temporary files
for i, image_data in enumerate(images):
temp_path = None
try:
# Save image to temporary file
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
if file_type == 'word':
# For Word documents, image_data is an inline_shape
image_bytes = image_data.image.blob
elif file_type == 'pdf':
# For PDFs, image_data is a pixmap
image_bytes = image_data.tobytes("png")
else:
image_bytes = image_data
temp_file.write(image_bytes)
temp_path = temp_file.name
temp_paths.append(temp_path)
temp_files.append((i, temp_path, image_data))
except Exception as e:
logger.error(f"Error saving image {i} to temporary file: {e}")
processed_images.append({
"index": i,
"error": str(e),
"path": temp_path or "unknown"
})
if not temp_paths:
return processed_images, ""
# Step 2: Batch OCR processing
batch_results = []
if self.ocr_processor.ocr_available:
try:
logger.info(f"Running batch OCR on {len(temp_paths)} images")
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
logger.info(f"Batch OCR completed for {len(batch_results)} images")
except Exception as e:
logger.error(f"Batch OCR processing failed: {e}")
# Fall back to individual processing
batch_results = []
# Step 3: Process results
for idx, (i, temp_path, image_data) in enumerate(temp_files):
image_metadata = {"path": temp_path, "index": i}
try:
# Get OCR result for this image
ocr_result = None
if batch_results and idx < len(batch_results):
batch_result = batch_results[idx]
ocr_result = {
"text": batch_result.text,
"confidence": batch_result.confidence,
"bboxes": batch_result.bboxes,
"line_count": batch_result.line_count
}
else:
# Fallback to individual OCR
if self.ocr_processor.ocr_available:
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
if ocr_result and ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
logger.info(f"Image {i+1} has text content, skipping classification")
else:
logger.info(f"Image {i+1} has no text, proceeding to classification")
# Step 4: Only classify if OCR found no text
if self.image_classifier and self.image_classifier.available:
try:
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
image_metadata["classification"] = classification_results
# Add classification to content for indexing
top_label = classification_results[0]["label"] if classification_results else "unknown"
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
image_metadata["primary_classification"] = top_label
# Add classification with confidence for better searchability
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
additional_content.append(classification_text)
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
# Add bee classification as a special entity for search
if "bee" in top_label.lower():
# Add multiple variations to ensure it gets picked up by entity extraction
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
additional_content.append(bee_entity_text)
# Also add as standalone entity markers
additional_content.append("Entity: Bee")
additional_content.append("Entity: Insect")
additional_content.append("Entity: Animal")
except Exception as classify_error:
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
image_metadata["classification_error"] = str(classify_error)
processed_images.append(image_metadata)
except Exception as e:
logger.error(f"Error processing image {i}: {e}")
processed_images.append({
"index": i,
"error": str(e),
"path": temp_path
})
finally:
# Clean up temporary file
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
except Exception as e:
logger.warning(f"Failed to delete temporary image file {temp_path}: {e}")
return processed_images, "\n".join(additional_content)
def _text_quality_score(self, text: str) -> float:
"""Return a score between 0 and 1 indicating text quality.
Higher score means more readable English text."""
if not text:
return 0.0
total = len(text)
# Count printable ASCII letters and spaces
printable = sum(1 for c in text if 32 <= ord(c) <= 126)
# Count replacement characters (<28>) which is Unicode U+FFFD
replacement = text.count('\ufffd')
# Count other non-ASCII characters
non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
# Score based on printable ratio, penalize replacement chars
score = (printable / total) * (1 - (replacement / total))
return score
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
"""Process PDF files with text extraction and OCR fallback"""
pdf_document = None
try:
content_parts = []
tables = []
images = []
processed_with_ocr = False
# Open PDF
pdf_document = fitz.open(str(file_path))
total_pages = len(pdf_document)
for page_num in range(total_pages):
page = pdf_document[page_num]
# Try text extraction first
text = page.get_text()
text_score = self._text_quality_score(text)
# Determine if text is usable (not garbled)
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
if text.strip() and text_score >= 0.5:
content_parts.append(f"Page {page_num + 1}:\n{text}")
else:
# Text is empty, garbled, or low quality -> use OCR
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
# Use higher resolution for better OCR accuracy on scanned documents
mat = fitz.Matrix(2, 2) # 2x resolution for better OCR
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
# Save temporary image for OCR
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
temp_file.write(img_data)
temp_path = temp_file.name
try:
if self.ocr_processor.ocr_available:
logger.info(f"Running OCR on page {page_num + 1} with high resolution")
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
if ocr_result["text"].strip():
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
processed_with_ocr = True
else:
logger.warning(f"OCR returned empty text for page {page_num + 1}")
# Don't add empty content, just mark as processed
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
# Extract tables from OCR
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
if ocr_tables:
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
tables.extend(ocr_tables)
else:
logger.warning("OCR not available, skipping OCR processing")
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
except Exception as ocr_error:
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
finally:
os.unlink(temp_path)
full_content = "\n\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"pages": total_pages,
"file_type": "pdf",
"processed_with_ocr": processed_with_ocr
},
tables=tables,
images=images
)
except Exception as e:
logger.error(f"PDF processing failed: {e}")
raise
finally:
if pdf_document:
pdf_document.close()
async def _process_word(self, file_path: Path) -> ProcessingResult:
"""Process Word documents with image extraction and classification"""
try:
doc = docx.Document(str(file_path))
content_parts = []
tables = []
images = []
# Extract text from paragraphs first (primary content)
for para in doc.paragraphs:
if para.text.strip():
content_parts.append(para.text)
# Extract tables
for table in doc.tables:
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
if table_data:
tables.append({
"data": table_data,
"rows": len(table_data),
"columns": max(len(row) for row in table_data) if table_data else 0
})
# Extract and process images using zipfile method
try:
import zipfile
import os
# Create temporary directory for extracted images
with tempfile.TemporaryDirectory() as temp_dir:
# Extract images from docx using zipfile
with zipfile.ZipFile(str(file_path), 'r') as zip_ref:
image_files = []
for file_info in zip_ref.filelist:
if file_info.filename.startswith('word/media/'):
# Extract the image
image_filename = os.path.basename(file_info.filename)
image_path = os.path.join(temp_dir, image_filename)
# Extract and save
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
target.write(source.read())
image_files.append(image_path)
logger.info(f"📸 Extracted image: {image_path}")
if image_files:
logger.info(f"Found {len(image_files)} images in Word document using zipfile method")
# Process each extracted image
for i, image_path in enumerate(image_files):
try:
image_metadata = {"path": image_path, "index": i}
# Step 1: Always run GPU OCR first
if self.ocr_processor.ocr_available:
ocr_result = self.ocr_processor.extract_text_from_image(image_path)
if ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
content_parts.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
logger.info(f"Image {i+1} has text content, skipping classification")
else:
logger.info(f"Image {i+1} has no text, proceeding to classification")
# Step 2: Only classify if OCR found no text
if self.image_classifier and self.image_classifier.available:
classification_results = self.image_classifier.classify_image(image_path, top_k=3)
image_metadata["classification"] = classification_results
# Add classification to content for indexing
top_label = classification_results[0]["label"] if classification_results else "unknown"
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
image_metadata["primary_classification"] = top_label
# Add classification with confidence for better searchability
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
content_parts.append(classification_text)
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
# Add bee classification as a special entity for search
if "bee" in top_label.lower():
# Add multiple variations to ensure it gets picked up by entity extraction
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
content_parts.append(bee_entity_text)
# Also add as standalone entity markers
content_parts.append("Entity: Bee")
content_parts.append("Entity: Insect")
content_parts.append("Entity: Animal")
images.append(image_metadata)
except Exception as img_error:
logger.error(f"Error processing image {i}: {img_error}")
images.append({
"index": i,
"error": str(img_error),
"path": image_path
})
except Exception as img_error:
logger.warning(f"Image extraction from Word document failed: {img_error}")
full_content = "\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "word",
"paragraphs": len([p for p in content_parts if not p.startswith('[')]),
"tables_count": len(tables),
"images_count": len(images)
},
tables=tables,
images=images
)
except Exception as e:
logger.error(f"Word document processing failed: {e}")
raise
async def _process_excel(self, file_path: Path) -> ProcessingResult:
"""Process Excel files"""
try:
workbook = openpyxl.load_workbook(str(file_path))
content_parts = []
tables = []
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
content_parts.append(f"Sheet: {sheet_name}")
# Extract data from cells
sheet_data = []
for row in sheet.iter_rows(values_only=True):
if any(cell is not None for cell in row):
sheet_data.append([str(cell) if cell is not None else "" for cell in row])
if sheet_data:
tables.append({
"data": sheet_data,
"sheet": sheet_name,
"rows": len(sheet_data),
"columns": max(len(row) for row in sheet_data) if sheet_data else 0
})
# Add sample content (first few rows)
sample_rows = min(5, len(sheet_data))
for i in range(sample_rows):
content_parts.append(" | ".join(sheet_data[i]))
workbook.close()
full_content = "\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "excel",
"sheets": len(workbook.sheetnames),
"tables_count": len(tables)
},
tables=tables
)
except Exception as e:
logger.error(f"Excel processing failed: {e}")
raise
async def _process_powerpoint(self, file_path: Path) -> ProcessingResult:
"""Process PowerPoint presentations"""
try:
presentation = Presentation(str(file_path))
content_parts = []
for i, slide in enumerate(presentation.slides):
content_parts.append(f"Slide {i + 1}:")
# Extract text from slide shapes
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text.append(shape.text)
if slide_text:
content_parts.extend(slide_text)
content_parts.append("") # Empty line between slides
full_content = "\n".join(content_parts)
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "powerpoint",
"slides": len(presentation.slides)
}
)
except Exception as e:
logger.error(f"PowerPoint processing failed: {e}")
raise
async def _process_text(self, file_path: Path) -> ProcessingResult:
"""Process text-based files (TXT, CSV, HTML)"""
try:
extension = file_path.suffix.lower()
if extension == '.csv':
# Process CSV with pandas
df = pd.read_csv(file_path)
content = df.to_string(index=False)
tables = [{
"data": df.values.tolist(),
"columns": df.columns.tolist(),
"rows": len(df),
"columns_count": len(df.columns)
}]
return ProcessingResult(
success=True,
content=content,
metadata={"file_type": "csv", "rows": len(df), "columns": len(df.columns)},
tables=tables
)
elif extension == '.html':
# Process HTML with BeautifulSoup
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
content = '\n'.join(chunk for chunk in chunks if chunk)
return ProcessingResult(
success=True,
content=content,
metadata={"file_type": "html"}
)
else: # TXT and other text files
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return ProcessingResult(
success=True,
content=content,
metadata={"file_type": "text"}
)
except Exception as e:
logger.error(f"Text file processing failed: {e}")
raise
async def _process_image(self, file_path: Path) -> ProcessingResult:
"""Process image files with OCR"""
try:
content_parts = []
tables = []
images = [{"path": str(file_path), "classification": "processed_with_ocr"}]
# Always perform OCR on images
ocr_result = self.ocr_processor.extract_text_from_image(str(file_path))
if ocr_result["text"].strip():
content_parts.append(ocr_result["text"])
# Extract tables from image
ocr_tables = self.ocr_processor.extract_tables_from_image(str(file_path))
tables.extend(ocr_tables)
full_content = "\n".join(content_parts) if content_parts else "No text extracted from image"
return ProcessingResult(
success=True,
content=full_content,
metadata={
"file_type": "image",
"ocr_confidence": ocr_result.get("confidence", 0.0),
"line_count": ocr_result.get("line_count", 0)
},
tables=tables,
images=images
)
except Exception as e:
logger.error(f"Image processing failed: {e}")
raise
def get_supported_formats(self) -> List[str]:
"""Get list of supported file formats"""
return list(self.supported_extensions)
async def process_batch(self, file_paths: List[str]) -> List[ProcessingResult]:
"""Process multiple documents in batch"""
tasks = [self.process_document(file_path) for file_path in file_paths]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle exceptions in results
processed_results = []
for result in results:
if isinstance(result, Exception):
processed_results.append(ProcessingResult(
success=False,
content="",
metadata={"error": str(result)},
error=str(result)
))
else:
processed_results.append(result)
return processed_results
# Singleton instance
_processor_instance = None
def get_document_processor() -> DocumentProcessor:
"""Get singleton document processor instance"""
global _processor_instance
if _processor_instance is None:
_processor_instance = DocumentProcessor()
return _processor_instance
async def test_processor():
"""Test function for document processor"""
processor = get_document_processor()
# Test with a sample file (modify path as needed)
test_file = "test_documents/test_document.txt"
if os.path.exists(test_file):
result = await processor.process_document(test_file)
print(f"Success: {result.success}")
print(f"Content length: {len(result.content)}")
print(f"Metadata: {result.metadata}")
else:
print("Test file not found")
if __name__ == "__main__":
# Run test
asyncio.run(test_processor())