954 lines
42 KiB
Python
954 lines
42 KiB
Python
"""
|
|
Multi-format Document Processing Pipeline for LightRAG
|
|
Supports PDF, images, Office documents, and more with GPU acceleration
|
|
Enhanced with text-first extraction and isolated image classification
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import asyncio
|
|
from typing import Dict, List, Any, Optional, Union, Tuple
|
|
from dataclasses import dataclass
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
# Import required libraries
|
|
import fitz # PyMuPDF
|
|
import docx
|
|
import openpyxl
|
|
from pptx import Presentation
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
from .production_config import get_config
|
|
|
|
# Import optimized image classifier using subprocess isolation
|
|
import sys
|
|
import os
|
|
# Add the workspace directory to path where fast_image_classifier.py is located
|
|
workspace_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
if workspace_dir not in sys.path:
|
|
sys.path.insert(0, workspace_dir)
|
|
from fast_image_classifier import get_image_classifier
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class ProcessingResult:
|
|
"""Result of document processing"""
|
|
success: bool
|
|
content: str
|
|
metadata: Dict[str, Any]
|
|
error: Optional[str] = None
|
|
tables: List[Dict[str, Any]] = None
|
|
images: List[Dict[str, Any]] = None
|
|
|
|
class OCRProcessor:
|
|
"""GPU-accelerated OCR processing using PaddleOCR with process-per-request isolation"""
|
|
|
|
def __init__(self, use_gpu: bool = True, languages: List[str] = None):
|
|
self.use_gpu = use_gpu
|
|
self.languages = languages or ['en', 'ch']
|
|
self.ocr_available = False
|
|
self._temp_dir = None
|
|
self._initialize_ocr()
|
|
|
|
def _initialize_ocr(self):
|
|
"""Initialize PaddleOCR by testing if it can be loaded"""
|
|
try:
|
|
logger.info("Testing PaddleOCR availability with process-per-request approach")
|
|
|
|
# Create a simple test script to verify OCR works
|
|
test_script = """
|
|
import sys
|
|
import json
|
|
from paddleocr import PaddleOCR
|
|
|
|
try:
|
|
# Test OCR initialization
|
|
ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang='en', show_log=False, gpu_mem=2000)
|
|
print("PaddleOCR test: SUCCESS")
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
print(f"PaddleOCR test: FAILED - {e}")
|
|
sys.exit(1)
|
|
"""
|
|
|
|
import tempfile
|
|
import subprocess
|
|
|
|
# Create temporary directory
|
|
self._temp_dir = tempfile.mkdtemp(prefix="paddleocr_")
|
|
script_path = os.path.join(self._temp_dir, "test_ocr.py")
|
|
|
|
with open(script_path, 'w') as f:
|
|
f.write(test_script)
|
|
|
|
# Run test
|
|
env = os.environ.copy()
|
|
result = subprocess.run(
|
|
[sys.executable, script_path],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
self.ocr_available = True
|
|
logger.info("PaddleOCR is available for process-per-request OCR")
|
|
else:
|
|
logger.error(f"PaddleOCR test failed: {result.stderr}")
|
|
self.ocr_available = False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize OCR processor: {e}")
|
|
self.ocr_available = False
|
|
|
|
def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
|
|
"""Extract text from image using isolated OCR process per request"""
|
|
if not self.ocr_available:
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
|
|
try:
|
|
import tempfile
|
|
import subprocess
|
|
import json
|
|
|
|
# OCR script that processes one image and returns JSON result
|
|
ocr_script = """
|
|
import sys
|
|
import json
|
|
from paddleocr import PaddleOCR
|
|
|
|
def extract_text_from_image(image_path):
|
|
try:
|
|
ocr_engine = PaddleOCR(
|
|
use_gpu=True,
|
|
use_angle_cls=True,
|
|
lang='en',
|
|
show_log=False,
|
|
gpu_mem=2000
|
|
)
|
|
|
|
result = ocr_engine.ocr(image_path)
|
|
|
|
if not result or not result[0]:
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
|
|
extracted_text = []
|
|
bboxes = []
|
|
total_confidence = 0.0
|
|
line_count = 0
|
|
|
|
for line in result[0]:
|
|
try:
|
|
if len(line) == 2:
|
|
bbox, (text, confidence) = line
|
|
elif len(line) >= 1:
|
|
bbox = line[0] if len(line) > 0 else []
|
|
if len(line) > 1:
|
|
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
|
|
text, confidence = line[1][0], line[1][1]
|
|
else:
|
|
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
|
|
else:
|
|
text, confidence = "", 0.0
|
|
else:
|
|
continue
|
|
|
|
text_str = str(text) if text is not None else ""
|
|
confidence_float = 0.0
|
|
if confidence is not None:
|
|
if isinstance(confidence, (int, float)):
|
|
confidence_float = float(confidence)
|
|
elif isinstance(confidence, str):
|
|
try:
|
|
confidence_float = float(confidence)
|
|
except ValueError:
|
|
confidence_float = 0.0
|
|
else:
|
|
confidence_float = 0.0
|
|
else:
|
|
confidence_float = 0.0
|
|
|
|
extracted_text.append(text_str)
|
|
bboxes.append(bbox)
|
|
total_confidence += confidence_float
|
|
line_count += 1
|
|
|
|
except (TypeError, ValueError, IndexError) as e:
|
|
extracted_text.append("")
|
|
bboxes.append([])
|
|
total_confidence += 0.0
|
|
line_count += 1
|
|
|
|
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
|
|
full_text = "\\\\n".join(extracted_text)
|
|
|
|
return {
|
|
"text": full_text,
|
|
"confidence": avg_confidence,
|
|
"bboxes": bboxes,
|
|
"line_count": line_count
|
|
}
|
|
except Exception as e:
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
|
|
# Main execution
|
|
if __name__ == "__main__":
|
|
image_path = sys.argv[1]
|
|
try:
|
|
result = extract_text_from_image(image_path)
|
|
print(json.dumps(result))
|
|
except Exception as e:
|
|
print(json.dumps({"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0, "error": str(e)}))
|
|
"""
|
|
|
|
# Write OCR script
|
|
script_path = os.path.join(self._temp_dir, "ocr_single.py")
|
|
with open(script_path, 'w') as f:
|
|
f.write(ocr_script)
|
|
|
|
# Run OCR process
|
|
env = os.environ.copy()
|
|
result = subprocess.run(
|
|
[sys.executable, script_path, image_path],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60, # 60 second timeout for OCR
|
|
env=env
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
try:
|
|
ocr_result = json.loads(result.stdout)
|
|
return ocr_result
|
|
except json.JSONDecodeError:
|
|
logger.error(f"Failed to parse OCR result: {result.stdout}")
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
else:
|
|
logger.error(f"OCR process failed with return code {result.returncode}: {result.stderr}")
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
|
|
except subprocess.TimeoutExpired:
|
|
logger.error("OCR processing timeout")
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
except Exception as e:
|
|
logger.error(f"OCR request failed: {e}")
|
|
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
|
|
|
def close(self):
|
|
"""Close the OCR process"""
|
|
if self._process:
|
|
try:
|
|
exit_request = {"action": "exit"}
|
|
self._process.stdin.write(json.dumps(exit_request) + '\n')
|
|
self._process.stdin.flush()
|
|
self._process.wait(timeout=5)
|
|
except:
|
|
self._process.terminate()
|
|
finally:
|
|
self._process = None
|
|
|
|
if self._temp_dir and os.path.exists(self._temp_dir):
|
|
import shutil
|
|
try:
|
|
shutil.rmtree(self._temp_dir)
|
|
except:
|
|
pass
|
|
|
|
def __del__(self):
|
|
"""Destructor to ensure cleanup"""
|
|
self.close()
|
|
|
|
def extract_tables_from_image(self, image_path: str) -> List[Dict[str, Any]]:
|
|
"""Extract tables from image using OCR and layout analysis"""
|
|
try:
|
|
# Use OCR to get text with bounding boxes
|
|
ocr_result = self.extract_text_from_image(image_path)
|
|
|
|
# Simple table detection based on text alignment
|
|
tables = self._detect_tables_from_bboxes(ocr_result["bboxes"], ocr_result["text"])
|
|
return tables
|
|
except Exception as e:
|
|
logger.error(f"Table extraction failed: {e}")
|
|
return []
|
|
|
|
def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]:
|
|
"""Detect tables from OCR bounding boxes"""
|
|
tables = []
|
|
|
|
if not bboxes:
|
|
return tables
|
|
|
|
# Group text by rows based on y-coordinates
|
|
rows = {}
|
|
for i, bbox in enumerate(bboxes):
|
|
try:
|
|
# Ensure all points are converted to float with proper error handling
|
|
y_values = []
|
|
for point in bbox:
|
|
if point and len(point) >= 2:
|
|
try:
|
|
# Ensure we convert both coordinates to float with explicit type safety
|
|
y_val = point[1]
|
|
if isinstance(y_val, (int, float)):
|
|
y_values.append(float(y_val))
|
|
elif isinstance(y_val, str):
|
|
y_values.append(float(y_val))
|
|
else:
|
|
logger.warning(f"Unexpected y-coordinate type: {type(y_val)}, value: {y_val}")
|
|
y_values.append(0.0)
|
|
except (TypeError, ValueError) as conv_error:
|
|
logger.warning(f"Type conversion error for y-coordinate {point[1]}: {conv_error}")
|
|
y_values.append(0.0)
|
|
else:
|
|
y_values.append(0.0)
|
|
|
|
# Safe calculation of y_center with explicit float conversion
|
|
try:
|
|
if y_values:
|
|
# Convert all values to float explicitly and handle any remaining type issues
|
|
float_y_values = []
|
|
for val in y_values:
|
|
try:
|
|
float_y_values.append(float(val))
|
|
except (TypeError, ValueError):
|
|
float_y_values.append(0.0)
|
|
y_center = sum(float_y_values) / len(float_y_values)
|
|
else:
|
|
y_center = 0.0
|
|
except (TypeError, ZeroDivisionError) as calc_error:
|
|
logger.warning(f"Error calculating y_center: {calc_error}")
|
|
y_center = 0.0
|
|
|
|
row_key = round(y_center / 10) # Group by 10-pixel rows
|
|
|
|
if row_key not in rows:
|
|
rows[row_key] = []
|
|
# Safe text extraction with bounds checking
|
|
text_lines = text.split('\n')
|
|
row_text = text_lines[i] if i < len(text_lines) else ""
|
|
rows[row_key].append((bbox, row_text))
|
|
except Exception as e:
|
|
logger.warning(f"Error processing bbox {i}: {e}")
|
|
continue
|
|
|
|
# Sort rows and create table structure
|
|
sorted_rows = sorted(rows.keys())
|
|
table_data = []
|
|
|
|
for row_key in sorted_rows:
|
|
try:
|
|
# Ensure all x-coordinates are converted to float with proper error handling
|
|
def get_x_coordinate(item):
|
|
try:
|
|
if (item[0] and len(item[0]) > 0 and
|
|
item[0][0] and len(item[0][0]) > 0):
|
|
# Explicit float conversion with error handling
|
|
x_val = item[0][0][0]
|
|
return float(x_val) if x_val is not None else 0.0
|
|
return 0.0
|
|
except (TypeError, ValueError, IndexError) as x_error:
|
|
logger.warning(f"Error getting x-coordinate: {x_error}")
|
|
return 0.0
|
|
|
|
row_items = sorted(rows[row_key], key=get_x_coordinate)
|
|
row_text = [item[1] for item in row_items]
|
|
table_data.append(row_text)
|
|
except Exception as e:
|
|
logger.warning(f"Error sorting row {row_key}: {e}")
|
|
continue
|
|
|
|
if len(table_data) > 1: # At least 2 rows for a table
|
|
tables.append({
|
|
"data": table_data,
|
|
"rows": len(table_data),
|
|
"columns": max(len(row) for row in table_data) if table_data else 0
|
|
})
|
|
|
|
return tables
|
|
|
|
|
|
class DocumentProcessor:
|
|
"""Main document processor for multiple file formats"""
|
|
|
|
def __init__(self):
|
|
self.config = get_config()
|
|
self.ocr_processor = OCRProcessor(
|
|
use_gpu=self.config.performance.USE_GPU,
|
|
languages=self.config.document_processing.OCR_LANGUAGES
|
|
)
|
|
self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS
|
|
|
|
# Initialize image classifier if available
|
|
self.image_classifier = None
|
|
if get_image_classifier:
|
|
try:
|
|
self.image_classifier = get_image_classifier()
|
|
logger.info("Image classifier initialized successfully")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to initialize image classifier: {e}")
|
|
|
|
async def process_document(self, file_path: str) -> ProcessingResult:
|
|
"""Process document based on file extension"""
|
|
file_path = Path(file_path)
|
|
|
|
if not file_path.exists():
|
|
return ProcessingResult(
|
|
success=False,
|
|
content="",
|
|
metadata={"error": "File not found"},
|
|
error="File not found"
|
|
)
|
|
|
|
# Determine file type and process accordingly
|
|
extension = file_path.suffix.lower()
|
|
|
|
try:
|
|
if extension in ['.pdf']:
|
|
return await self._process_pdf(file_path)
|
|
elif extension in ['.doc', '.docx']:
|
|
return await self._process_word(file_path)
|
|
elif extension in ['.xls', '.xlsx']:
|
|
return await self._process_excel(file_path)
|
|
elif extension in ['.ppt', '.pptx']:
|
|
return await self._process_powerpoint(file_path)
|
|
elif extension in ['.txt', '.csv', '.html']:
|
|
return await self._process_text(file_path)
|
|
elif extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']:
|
|
return await self._process_image(file_path)
|
|
else:
|
|
return ProcessingResult(
|
|
success=False,
|
|
content="",
|
|
metadata={"error": f"Unsupported file type: {extension}"},
|
|
error=f"Unsupported file type: {extension}"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error processing {file_path}: {e}")
|
|
return ProcessingResult(
|
|
success=False,
|
|
content="",
|
|
metadata={"error": str(e)},
|
|
error=str(e)
|
|
)
|
|
|
|
def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]:
|
|
"""
|
|
Extract and process images from documents
|
|
Returns processed images metadata and additional content from OCR
|
|
"""
|
|
processed_images = []
|
|
additional_content = []
|
|
|
|
for i, image_data in enumerate(images):
|
|
temp_path = None
|
|
try:
|
|
# Save image to temporary file
|
|
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
|
|
if file_type == 'word':
|
|
# For Word documents, image_data is an inline_shape
|
|
image_bytes = image_data.image.blob
|
|
elif file_type == 'pdf':
|
|
# For PDFs, image_data is a pixmap
|
|
image_bytes = image_data.tobytes("png")
|
|
else:
|
|
image_bytes = image_data
|
|
|
|
temp_file.write(image_bytes)
|
|
temp_path = temp_file.name
|
|
|
|
# Process image with OCR first, then classify only if no text found
|
|
image_metadata = {"path": temp_path, "index": i}
|
|
|
|
# Step 1: Always run GPU OCR first
|
|
if self.ocr_processor.ocr_available:
|
|
try:
|
|
logger.info(f"Running GPU OCR on image {i+1}")
|
|
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
|
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
|
|
|
|
if ocr_result["text"].strip():
|
|
image_metadata["ocr_text"] = ocr_result["text"]
|
|
image_metadata["ocr_confidence"] = ocr_result["confidence"]
|
|
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
|
|
logger.info(f"Image {i+1} has text content, skipping classification")
|
|
else:
|
|
logger.info(f"Image {i+1} has no text, proceeding to classification")
|
|
# Step 2: Only classify if OCR found no text
|
|
if self.image_classifier and self.image_classifier.available:
|
|
try:
|
|
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
|
|
image_metadata["classification"] = classification_results
|
|
# Add classification to content for indexing
|
|
top_label = classification_results[0]["label"] if classification_results else "unknown"
|
|
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
|
|
image_metadata["primary_classification"] = top_label
|
|
# Add classification with confidence for better searchability
|
|
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
|
|
additional_content.append(classification_text)
|
|
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
|
|
|
|
# Add bee classification as a special entity for search
|
|
if "bee" in top_label.lower():
|
|
# Add multiple variations to ensure it gets picked up by entity extraction
|
|
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
|
|
additional_content.append(bee_entity_text)
|
|
# Also add as standalone entity markers
|
|
additional_content.append("Entity: Bee")
|
|
additional_content.append("Entity: Insect")
|
|
additional_content.append("Entity: Animal")
|
|
|
|
except Exception as classify_error:
|
|
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
|
|
image_metadata["classification_error"] = str(classify_error)
|
|
except Exception as ocr_error:
|
|
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
|
|
image_metadata["ocr_error"] = str(ocr_error)
|
|
|
|
processed_images.append(image_metadata)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing image {i}: {e}")
|
|
processed_images.append({
|
|
"index": i,
|
|
"error": str(e),
|
|
"path": temp_path or "unknown"
|
|
})
|
|
finally:
|
|
# Clean up temporary file
|
|
if temp_path and os.path.exists(temp_path):
|
|
try:
|
|
os.unlink(temp_path)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete temporary image file {temp_path}: {e}")
|
|
|
|
return processed_images, "\n".join(additional_content)
|
|
|
|
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
|
|
"""Process PDF files with text extraction and OCR fallback"""
|
|
pdf_document = None
|
|
try:
|
|
content_parts = []
|
|
tables = []
|
|
images = []
|
|
processed_with_ocr = False
|
|
|
|
# Open PDF
|
|
pdf_document = fitz.open(str(file_path))
|
|
total_pages = len(pdf_document)
|
|
|
|
for page_num in range(total_pages):
|
|
page = pdf_document[page_num]
|
|
|
|
# Try text extraction first
|
|
text = page.get_text()
|
|
if text.strip():
|
|
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
|
else:
|
|
# Fall back to OCR for scanned pages with higher resolution
|
|
logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR")
|
|
# Use higher resolution for better OCR accuracy on scanned documents
|
|
mat = fitz.Matrix(2, 2) # 2x resolution for better OCR
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
|
|
# Save temporary image for OCR
|
|
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
|
|
temp_file.write(img_data)
|
|
temp_path = temp_file.name
|
|
|
|
try:
|
|
if self.ocr_processor.ocr_available:
|
|
logger.info(f"Running OCR on page {page_num + 1} with high resolution")
|
|
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
|
|
|
if ocr_result["text"].strip():
|
|
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
|
|
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
|
processed_with_ocr = True
|
|
else:
|
|
logger.warning(f"OCR returned empty text for page {page_num + 1}")
|
|
# Don't add empty content, just mark as processed
|
|
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
|
|
|
# Extract tables from OCR
|
|
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
|
if ocr_tables:
|
|
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
|
tables.extend(ocr_tables)
|
|
else:
|
|
logger.warning("OCR not available, skipping OCR processing")
|
|
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
|
|
except Exception as ocr_error:
|
|
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
|
|
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
full_content = "\n\n".join(content_parts)
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=full_content,
|
|
metadata={
|
|
"pages": total_pages,
|
|
"file_type": "pdf",
|
|
"processed_with_ocr": processed_with_ocr
|
|
},
|
|
tables=tables,
|
|
images=images
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"PDF processing failed: {e}")
|
|
raise
|
|
finally:
|
|
if pdf_document:
|
|
pdf_document.close()
|
|
|
|
async def _process_word(self, file_path: Path) -> ProcessingResult:
|
|
"""Process Word documents with image extraction and classification"""
|
|
try:
|
|
doc = docx.Document(str(file_path))
|
|
content_parts = []
|
|
tables = []
|
|
images = []
|
|
|
|
# Extract text from paragraphs first (primary content)
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
content_parts.append(para.text)
|
|
|
|
# Extract tables
|
|
for table in doc.tables:
|
|
table_data = []
|
|
for row in table.rows:
|
|
row_data = [cell.text for cell in row.cells]
|
|
table_data.append(row_data)
|
|
|
|
if table_data:
|
|
tables.append({
|
|
"data": table_data,
|
|
"rows": len(table_data),
|
|
"columns": max(len(row) for row in table_data) if table_data else 0
|
|
})
|
|
|
|
# Extract and process images using zipfile method
|
|
try:
|
|
import zipfile
|
|
import os
|
|
|
|
# Create temporary directory for extracted images
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Extract images from docx using zipfile
|
|
with zipfile.ZipFile(str(file_path), 'r') as zip_ref:
|
|
image_files = []
|
|
for file_info in zip_ref.filelist:
|
|
if file_info.filename.startswith('word/media/'):
|
|
# Extract the image
|
|
image_filename = os.path.basename(file_info.filename)
|
|
image_path = os.path.join(temp_dir, image_filename)
|
|
|
|
# Extract and save
|
|
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
|
|
target.write(source.read())
|
|
|
|
image_files.append(image_path)
|
|
logger.info(f"📸 Extracted image: {image_path}")
|
|
|
|
if image_files:
|
|
logger.info(f"Found {len(image_files)} images in Word document using zipfile method")
|
|
|
|
# Process each extracted image
|
|
for i, image_path in enumerate(image_files):
|
|
try:
|
|
image_metadata = {"path": image_path, "index": i}
|
|
|
|
# Step 1: Always run GPU OCR first
|
|
if self.ocr_processor.ocr_available:
|
|
ocr_result = self.ocr_processor.extract_text_from_image(image_path)
|
|
if ocr_result["text"].strip():
|
|
image_metadata["ocr_text"] = ocr_result["text"]
|
|
image_metadata["ocr_confidence"] = ocr_result["confidence"]
|
|
content_parts.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
|
|
logger.info(f"Image {i+1} has text content, skipping classification")
|
|
else:
|
|
logger.info(f"Image {i+1} has no text, proceeding to classification")
|
|
# Step 2: Only classify if OCR found no text
|
|
if self.image_classifier and self.image_classifier.available:
|
|
classification_results = self.image_classifier.classify_image(image_path, top_k=3)
|
|
image_metadata["classification"] = classification_results
|
|
# Add classification to content for indexing
|
|
top_label = classification_results[0]["label"] if classification_results else "unknown"
|
|
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
|
|
image_metadata["primary_classification"] = top_label
|
|
# Add classification with confidence for better searchability
|
|
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
|
|
content_parts.append(classification_text)
|
|
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
|
|
|
|
# Add bee classification as a special entity for search
|
|
if "bee" in top_label.lower():
|
|
# Add multiple variations to ensure it gets picked up by entity extraction
|
|
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
|
|
content_parts.append(bee_entity_text)
|
|
# Also add as standalone entity markers
|
|
content_parts.append("Entity: Bee")
|
|
content_parts.append("Entity: Insect")
|
|
content_parts.append("Entity: Animal")
|
|
|
|
images.append(image_metadata)
|
|
|
|
except Exception as img_error:
|
|
logger.error(f"Error processing image {i}: {img_error}")
|
|
images.append({
|
|
"index": i,
|
|
"error": str(img_error),
|
|
"path": image_path
|
|
})
|
|
except Exception as img_error:
|
|
logger.warning(f"Image extraction from Word document failed: {img_error}")
|
|
|
|
full_content = "\n".join(content_parts)
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=full_content,
|
|
metadata={
|
|
"file_type": "word",
|
|
"paragraphs": len([p for p in content_parts if not p.startswith('[')]),
|
|
"tables_count": len(tables),
|
|
"images_count": len(images)
|
|
},
|
|
tables=tables,
|
|
images=images
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Word document processing failed: {e}")
|
|
raise
|
|
|
|
async def _process_excel(self, file_path: Path) -> ProcessingResult:
|
|
"""Process Excel files"""
|
|
try:
|
|
workbook = openpyxl.load_workbook(str(file_path))
|
|
content_parts = []
|
|
tables = []
|
|
|
|
for sheet_name in workbook.sheetnames:
|
|
sheet = workbook[sheet_name]
|
|
content_parts.append(f"Sheet: {sheet_name}")
|
|
|
|
# Extract data from cells
|
|
sheet_data = []
|
|
for row in sheet.iter_rows(values_only=True):
|
|
if any(cell is not None for cell in row):
|
|
sheet_data.append([str(cell) if cell is not None else "" for cell in row])
|
|
|
|
if sheet_data:
|
|
tables.append({
|
|
"data": sheet_data,
|
|
"sheet": sheet_name,
|
|
"rows": len(sheet_data),
|
|
"columns": max(len(row) for row in sheet_data) if sheet_data else 0
|
|
})
|
|
|
|
# Add sample content (first few rows)
|
|
sample_rows = min(5, len(sheet_data))
|
|
for i in range(sample_rows):
|
|
content_parts.append(" | ".join(sheet_data[i]))
|
|
|
|
workbook.close()
|
|
full_content = "\n".join(content_parts)
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=full_content,
|
|
metadata={
|
|
"file_type": "excel",
|
|
"sheets": len(workbook.sheetnames),
|
|
"tables_count": len(tables)
|
|
},
|
|
tables=tables
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Excel processing failed: {e}")
|
|
raise
|
|
|
|
async def _process_powerpoint(self, file_path: Path) -> ProcessingResult:
|
|
"""Process PowerPoint presentations"""
|
|
try:
|
|
presentation = Presentation(str(file_path))
|
|
content_parts = []
|
|
|
|
for i, slide in enumerate(presentation.slides):
|
|
content_parts.append(f"Slide {i + 1}:")
|
|
|
|
# Extract text from slide shapes
|
|
slide_text = []
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text") and shape.text.strip():
|
|
slide_text.append(shape.text)
|
|
|
|
if slide_text:
|
|
content_parts.extend(slide_text)
|
|
content_parts.append("") # Empty line between slides
|
|
|
|
full_content = "\n".join(content_parts)
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=full_content,
|
|
metadata={
|
|
"file_type": "powerpoint",
|
|
"slides": len(presentation.slides)
|
|
}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"PowerPoint processing failed: {e}")
|
|
raise
|
|
|
|
async def _process_text(self, file_path: Path) -> ProcessingResult:
|
|
"""Process text-based files (TXT, CSV, HTML)"""
|
|
try:
|
|
extension = file_path.suffix.lower()
|
|
|
|
if extension == '.csv':
|
|
# Process CSV with pandas
|
|
df = pd.read_csv(file_path)
|
|
content = df.to_string(index=False)
|
|
tables = [{
|
|
"data": df.values.tolist(),
|
|
"columns": df.columns.tolist(),
|
|
"rows": len(df),
|
|
"columns_count": len(df.columns)
|
|
}]
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=content,
|
|
metadata={"file_type": "csv", "rows": len(df), "columns": len(df.columns)},
|
|
tables=tables
|
|
)
|
|
|
|
elif extension == '.html':
|
|
# Process HTML with BeautifulSoup
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Remove script and style elements
|
|
for script in soup(["script", "style"]):
|
|
script.decompose()
|
|
|
|
text = soup.get_text()
|
|
lines = (line.strip() for line in text.splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
content = '\n'.join(chunk for chunk in chunks if chunk)
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=content,
|
|
metadata={"file_type": "html"}
|
|
)
|
|
|
|
else: # TXT and other text files
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=content,
|
|
metadata={"file_type": "text"}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Text file processing failed: {e}")
|
|
raise
|
|
|
|
async def _process_image(self, file_path: Path) -> ProcessingResult:
|
|
"""Process image files with OCR"""
|
|
try:
|
|
content_parts = []
|
|
tables = []
|
|
images = [{"path": str(file_path), "classification": "processed_with_ocr"}]
|
|
|
|
# Always perform OCR on images
|
|
ocr_result = self.ocr_processor.extract_text_from_image(str(file_path))
|
|
if ocr_result["text"].strip():
|
|
content_parts.append(ocr_result["text"])
|
|
|
|
# Extract tables from image
|
|
ocr_tables = self.ocr_processor.extract_tables_from_image(str(file_path))
|
|
tables.extend(ocr_tables)
|
|
|
|
full_content = "\n".join(content_parts) if content_parts else "No text extracted from image"
|
|
|
|
return ProcessingResult(
|
|
success=True,
|
|
content=full_content,
|
|
metadata={
|
|
"file_type": "image",
|
|
"ocr_confidence": ocr_result.get("confidence", 0.0),
|
|
"line_count": ocr_result.get("line_count", 0)
|
|
},
|
|
tables=tables,
|
|
images=images
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Image processing failed: {e}")
|
|
raise
|
|
|
|
def get_supported_formats(self) -> List[str]:
|
|
"""Get list of supported file formats"""
|
|
return list(self.supported_extensions)
|
|
|
|
async def process_batch(self, file_paths: List[str]) -> List[ProcessingResult]:
|
|
"""Process multiple documents in batch"""
|
|
tasks = [self.process_document(file_path) for file_path in file_paths]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Handle exceptions in results
|
|
processed_results = []
|
|
for result in results:
|
|
if isinstance(result, Exception):
|
|
processed_results.append(ProcessingResult(
|
|
success=False,
|
|
content="",
|
|
metadata={"error": str(result)},
|
|
error=str(result)
|
|
))
|
|
else:
|
|
processed_results.append(result)
|
|
|
|
return processed_results
|
|
|
|
# Singleton instance
|
|
_processor_instance = None
|
|
|
|
def get_document_processor() -> DocumentProcessor:
|
|
"""Get singleton document processor instance"""
|
|
global _processor_instance
|
|
if _processor_instance is None:
|
|
_processor_instance = DocumentProcessor()
|
|
return _processor_instance
|
|
|
|
async def test_processor():
|
|
"""Test function for document processor"""
|
|
processor = get_document_processor()
|
|
|
|
# Test with a sample file (modify path as needed)
|
|
test_file = "test_documents/test_document.txt"
|
|
if os.path.exists(test_file):
|
|
result = await processor.process_document(test_file)
|
|
print(f"Success: {result.success}")
|
|
print(f"Content length: {len(result.content)}")
|
|
print(f"Metadata: {result.metadata}")
|
|
else:
|
|
print("Test file not found")
|
|
|
|
if __name__ == "__main__":
|
|
# Run test
|
|
asyncio.run(test_processor()) |