Files
railseek6/optimized_document_processor.py

312 lines
14 KiB
Python

"""
Optimized Document Processor with Persistent Classifier
Uses GPU acceleration for both PaddleOCR and OpenCLIP with complete dependency isolation
"""
import os
import sys
import asyncio
import logging
from pathlib import Path
from typing import Dict, Any, List, Optional
import json
# Add paths
sys.path.insert(0, "LightRAG-main")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class OptimizedDocumentProcessor:
"""Optimized document processor with GPU acceleration and dependency isolation"""
def __init__(self):
self.ocr_processor = None
self.classifier_client = None
self._initialize_components()
def _initialize_components(self):
"""Initialize OCR and classifier components with dependency isolation"""
logger.info("=== INITIALIZING OPTIMIZED DOCUMENT PROCESSOR ===")
# Initialize OCR processor (PaddleOCR with GPU REQUIRED - no fallback)
try:
from simple_ocr_processor import SimpleOCRProcessor
self.ocr_processor = SimpleOCRProcessor()
logger.info("✅ OCR processor initialized with GPU")
except Exception as e:
logger.error(f"❌ OCR processor GPU initialization failed: {e}")
raise RuntimeError(f"PaddleOCR GPU is required but failed to initialize: {e}")
# Initialize persistent classifier client (OpenCLIP with GPU)
try:
from persistent_classifier_client import PersistentClassifierClient
self.classifier_client = PersistentClassifierClient()
if self.classifier_client.available:
logger.info("✅ Persistent classifier client initialized")
else:
logger.warning("⚠️ Persistent classifier not available, image classification disabled")
self.classifier_client = None
except Exception as e:
logger.error(f"❌ Classifier client initialization failed: {e}")
self.classifier_client = None
async def process_document(self, file_path: str) -> Dict[str, Any]:
"""Process document with text-first extraction and image classification"""
logger.info(f"📄 Processing document: {file_path}")
result = {
"success": False,
"file_path": file_path,
"text_content": "",
"images": [],
"metadata": {
"file_type": Path(file_path).suffix.lower(),
"processing_time": 0,
"ocr_used": False,
"classification_used": False,
"gpu_accelerated": False
}
}
start_time = asyncio.get_event_loop().time()
try:
# Step 1: Extract text first for all file types
text_content = await self._extract_text(file_path)
result["text_content"] = text_content
result["metadata"]["text_extracted"] = bool(text_content.strip())
# Step 2: Extract and process images
images = await self._extract_images(file_path)
if images:
processed_images = await self._process_images(images)
result["images"] = processed_images
result["metadata"]["images_processed"] = len(images)
# Step 2.5: Append image classification results to text content
classification_text = self._build_classification_text(processed_images)
if classification_text:
result["text_content"] += "\n\n" + classification_text
logger.info(f"📸 Added image classification metadata to text content")
# Step 3: Update metadata
processing_time = asyncio.get_event_loop().time() - start_time
result["metadata"]["processing_time"] = processing_time
result["metadata"]["ocr_used"] = self.ocr_processor is not None
result["metadata"]["classification_used"] = self.classifier_client is not None
result["metadata"]["gpu_accelerated"] = True # Both use GPU when available
result["success"] = True
logger.info(f"✅ Document processing completed in {processing_time:.2f}s")
except Exception as e:
result["success"] = False
result["metadata"]["error"] = str(e)
logger.error(f"❌ Document processing failed: {e}")
return result
async def _extract_text(self, file_path: str) -> str:
"""Extract text from document using appropriate method"""
file_ext = Path(file_path).suffix.lower()
if file_ext in ['.txt']:
# Simple text file
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
elif file_ext in ['.pdf']:
# PDF file - extract text directly
try:
import PyPDF2
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.warning(f"PDF text extraction failed, will use OCR: {e}")
return ""
elif file_ext in ['.docx']:
# Word document - extract text directly
try:
from docx import Document
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
logger.warning(f"DOCX text extraction failed, will use OCR: {e}")
return ""
else:
# Unknown file type, try OCR
logger.info(f"Unknown file type {file_ext}, using OCR")
return ""
async def _extract_images(self, file_path: str) -> List[str]:
"""Extract images from document"""
file_ext = Path(file_path).suffix.lower()
output_dir = "extracted_images"
os.makedirs(output_dir, exist_ok=True)
if file_ext in ['.pdf']:
# Extract images from PDF
try:
from pdf2image import convert_from_path
images = convert_from_path(file_path)
image_paths = []
for i, image in enumerate(images):
img_path = os.path.join(output_dir, f"pdf_page_{i+1}.png")
image.save(img_path, 'PNG')
image_paths.append(img_path)
return image_paths
except Exception as e:
logger.warning(f"PDF image extraction failed: {e}")
return []
elif file_ext in ['.docx']:
# Extract images from Word document
try:
from word_image_extractor import extract_images_from_docx
return extract_images_from_docx(file_path, output_dir)
except Exception as e:
logger.warning(f"DOCX image extraction failed: {e}")
return []
elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
# Single image file
return [file_path]
else:
# No images for other file types
return []
async def _process_images(self, image_paths: List[str]) -> List[Dict[str, Any]]:
"""Process images with conditional classification: OCR-first, classification only if no text"""
processed_images = []
for img_path in image_paths:
image_data = {
"path": img_path,
"ocr_text": "",
"ocr_confidence": 0.0,
"classification": [],
"processing_notes": ""
}
# Step 1: OCR text extraction (GPU mode)
if self.ocr_processor:
try:
ocr_result = await asyncio.to_thread(self.ocr_processor.extract_text_from_image, img_path)
if ocr_result.get("text", "").strip():
image_data["ocr_text"] = ocr_result.get("text", "")
image_data["ocr_confidence"] = ocr_result.get("confidence", 0.0)
image_data["processing_notes"] = "Text extracted via OCR"
logger.info(f"📝 OCR extracted {len(image_data['ocr_text'])} chars from {os.path.basename(img_path)}")
else:
# No text detected - proceed to classification
image_data["processing_notes"] = "No text detected, proceeding to classification"
logger.info(f"🖼️ No text in {os.path.basename(img_path)}, using classification")
except Exception as e:
logger.warning(f"OCR failed for {img_path}: {e}")
image_data["processing_notes"] = f"OCR failed: {str(e)}"
# Step 2: Image classification (GPU mode) - ONLY if no text was detected
if self.classifier_client and not image_data["ocr_text"].strip():
try:
classification_results = self.classifier_client.classify_image(img_path)
image_data["classification"] = classification_results
if classification_results:
image_data["processing_notes"] += " | Classified via OpenCLIP"
logger.info(f"🔍 Classified {os.path.basename(img_path)}: {classification_results[0]['label']}")
except Exception as e:
logger.warning(f"Classification failed for {img_path}: {e}")
image_data["processing_notes"] += f" | Classification failed: {str(e)}"
processed_images.append(image_data)
return processed_images
def _build_classification_text(self, processed_images: List[Dict[str, Any]]) -> str:
"""Build text representation of image classification results for indexing"""
classification_lines = []
for i, img_data in enumerate(processed_images):
if img_data.get("classification"):
# Add image classification labels to the text content
top_classification = img_data["classification"][0] # Get highest confidence result
label = top_classification["label"]
confidence = top_classification["confidence"]
classification_lines.append(f"Image {i+1}: {label} (confidence: {confidence:.1%})")
# Add additional classifications if they have high confidence
for j, cls in enumerate(img_data["classification"][1:4]): # Next top 3
if cls["confidence"] > 0.1: # Only include if confidence > 10%
classification_lines.append(f" Also: {cls['label']} (confidence: {cls['confidence']:.1%})")
if classification_lines:
return "Image Classifications:\n" + "\n".join(classification_lines)
return ""
async def test_optimized_processor():
"""Test the optimized document processor"""
print("🧪 TESTING OPTIMIZED DOCUMENT PROCESSOR")
print("=" * 50)
processor = OptimizedDocumentProcessor()
# Test with test.docx
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"📄 Processing: {test_file}")
result = await processor.process_document(test_file)
print(f"\n📊 PROCESSING RESULTS:")
print(f" Success: {'' if result['success'] else ''}")
print(f" Processing Time: {result['metadata']['processing_time']:.2f}s")
print(f" Text Extracted: {result['metadata']['text_extracted']}")
print(f" Images Processed: {result['metadata'].get('images_processed', 0)}")
print(f" OCR Used: {result['metadata']['ocr_used']}")
print(f" Classification Used: {result['metadata']['classification_used']}")
print(f" GPU Accelerated: {result['metadata']['gpu_accelerated']}")
if result["text_content"]:
print(f"\n📝 TEXT CONTENT (first 500 chars):")
print(result["text_content"][:500] + "..." if len(result["text_content"]) > 500 else result["text_content"])
if result["images"]:
print(f"\n🖼️ IMAGE PROCESSING RESULTS:")
for i, img in enumerate(result["images"]):
print(f" Image {i+1}:")
if img["ocr_text"]:
print(f" OCR: {len(img['ocr_text'])} chars, confidence: {img['ocr_confidence']:.3f}")
if img["classification"]:
top_result = img["classification"][0]
print(f" Classification: {top_result['label']} (confidence: {top_result['confidence']:.3f})")
# Check for bee detection
if "bee" in top_result["label"].lower():
print(f" 🎯 BEE DETECTED WITH {top_result['confidence']:.1%} CONFIDENCE!")
# Performance summary
print(f"\n⚡ PERFORMANCE SUMMARY:")
print(f" Total processing time: {result['metadata']['processing_time']:.2f}s")
if result["metadata"].get('images_processed', 0) > 0:
per_image_time = result['metadata']['processing_time'] / result['metadata']['images_processed']
print(f" Per image processing time: {per_image_time:.3f}s")
return result
if __name__ == "__main__":
asyncio.run(test_optimized_processor())