workspace working
This commit is contained in:
@@ -30,6 +30,9 @@ if workspace_dir not in sys.path:
|
||||
sys.path.insert(0, workspace_dir)
|
||||
from fast_image_classifier import get_image_classifier
|
||||
|
||||
# Import optimized OCR processor
|
||||
from .optimized_ocr_processor import OptimizedOCRProcessor
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -377,9 +380,11 @@ class DocumentProcessor:
|
||||
|
||||
def __init__(self):
|
||||
self.config = get_config()
|
||||
self.ocr_processor = OCRProcessor(
|
||||
self.ocr_processor = OptimizedOCRProcessor(
|
||||
use_gpu=self.config.performance.USE_GPU,
|
||||
languages=self.config.document_processing.OCR_LANGUAGES
|
||||
languages=self.config.document_processing.OCR_LANGUAGES,
|
||||
batch_size=4, # Process 4 images at a time for better performance
|
||||
max_workers=2 # Use 2 parallel workers for async operations
|
||||
)
|
||||
self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS
|
||||
|
||||
@@ -438,12 +443,15 @@ class DocumentProcessor:
|
||||
|
||||
def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]:
|
||||
"""
|
||||
Extract and process images from documents
|
||||
Extract and process images from documents with batch OCR processing
|
||||
Returns processed images metadata and additional content from OCR
|
||||
"""
|
||||
processed_images = []
|
||||
additional_content = []
|
||||
temp_paths = []
|
||||
temp_files = []
|
||||
|
||||
# Step 1: Save all images to temporary files
|
||||
for i, image_data in enumerate(images):
|
||||
temp_path = None
|
||||
try:
|
||||
@@ -461,62 +469,95 @@ class DocumentProcessor:
|
||||
temp_file.write(image_bytes)
|
||||
temp_path = temp_file.name
|
||||
|
||||
# Process image with OCR first, then classify only if no text found
|
||||
image_metadata = {"path": temp_path, "index": i}
|
||||
temp_paths.append(temp_path)
|
||||
temp_files.append((i, temp_path, image_data))
|
||||
|
||||
# Step 1: Always run GPU OCR first
|
||||
if self.ocr_processor.ocr_available:
|
||||
try:
|
||||
logger.info(f"Running GPU OCR on image {i+1}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving image {i} to temporary file: {e}")
|
||||
processed_images.append({
|
||||
"index": i,
|
||||
"error": str(e),
|
||||
"path": temp_path or "unknown"
|
||||
})
|
||||
|
||||
if not temp_paths:
|
||||
return processed_images, ""
|
||||
|
||||
# Step 2: Batch OCR processing
|
||||
batch_results = []
|
||||
if self.ocr_processor.ocr_available:
|
||||
try:
|
||||
logger.info(f"Running batch OCR on {len(temp_paths)} images")
|
||||
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
|
||||
logger.info(f"Batch OCR completed for {len(batch_results)} images")
|
||||
except Exception as e:
|
||||
logger.error(f"Batch OCR processing failed: {e}")
|
||||
# Fall back to individual processing
|
||||
batch_results = []
|
||||
|
||||
# Step 3: Process results
|
||||
for idx, (i, temp_path, image_data) in enumerate(temp_files):
|
||||
image_metadata = {"path": temp_path, "index": i}
|
||||
|
||||
try:
|
||||
# Get OCR result for this image
|
||||
ocr_result = None
|
||||
if batch_results and idx < len(batch_results):
|
||||
batch_result = batch_results[idx]
|
||||
ocr_result = {
|
||||
"text": batch_result.text,
|
||||
"confidence": batch_result.confidence,
|
||||
"bboxes": batch_result.bboxes,
|
||||
"line_count": batch_result.line_count
|
||||
}
|
||||
else:
|
||||
# Fallback to individual OCR
|
||||
if self.ocr_processor.ocr_available:
|
||||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||||
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
|
||||
|
||||
if ocr_result["text"].strip():
|
||||
image_metadata["ocr_text"] = ocr_result["text"]
|
||||
image_metadata["ocr_confidence"] = ocr_result["confidence"]
|
||||
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
|
||||
logger.info(f"Image {i+1} has text content, skipping classification")
|
||||
else:
|
||||
logger.info(f"Image {i+1} has no text, proceeding to classification")
|
||||
# Step 2: Only classify if OCR found no text
|
||||
if self.image_classifier and self.image_classifier.available:
|
||||
try:
|
||||
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
|
||||
image_metadata["classification"] = classification_results
|
||||
# Add classification to content for indexing
|
||||
top_label = classification_results[0]["label"] if classification_results else "unknown"
|
||||
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
|
||||
image_metadata["primary_classification"] = top_label
|
||||
# Add classification with confidence for better searchability
|
||||
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
|
||||
additional_content.append(classification_text)
|
||||
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
|
||||
|
||||
# Add bee classification as a special entity for search
|
||||
if "bee" in top_label.lower():
|
||||
# Add multiple variations to ensure it gets picked up by entity extraction
|
||||
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
|
||||
additional_content.append(bee_entity_text)
|
||||
# Also add as standalone entity markers
|
||||
additional_content.append("Entity: Bee")
|
||||
additional_content.append("Entity: Insect")
|
||||
additional_content.append("Entity: Animal")
|
||||
|
||||
except Exception as classify_error:
|
||||
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
|
||||
image_metadata["classification_error"] = str(classify_error)
|
||||
except Exception as ocr_error:
|
||||
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
|
||||
image_metadata["ocr_error"] = str(ocr_error)
|
||||
|
||||
if ocr_result and ocr_result["text"].strip():
|
||||
image_metadata["ocr_text"] = ocr_result["text"]
|
||||
image_metadata["ocr_confidence"] = ocr_result["confidence"]
|
||||
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
|
||||
logger.info(f"Image {i+1} has text content, skipping classification")
|
||||
else:
|
||||
logger.info(f"Image {i+1} has no text, proceeding to classification")
|
||||
# Step 4: Only classify if OCR found no text
|
||||
if self.image_classifier and self.image_classifier.available:
|
||||
try:
|
||||
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
|
||||
image_metadata["classification"] = classification_results
|
||||
# Add classification to content for indexing
|
||||
top_label = classification_results[0]["label"] if classification_results else "unknown"
|
||||
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
|
||||
image_metadata["primary_classification"] = top_label
|
||||
# Add classification with confidence for better searchability
|
||||
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
|
||||
additional_content.append(classification_text)
|
||||
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
|
||||
|
||||
# Add bee classification as a special entity for search
|
||||
if "bee" in top_label.lower():
|
||||
# Add multiple variations to ensure it gets picked up by entity extraction
|
||||
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
|
||||
additional_content.append(bee_entity_text)
|
||||
# Also add as standalone entity markers
|
||||
additional_content.append("Entity: Bee")
|
||||
additional_content.append("Entity: Insect")
|
||||
additional_content.append("Entity: Animal")
|
||||
|
||||
except Exception as classify_error:
|
||||
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
|
||||
image_metadata["classification_error"] = str(classify_error)
|
||||
|
||||
processed_images.append(image_metadata)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image {i}: {e}")
|
||||
processed_images.append({
|
||||
"index": i,
|
||||
"error": str(e),
|
||||
"path": temp_path or "unknown"
|
||||
"path": temp_path
|
||||
})
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
|
||||
Reference in New Issue
Block a user