workspace working

This commit is contained in:
2026-01-12 22:31:11 +08:00
parent 2738a822d1
commit 370fe6368a
149 changed files with 4648 additions and 660 deletions

View File

@@ -30,6 +30,9 @@ if workspace_dir not in sys.path:
sys.path.insert(0, workspace_dir)
from fast_image_classifier import get_image_classifier
# Import optimized OCR processor
from .optimized_ocr_processor import OptimizedOCRProcessor
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -377,9 +380,11 @@ class DocumentProcessor:
def __init__(self):
self.config = get_config()
self.ocr_processor = OCRProcessor(
self.ocr_processor = OptimizedOCRProcessor(
use_gpu=self.config.performance.USE_GPU,
languages=self.config.document_processing.OCR_LANGUAGES
languages=self.config.document_processing.OCR_LANGUAGES,
batch_size=4, # Process 4 images at a time for better performance
max_workers=2 # Use 2 parallel workers for async operations
)
self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS
@@ -438,12 +443,15 @@ class DocumentProcessor:
def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]:
"""
Extract and process images from documents
Extract and process images from documents with batch OCR processing
Returns processed images metadata and additional content from OCR
"""
processed_images = []
additional_content = []
temp_paths = []
temp_files = []
# Step 1: Save all images to temporary files
for i, image_data in enumerate(images):
temp_path = None
try:
@@ -461,62 +469,95 @@ class DocumentProcessor:
temp_file.write(image_bytes)
temp_path = temp_file.name
# Process image with OCR first, then classify only if no text found
image_metadata = {"path": temp_path, "index": i}
temp_paths.append(temp_path)
temp_files.append((i, temp_path, image_data))
# Step 1: Always run GPU OCR first
if self.ocr_processor.ocr_available:
try:
logger.info(f"Running GPU OCR on image {i+1}")
except Exception as e:
logger.error(f"Error saving image {i} to temporary file: {e}")
processed_images.append({
"index": i,
"error": str(e),
"path": temp_path or "unknown"
})
if not temp_paths:
return processed_images, ""
# Step 2: Batch OCR processing
batch_results = []
if self.ocr_processor.ocr_available:
try:
logger.info(f"Running batch OCR on {len(temp_paths)} images")
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
logger.info(f"Batch OCR completed for {len(batch_results)} images")
except Exception as e:
logger.error(f"Batch OCR processing failed: {e}")
# Fall back to individual processing
batch_results = []
# Step 3: Process results
for idx, (i, temp_path, image_data) in enumerate(temp_files):
image_metadata = {"path": temp_path, "index": i}
try:
# Get OCR result for this image
ocr_result = None
if batch_results and idx < len(batch_results):
batch_result = batch_results[idx]
ocr_result = {
"text": batch_result.text,
"confidence": batch_result.confidence,
"bboxes": batch_result.bboxes,
"line_count": batch_result.line_count
}
else:
# Fallback to individual OCR
if self.ocr_processor.ocr_available:
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
if ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
logger.info(f"Image {i+1} has text content, skipping classification")
else:
logger.info(f"Image {i+1} has no text, proceeding to classification")
# Step 2: Only classify if OCR found no text
if self.image_classifier and self.image_classifier.available:
try:
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
image_metadata["classification"] = classification_results
# Add classification to content for indexing
top_label = classification_results[0]["label"] if classification_results else "unknown"
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
image_metadata["primary_classification"] = top_label
# Add classification with confidence for better searchability
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
additional_content.append(classification_text)
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
# Add bee classification as a special entity for search
if "bee" in top_label.lower():
# Add multiple variations to ensure it gets picked up by entity extraction
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
additional_content.append(bee_entity_text)
# Also add as standalone entity markers
additional_content.append("Entity: Bee")
additional_content.append("Entity: Insect")
additional_content.append("Entity: Animal")
except Exception as classify_error:
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
image_metadata["classification_error"] = str(classify_error)
except Exception as ocr_error:
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
image_metadata["ocr_error"] = str(ocr_error)
if ocr_result and ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
logger.info(f"Image {i+1} has text content, skipping classification")
else:
logger.info(f"Image {i+1} has no text, proceeding to classification")
# Step 4: Only classify if OCR found no text
if self.image_classifier and self.image_classifier.available:
try:
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
image_metadata["classification"] = classification_results
# Add classification to content for indexing
top_label = classification_results[0]["label"] if classification_results else "unknown"
top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
image_metadata["primary_classification"] = top_label
# Add classification with confidence for better searchability
classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
additional_content.append(classification_text)
logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
# Add bee classification as a special entity for search
if "bee" in top_label.lower():
# Add multiple variations to ensure it gets picked up by entity extraction
bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
additional_content.append(bee_entity_text)
# Also add as standalone entity markers
additional_content.append("Entity: Bee")
additional_content.append("Entity: Insect")
additional_content.append("Entity: Animal")
except Exception as classify_error:
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
image_metadata["classification_error"] = str(classify_error)
processed_images.append(image_metadata)
except Exception as e:
logger.error(f"Error processing image {i}: {e}")
processed_images.append({
"index": i,
"error": str(e),
"path": temp_path or "unknown"
"path": temp_path
})
finally:
# Clean up temporary file