workspace working

2026-01-12 22:31:11 +08:00
parent 2738a822d1
commit 370fe6368a
149 changed files with 4648 additions and 660 deletions
--- a/LightRAG-main/lightrag/document_processor.py
+++ b/LightRAG-main/lightrag/document_processor.py
@@ -30,6 +30,9 @@ if workspace_dir not in sys.path:
    sys.path.insert(0, workspace_dir)
 from fast_image_classifier import get_image_classifier

+# Import optimized OCR processor
+from .optimized_ocr_processor import OptimizedOCRProcessor
+
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -377,9 +380,11 @@ class DocumentProcessor:
    
    def __init__(self):
        self.config = get_config()
-        self.ocr_processor = OCRProcessor(
+        self.ocr_processor = OptimizedOCRProcessor(
            use_gpu=self.config.performance.USE_GPU,
-            languages=self.config.document_processing.OCR_LANGUAGES
+            languages=self.config.document_processing.OCR_LANGUAGES,
+            batch_size=4,  # Process 4 images at a time for better performance
+            max_workers=2   # Use 2 parallel workers for async operations
        )
        self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS
        
@@ -438,12 +443,15 @@ class DocumentProcessor:
    
    def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]:
        """
-        Extract and process images from documents
+        Extract and process images from documents with batch OCR processing
        Returns processed images metadata and additional content from OCR
        """
        processed_images = []
        additional_content = []
+        temp_paths = []
+        temp_files = []
        
+        # Step 1: Save all images to temporary files
        for i, image_data in enumerate(images):
            temp_path = None
            try:
@@ -461,62 +469,95 @@ class DocumentProcessor:
                    temp_file.write(image_bytes)
                    temp_path = temp_file.name
                
-                # Process image with OCR first, then classify only if no text found
-                image_metadata = {"path": temp_path, "index": i}
+                temp_paths.append(temp_path)
+                temp_files.append((i, temp_path, image_data))
                
-                # Step 1: Always run GPU OCR first
-                if self.ocr_processor.ocr_available:
-                    try:
-                        logger.info(f"Running GPU OCR on image {i+1}")
+            except Exception as e:
+                logger.error(f"Error saving image {i} to temporary file: {e}")
+                processed_images.append({
+                    "index": i,
+                    "error": str(e),
+                    "path": temp_path or "unknown"
+                })
+        
+        if not temp_paths:
+            return processed_images, ""
+        
+        # Step 2: Batch OCR processing
+        batch_results = []
+        if self.ocr_processor.ocr_available:
+            try:
+                logger.info(f"Running batch OCR on {len(temp_paths)} images")
+                batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
+                logger.info(f"Batch OCR completed for {len(batch_results)} images")
+            except Exception as e:
+                logger.error(f"Batch OCR processing failed: {e}")
+                # Fall back to individual processing
+                batch_results = []
+        
+        # Step 3: Process results
+        for idx, (i, temp_path, image_data) in enumerate(temp_files):
+            image_metadata = {"path": temp_path, "index": i}
+            
+            try:
+                # Get OCR result for this image
+                ocr_result = None
+                if batch_results and idx < len(batch_results):
+                    batch_result = batch_results[idx]
+                    ocr_result = {
+                        "text": batch_result.text,
+                        "confidence": batch_result.confidence,
+                        "bboxes": batch_result.bboxes,
+                        "line_count": batch_result.line_count
+                    }
+                else:
+                    # Fallback to individual OCR
+                    if self.ocr_processor.ocr_available:
                        ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
-                        logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
-                        
-                        if ocr_result["text"].strip():
-                            image_metadata["ocr_text"] = ocr_result["text"]
-                            image_metadata["ocr_confidence"] = ocr_result["confidence"]
-                            additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
-                            logger.info(f"Image {i+1} has text content, skipping classification")
-                        else:
-                            logger.info(f"Image {i+1} has no text, proceeding to classification")
-                            # Step 2: Only classify if OCR found no text
-                            if self.image_classifier and self.image_classifier.available:
-                                try:
-                                    classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
-                                    image_metadata["classification"] = classification_results
-                                    # Add classification to content for indexing
-                                    top_label = classification_results[0]["label"] if classification_results else "unknown"
-                                    top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
-                                    image_metadata["primary_classification"] = top_label
-                                    # Add classification with confidence for better searchability
-                                    classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
-                                    additional_content.append(classification_text)
-                                    logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
-                                    
-                                    # Add bee classification as a special entity for search
-                                    if "bee" in top_label.lower():
-                                        # Add multiple variations to ensure it gets picked up by entity extraction
-                                        bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
-                                        additional_content.append(bee_entity_text)
-                                        # Also add as standalone entity markers
-                                        additional_content.append("Entity: Bee")
-                                        additional_content.append("Entity: Insect")
-                                        additional_content.append("Entity: Animal")
-                                        
-                                except Exception as classify_error:
-                                    logger.error(f"Image classification failed for image {i+1}: {classify_error}")
-                                    image_metadata["classification_error"] = str(classify_error)
-                    except Exception as ocr_error:
-                        logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
-                        image_metadata["ocr_error"] = str(ocr_error)
+                
+                if ocr_result and ocr_result["text"].strip():
+                    image_metadata["ocr_text"] = ocr_result["text"]
+                    image_metadata["ocr_confidence"] = ocr_result["confidence"]
+                    additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
+                    logger.info(f"Image {i+1} has text content, skipping classification")
+                else:
+                    logger.info(f"Image {i+1} has no text, proceeding to classification")
+                    # Step 4: Only classify if OCR found no text
+                    if self.image_classifier and self.image_classifier.available:
+                        try:
+                            classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
+                            image_metadata["classification"] = classification_results
+                            # Add classification to content for indexing
+                            top_label = classification_results[0]["label"] if classification_results else "unknown"
+                            top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
+                            image_metadata["primary_classification"] = top_label
+                            # Add classification with confidence for better searchability
+                            classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
+                            additional_content.append(classification_text)
+                            logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")
+                            
+                            # Add bee classification as a special entity for search
+                            if "bee" in top_label.lower():
+                                # Add multiple variations to ensure it gets picked up by entity extraction
+                                bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
+                                additional_content.append(bee_entity_text)
+                                # Also add as standalone entity markers
+                                additional_content.append("Entity: Bee")
+                                additional_content.append("Entity: Insect")
+                                additional_content.append("Entity: Animal")
+                                
+                        except Exception as classify_error:
+                            logger.error(f"Image classification failed for image {i+1}: {classify_error}")
+                            image_metadata["classification_error"] = str(classify_error)
                
                processed_images.append(image_metadata)
-                    
+                
            except Exception as e:
                logger.error(f"Error processing image {i}: {e}")
                processed_images.append({
                    "index": i,
                    "error": str(e),
-                    "path": temp_path or "unknown"
+                    "path": temp_path
                })
            finally:
                # Clean up temporary file