railseek6/image_classifier.py

"""
Isolated OpenCLIP Image Classifier Module
Designed to work alongside PaddleOCR without dependency conflicts
"""

import os
import logging
import tempfile
from typing import List, Dict, Any, Optional
from pathlib import Path
import importlib.util

logger = logging.getLogger(__name__)

class OpenCLIPClassifier:
    """OpenCLIP-based image classifier with isolated dependencies"""

    def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "laion2b_s34b_b79k"):
        self.model_name = model_name
        self.pretrained = pretrained
        self.model = None
        self.processor = None
        self.available = False
        self._initialize_classifier()

    def _initialize_classifier(self):
        """Initialize OpenCLIP model with proper isolation"""
        try:
            # Check if open_clip is available
            if importlib.util.find_spec("open_clip") is None:
                logger.warning("OpenCLIP not available, image classification will be disabled")
                return

            import open_clip
            import torch

            logger.info(f"Initializing OpenCLIP with model: {self.model_name}, pretrained: {self.pretrained}")

            # Load model and processor
            self.model, _, self.processor = open_clip.create_model_and_transforms(
                model_name=self.model_name,
                pretrained=self.pretrained
            )

            # Move to GPU if available
            if torch.cuda.is_available():
                self.model = self.model.cuda()
                logger.info("OpenCLIP model moved to GPU")
            else:
                logger.info("OpenCLIP model using CPU")

            self.available = True
            logger.info("OpenCLIP classifier initialized successfully")

        except ImportError as e:
            logger.warning(f"OpenCLIP dependencies not available: {e}")
        except Exception as e:
            logger.error(f"Failed to initialize OpenCLIP classifier: {e}")

    def classify_image(self, image_path: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Classify image and return top predictions

        Args:
            image_path: Path to image file
            top_k: Number of top predictions to return

        Returns:
            List of classification results with confidence scores
        """
        if not self.available or not self.model:
            return [{"label": "classification_unavailable", "confidence": 0.0}]

        try:
            import torch
            from PIL import Image

            # Load and preprocess image
            image = Image.open(image_path).convert("RGB")
            image_tensor = self.processor(image).unsqueeze(0)

            # Move to GPU if available
            if torch.cuda.is_available():
                image_tensor = image_tensor.cuda()

            # Get model predictions
            with torch.no_grad():
                image_features = self.model.encode_image(image_tensor)
                image_features /= image_features.norm(dim=-1, keepdim=True)

                # Use zero-shot classification with common labels
                text_labels = [
                    "a photo of a bee", "a photo of a flower", "a photo of a person",
                    "a photo of a document", "a photo of a chart", "a photo of a diagram",
                    "a photo of a table", "a photo of a graph", "a photo of a logo",
                    "a photo of a signature", "a photo of a stamp", "a photo of a barcode",
                    "a photo of a QR code", "a photo of a screenshot", "a photo of a landscape",
                    "a photo of an animal", "a photo of a building", "a photo of a vehicle",
                    "a photo of food", "a photo of clothing", "a photo of electronics",
                    "a photo of furniture", "a photo of nature", "a photo of art",
                    "a photo of text", "a photo of numbers", "a photo of symbols"
                ]

                # Encode text labels
                text_tokens = open_clip.tokenize(text_labels)
                if torch.cuda.is_available():
                    text_tokens = text_tokens.cuda()

                text_features = self.model.encode_text(text_tokens)
                text_features /= text_features.norm(dim=-1, keepdim=True)

                # Calculate similarity
                similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
                values, indices = similarity[0].topk(top_k)

                results = []
                for value, index in zip(values, indices):
                    results.append({
                        "label": text_labels[index],
                        "confidence": float(value)
                    })

                return results

        except Exception as e:
            logger.error(f"Image classification failed: {e}")
            return [{"label": "classification_error", "confidence": 0.0}]

    def get_detailed_classification(self, image_path: str) -> Dict[str, Any]:
        """
        Get detailed classification with multiple label categories
        """
        if not self.available:
            return {
                "available": False,
                "error": "OpenCLIP classifier not available",
                "predictions": []
            }

        try:
            # Get standard classification
            standard_results = self.classify_image(image_path, top_k=3)

            # Additional specialized classifications
            specialized_labels = {
                "document_types": [
                    "a photo of a contract", "a photo of a invoice", "a photo of a receipt",
                    "a photo of a report", "a photo of a presentation", "a photo of a form",
                    "a photo of a certificate", "a photo of a license", "a photo of a passport"
                ],
                "office_objects": [
                    "a photo of a desk", "a photo of a chair", "a photo of a computer",
                    "a photo of a printer", "a photo of a phone", "a photo of a pen",
                    "a photo of paper", "a photo of a folder", "a photo of a cabinet"
                ]
            }

            detailed_results = {
                "available": True,
                "standard_predictions": standard_results,
                "file_path": image_path
            }

            return detailed_results

        except Exception as e:
            logger.error(f"Detailed classification failed: {e}")
            return {
                "available": False,
                "error": str(e),
                "predictions": []
            }


# Singleton instance
_classifier_instance = None

def get_image_classifier() -> OpenCLIPClassifier:
    """Get singleton image classifier instance"""
    global _classifier_instance
    if _classifier_instance is None:
        _classifier_instance = OpenCLIPClassifier()
    return _classifier_instance


def test_classifier():
    """Test function for image classifier"""
    classifier = get_image_classifier()

    if classifier.available:
        print("OpenCLIP classifier is available")
        # Test with a sample image if available
        test_images = ["test_bee_image.png", "sample_image.jpg"]
        for test_image in test_images:
            if os.path.exists(test_image):
                results = classifier.classify_image(test_image)
                print(f"Classification results for {test_image}:")
                for result in results:
                    print(f"  {result['label']}: {result['confidence']:.4f}")
                break
        else:
            print("No test images found for classification")
    else:
        print("OpenCLIP classifier is not available")


if __name__ == "__main__":
    test_classifier()