railseek6/fixed_ocr_classifier.py

"""
Fixed OCR and Image Classification with Complete Dependency Isolation
Direct subprocess communication without file-based JSON parsing
"""

import os
import sys
import subprocess
import tempfile
import asyncio
from pathlib import Path

class FixedOCRProcessor:
    """Fixed OCR processor using direct subprocess communication"""

    def __init__(self):
        self.available = False
        self._initialize()

    def _initialize(self):
        """Initialize OCR processor"""
        try:
            # Test if PaddleOCR works
            test_script = """
import sys
try:
    from paddleocr import PaddleOCR
    print("OCR_READY")
except Exception as e:
    print(f"OCR_ERROR:{e}")
"""
            result = subprocess.run([sys.executable, "-c", test_script],
                                  capture_output=True, text=True, timeout=10)
            if "OCR_READY" in result.stdout:
                self.available = True
                print("✅ OCR processor initialized successfully")
            else:
                print(f"❌ OCR initialization failed: {result.stderr}")
        except Exception as e:
            print(f"❌ OCR initialization failed: {e}")

    def extract_text_from_image(self, image_path):
        """Extract text from image using direct subprocess"""
        if not self.available or not os.path.exists(image_path):
            return {"text": "", "confidence": 0.0, "line_count": 0}

        try:
            ocr_script = f"""
import sys
from paddleocr import PaddleOCR

try:
    ocr = PaddleOCR(use_gpu=True, cls=True)
    result = ocr.ocr(r"{image_path}")

    if not result or not result[0]:
        print("OCR_RESULT:EMPTY")
        sys.exit(0)

    extracted_text = []
    total_confidence = 0.0
    line_count = 0

    for line in result[0]:
        try:
            if len(line) == 2:
                bbox, (text, confidence) = line
            elif len(line) >= 1:
                bbox = line[0] if len(line) > 0 else []
                if len(line) > 1:
                    if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
                        text, confidence = line[1][0], line[1][1]
                    else:
                        text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
                else:
                    text, confidence = "", 0.0
            else:
                continue

            text_str = str(text) if text is not None else ""
            confidence_float = float(confidence) if isinstance(confidence, (int, float)) else 0.0

            extracted_text.append(text_str)
            total_confidence += confidence_float
            line_count += 1

        except Exception:
            extracted_text.append("")
            total_confidence += 0.0
            line_count += 1

    avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
    full_text = " ".join(extracted_text)

    print(f"OCR_RESULT:TEXT={{full_text}}")
    print(f"OCR_RESULT:CONFIDENCE={{avg_confidence}}")
    print(f"OCR_RESULT:LINES={{line_count}}")

except Exception as e:
    print(f"OCR_ERROR:{{e}}")
"""
            result = subprocess.run([sys.executable, "-c", ocr_script],
                                  capture_output=True, text=True, timeout=60)

            # Parse results from stdout
            text = ""
            confidence = 0.0
            line_count = 0

            for line in result.stdout.split('\n'):
                if line.startswith("OCR_RESULT:TEXT="):
                    text = line.replace("OCR_RESULT:TEXT=", "").strip()
                elif line.startswith("OCR_RESULT:CONFIDENCE="):
                    try:
                        confidence = float(line.replace("OCR_RESULT:CONFIDENCE=", "").strip())
                    except:
                        confidence = 0.0
                elif line.startswith("OCR_RESULT:LINES="):
                    try:
                        line_count = int(line.replace("OCR_RESULT:LINES=", "").strip())
                    except:
                        line_count = 0

            return {
                "text": text,
                "confidence": confidence,
                "line_count": line_count
            }

        except Exception as e:
            print(f"❌ OCR processing failed: {e}")
            return {"text": "", "confidence": 0.0, "line_count": 0}


class FixedImageClassifier:
    """Fixed image classifier using direct subprocess communication"""

    def __init__(self):
        self.available = False
        self._initialize()

    def _initialize(self):
        """Initialize image classifier"""
        try:
            # Check if virtual environment exists and works
            venv_python = "openclip_env\\Scripts\\python.exe"
            if not os.path.exists(venv_python):
                print("❌ OpenCLIP virtual environment not found")
                return

            test_script = """
try:
    import open_clip
    print("CLASSIFIER_READY")
except Exception as e:
    print(f"CLASSIFIER_ERROR:{e}")
"""
            result = subprocess.run([venv_python, "-c", test_script],
                                  capture_output=True, text=True, timeout=30)

            if "CLASSIFIER_READY" in result.stdout:
                self.available = True
                print("✅ Image classifier initialized successfully")
            else:
                print(f"❌ Classifier initialization failed: {result.stderr}")

        except Exception as e:
            print(f"❌ Classifier initialization failed: {e}")

    def classify_image(self, image_path, top_k=3):
        """Classify image using direct subprocess communication"""
        if not self.available or not os.path.exists(image_path):
            return [{"label": "classification_unavailable", "confidence": 0.0}]

        try:
            venv_python = "openclip_env\\Scripts\\python.exe"

            classification_script = f"""
import open_clip
import torch
from PIL import Image

try:
    # Load model
    model, _, processor = open_clip.create_model_and_transforms(
        model_name="ViT-B-32",
        pretrained="laion2b_s34b_b79k"
    )

    # Load and process image
    image = Image.open(r"{image_path}").convert("RGB")
    image_tensor = processor(image).unsqueeze(0)

    # Move to GPU if available
    if torch.cuda.is_available():
        model = model.cuda()
        image_tensor = image_tensor.cuda()

    # Get predictions
    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # Common labels
        text_labels = [
            "a photo of a bee", "a photo of a flower", "a photo of a person",
            "a photo of a document", "a photo of a chart", "a photo of a diagram",
            "a photo of a table", "a photo of a graph", "a photo of a logo",
            "a photo of a signature", "a photo of a stamp", "a photo of a barcode"
        ]

        # Encode text labels
        text_tokens = open_clip.tokenize(text_labels)
        if torch.cuda.is_available():
            text_tokens = text_tokens.cuda()

        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Calculate similarity
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk({top_k})

        for value, index in zip(values, indices):
            label = text_labels[index]
            confidence = float(value)
            print(f"CLASSIFICATION_RESULT:{{label}}|{{confidence}}")

except Exception as e:
    print(f"CLASSIFICATION_ERROR:{{e}}")
"""
            result = subprocess.run([venv_python, "-c", classification_script],
                                  capture_output=True, text=True, timeout=30)

            results = []
            for line in result.stdout.split('\n'):
                if line.startswith("CLASSIFICATION_RESULT:"):
                    parts = line.replace("CLASSIFICATION_RESULT:", "").split("|")
                    if len(parts) == 2:
                        try:
                            label = parts[0]
                            confidence = float(parts[1])
                            results.append({"label": label, "confidence": confidence})
                        except:
                            continue

            if results:
                return results
            else:
                return [{"label": "classification_failed", "confidence": 0.0}]

        except Exception as e:
            print(f"❌ Classification failed: {e}")
            return [{"label": "classification_error", "confidence": 0.0}]


class FixedDocumentProcessor:
    """Fixed document processor with complete dependency isolation"""

    def __init__(self):
        self.ocr_processor = FixedOCRProcessor()
        self.image_classifier = FixedImageClassifier()
        print("🎯 Fixed Document Processor Initialized")
        print(f"   OCR: {'✅ Available' if self.ocr_processor.available else '❌ Not Available'}")
        print(f"   Classifier: {'✅ Available' if self.image_classifier.available else '❌ Not Available'}")

    async def process_document(self, file_path):
        """Process document with fixed OCR and classification"""
        try:
            import zipfile
            import tempfile

            # Extract images from Word document
            images = []
            content_parts = []

            with tempfile.TemporaryDirectory() as temp_dir:
                # Extract images from docx
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    image_files = []
                    for file_info in zip_ref.filelist:
                        if file_info.filename.startswith('word/media/'):
                            image_filename = os.path.basename(file_info.filename)
                            image_path = os.path.join(temp_dir, image_filename)

                            with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
                                target.write(source.read())

                            image_files.append(image_path)
                            print(f"📸 Extracted image: {image_path}")

                    print(f"Found {len(image_files)} images in document")

                # Process each image
                for i, image_path in enumerate(image_files):
                    image_metadata = {"index": i, "path": image_path}

                    # OCR processing
                    if self.ocr_processor.available:
                        ocr_result = self.ocr_processor.extract_text_from_image(image_path)
                        if ocr_result["text"].strip():
                            image_metadata["ocr_text"] = ocr_result["text"]
                            image_metadata["ocr_confidence"] = ocr_result["confidence"]
                            content_parts.append(f"[Image {i+1} OCR]: {ocr_result['text']}")
                            print(f"✅ Image {i+1} OCR: {len(ocr_result['text'])} chars")

                    # Image classification
                    if self.image_classifier.available:
                        classification_results = self.image_classifier.classify_image(image_path)
                        image_metadata["classification"] = classification_results
                        if classification_results and classification_results[0]["confidence"] > 0:
                            top_label = classification_results[0]["label"]
                            top_confidence = classification_results[0]["confidence"]
                            content_parts.append(f"[Image {i+1} Classification]: {top_label} ({top_confidence:.3f})")
                            print(f"✅ Image {i+1} Classification: {top_label} ({top_confidence:.3f})")

                            # Check for bee
                            if "bee" in top_label.lower():
                                print(f"🎯 BEE DETECTED in image {i+1}!")

                    images.append(image_metadata)

            # Add some basic content
            content_parts.insert(0, f"Processed document: {os.path.basename(file_path)}")
            content_parts.insert(1, f"Total images: {len(images)}")

            full_content = "\n".join(content_parts)

            return {
                "success": True,
                "content": full_content,
                "metadata": {
                    "file_type": "word",
                    "images_count": len(images),
                    "processed_with_ocr": self.ocr_processor.available,
                    "processed_with_classification": self.image_classifier.available
                },
                "images": images
            }

        except Exception as e:
            print(f"❌ Document processing failed: {e}")
            return {
                "success": False,
                "content": "",
                "metadata": {"error": str(e)},
                "images": []
            }


async def test_fixed_solution():
    """Test the fixed solution"""
    print("🧪 TESTING FIXED SOLUTION")
    print("=" * 50)

    processor = FixedDocumentProcessor()

    # Test with test.docx
    test_file = "test.docx"
    if not os.path.exists(test_file):
        print(f"❌ Test file not found: {test_file}")
        return

    print(f"\n📄 PROCESSING: {test_file}")
    result = await processor.process_document(test_file)

    if not result["success"]:
        print(f"❌ Processing failed: {result['metadata'].get('error', 'Unknown error')}")
        return

    print(f"✅ Processing successful")
    print(f"📊 Metadata: {result['metadata']}")

    # Analyze results
    ocr_working = False
    classification_working = False
    bee_found = False

    for img in result["images"]:
        if "ocr_text" in img and img["ocr_text"].strip():
            ocr_working = True
        if "classification" in img and img["classification"] and img["classification"][0]["confidence"] > 0:
            classification_working = True
            if "bee" in img["classification"][0]["label"].lower():
                bee_found = True

    print(f"\n🎯 FINAL RESULTS:")
    print(f"   OCR: {'✅ WORKING' if ocr_working else '❌ FAILED'}")
    print(f"   Classification: {'✅ WORKING' if classification_working else '❌ FAILED'}")
    print(f"   Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
    print(f"   Dependency Isolation: {'✅ ACHIEVED' if ocr_working and classification_working else '❌ FAILED'}")

    return result


if __name__ == "__main__":
    asyncio.run(test_fixed_solution())