railseek6/final_integrated_solution.py

"""
Final Integrated Solution for Document Processing Pipeline
Complete dependency isolation between PaddleOCR and OpenCLIP
"""

import os
import sys
import subprocess
import tempfile
import asyncio
import zipfile
from pathlib import Path

class IsolatedOCRProcessor:
    """Isolated OCR processor using subprocess communication"""

    def __init__(self):
        self.available = False
        self._initialize()

    def _initialize(self):
        """Initialize OCR processor"""
        try:
            # Test if PaddleOCR works
            test_script = """
import sys
try:
    from paddleocr import PaddleOCR
    print("OCR_READY")
except Exception as e:
    print(f"OCR_ERROR:{e}")
"""
            result = subprocess.run([sys.executable, "-c", test_script],
                                  capture_output=True, text=True, timeout=10)
            if "OCR_READY" in result.stdout:
                self.available = True
                print("✅ OCR processor initialized successfully")
            else:
                print(f"❌ OCR initialization failed: {result.stderr}")
        except Exception as e:
            print(f"❌ OCR initialization failed: {e}")

    def extract_text_from_image(self, image_path):
        """Extract text from image using direct subprocess"""
        if not self.available or not os.path.exists(image_path):
            return {"text": "", "confidence": 0.0, "line_count": 0}

        try:
            ocr_script = f"""
import sys
from paddleocr import PaddleOCR

try:
    ocr = PaddleOCR(use_gpu=True, cls=True)
    result = ocr.ocr(r"{image_path}")

    if not result or not result[0]:
        print("OCR_RESULT:EMPTY")
        sys.exit(0)

    extracted_text = []
    total_confidence = 0.0
    line_count = 0

    for line in result[0]:
        try:
            if len(line) == 2:
                bbox, (text, confidence) = line
            elif len(line) >= 1:
                bbox = line[0] if len(line) > 0 else []
                if len(line) > 1:
                    if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
                        text, confidence = line[1][0], line[1][1]
                    else:
                        text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
                else:
                    text, confidence = "", 0.0
            else:
                continue

            text_str = str(text) if text is not None else ""
            confidence_float = float(confidence) if isinstance(confidence, (int, float)) else 0.0

            extracted_text.append(text_str)
            total_confidence += confidence_float
            line_count += 1

        except Exception:
            extracted_text.append("")
            total_confidence += 0.0
            line_count += 1

    avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
    full_text = " ".join(extracted_text)

    print(f"OCR_RESULT:TEXT={{full_text}}")
    print(f"OCR_RESULT:CONFIDENCE={{avg_confidence}}")
    print(f"OCR_RESULT:LINES={{line_count}}")

except Exception as e:
    print(f"OCR_ERROR:{{e}}")
"""
            result = subprocess.run([sys.executable, "-c", ocr_script],
                                  capture_output=True, text=True, timeout=60)

            # Parse results from stdout
            text = ""
            confidence = 0.0
            line_count = 0

            for line in result.stdout.split('\n'):
                if line.startswith("OCR_RESULT:TEXT="):
                    text = line.replace("OCR_RESULT:TEXT=", "").strip()
                elif line.startswith("OCR_RESULT:CONFIDENCE="):
                    try:
                        confidence = float(line.replace("OCR_RESULT:CONFIDENCE=", "").strip())
                    except:
                        confidence = 0.0
                elif line.startswith("OCR_RESULT:LINES="):
                    try:
                        line_count = int(line.replace("OCR_RESULT:LINES=", "").strip())
                    except:
                        line_count = 0

            return {
                "text": text,
                "confidence": confidence,
                "line_count": line_count
            }

        except Exception as e:
            print(f"❌ OCR processing failed: {e}")
            return {"text": "", "confidence": 0.0, "line_count": 0}


class IsolatedImageClassifier:
    """Isolated image classifier using virtual environment"""

    def __init__(self):
        self.available = False
        self._initialize()

    def _initialize(self):
        """Initialize image classifier"""
        try:
            # Check if virtual environment exists and works
            venv_python = "openclip_env\\Scripts\\python.exe"
            if not os.path.exists(venv_python):
                print("❌ OpenCLIP virtual environment not found")
                return

            test_script = """
try:
    import open_clip
    print("CLASSIFIER_READY")
except Exception as e:
    print(f"CLASSIFIER_ERROR:{e}")
"""
            result = subprocess.run([venv_python, "-c", test_script],
                                  capture_output=True, text=True, timeout=30)

            if "CLASSIFIER_READY" in result.stdout:
                self.available = True
                print("✅ Image classifier initialized successfully")
            else:
                print(f"❌ Classifier initialization failed: {result.stderr}")

        except Exception as e:
            print(f"❌ Classifier initialization failed: {e}")

    def classify_image(self, image_path, top_k=3):
        """Classify image using isolated virtual environment"""
        if not self.available or not os.path.exists(image_path):
            return [{"label": "classification_unavailable", "confidence": 0.0}]

        try:
            venv_python = "openclip_env\\Scripts\\python.exe"

            classification_script = f"""
import open_clip
import torch
from PIL import Image

try:
    # Load model
    model, _, processor = open_clip.create_model_and_transforms(
        model_name="ViT-B-32",
        pretrained="laion2b_s34b_b79k"
    )

    # Load and process image
    image = Image.open(r"{image_path}").convert("RGB")
    image_tensor = processor(image).unsqueeze(0)

    # Move to GPU if available
    if torch.cuda.is_available():
        model = model.cuda()
        image_tensor = image_tensor.cuda()

    # Get predictions
    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # Common labels for document processing
        text_labels = [
            "a photo of a bee", "a photo of a flower", "a photo of a person",
            "a photo of a document", "a photo of a chart", "a photo of a diagram",
            "a photo of a table", "a photo of a graph", "a photo of a logo",
            "a photo of a signature", "a photo of a stamp", "a photo of a barcode",
            "a photo of a QR code", "a photo of a screenshot", "a photo of a landscape",
            "a photo of an animal", "a photo of a building", "a photo of a vehicle",
            "a photo of text", "a photo of numbers", "a photo of symbols"
        ]

        # Encode text labels
        text_tokens = open_clip.tokenize(text_labels)
        if torch.cuda.is_available():
            text_tokens = text_tokens.cuda()

        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Calculate similarity
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk({top_k})

        for value, index in zip(values, indices):
            label = text_labels[index]
            confidence = float(value)
            print(f"CLASSIFICATION_RESULT:{{label}}|{{confidence}}")

except Exception as e:
    print(f"CLASSIFICATION_ERROR:{{e}}")
"""
            result = subprocess.run([venv_python, "-c", classification_script],
                                  capture_output=True, text=True, timeout=30)

            results = []
            for line in result.stdout.split('\n'):
                if line.startswith("CLASSIFICATION_RESULT:"):
                    parts = line.replace("CLASSIFICATION_RESULT:", "").split("|")
                    if len(parts) == 2:
                        try:
                            label = parts[0]
                            confidence = float(parts[1])
                            results.append({"label": label, "confidence": confidence})
                        except:
                            continue

            if results:
                return results
            else:
                return [{"label": "classification_failed", "confidence": 0.0}]

        except Exception as e:
            print(f"❌ Classification failed: {e}")
            return [{"label": "classification_error", "confidence": 0.0}]


class DocumentProcessor:
    """Main document processor with complete dependency isolation"""

    def __init__(self):
        self.ocr_processor = IsolatedOCRProcessor()
        self.image_classifier = IsolatedImageClassifier()
        print("🎯 Document Processor Initialized")
        print(f"   OCR: {'✅ Available' if self.ocr_processor.available else '❌ Not Available'}")
        print(f"   Classifier: {'✅ Available' if self.image_classifier.available else '❌ Not Available'}")

    async def process_document(self, file_path):
        """Process document with text-first extraction and image processing"""
        try:
            # Extract text content first
            text_content = await self._extract_text_content(file_path)

            # Extract and process images
            images_metadata, image_content = await self._extract_and_process_images(file_path)

            # Combine all content
            full_content = text_content + "\n\n" + image_content if image_content else text_content

            return {
                "success": True,
                "content": full_content,
                "metadata": {
                    "file_type": "word",
                    "images_count": len(images_metadata),
                    "processed_with_ocr": self.ocr_processor.available,
                    "processed_with_classification": self.image_classifier.available
                },
                "images": images_metadata
            }

        except Exception as e:
            print(f"❌ Document processing failed: {e}")
            return {
                "success": False,
                "content": "",
                "metadata": {"error": str(e)},
                "images": []
            }

    async def _extract_text_content(self, file_path):
        """Extract text content from Word document"""
        try:
            import docx
            doc = docx.Document(file_path)
            content_parts = []

            # Extract paragraphs
            for para in doc.paragraphs:
                if para.text.strip():
                    content_parts.append(para.text)

            # Extract tables
            for table in doc.tables:
                table_data = []
                for row in table.rows:
                    row_data = [cell.text for cell in row.cells]
                    table_data.append(row_data)

                if table_data:
                    content_parts.append(f"[Table]: {len(table_data)} rows")

            return "\n".join(content_parts)

        except Exception as e:
            print(f"⚠️ Text extraction failed: {e}")
            return "Text content extraction failed"

    async def _extract_and_process_images(self, file_path):
        """Extract and process images from document"""
        images_metadata = []
        image_content_parts = []

        try:
            with tempfile.TemporaryDirectory() as temp_dir:
                # Extract images from docx using zipfile
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    image_files = []
                    for file_info in zip_ref.filelist:
                        if file_info.filename.startswith('word/media/'):
                            image_filename = os.path.basename(file_info.filename)
                            image_path = os.path.join(temp_dir, image_filename)

                            with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
                                target.write(source.read())

                            image_files.append(image_path)
                            print(f"📸 Extracted image: {image_path}")

                    print(f"Found {len(image_files)} images in document")

                # Process each image
                for i, image_path in enumerate(image_files):
                    image_metadata = {"index": i, "path": image_path}

                    # OCR processing
                    if self.ocr_processor.available:
                        ocr_result = self.ocr_processor.extract_text_from_image(image_path)
                        if ocr_result["text"].strip():
                            image_metadata["ocr_text"] = ocr_result["text"]
                            image_metadata["ocr_confidence"] = ocr_result["confidence"]
                            image_content_parts.append(f"[Image {i+1} OCR]: {ocr_result['text']}")
                            print(f"✅ Image {i+1} OCR: {len(ocr_result['text'])} chars")

                    # Image classification
                    if self.image_classifier.available:
                        classification_results = self.image_classifier.classify_image(image_path)
                        image_metadata["classification"] = classification_results
                        if classification_results and classification_results[0]["confidence"] > 0:
                            top_label = classification_results[0]["label"]
                            top_confidence = classification_results[0]["confidence"]
                            image_content_parts.append(f"[Image {i+1} Classification]: {top_label} ({top_confidence:.3f})")
                            print(f"✅ Image {i+1} Classification: {top_label} ({top_confidence:.3f})")

                            # Check for bee
                            if "bee" in top_label.lower():
                                print(f"🎯 BEE DETECTED in image {i+1}!")

                    images_metadata.append(image_metadata)

            return images_metadata, "\n".join(image_content_parts)

        except Exception as e:
            print(f"❌ Image processing failed: {e}")
            return [], ""


async def test_final_solution():
    """Test the final integrated solution"""
    print("🧪 FINAL INTEGRATED SOLUTION TEST")
    print("=" * 50)

    processor = DocumentProcessor()

    # Test with test.docx
    test_file = "test.docx"
    if not os.path.exists(test_file):
        print(f"❌ Test file not found: {test_file}")
        return

    print(f"\n📄 PROCESSING: {test_file}")
    result = await processor.process_document(test_file)

    if not result["success"]:
        print(f"❌ Processing failed: {result['metadata'].get('error', 'Unknown error')}")
        return

    print(f"✅ Processing successful")
    print(f"📊 Metadata: {result['metadata']}")

    # Analyze results
    ocr_working = False
    classification_working = False
    bee_found = False

    for img in result["images"]:
        if "ocr_text" in img and img["ocr_text"].strip():
            ocr_working = True
        if "classification" in img and img["classification"] and img["classification"][0]["confidence"] > 0:
            classification_working = True
            if "bee" in img["classification"][0]["label"].lower():
                bee_found = True

    print(f"\n🎯 FINAL RESULTS:")
    print(f"   OCR: {'✅ WORKING' if ocr_working else '❌ FAILED'}")
    print(f"   Classification: {'✅ WORKING' if classification_working else '❌ FAILED'}")
    print(f"   Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
    print(f"   Dependency Isolation: {'✅ ACHIEVED' if ocr_working and classification_working else '❌ FAILED'}")

    # Show sample content
    if result["content"]:
        print(f"\n📝 SAMPLE CONTENT (first 500 chars):")
        print(result["content"][:500] + "...")

    return result


if __name__ == "__main__":
    asyncio.run(test_final_solution())