railseek6/fix_ocr_issues.py

"""
Fix OCR issues by completely isolating PaddleOCR from OpenCLIP dependencies
"""

import os
import sys
import subprocess
import tempfile
import json
from pathlib import Path

def test_paddleocr_alone():
    """Test PaddleOCR without any OpenCLIP interference"""
    print("🔍 Testing PaddleOCR in Isolation")
    print("=" * 50)

    # Test PaddleOCR directly without importing OpenCLIP
    test_code = '''
import sys
import os

# Remove any OpenCLIP paths from sys.path to ensure isolation
original_path = sys.path.copy()
filtered_path = [p for p in sys.path if 'openclip' not in p.lower()]
sys.path = filtered_path

try:
    print("🧪 Testing PaddleOCR without OpenCLIP interference...")

    # Test basic imports
    import torch
    print(f"✅ PyTorch: {torch.__version__}")
    print(f"✅ CUDA available: {torch.cuda.is_available()}")

    # Test PaddleOCR
    from paddleocr import PaddleOCR
    print("✅ PaddleOCR imported successfully")

    # Initialize OCR
    ocr = PaddleOCR(use_gpu=True)
    print("✅ PaddleOCR GPU initialization successful")

    # Test OCR on an image
    test_image = "extracted_images/image1.png"
    if os.path.exists(test_image):
        print(f"📸 Testing OCR on: {test_image}")
        result = ocr.ocr(test_image, cls=True)

        if result and result[0]:
            print(f"✅ OCR successful - found {len(result[0])} text lines")
            for i, line in enumerate(result[0][:3]):  # Show first 3 lines
                text = line[1][0] if len(line) > 1 and len(line[1]) > 0 else "No text"
                confidence = line[1][1] if len(line) > 1 and len(line[1]) > 1 else 0.0
                print(f"   Line {i+1}: '{text}' (confidence: {confidence:.3f})")
        else:
            print("❌ OCR returned no results")
    else:
        print(f"❌ Test image not found: {test_image}")

except Exception as e:
    print(f"❌ PaddleOCR test failed: {e}")
    import traceback
    traceback.print_exc()
finally:
    # Restore original path
    sys.path = original_path
'''

    # Run the test in a separate process to ensure complete isolation
    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
        f.write(test_code)
        script_path = f.name

    try:
        result = subprocess.run([sys.executable, script_path],
                              capture_output=True, text=True, timeout=60)
        print(result.stdout)
        if result.stderr:
            print("STDERR:", result.stderr)
    finally:
        os.unlink(script_path)

def create_isolated_ocr_processor():
    """Create a completely isolated OCR processor that doesn't import OpenCLIP"""
    print("\n🔧 Creating Isolated OCR Processor")
    print("=" * 50)

    ocr_processor_code = '''
"""
Completely isolated OCR processor that avoids any OpenCLIP dependencies
"""

import os
import sys
import json
import tempfile
from pathlib import Path

class IsolatedOCRProcessor:
    """OCR processor that runs in complete isolation from OpenCLIP"""

    def __init__(self):
        self.ocr_engine = None
        self.available = False
        self._initialize_ocr()

    def _initialize_ocr(self):
        """Initialize PaddleOCR without any OpenCLIP interference"""
        try:
            # Clean up sys.path to remove OpenCLIP paths
            original_path = sys.path.copy()
            sys.path = [p for p in sys.path if 'openclip' not in p.lower()]

            print("🚀 Initializing PaddleOCR in isolated environment...")
            from paddleocr import PaddleOCR

            self.ocr_engine = PaddleOCR(use_gpu=True)
            self.available = True
            print("✅ PaddleOCR initialized successfully with GPU")

            # Restore original path
            sys.path = original_path

        except Exception as e:
            print(f"❌ PaddleOCR initialization failed: {e}")
            self.available = False
            # Restore original path even on failure
            sys.path = original_path
            raise

    def extract_text_from_image(self, image_path):
        """Extract text from image using isolated OCR"""
        if not self.available or not self.ocr_engine:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        try:
            # Clean up sys.path again for OCR execution
            original_path = sys.path.copy()
            sys.path = [p for p in sys.path if 'openclip' not in p.lower()]

            result = self.ocr_engine.ocr(image_path, cls=True)

            # Restore path
            sys.path = original_path

            if not result or not result[0]:
                return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

            extracted_text = []
            bboxes = []
            total_confidence = 0.0
            line_count = 0

            for line in result[0]:
                try:
                    if len(line) == 2:
                        bbox, (text, confidence) = line
                    elif len(line) >= 1:
                        bbox = line[0] if len(line) > 0 else []
                        if len(line) > 1:
                            if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
                                text, confidence = line[1][0], line[1][1]
                            else:
                                text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
                        else:
                            text, confidence = "", 0.0
                    else:
                        continue

                    text_str = str(text) if text is not None else ""
                    confidence_float = 0.0
                    if confidence is not None:
                        if isinstance(confidence, (int, float)):
                            confidence_float = float(confidence)
                        elif isinstance(confidence, str):
                            try:
                                confidence_float = float(confidence)
                            except ValueError:
                                confidence_float = 0.0
                        else:
                            confidence_float = 0.0

                    extracted_text.append(text_str)
                    bboxes.append(bbox)
                    total_confidence += confidence_float
                    line_count += 1

                except Exception:
                    extracted_text.append("")
                    bboxes.append([])
                    total_confidence += 0.0
                    line_count += 1

            avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
            full_text = "\\n".join(extracted_text)

            return {
                "text": full_text,
                "confidence": avg_confidence,
                "bboxes": bboxes,
                "line_count": line_count
            }

        except Exception as e:
            print(f"❌ OCR processing failed: {e}")
            # Restore path on error
            sys.path = original_path
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

# Singleton instance
_ocr_instance = None

def get_isolated_ocr_processor():
    """Get singleton isolated OCR processor instance"""
    global _ocr_instance
    if _ocr_instance is None:
        _ocr_instance = IsolatedOCRProcessor()
    return _ocr_instance

if __name__ == "__main__":
    # Test the isolated OCR processor
    processor = get_isolated_ocr_processor()
    if processor.available:
        print("✅ Isolated OCR processor is available")
        # Test with an image
        test_image = "extracted_images/image1.png"
        if os.path.exists(test_image):
            result = processor.extract_text_from_image(test_image)
            print(f"OCR Result: {len(result['text'])} characters, confidence: {result['confidence']:.3f}")
            if result['text']:
                print(f"Text preview: {result['text'][:100]}...")
        else:
            print("❌ Test image not found")
    else:
        print("❌ Isolated OCR processor is not available")
'''

    with open("isolated_ocr_processor.py", "w", encoding="utf-8") as f:
        f.write(ocr_processor_code)

    print("✅ Created isolated OCR processor")

def update_document_processor_for_isolation():
    """Update document processor to use the isolated OCR processor"""
    print("\n🔄 Updating Document Processor for Complete Isolation")
    print("=" * 50)

    # Read current document processor
    with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
        content = f.read()

    # Replace the OCRProcessor class with a version that uses the isolated processor
    old_ocr_class = '''class OCRProcessor:
    """GPU-accelerated OCR processing using PaddleOCR with graceful fallback"""

    def __init__(self, use_gpu: bool = True, languages: List[str] = None):
        self.use_gpu = use_gpu
        self.languages = languages or ['en', 'ch']
        self.ocr_engine = None
        self.ocr_available = False
        self._initialize_ocr()

    def _initialize_ocr(self):
        """Initialize PaddleOCR engine with GPU only - no fallbacks"""
        try:
            logger.info("Initializing PaddleOCR with GPU mode only")
            self.ocr_engine = PaddleOCR(use_gpu=True)
            logger.info("PaddleOCR engine initialized successfully with GPU")
            self.ocr_available = True

        except Exception as e:
            logger.error(f"PaddleOCR GPU initialization failed: {e}")
            self.ocr_engine = None
            self.ocr_available = False
            raise RuntimeError(f"PaddleOCR GPU initialization failed: {e}")'''

    new_ocr_class = '''class OCRProcessor:
    """GPU-accelerated OCR processing using isolated PaddleOCR"""

    def __init__(self, use_gpu: bool = True, languages: List[str] = None):
        self.use_gpu = use_gpu
        self.languages = languages or ['en', 'ch']
        self.ocr_engine = None
        self.ocr_available = False
        self._initialize_ocr()

    def _initialize_ocr(self):
        """Initialize isolated PaddleOCR engine with GPU only"""
        try:
            logger.info("Initializing isolated PaddleOCR with GPU mode only")
            # Import the isolated OCR processor
            import sys
            import os
            parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            if parent_dir not in sys.path:
                sys.path.insert(0, parent_dir)
            from isolated_ocr_processor import get_isolated_ocr_processor

            self.ocr_engine = get_isolated_ocr_processor()
            if self.ocr_engine.available:
                logger.info("Isolated PaddleOCR engine initialized successfully with GPU")
                self.ocr_available = True
            else:
                raise RuntimeError("Isolated OCR processor not available")

        except Exception as e:
            logger.error(f"Isolated PaddleOCR GPU initialization failed: {e}")
            self.ocr_engine = None
            self.ocr_available = False
            raise RuntimeError(f"Isolated PaddleOCR GPU initialization failed: {e}")'''

    content = content.replace(old_ocr_class, new_ocr_class)

    # Also update the extract_text_from_image method
    old_extract_method = '''    def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
        """Extract text from image using OCR"""
        if not self.ocr_engine:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        try:
            # Perform OCR
            result = self.ocr_engine.ocr(image_path, cls=True)'''

    new_extract_method = '''    def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
        """Extract text from image using isolated OCR"""
        if not self.ocr_engine:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        try:
            # Perform OCR using isolated processor
            result = self.ocr_engine.extract_text_from_image(image_path)'''

    content = content.replace(old_extract_method, new_extract_method)

    # Remove the rest of the old OCR processing code since it's handled by the isolated processor
    old_ocr_processing = '''
            if not result or not result[0]:
                return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

            # Process OCR results - handle different PaddleOCR result structures
            extracted_text = []
            bboxes = []
            total_confidence = 0.0
            line_count = 0

            for line in result[0]:
                try:
                    # Handle different PaddleOCR result structures
                    if len(line) == 2:
                        # Standard structure: [[bbox], (text, confidence)]
                        bbox, (text, confidence) = line
                    elif len(line) >= 1:
                        # Handle alternative structures
                        bbox = line[0] if len(line) > 0 else []
                        if len(line) > 1:
                            if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
                                text, confidence = line[1][0], line[1][1]
                            else:
                                text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
                        else:
                            text, confidence = "", 0.0
                    else:
                        continue

                    # Ensure text is string and confidence is float
                    text_str = str(text) if text is not None else ""
                    confidence_float = 0.0
                    if confidence is not None:
                        if isinstance(confidence, (int, float)):
                            confidence_float = float(confidence)
                        elif isinstance(confidence, str):
                            try:
                                confidence_float = float(confidence)
                            except ValueError:
                                logger.warning(f"Could not convert confidence string to float: {confidence}")
                                confidence_float = 0.0
                        else:
                            logger.warning(f"Unexpected confidence type: {type(confidence)}, value: {confidence}")
                            confidence_float = 0.0
                    else:
                        confidence_float = 0.0

                    extracted_text.append(text_str)
                    bboxes.append(bbox)
                    total_confidence += confidence_float
                    line_count += 1

                except (TypeError, ValueError, IndexError) as e:
                    logger.warning(f"Type conversion error in OCR line processing: {e}")
                    # Add empty text and continue
                    extracted_text.append("")
                    bboxes.append([])
                    total_confidence += 0.0
                    line_count += 1

            try:
                avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
            except (TypeError, ZeroDivisionError):
                avg_confidence = 0.0

            full_text = "\\n".join(extracted_text)

            return {
                "text": full_text,
                "confidence": avg_confidence,
                "bboxes": bboxes,
                "line_count": line_count
            }'''

    # Just remove this block since the isolated processor handles the processing
    content = content.replace(old_ocr_processing, "")

    # Write the updated content
    with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
        f.write(content)

    print("✅ Document processor updated for complete isolation")

def create_final_test():
    """Create a final test to verify the complete isolation"""
    print("\n🧪 Creating Final Isolation Test")
    print("=" * 50)

    test_code = '''
"""
Final test to verify complete dependency isolation between PaddleOCR and OpenCLIP
"""

import asyncio
import sys
import os
from pathlib import Path

# Add paths
sys.path.insert(0, "LightRAG-main")

async def test_complete_isolation():
    """Test that PaddleOCR and OpenCLIP are completely isolated"""
    print("🔍 TESTING COMPLETE DEPENDENCY ISOLATION")
    print("=" * 60)

    try:
        from lightrag.document_processor import get_document_processor

        processor = get_document_processor()

        print("🎯 SYSTEM STATUS:")
        print(f"   OCR Processor: {'✅ Available' if processor.ocr_processor.ocr_available else '❌ Not Available'}")
        print(f"   Image Classifier: {'✅ Available' if processor.image_classifier and processor.image_classifier.available else '❌ Not Available'}")

        # Process test document
        test_file = "test.docx"
        if not os.path.exists(test_file):
            print(f"❌ Test file not found: {test_file}")
            return

        print(f"\\n📄 PROCESSING DOCUMENT: {test_file}")
        result = await processor.process_document(test_file)

        print(f"✅ Processing Success: {result.success}")
        print(f"📊 Metadata: {result.metadata}")

        # Check OCR results
        print(f"\\n🔤 OCR RESULTS:")
        ocr_working = False
        for i, img in enumerate(result.images):
            if 'ocr_text' in img and img['ocr_text'].strip():
                ocr_working = True
                print(f"   ✅ Image {i+1}: OCR extracted {len(img['ocr_text'])} characters")
                if img['ocr_text'].strip():
                    print(f"      Text: {img['ocr_text'][:100]}...")
            elif 'ocr_error' in img:
                print(f"   ❌ Image {i+1}: OCR failed - {img['ocr_error']}")
            else:
                print(f"   ⚠️ Image {i+1}: No OCR text extracted")

        # Check classification results
        print(f"\\n🖼️ CLASSIFICATION RESULTS:")
        classification_working = False
        bee_detected = False
        for i, img in enumerate(result.images):
            if 'classification' in img and img['classification']:
                classification_working = True
                top_label = img['classification'][0]['label'] if img['classification'] else 'unknown'
                print(f"   ✅ Image {i+1}: Classified as '{top_label}'")
                if 'bee' in top_label.lower():
                    bee_detected = True
                    print(f"      🎯 BEE DETECTED in image {i+1}!")

        print(f"\\n🎯 FINAL VERIFICATION:")
        if ocr_working:
            print("   ✅ OCR is working with complete dependency isolation")
        else:
            print("   ❌ OCR is not working properly")

        if classification_working:
            print("   ✅ Image classification is working with complete dependency isolation")
        else:
            print("   ❌ Image classification is not working properly")

        if bee_detected:
            print("   ✅ Bee image successfully detected and classified")
        else:
            print("   ❌ Bee image not detected in classifications")

        print(f"\\n🚀 DEPENDENCY ISOLATION STATUS: {'✅ SUCCESS' if ocr_working and classification_working else '❌ FAILED'}")

    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    asyncio.run(test_complete_isolation())
'''

    with open("final_isolation_test.py", "w", encoding="utf-8") as f:
        f.write(test_code)

    print("✅ Created final isolation test")

def main():
    """Run all fixes for complete dependency isolation"""
    print("🎯 FIXING OCR ISSUES WITH COMPLETE DEPENDENCY ISOLATION")
    print("=" * 70)

    # Test PaddleOCR alone first
    test_paddleocr_alone()

    # Create isolated OCR processor
    create_isolated_ocr_processor()

    # Update document processor
    update_document_processor_for_isolation()

    # Create final test
    create_final_test()

    print(f"\\n✅ COMPLETE ISOLATION SOLUTION IMPLEMENTED:")
    print("   - Created isolated OCR processor that avoids OpenCLIP paths")
    print("   - Updated document processor to use isolated components")
    print("   - Ensured complete dependency separation")
    print(f"\\n🚀 Run the final test: python final_isolation_test.py")

if __name__ == "__main__":
    main()