railseek6/fix_indexing.py

"""
Fix indexing to include image classification results in searchable content
"""

import asyncio
import sys
import os
from pathlib import Path

# Add paths
sys.path.insert(0, "LightRAG-main")

def test_current_indexing():
    """Test what content is currently being indexed"""
    print("🔍 Testing Current Indexing Behavior")
    print("=" * 50)

    try:
        from lightrag.document_processor import get_document_processor

        processor = get_document_processor()

        # Process test document
        test_file = "test.docx"
        if not os.path.exists(test_file):
            print(f"❌ Test file not found: {test_file}")
            return

        print(f"📄 Processing: {test_file}")
        result = asyncio.run(processor.process_document(test_file))

        print(f"✅ Processing Success: {result.success}")
        print(f"📊 Metadata: {result.metadata}")
        print(f"📝 Content Length: {len(result.content)} characters")

        # Show what content is actually being indexed
        print(f"\n📋 CONTENT PREVIEW (first 500 chars):")
        print(result.content[:500])

        print(f"\n📋 CONTENT PREVIEW (last 500 chars):")
        print(result.content[-500:])

        # Check for image-related content
        print(f"\n🔍 SEARCHING FOR IMAGE CONTENT:")
        if "[Image" in result.content:
            print("✅ Found image metadata in content")
            # Extract all image-related lines
            lines = result.content.split('\n')
            image_lines = [line for line in lines if '[Image' in line]
            for line in image_lines:
                print(f"   {line}")
        else:
            print("❌ No image metadata found in content")

        # Check for bee-related content
        print(f"\n🐝 SEARCHING FOR BEE CONTENT:")
        if 'bee' in result.content.lower():
            print("✅ Found 'bee' in content")
            bee_lines = [line for line in lines if 'bee' in line.lower()]
            for line in bee_lines:
                print(f"   {line}")
        else:
            print("❌ No 'bee' found in content")

    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()

def fix_document_processor():
    """Fix the document processor to include image classifications in searchable content"""
    print("\n🔧 Fixing Document Processor for Better Indexing")
    print("=" * 50)

    # Read the current document processor
    with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
        content = f.read()

    # Find the _extract_and_process_images method and enhance it
    old_method = '''                # OCR processing - ensure it works properly
                if self.ocr_processor.ocr_available:
                    try:
                        logger.info(f"Running OCR on image {i+1}")
                        ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
                        logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")

                        if ocr_result["text"].strip():
                            image_metadata["ocr_text"] = ocr_result["text"]
                            image_metadata["ocr_confidence"] = ocr_result["confidence"]
                            additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
                        else:
                            logger.warning(f"OCR returned empty text for image {i+1}")
                    except Exception as ocr_error:
                        logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
                        image_metadata["ocr_error"] = str(ocr_error)'''

    new_method = '''                # OCR processing - ensure it works properly
                if self.ocr_processor.ocr_available:
                    try:
                        logger.info(f"Running OCR on image {i+1}")
                        ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
                        logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")

                        if ocr_result["text"].strip():
                            image_metadata["ocr_text"] = ocr_result["text"]
                            image_metadata["ocr_confidence"] = ocr_result["confidence"]
                            additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
                            # Add OCR text directly to main content for better searchability
                            additional_content.append(ocr_result["text"])
                        else:
                            logger.warning(f"OCR returned empty text for image {i+1}")
                    except Exception as ocr_error:
                        logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
                        image_metadata["ocr_error"] = str(ocr_error)'''

    content = content.replace(old_method, new_method)

    # Also fix the classification part to add more searchable content
    old_classification = '''                # Image classification
                if self.image_classifier and self.image_classifier.available:
                    try:
                        classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
                        image_metadata["classification"] = classification_results
                        # Add classification to content for indexing
                        top_label = classification_results[0]["label"] if classification_results else "unknown"
                        image_metadata["primary_classification"] = top_label
                        additional_content.append(f"[Image {i+1} Classification]: {top_label}")
                    except Exception as classify_error:
                        logger.error(f"Image classification failed for image {i+1}: {classify_error}")
                        image_metadata["classification_error"] = str(classify_error)'''

    new_classification = '''                # Image classification
                if self.image_classifier and self.image_classifier.available:
                    try:
                        classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
                        image_metadata["classification"] = classification_results
                        # Add classification to content for indexing
                        top_label = classification_results[0]["label"] if classification_results else "unknown"
                        image_metadata["primary_classification"] = top_label
                        additional_content.append(f"[Image {i+1} Classification]: {top_label}")
                        # Add all classification labels for better searchability
                        for j, cls in enumerate(classification_results):
                            additional_content.append(f"Image {i+1} classified as: {cls['label']} with confidence {cls['confidence']:.3f}")
                    except Exception as classify_error:
                        logger.error(f"Image classification failed for image {i+1}: {classify_error}")
                        image_metadata["classification_error"] = str(classify_error)'''

    content = content.replace(old_classification, new_classification)

    # Write the fixed content back
    with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
        f.write(content)

    print("✅ Document processor updated for better indexing")

def create_enhanced_test():
    """Create a test that simulates the full upload and search workflow"""
    print("\n🚀 Creating Enhanced Search Test")
    print("=" * 50)

    test_code = '''
"""
Enhanced test that simulates upload, indexing, and search
"""

import asyncio
import sys
import os
from pathlib import Path

# Add paths
sys.path.insert(0, "LightRAG-main")

async def test_full_workflow():
    """Test the complete workflow including simulated search"""
    print("🔍 TESTING COMPLETE WORKFLOW WITH SEARCH")
    print("=" * 60)

    try:
        from lightrag.document_processor import get_document_processor

        processor = get_document_processor()

        # Process test document
        test_file = "test.docx"
        if not os.path.exists(test_file):
            print(f"❌ Test file not found: {test_file}")
            return

        print(f"📄 Processing: {test_file}")
        result = await processor.process_document(test_file)

        if not result.success:
            print(f"❌ Processing failed: {result.error}")
            return

        print(f"✅ Processing Success")
        print(f"📊 Metadata: {result.metadata}")

        # Simulate indexing and search
        print(f"\\n🔍 SIMULATING INDEXING AND SEARCH")
        print("=" * 40)

        # Extract all searchable content
        search_content = result.content.lower()

        # Test various search queries
        test_queries = [
            "bee", "insect", "animal", "clipart", "image",
            "docker", "windows", "autologin", "configuration"
        ]

        print("📋 SEARCH RESULTS:")
        for query in test_queries:
            if query in search_content:
                print(f"   ✅ '{query}': FOUND in indexed content")
                # Show context
                idx = search_content.find(query)
                context = result.content[max(0, idx-50):min(len(result.content), idx+50)]
                print(f"      Context: ...{context}...")
            else:
                print(f"   ❌ '{query}': NOT FOUND in indexed content")

        # Specifically check for image classifications
        print(f"\\n🖼️ IMAGE CLASSIFICATION SEARCH:")
        bee_found = False
        for i, img in enumerate(result.images):
            if 'primary_classification' in img:
                classification = img['primary_classification'].lower()
                print(f"   Image {i+1}: {classification}")
                if 'bee' in classification:
                    bee_found = True
                    print(f"      🎯 BEE DETECTED in image {i+1}")
            else:
                print(f"   Image {i+1}: No classification available")

        if not bee_found:
            print("   ❌ No bee detected in any image classifications")

        # Check if bee appears in any OCR text
        print(f"\\n🔤 OCR TEXT ANALYSIS:")
        bee_in_ocr = False
        for i, img in enumerate(result.images):
            if 'ocr_text' in img and img['ocr_text']:
                ocr_text = img['ocr_text'].lower()
                if 'bee' in ocr_text:
                    bee_in_ocr = True
                    print(f"   ✅ Image {i+1} OCR contains 'bee': {ocr_text[:100]}...")
                else:
                    print(f"   Image {i+1} OCR: {ocr_text[:50]}..." if ocr_text else "   Image {i+1}: No OCR text")
            else:
                print(f"   Image {i+1}: No OCR text available")

        print(f"\\n🎯 FINAL BEE DETECTION STATUS:")
        if bee_found or bee_in_ocr or 'bee' in search_content:
            print("   ✅ BEE CONTENT IS SEARCHABLE AND INDEXED")
        else:
            print("   ❌ BEE CONTENT IS NOT PROPERLY INDEXED")
            print("   📝 Recommendations:")
            print("     - Ensure image classifications are included in main content")
            print("     - Add classification labels to searchable text")
            print("     - Include OCR text from images in search index")

    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    asyncio.run(test_full_workflow())
'''

    with open("enhanced_search_test.py", "w", encoding="utf-8") as f:
        f.write(test_code)

    print("✅ Created enhanced search test")

def main():
    """Run all fixes"""
    print("🎯 FIXING INDEXING FOR BEE DETECTION")
    print("=" * 60)

    # Test current state
    test_current_indexing()

    # Fix the document processor
    fix_document_processor()

    # Create enhanced test
    create_enhanced_test()

    print(f"\\n✅ FIXES APPLIED:")
    print("   - Enhanced OCR text inclusion in searchable content")
    print("   - Improved image classification metadata indexing")
    print("   - Created comprehensive search test")
    print(f"\\n🚀 Run the test: python enhanced_search_test.py")

if __name__ == "__main__":
    main()