railseek6/test_complete_solution.py

"""
Complete Test for Document Processing Pipeline with Dependency Isolation
Tests OCR, Image Classification, and Bee Detection
"""

import asyncio
import sys
import os
import tempfile
import zipfile

# Add paths
sys.path.insert(0, "LightRAG-main")

async def test_complete_solution():
    """Test the complete document processing pipeline"""
    print("🧪 COMPLETE SOLUTION TEST")
    print("=" * 50)

    try:
        from lightrag.document_processor import get_document_processor

        processor = get_document_processor()

        print("🎯 COMPONENT STATUS:")
        print(f"   OCR: {'✅ Available' if processor.ocr_processor.ocr_available else '❌ Not Available'}")
        print(f"   Image Classifier: {'✅ Available' if processor.image_classifier and processor.image_classifier.available else '❌ Not Available'}")

        # Test 1: Process test.docx with bee image
        test_file = "test.docx"
        if not os.path.exists(test_file):
            print(f"❌ Test file not found: {test_file}")
            return

        print(f"\n📄 PROCESSING: {test_file}")
        result = await processor.process_document(test_file)

        if not result.success:
            print(f"❌ Processing failed: {result.error}")
            return

        print(f"✅ Processing successful")
        print(f"📊 Metadata: {result.metadata}")

        # Check OCR results
        print(f"\n🔤 OCR PERFORMANCE:")
        ocr_success = False
        for i, img in enumerate(result.images):
            if 'ocr_text' in img and img['ocr_text'].strip():
                ocr_success = True
                text_len = len(img['ocr_text'])
                confidence = img.get('ocr_confidence', 0)
                print(f"   ✅ Image {i+1}: {text_len} chars, confidence: {confidence:.3f}")
                if img['ocr_text'].strip():
                    print(f"      Text: {img['ocr_text'][:50]}...")
            elif 'ocr_error' in img:
                print(f"   ❌ Image {i+1}: {img['ocr_error']}")
            else:
                print(f"   ⚠️ Image {i+1}: No OCR text")

        # Check classification
        print(f"\n🖼️ CLASSIFICATION PERFORMANCE:")
        classification_success = False
        bee_found = False
        for i, img in enumerate(result.images):
            if 'classification' in img and img['classification']:
                classification_success = True
                top_result = img['classification'][0]
                label = top_result.get('label', 'unknown')
                score = top_result.get('confidence', 0)
                print(f"   ✅ Image {i+1}: {label} (score: {score:.3f})")
                if 'bee' in label.lower():
                    bee_found = True
                    print(f"      🎯 BEE DETECTED!")
            elif 'classification_error' in img:
                print(f"   ❌ Image {i+1}: {img['classification_error']}")
            else:
                print(f"   ⚠️ Image {i+1}: No classification")

        print(f"\n🎯 FINAL RESULTS:")
        print(f"   OCR: {'✅ WORKING' if ocr_success else '❌ FAILED'}")
        print(f"   Classification: {'✅ WORKING' if classification_success else '❌ FAILED'}")
        print(f"   Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
        print(f"   Dependency Isolation: {'✅ ACHIEVED' if ocr_success and classification_success else '❌ FAILED'}")

        # Test 2: Test OCR with a simple image
        print(f"\n🧪 ADDITIONAL OCR TEST:")
        test_simple_ocr()

        # Test 3: Test image classification with virtual environment
        print(f"\n🧪 ADDITIONAL CLASSIFICATION TEST:")
        await test_simple_classification(processor)

    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()

def test_simple_ocr():
    """Test OCR with simple processor"""
    try:
        from simple_ocr_processor import get_simple_ocr_processor

        processor = get_simple_ocr_processor()
        if not processor.available:
            print("   ❌ Simple OCR processor not available")
            return

        # Create a simple test image with text
        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
            test_image_path = f.name

        # For now, just test if processor works
        result = processor.extract_text_from_image("test.docx")  # This will fail but test the process
        print(f"   ✅ OCR subprocess execution: {'Working' if 'text' in result else 'Failed'}")

        # Clean up
        if os.path.exists(test_image_path):
            os.unlink(test_image_path)

    except Exception as e:
        print(f"   ❌ OCR test failed: {e}")

async def test_simple_classification(processor):
    """Test image classification"""
    if not processor.image_classifier or not processor.image_classifier.available:
        print("   ❌ Image classifier not available")
        return

    try:
        # Extract first image from test.docx for classification test
        with tempfile.TemporaryDirectory() as temp_dir:
            # Extract images from docx using zipfile
            with zipfile.ZipFile("test.docx", 'r') as zip_ref:
                image_files = []
                for file_info in zip_ref.filelist:
                    if file_info.filename.startswith('word/media/'):
                        # Extract the image
                        image_filename = os.path.basename(file_info.filename)
                        image_path = os.path.join(temp_dir, image_filename)

                        # Extract and save
                        with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
                            target.write(source.read())

                        image_files.append(image_path)
                        break  # Just test with first image

            if image_files:
                test_image = image_files[0]
                print(f"   Testing classification on: {os.path.basename(test_image)}")
                results = processor.image_classifier.classify_image(test_image, top_k=3)

                if results and 'error' not in results[0]:
                    print(f"   ✅ Classification working")
                    for result in results:
                        print(f"      {result['label']}: {result['confidence']:.4f}")
                        if 'bee' in result['label'].lower():
                            print(f"      🎯 BEE CLASSIFICATION SUCCESS!")
                else:
                    print(f"   ❌ Classification failed: {results}")
            else:
                print("   ⚠️ No images found in test.docx for classification test")

    except Exception as e:
        print(f"   ❌ Classification test failed: {e}")

if __name__ == "__main__":
    asyncio.run(test_complete_solution())