"""
Standalone test for document processing without dependency conflicts
Tests the enhanced pipeline with isolated modules
"""

import os
import sys
import asyncio
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Add paths
current_dir = Path(__file__).parent
lightrag_dir = current_dir / "LightRAG-main"
sys.path.insert(0, str(current_dir))
sys.path.insert(0, str(lightrag_dir))

async def test_document_with_images():
    """Test document processing with a document that contains images"""
    
    print("🧪 Testing Document Processing with Images")
    print("=" * 50)
    
    # Test file
    test_file = "test.docx"
    
    if not os.path.exists(test_file):
        print(f"❌ Test file {test_file} not found")
        return False
    
    try:
        # Import the document processor
        from lightrag.document_processor import get_document_processor
        
        # Initialize processor
        processor = get_document_processor()
        
        print(f"📄 Processing: {test_file}")
        print(f"🔧 OCR Available: {processor.ocr_processor.ocr_available}")
        print(f"🖼️ Image Classifier Available: {processor.image_classifier and processor.image_classifier.available}")
        
        # Process the document
        result = await processor.process_document(test_file)
        
        if result.success:
            print("✅ Document processed successfully")
            print(f"📊 Metadata: {result.metadata}")
            
            # Check for images
            if result.images:
                print(f"🖼️ Found {len(result.images)} images in document")
                for i, img in enumerate(result.images):
                    print(f"  Image {i+1}: {img}")
            else:
                print("❌ No images found in document")
                
            # Check content
            print(f"📝 Content length: {len(result.content)} characters")
            print(f"📋 Content preview: {result.content[:200]}...")
            
        else:
            print(f"❌ Processing failed: {result.error}")
            
        return result.success
        
    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

async def test_image_extraction():
    """Test image extraction from Word documents specifically"""
    
    print("\n🔍 Testing Image Extraction from Word Documents")
    print("=" * 50)
    
    try:
        import docx
        
        test_file = "test.docx"
        doc = docx.Document(test_file)
        
        # Count inline shapes (images)
        inline_shapes = list(doc.inline_shapes)
        print(f"📊 Found {len(inline_shapes)} inline shapes in document")
        
        # Check if any are images
        image_count = 0
        for i, shape in enumerate(inline_shapes):
            if hasattr(shape, 'image'):
                image_count += 1
                print(f"  ✅ Shape {i+1} is an image")
            else:
                print(f"  ❌ Shape {i+1} is not an image")
        
        print(f"🖼️ Total images found: {image_count}")
        
        return image_count > 0
        
    except Exception as e:
        print(f"❌ Image extraction test failed: {e}")
        return False

async def test_ocr_functionality():
    """Test OCR functionality separately"""
    
    print("\n🔤 Testing OCR Functionality")
    print("=" * 50)
    
    try:
        from lightrag.document_processor import OCRProcessor
        
        # Initialize OCR processor
        ocr_processor = OCRProcessor(use_gpu=True)
        
        if ocr_processor.ocr_available:
            print("✅ OCR processor is available")
            
            # Test with a simple image if available
            test_images = ["ocr_high_res.png", "ocr_page1_preview.png"]
            for test_img in test_images:
                if os.path.exists(test_img):
                    print(f"🧪 Testing OCR on: {test_img}")
                    result = ocr_processor.extract_text_from_image(test_img)
                    print(f"  Text extracted: {len(result['text'])} characters")
                    print(f"  Confidence: {result['confidence']:.4f}")
                    if result['text'].strip():
                        print(f"  Preview: {result['text'][:100]}...")
                    break
            else:
                print("⚠️ No test images found for OCR testing")
        else:
            print("❌ OCR processor not available")
            
        return ocr_processor.ocr_available
        
    except Exception as e:
        print(f"❌ OCR test failed: {e}")
        return False

async def test_dependency_isolation():
    """Test that PaddleOCR and OpenCLIP can coexist"""
    
    print("\n🛡️ Testing Dependency Isolation")
    print("=" * 50)
    
    try:
        # Test importing both modules
        print("🔧 Importing PaddleOCR...")
        import paddleocr
        from paddleocr import PaddleOCR
        print("✅ PaddleOCR imported successfully")
        
        print("🔧 Importing OpenCLIP...")
        try:
            import open_clip
            import torch
            print("✅ OpenCLIP imported successfully")
            
            # Try to initialize OpenCLIP
            print("🔄 Initializing OpenCLIP model...")
            model, _, processor = open_clip.create_model_and_transforms(
                model_name="ViT-B-32",
                pretrained="laion2b_s34b_b79k"
            )
            print("✅ OpenCLIP model initialized successfully")
            
            return True
            
        except ImportError:
            print("⚠️ OpenCLIP not available - this is expected if not installed")
            return True
        except Exception as e:
            print(f"⚠️ OpenCLIP initialization failed: {e}")
            print("This might be due to CUDA conflicts with PaddleOCR")
            return False
            
    except Exception as e:
        print(f"❌ Dependency isolation test failed: {e}")
        return False

async def main():
    """Run all tests"""
    
    print("🚀 Starting Standalone Document Processing Tests")
    print("=" * 60)
    
    tests_passed = 0
    total_tests = 4
    
    # Test 1: Dependency Isolation
    if await test_dependency_isolation():
        tests_passed += 1
    
    # Test 2: OCR Functionality  
    if await test_ocr_functionality():
        tests_passed += 1
        
    # Test 3: Image Extraction
    if await test_image_extraction():
        tests_passed += 1
        
    # Test 4: Document Processing
    if await test_document_with_images():
        tests_passed += 1
    
    # Summary
    print(f"\n📊 Test Summary: {tests_passed}/{total_tests} tests passed")
    
    if tests_passed == total_tests:
        print("🎉 All tests passed! The enhanced pipeline is working correctly.")
    else:
        print("⚠️ Some tests failed. Check the output above for details.")
    
    return tests_passed == total_tests

if __name__ == "__main__":
    success = asyncio.run(main())
    sys.exit(0 if success else 1)