railseek6/test_standalone.py

"""
Standalone test for document processing without dependency conflicts
Tests the enhanced pipeline with isolated modules
"""

import os
import sys
import asyncio
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Add paths
current_dir = Path(__file__).parent
lightrag_dir = current_dir / "LightRAG-main"
sys.path.insert(0, str(current_dir))
sys.path.insert(0, str(lightrag_dir))

async def test_document_with_images():
    """Test document processing with a document that contains images"""

    print("🧪 Testing Document Processing with Images")
    print("=" * 50)

    # Test file
    test_file = "test.docx"

    if not os.path.exists(test_file):
        print(f"❌ Test file {test_file} not found")
        return False

    try:
        # Import the document processor
        from lightrag.document_processor import get_document_processor

        # Initialize processor
        processor = get_document_processor()

        print(f"📄 Processing: {test_file}")
        print(f"🔧 OCR Available: {processor.ocr_processor.ocr_available}")
        print(f"🖼️ Image Classifier Available: {processor.image_classifier and processor.image_classifier.available}")

        # Process the document
        result = await processor.process_document(test_file)

        if result.success:
            print("✅ Document processed successfully")
            print(f"📊 Metadata: {result.metadata}")

            # Check for images
            if result.images:
                print(f"🖼️ Found {len(result.images)} images in document")
                for i, img in enumerate(result.images):
                    print(f"  Image {i+1}: {img}")
            else:
                print("❌ No images found in document")

            # Check content
            print(f"📝 Content length: {len(result.content)} characters")
            print(f"📋 Content preview: {result.content[:200]}...")

        else:
            print(f"❌ Processing failed: {result.error}")

        return result.success

    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

async def test_image_extraction():
    """Test image extraction from Word documents specifically"""

    print("\n🔍 Testing Image Extraction from Word Documents")
    print("=" * 50)

    try:
        import docx

        test_file = "test.docx"
        doc = docx.Document(test_file)

        # Count inline shapes (images)
        inline_shapes = list(doc.inline_shapes)
        print(f"📊 Found {len(inline_shapes)} inline shapes in document")

        # Check if any are images
        image_count = 0
        for i, shape in enumerate(inline_shapes):
            if hasattr(shape, 'image'):
                image_count += 1
                print(f"  ✅ Shape {i+1} is an image")
            else:
                print(f"  ❌ Shape {i+1} is not an image")

        print(f"🖼️ Total images found: {image_count}")

        return image_count > 0

    except Exception as e:
        print(f"❌ Image extraction test failed: {e}")
        return False

async def test_ocr_functionality():
    """Test OCR functionality separately"""

    print("\n🔤 Testing OCR Functionality")
    print("=" * 50)

    try:
        from lightrag.document_processor import OCRProcessor

        # Initialize OCR processor
        ocr_processor = OCRProcessor(use_gpu=True)

        if ocr_processor.ocr_available:
            print("✅ OCR processor is available")

            # Test with a simple image if available
            test_images = ["ocr_high_res.png", "ocr_page1_preview.png"]
            for test_img in test_images:
                if os.path.exists(test_img):
                    print(f"🧪 Testing OCR on: {test_img}")
                    result = ocr_processor.extract_text_from_image(test_img)
                    print(f"  Text extracted: {len(result['text'])} characters")
                    print(f"  Confidence: {result['confidence']:.4f}")
                    if result['text'].strip():
                        print(f"  Preview: {result['text'][:100]}...")
                    break
            else:
                print("⚠️ No test images found for OCR testing")
        else:
            print("❌ OCR processor not available")

        return ocr_processor.ocr_available

    except Exception as e:
        print(f"❌ OCR test failed: {e}")
        return False

async def test_dependency_isolation():
    """Test that PaddleOCR and OpenCLIP can coexist"""

    print("\n🛡️ Testing Dependency Isolation")
    print("=" * 50)

    try:
        # Test importing both modules
        print("🔧 Importing PaddleOCR...")
        import paddleocr
        from paddleocr import PaddleOCR
        print("✅ PaddleOCR imported successfully")

        print("🔧 Importing OpenCLIP...")
        try:
            import open_clip
            import torch
            print("✅ OpenCLIP imported successfully")

            # Try to initialize OpenCLIP
            print("🔄 Initializing OpenCLIP model...")
            model, _, processor = open_clip.create_model_and_transforms(
                model_name="ViT-B-32",
                pretrained="laion2b_s34b_b79k"
            )
            print("✅ OpenCLIP model initialized successfully")

            return True

        except ImportError:
            print("⚠️ OpenCLIP not available - this is expected if not installed")
            return True
        except Exception as e:
            print(f"⚠️ OpenCLIP initialization failed: {e}")
            print("This might be due to CUDA conflicts with PaddleOCR")
            return False

    except Exception as e:
        print(f"❌ Dependency isolation test failed: {e}")
        return False

async def main():
    """Run all tests"""

    print("🚀 Starting Standalone Document Processing Tests")
    print("=" * 60)

    tests_passed = 0
    total_tests = 4

    # Test 1: Dependency Isolation
    if await test_dependency_isolation():
        tests_passed += 1

    # Test 2: OCR Functionality
    if await test_ocr_functionality():
        tests_passed += 1

    # Test 3: Image Extraction
    if await test_image_extraction():
        tests_passed += 1

    # Test 4: Document Processing
    if await test_document_with_images():
        tests_passed += 1

    # Summary
    print(f"\n📊 Test Summary: {tests_passed}/{total_tests} tests passed")

    if tests_passed == total_tests:
        print("🎉 All tests passed! The enhanced pipeline is working correctly.")
    else:
        print("⚠️ Some tests failed. Check the output above for details.")

    return tests_passed == total_tests

if __name__ == "__main__":
    success = asyncio.run(main())
    sys.exit(0 if success else 1)