railseek6/check_test_doc.py

"""
Check if test.docx actually contains images and debug the image extraction
"""

import docx
from pathlib import Path

def check_docx_images():
    """Check what's actually in test.docx"""
    test_file = "test.docx"

    if not Path(test_file).exists():
        print(f"❌ {test_file} not found")
        return

    print(f"🔍 Analyzing {test_file}")

    try:
        doc = docx.Document(test_file)

        # Check inline shapes
        print(f"📊 Inline shapes found: {len(doc.inline_shapes)}")

        for i, shape in enumerate(doc.inline_shapes):
            print(f"  Shape {i+1}:")
            print(f"    Type: {type(shape)}")
            print(f"    Has image: {hasattr(shape, 'image')}")
            if hasattr(shape, 'image'):
                print(f"    Image type: {type(shape.image)}")
                print(f"    Image format: {getattr(shape.image, 'format', 'unknown')}")
                print(f"    Image size: {len(shape.image.blob) if hasattr(shape.image, 'blob') else 'unknown'} bytes")

        # Check other elements that might contain images
        print(f"\n📊 Paragraphs: {len(doc.paragraphs)}")
        print(f"📊 Tables: {len(doc.tables)}")

        # Check if there are any images in the XML
        print(f"\n🔍 Checking document parts...")
        for part in doc.part.related_parts:
            print(f"  Part: {part}")

    except Exception as e:
        print(f"❌ Error analyzing document: {e}")
        import traceback
        traceback.print_exc()

def check_openclip_installation():
    """Check if OpenCLIP is properly installed"""
    print(f"\n🔧 Checking OpenCLIP installation...")

    try:
        import subprocess
        result = subprocess.run([
            'python', '-c',
            'import open_clip; print("✅ OpenCLIP imported successfully"); print(f"Version: {open_clip.__version__}")'
        ], capture_output=True, text=True, timeout=10)

        if result.returncode == 0:
            print(result.stdout)
        else:
            print(f"❌ OpenCLIP import failed: {result.stderr}")

    except Exception as e:
        print(f"❌ OpenCLIP check failed: {e}")

def check_gpu_availability():
    """Check GPU availability for both PaddleOCR and OpenCLIP"""
    print(f"\n🎮 Checking GPU availability...")

    try:
        import torch
        if torch.cuda.is_available():
            print(f"✅ CUDA is available")
            print(f"   GPU Count: {torch.cuda.device_count()}")
            for i in range(torch.cuda.device_count()):
                print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
        else:
            print(f"❌ CUDA is not available")

    except ImportError:
        print(f"❌ PyTorch not installed")

if __name__ == "__main__":
    print("🔍 Debugging Document Processing Issues")
    print("=" * 50)

    check_docx_images()
    check_openclip_installation()
    check_gpu_availability()