Files
railseek6/check_test_doc.py

89 lines
2.9 KiB
Python

"""
Check if test.docx actually contains images and debug the image extraction
"""
import docx
from pathlib import Path
def check_docx_images():
"""Check what's actually in test.docx"""
test_file = "test.docx"
if not Path(test_file).exists():
print(f"{test_file} not found")
return
print(f"🔍 Analyzing {test_file}")
try:
doc = docx.Document(test_file)
# Check inline shapes
print(f"📊 Inline shapes found: {len(doc.inline_shapes)}")
for i, shape in enumerate(doc.inline_shapes):
print(f" Shape {i+1}:")
print(f" Type: {type(shape)}")
print(f" Has image: {hasattr(shape, 'image')}")
if hasattr(shape, 'image'):
print(f" Image type: {type(shape.image)}")
print(f" Image format: {getattr(shape.image, 'format', 'unknown')}")
print(f" Image size: {len(shape.image.blob) if hasattr(shape.image, 'blob') else 'unknown'} bytes")
# Check other elements that might contain images
print(f"\n📊 Paragraphs: {len(doc.paragraphs)}")
print(f"📊 Tables: {len(doc.tables)}")
# Check if there are any images in the XML
print(f"\n🔍 Checking document parts...")
for part in doc.part.related_parts:
print(f" Part: {part}")
except Exception as e:
print(f"❌ Error analyzing document: {e}")
import traceback
traceback.print_exc()
def check_openclip_installation():
"""Check if OpenCLIP is properly installed"""
print(f"\n🔧 Checking OpenCLIP installation...")
try:
import subprocess
result = subprocess.run([
'python', '-c',
'import open_clip; print("✅ OpenCLIP imported successfully"); print(f"Version: {open_clip.__version__}")'
], capture_output=True, text=True, timeout=10)
if result.returncode == 0:
print(result.stdout)
else:
print(f"❌ OpenCLIP import failed: {result.stderr}")
except Exception as e:
print(f"❌ OpenCLIP check failed: {e}")
def check_gpu_availability():
"""Check GPU availability for both PaddleOCR and OpenCLIP"""
print(f"\n🎮 Checking GPU availability...")
try:
import torch
if torch.cuda.is_available():
print(f"✅ CUDA is available")
print(f" GPU Count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
else:
print(f"❌ CUDA is not available")
except ImportError:
print(f"❌ PyTorch not installed")
if __name__ == "__main__":
print("🔍 Debugging Document Processing Issues")
print("=" * 50)
check_docx_images()
check_openclip_installation()
check_gpu_availability()