89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
"""
|
|
Check if test.docx actually contains images and debug the image extraction
|
|
"""
|
|
|
|
import docx
|
|
from pathlib import Path
|
|
|
|
def check_docx_images():
|
|
"""Check what's actually in test.docx"""
|
|
test_file = "test.docx"
|
|
|
|
if not Path(test_file).exists():
|
|
print(f"❌ {test_file} not found")
|
|
return
|
|
|
|
print(f"🔍 Analyzing {test_file}")
|
|
|
|
try:
|
|
doc = docx.Document(test_file)
|
|
|
|
# Check inline shapes
|
|
print(f"📊 Inline shapes found: {len(doc.inline_shapes)}")
|
|
|
|
for i, shape in enumerate(doc.inline_shapes):
|
|
print(f" Shape {i+1}:")
|
|
print(f" Type: {type(shape)}")
|
|
print(f" Has image: {hasattr(shape, 'image')}")
|
|
if hasattr(shape, 'image'):
|
|
print(f" Image type: {type(shape.image)}")
|
|
print(f" Image format: {getattr(shape.image, 'format', 'unknown')}")
|
|
print(f" Image size: {len(shape.image.blob) if hasattr(shape.image, 'blob') else 'unknown'} bytes")
|
|
|
|
# Check other elements that might contain images
|
|
print(f"\n📊 Paragraphs: {len(doc.paragraphs)}")
|
|
print(f"📊 Tables: {len(doc.tables)}")
|
|
|
|
# Check if there are any images in the XML
|
|
print(f"\n🔍 Checking document parts...")
|
|
for part in doc.part.related_parts:
|
|
print(f" Part: {part}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error analyzing document: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def check_openclip_installation():
|
|
"""Check if OpenCLIP is properly installed"""
|
|
print(f"\n🔧 Checking OpenCLIP installation...")
|
|
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run([
|
|
'python', '-c',
|
|
'import open_clip; print("✅ OpenCLIP imported successfully"); print(f"Version: {open_clip.__version__}")'
|
|
], capture_output=True, text=True, timeout=10)
|
|
|
|
if result.returncode == 0:
|
|
print(result.stdout)
|
|
else:
|
|
print(f"❌ OpenCLIP import failed: {result.stderr}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ OpenCLIP check failed: {e}")
|
|
|
|
def check_gpu_availability():
|
|
"""Check GPU availability for both PaddleOCR and OpenCLIP"""
|
|
print(f"\n🎮 Checking GPU availability...")
|
|
|
|
try:
|
|
import torch
|
|
if torch.cuda.is_available():
|
|
print(f"✅ CUDA is available")
|
|
print(f" GPU Count: {torch.cuda.device_count()}")
|
|
for i in range(torch.cuda.device_count()):
|
|
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
|
|
else:
|
|
print(f"❌ CUDA is not available")
|
|
|
|
except ImportError:
|
|
print(f"❌ PyTorch not installed")
|
|
|
|
if __name__ == "__main__":
|
|
print("🔍 Debugging Document Processing Issues")
|
|
print("=" * 50)
|
|
|
|
check_docx_images()
|
|
check_openclip_installation()
|
|
check_gpu_availability() |