""" Check if test.docx actually contains images and debug the image extraction """ import docx from pathlib import Path def check_docx_images(): """Check what's actually in test.docx""" test_file = "test.docx" if not Path(test_file).exists(): print(f"āŒ {test_file} not found") return print(f"šŸ” Analyzing {test_file}") try: doc = docx.Document(test_file) # Check inline shapes print(f"šŸ“Š Inline shapes found: {len(doc.inline_shapes)}") for i, shape in enumerate(doc.inline_shapes): print(f" Shape {i+1}:") print(f" Type: {type(shape)}") print(f" Has image: {hasattr(shape, 'image')}") if hasattr(shape, 'image'): print(f" Image type: {type(shape.image)}") print(f" Image format: {getattr(shape.image, 'format', 'unknown')}") print(f" Image size: {len(shape.image.blob) if hasattr(shape.image, 'blob') else 'unknown'} bytes") # Check other elements that might contain images print(f"\nšŸ“Š Paragraphs: {len(doc.paragraphs)}") print(f"šŸ“Š Tables: {len(doc.tables)}") # Check if there are any images in the XML print(f"\nšŸ” Checking document parts...") for part in doc.part.related_parts: print(f" Part: {part}") except Exception as e: print(f"āŒ Error analyzing document: {e}") import traceback traceback.print_exc() def check_openclip_installation(): """Check if OpenCLIP is properly installed""" print(f"\nšŸ”§ Checking OpenCLIP installation...") try: import subprocess result = subprocess.run([ 'python', '-c', 'import open_clip; print("āœ… OpenCLIP imported successfully"); print(f"Version: {open_clip.__version__}")' ], capture_output=True, text=True, timeout=10) if result.returncode == 0: print(result.stdout) else: print(f"āŒ OpenCLIP import failed: {result.stderr}") except Exception as e: print(f"āŒ OpenCLIP check failed: {e}") def check_gpu_availability(): """Check GPU availability for both PaddleOCR and OpenCLIP""" print(f"\nšŸŽ® Checking GPU availability...") try: import torch if torch.cuda.is_available(): print(f"āœ… CUDA is available") print(f" GPU Count: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): print(f" GPU {i}: {torch.cuda.get_device_name(i)}") else: print(f"āŒ CUDA is not available") except ImportError: print(f"āŒ PyTorch not installed") if __name__ == "__main__": print("šŸ” Debugging Document Processing Issues") print("=" * 50) check_docx_images() check_openclip_installation() check_gpu_availability()