import zipfile import os from pathlib import Path def extract_images_from_docx(docx_path, output_dir): """ Extract all images from a Word document using zipfile method """ images = [] try: # Create output directory os.makedirs(output_dir, exist_ok=True) # Open the docx as a zip file with zipfile.ZipFile(docx_path, 'r') as zip_ref: # List all files in the zip for file_info in zip_ref.filelist: # Check if file is in media directory (where images are stored) if file_info.filename.startswith('word/media/'): # Extract the image image_filename = Path(file_info.filename).name image_path = os.path.join(output_dir, image_filename) # Extract and save with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target: target.write(source.read()) images.append(image_path) print(f"📸 Extracted image: {image_path}") return images except Exception as e: print(f"❌ Error extracting images: {e}") return [] if __name__ == "__main__": # Test extraction test_doc = "test.docx" if os.path.exists(test_doc): images = extract_images_from_docx(test_doc, "extracted_images") print(f"📊 Found {len(images)} images") for img in images: print(f" - {img}") else: print(f"❌ Test document not found: {test_doc}")