Files
railseek6/word_image_extractor.py

49 lines
1.6 KiB
Python

import zipfile
import os
from pathlib import Path
def extract_images_from_docx(docx_path, output_dir):
"""
Extract all images from a Word document using zipfile method
"""
images = []
try:
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Open the docx as a zip file
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
# List all files in the zip
for file_info in zip_ref.filelist:
# Check if file is in media directory (where images are stored)
if file_info.filename.startswith('word/media/'):
# Extract the image
image_filename = Path(file_info.filename).name
image_path = os.path.join(output_dir, image_filename)
# Extract and save
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
target.write(source.read())
images.append(image_path)
print(f"📸 Extracted image: {image_path}")
return images
except Exception as e:
print(f"❌ Error extracting images: {e}")
return []
if __name__ == "__main__":
# Test extraction
test_doc = "test.docx"
if os.path.exists(test_doc):
images = extract_images_from_docx(test_doc, "extracted_images")
print(f"📊 Found {len(images)} images")
for img in images:
print(f" - {img}")
else:
print(f"❌ Test document not found: {test_doc}")