49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
|
|
import zipfile
|
|
import os
|
|
from pathlib import Path
|
|
|
|
def extract_images_from_docx(docx_path, output_dir):
|
|
"""
|
|
Extract all images from a Word document using zipfile method
|
|
"""
|
|
images = []
|
|
|
|
try:
|
|
# Create output directory
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Open the docx as a zip file
|
|
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
|
# List all files in the zip
|
|
for file_info in zip_ref.filelist:
|
|
# Check if file is in media directory (where images are stored)
|
|
if file_info.filename.startswith('word/media/'):
|
|
# Extract the image
|
|
image_filename = Path(file_info.filename).name
|
|
image_path = os.path.join(output_dir, image_filename)
|
|
|
|
# Extract and save
|
|
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
|
|
target.write(source.read())
|
|
|
|
images.append(image_path)
|
|
print(f"📸 Extracted image: {image_path}")
|
|
|
|
return images
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error extracting images: {e}")
|
|
return []
|
|
|
|
if __name__ == "__main__":
|
|
# Test extraction
|
|
test_doc = "test.docx"
|
|
if os.path.exists(test_doc):
|
|
images = extract_images_from_docx(test_doc, "extracted_images")
|
|
print(f"📊 Found {len(images)} images")
|
|
for img in images:
|
|
print(f" - {img}")
|
|
else:
|
|
print(f"❌ Test document not found: {test_doc}")
|