railseek6/word_image_extractor.py


import zipfile
import os
from pathlib import Path

def extract_images_from_docx(docx_path, output_dir):
    """
    Extract all images from a Word document using zipfile method
    """
    images = []

    try:
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Open the docx as a zip file
        with zipfile.ZipFile(docx_path, 'r') as zip_ref:
            # List all files in the zip
            for file_info in zip_ref.filelist:
                # Check if file is in media directory (where images are stored)
                if file_info.filename.startswith('word/media/'):
                    # Extract the image
                    image_filename = Path(file_info.filename).name
                    image_path = os.path.join(output_dir, image_filename)

                    # Extract and save
                    with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
                        target.write(source.read())

                    images.append(image_path)
                    print(f"📸 Extracted image: {image_path}")

        return images

    except Exception as e:
        print(f"❌ Error extracting images: {e}")
        return []

if __name__ == "__main__":
    # Test extraction
    test_doc = "test.docx"
    if os.path.exists(test_doc):
        images = extract_images_from_docx(test_doc, "extracted_images")
        print(f"📊 Found {len(images)} images")
        for img in images:
            print(f"  - {img}")
    else:
        print(f"❌ Test document not found: {test_doc}")