Files
railseek6/check_ocr_pdf_images.py

60 lines
2.2 KiB
Python

import fitz # PyMuPDF
from PIL import Image
import io
import os
def check_pdf_images(pdf_path):
if not os.path.exists(pdf_path):
print(f"{pdf_path} not found")
return
print(f"🖼️ Checking {pdf_path} for images...")
try:
doc = fitz.open(pdf_path)
print(f" Pages: {len(doc)}")
for page_num in range(len(doc)):
page = doc[page_num]
print(f"\n Page {page_num+1}:")
# Get image list
image_list = page.get_images()
print(f" Images found: {len(image_list)}")
if image_list:
for img_index, img in enumerate(image_list):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4: # this is GRAY or RGB
img_data = pix.tobytes("png")
img_size = len(img_data)
print(f" Image {img_index+1}: {pix.width}x{pix.height}, {img_size} bytes")
# Save the image for inspection
img_filename = f"ocr_page{page_num+1}_img{img_index+1}.png"
with open(img_filename, "wb") as f:
f.write(img_data)
print(f" Saved as: {img_filename}")
# Try to open and display image info
try:
pil_img = Image.open(io.BytesIO(img_data))
print(f" Format: {pil_img.format}, Mode: {pil_img.mode}")
except Exception as e:
print(f" Error opening image: {e}")
else:
print(f" Image {img_index+1}: CMYK (skipped)")
pix = None # free pixmap resources
else:
print(" No images found on this page")
doc.close()
except Exception as e:
print(f"❌ Error analyzing PDF: {e}")
if __name__ == "__main__":
check_pdf_images("ocr.pdf")