import fitz # PyMuPDF from PIL import Image import io import os def check_pdf_images(pdf_path): if not os.path.exists(pdf_path): print(f"❌ {pdf_path} not found") return print(f"🖼️ Checking {pdf_path} for images...") try: doc = fitz.open(pdf_path) print(f" Pages: {len(doc)}") for page_num in range(len(doc)): page = doc[page_num] print(f"\n Page {page_num+1}:") # Get image list image_list = page.get_images() print(f" Images found: {len(image_list)}") if image_list: for img_index, img in enumerate(image_list): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n - pix.alpha < 4: # this is GRAY or RGB img_data = pix.tobytes("png") img_size = len(img_data) print(f" Image {img_index+1}: {pix.width}x{pix.height}, {img_size} bytes") # Save the image for inspection img_filename = f"ocr_page{page_num+1}_img{img_index+1}.png" with open(img_filename, "wb") as f: f.write(img_data) print(f" Saved as: {img_filename}") # Try to open and display image info try: pil_img = Image.open(io.BytesIO(img_data)) print(f" Format: {pil_img.format}, Mode: {pil_img.mode}") except Exception as e: print(f" Error opening image: {e}") else: print(f" Image {img_index+1}: CMYK (skipped)") pix = None # free pixmap resources else: print(" No images found on this page") doc.close() except Exception as e: print(f"❌ Error analyzing PDF: {e}") if __name__ == "__main__": check_pdf_images("ocr.pdf")