60 lines
2.2 KiB
Python
60 lines
2.2 KiB
Python
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
import io
|
|
import os
|
|
|
|
def check_pdf_images(pdf_path):
|
|
if not os.path.exists(pdf_path):
|
|
print(f"❌ {pdf_path} not found")
|
|
return
|
|
|
|
print(f"🖼️ Checking {pdf_path} for images...")
|
|
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
print(f" Pages: {len(doc)}")
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
print(f"\n Page {page_num+1}:")
|
|
|
|
# Get image list
|
|
image_list = page.get_images()
|
|
print(f" Images found: {len(image_list)}")
|
|
|
|
if image_list:
|
|
for img_index, img in enumerate(image_list):
|
|
xref = img[0]
|
|
pix = fitz.Pixmap(doc, xref)
|
|
|
|
if pix.n - pix.alpha < 4: # this is GRAY or RGB
|
|
img_data = pix.tobytes("png")
|
|
img_size = len(img_data)
|
|
print(f" Image {img_index+1}: {pix.width}x{pix.height}, {img_size} bytes")
|
|
|
|
# Save the image for inspection
|
|
img_filename = f"ocr_page{page_num+1}_img{img_index+1}.png"
|
|
with open(img_filename, "wb") as f:
|
|
f.write(img_data)
|
|
print(f" Saved as: {img_filename}")
|
|
|
|
# Try to open and display image info
|
|
try:
|
|
pil_img = Image.open(io.BytesIO(img_data))
|
|
print(f" Format: {pil_img.format}, Mode: {pil_img.mode}")
|
|
except Exception as e:
|
|
print(f" Error opening image: {e}")
|
|
else:
|
|
print(f" Image {img_index+1}: CMYK (skipped)")
|
|
|
|
pix = None # free pixmap resources
|
|
else:
|
|
print(" No images found on this page")
|
|
|
|
doc.close()
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error analyzing PDF: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
check_pdf_images("ocr.pdf") |