import fitz # PyMuPDF import os import json from PIL import Image import io def debug_pdf_extraction(pdf_path): if not os.path.exists(pdf_path): print(f"āŒ {pdf_path} not found") return print(f"šŸ” Debugging {pdf_path} extraction...") try: doc = fitz.open(pdf_path) print(f" Pages: {len(doc)}") for page_num in range(len(doc)): page = doc[page_num] print(f"\n Page {page_num+1}:") # Try different text extraction methods print(" Text extraction methods:") # Method 1: Default text extraction text_default = page.get_text() print(f" Default: {len(text_default)} chars") if text_default.strip(): print(f" Content: {repr(text_default[:200])}") else: print(" No text found") # Method 2: Text extraction with flags text_words = page.get_text("words") print(f" Words: {len(text_words)} words found") if text_words: print(f" Sample words: {text_words[:5]}") text_blocks = page.get_text("blocks") print(f" Blocks: {len(text_blocks)} blocks found") if text_blocks: for i, block in enumerate(text_blocks[:3]): print(f" Block {i+1}: {repr(block[4][:100])}") # Check if it's a scanned document by rendering to image print(" Rendering page as image...") try: mat = fitz.Matrix(2, 2) # High resolution pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_size = len(img_data) print(f" Rendered image size: {img_size} bytes") # Save the rendered image img_filename = f"ocr_page{page_num+1}_rendered.png" with open(img_filename, "wb") as f: f.write(img_data) print(f" Saved rendered image: {img_filename}") # Check if the image has content by analyzing pixel data pil_img = Image.open(io.BytesIO(img_data)) print(f" Image size: {pil_img.size}, mode: {pil_img.mode}") # Check if image is mostly white/empty if pil_img.mode == 'RGB': # Convert to grayscale for analysis gray_img = pil_img.convert('L') # Calculate percentage of non-white pixels import numpy as np img_array = np.array(gray_img) white_pixels = np.sum(img_array > 240) # threshold for white total_pixels = img_array.size white_percentage = (white_pixels / total_pixels) * 100 print(f" White pixels: {white_percentage:.1f}%") if white_percentage > 95: print(" āš ļø Image appears mostly white - may be blank or corrupted") else: print(" āœ… Image has significant content") except Exception as e: print(f" Error rendering image: {e}") doc.close() print(f"\nšŸ“Š Summary for {pdf_path}:") print(f" File size: {os.path.getsize(pdf_path)} bytes") print(f" No embedded text found") print(f" No embedded images found") print(f" This appears to be a scanned PDF or corrupted file") except Exception as e: print(f"āŒ Error analyzing PDF: {e}") if __name__ == "__main__": debug_pdf_extraction("ocr.pdf")