railseek6/debug_ocr_pdf_extraction.py

import fitz  # PyMuPDF
import os
import json
from PIL import Image
import io

def debug_pdf_extraction(pdf_path):
    if not os.path.exists(pdf_path):
        print(f"❌ {pdf_path} not found")
        return

    print(f"🔍 Debugging {pdf_path} extraction...")

    try:
        doc = fitz.open(pdf_path)
        print(f"  Pages: {len(doc)}")

        for page_num in range(len(doc)):
            page = doc[page_num]
            print(f"\n  Page {page_num+1}:")

            # Try different text extraction methods
            print("  Text extraction methods:")

            # Method 1: Default text extraction
            text_default = page.get_text()
            print(f"    Default: {len(text_default)} chars")
            if text_default.strip():
                print(f"      Content: {repr(text_default[:200])}")
            else:
                print("      No text found")

            # Method 2: Text extraction with flags
            text_words = page.get_text("words")
            print(f"    Words: {len(text_words)} words found")
            if text_words:
                print(f"      Sample words: {text_words[:5]}")

            text_blocks = page.get_text("blocks")
            print(f"    Blocks: {len(text_blocks)} blocks found")
            if text_blocks:
                for i, block in enumerate(text_blocks[:3]):
                    print(f"      Block {i+1}: {repr(block[4][:100])}")

            # Check if it's a scanned document by rendering to image
            print("  Rendering page as image...")
            try:
                mat = fitz.Matrix(2, 2)  # High resolution
                pix = page.get_pixmap(matrix=mat)
                img_data = pix.tobytes("png")
                img_size = len(img_data)
                print(f"    Rendered image size: {img_size} bytes")

                # Save the rendered image
                img_filename = f"ocr_page{page_num+1}_rendered.png"
                with open(img_filename, "wb") as f:
                    f.write(img_data)
                print(f"    Saved rendered image: {img_filename}")

                # Check if the image has content by analyzing pixel data
                pil_img = Image.open(io.BytesIO(img_data))
                print(f"    Image size: {pil_img.size}, mode: {pil_img.mode}")

                # Check if image is mostly white/empty
                if pil_img.mode == 'RGB':
                    # Convert to grayscale for analysis
                    gray_img = pil_img.convert('L')
                    # Calculate percentage of non-white pixels
                    import numpy as np
                    img_array = np.array(gray_img)
                    white_pixels = np.sum(img_array > 240)  # threshold for white
                    total_pixels = img_array.size
                    white_percentage = (white_pixels / total_pixels) * 100
                    print(f"    White pixels: {white_percentage:.1f}%")

                    if white_percentage > 95:
                        print("    ⚠️  Image appears mostly white - may be blank or corrupted")
                    else:
                        print("    ✅ Image has significant content")

            except Exception as e:
                print(f"    Error rendering image: {e}")

        doc.close()

        print(f"\n📊 Summary for {pdf_path}:")
        print(f"  File size: {os.path.getsize(pdf_path)} bytes")
        print(f"  No embedded text found")
        print(f"  No embedded images found")
        print(f"  This appears to be a scanned PDF or corrupted file")

    except Exception as e:
        print(f"❌ Error analyzing PDF: {e}")

if __name__ == "__main__":
    debug_pdf_extraction("ocr.pdf")