Files
railseek6/debug_ocr_pdf_extraction.py

96 lines
3.8 KiB
Python

import fitz # PyMuPDF
import os
import json
from PIL import Image
import io
def debug_pdf_extraction(pdf_path):
if not os.path.exists(pdf_path):
print(f"{pdf_path} not found")
return
print(f"🔍 Debugging {pdf_path} extraction...")
try:
doc = fitz.open(pdf_path)
print(f" Pages: {len(doc)}")
for page_num in range(len(doc)):
page = doc[page_num]
print(f"\n Page {page_num+1}:")
# Try different text extraction methods
print(" Text extraction methods:")
# Method 1: Default text extraction
text_default = page.get_text()
print(f" Default: {len(text_default)} chars")
if text_default.strip():
print(f" Content: {repr(text_default[:200])}")
else:
print(" No text found")
# Method 2: Text extraction with flags
text_words = page.get_text("words")
print(f" Words: {len(text_words)} words found")
if text_words:
print(f" Sample words: {text_words[:5]}")
text_blocks = page.get_text("blocks")
print(f" Blocks: {len(text_blocks)} blocks found")
if text_blocks:
for i, block in enumerate(text_blocks[:3]):
print(f" Block {i+1}: {repr(block[4][:100])}")
# Check if it's a scanned document by rendering to image
print(" Rendering page as image...")
try:
mat = fitz.Matrix(2, 2) # High resolution
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_size = len(img_data)
print(f" Rendered image size: {img_size} bytes")
# Save the rendered image
img_filename = f"ocr_page{page_num+1}_rendered.png"
with open(img_filename, "wb") as f:
f.write(img_data)
print(f" Saved rendered image: {img_filename}")
# Check if the image has content by analyzing pixel data
pil_img = Image.open(io.BytesIO(img_data))
print(f" Image size: {pil_img.size}, mode: {pil_img.mode}")
# Check if image is mostly white/empty
if pil_img.mode == 'RGB':
# Convert to grayscale for analysis
gray_img = pil_img.convert('L')
# Calculate percentage of non-white pixels
import numpy as np
img_array = np.array(gray_img)
white_pixels = np.sum(img_array > 240) # threshold for white
total_pixels = img_array.size
white_percentage = (white_pixels / total_pixels) * 100
print(f" White pixels: {white_percentage:.1f}%")
if white_percentage > 95:
print(" ⚠️ Image appears mostly white - may be blank or corrupted")
else:
print(" ✅ Image has significant content")
except Exception as e:
print(f" Error rendering image: {e}")
doc.close()
print(f"\n📊 Summary for {pdf_path}:")
print(f" File size: {os.path.getsize(pdf_path)} bytes")
print(f" No embedded text found")
print(f" No embedded images found")
print(f" This appears to be a scanned PDF or corrupted file")
except Exception as e:
print(f"❌ Error analyzing PDF: {e}")
if __name__ == "__main__":
debug_pdf_extraction("ocr.pdf")