96 lines
3.8 KiB
Python
96 lines
3.8 KiB
Python
import fitz # PyMuPDF
|
|
import os
|
|
import json
|
|
from PIL import Image
|
|
import io
|
|
|
|
def debug_pdf_extraction(pdf_path):
|
|
if not os.path.exists(pdf_path):
|
|
print(f"❌ {pdf_path} not found")
|
|
return
|
|
|
|
print(f"🔍 Debugging {pdf_path} extraction...")
|
|
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
print(f" Pages: {len(doc)}")
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
print(f"\n Page {page_num+1}:")
|
|
|
|
# Try different text extraction methods
|
|
print(" Text extraction methods:")
|
|
|
|
# Method 1: Default text extraction
|
|
text_default = page.get_text()
|
|
print(f" Default: {len(text_default)} chars")
|
|
if text_default.strip():
|
|
print(f" Content: {repr(text_default[:200])}")
|
|
else:
|
|
print(" No text found")
|
|
|
|
# Method 2: Text extraction with flags
|
|
text_words = page.get_text("words")
|
|
print(f" Words: {len(text_words)} words found")
|
|
if text_words:
|
|
print(f" Sample words: {text_words[:5]}")
|
|
|
|
text_blocks = page.get_text("blocks")
|
|
print(f" Blocks: {len(text_blocks)} blocks found")
|
|
if text_blocks:
|
|
for i, block in enumerate(text_blocks[:3]):
|
|
print(f" Block {i+1}: {repr(block[4][:100])}")
|
|
|
|
# Check if it's a scanned document by rendering to image
|
|
print(" Rendering page as image...")
|
|
try:
|
|
mat = fitz.Matrix(2, 2) # High resolution
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
img_size = len(img_data)
|
|
print(f" Rendered image size: {img_size} bytes")
|
|
|
|
# Save the rendered image
|
|
img_filename = f"ocr_page{page_num+1}_rendered.png"
|
|
with open(img_filename, "wb") as f:
|
|
f.write(img_data)
|
|
print(f" Saved rendered image: {img_filename}")
|
|
|
|
# Check if the image has content by analyzing pixel data
|
|
pil_img = Image.open(io.BytesIO(img_data))
|
|
print(f" Image size: {pil_img.size}, mode: {pil_img.mode}")
|
|
|
|
# Check if image is mostly white/empty
|
|
if pil_img.mode == 'RGB':
|
|
# Convert to grayscale for analysis
|
|
gray_img = pil_img.convert('L')
|
|
# Calculate percentage of non-white pixels
|
|
import numpy as np
|
|
img_array = np.array(gray_img)
|
|
white_pixels = np.sum(img_array > 240) # threshold for white
|
|
total_pixels = img_array.size
|
|
white_percentage = (white_pixels / total_pixels) * 100
|
|
print(f" White pixels: {white_percentage:.1f}%")
|
|
|
|
if white_percentage > 95:
|
|
print(" ⚠️ Image appears mostly white - may be blank or corrupted")
|
|
else:
|
|
print(" ✅ Image has significant content")
|
|
|
|
except Exception as e:
|
|
print(f" Error rendering image: {e}")
|
|
|
|
doc.close()
|
|
|
|
print(f"\n📊 Summary for {pdf_path}:")
|
|
print(f" File size: {os.path.getsize(pdf_path)} bytes")
|
|
print(f" No embedded text found")
|
|
print(f" No embedded images found")
|
|
print(f" This appears to be a scanned PDF or corrupted file")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error analyzing PDF: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
debug_pdf_extraction("ocr.pdf") |