Files
railseek6/debug_ocr_extraction.py

96 lines
3.5 KiB
Python

import os
import sys
import fitz # PyMuPDF
import tempfile
import paddleocr
from pathlib import Path
def debug_ocr_on_pdf(pdf_path):
"""Debug what PaddleOCR is actually extracting from the PDF"""
print(f"Testing OCR extraction on: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")
print(f"File size: {os.path.getsize(pdf_path)} bytes")
# Initialize PaddleOCR with current configuration
try:
ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en')
print("PaddleOCR initialized successfully")
except Exception as e:
print(f"Error initializing PaddleOCR: {e}")
return
# Open PDF and process each page
try:
doc = fitz.open(pdf_path)
print(f"PDF has {len(doc)} pages")
all_extracted_text = ""
for page_num in range(len(doc)):
print(f"\n--- Processing Page {page_num + 1} ---")
page = doc.load_page(page_num)
pix = page.get_pixmap()
img_data = pix.tobytes("png")
# Save temporary image
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
temp_img.write(img_data)
temp_img_path = temp_img.name
try:
# Run OCR on the temporary image
result = ocr.ocr(temp_img_path)
print(f"OCR result structure: {type(result)}")
if result:
print(f"Number of lines detected: {len(result)}")
page_text = ""
for line_num, line in enumerate(result):
print(f"Line {line_num}: {line}")
if line: # Check if line is not empty
for word_info in line:
if len(word_info) >= 2:
text = word_info[1][0]
confidence = word_info[1][1] if len(word_info[1]) > 1 else "N/A"
print(f" Word: '{text}' (confidence: {confidence})")
page_text += text + " "
print(f"Extracted text from page {page_num + 1}: '{page_text.strip()}'")
all_extracted_text += page_text.strip() + "\n"
else:
print(f"No OCR results for page {page_num + 1}")
except Exception as e:
print(f"Error during OCR processing page {page_num + 1}: {e}")
finally:
# Clean up temp file
try:
os.unlink(temp_img_path)
except:
pass
print(f"\n--- FINAL EXTRACTED TEXT ---")
print(all_extracted_text)
print(f"Total characters extracted: {len(all_extracted_text)}")
doc.close()
except Exception as e:
print(f"Error processing PDF: {e}")
if __name__ == "__main__":
# Test on the ocr.pdf file
pdf_path = "ocr.pdf"
if not os.path.exists(pdf_path):
print(f"File {pdf_path} not found. Looking for similar files...")
# Look for any PDF files that might be the test file
for file in os.listdir("."):
if file.lower().endswith(".pdf"):
print(f"Found PDF: {file}")
if "ocr" in file.lower():
pdf_path = file
break
debug_ocr_on_pdf(pdf_path)