Files
railseek6/check_ocr_pdf_content.py

28 lines
914 B
Python

import PyPDF2
import os
def check_pdf_content(pdf_path):
if not os.path.exists(pdf_path):
print(f"{pdf_path} not found")
return
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
print(f"📄 {pdf_path} has {num_pages} pages")
for i, page in enumerate(reader.pages):
text = page.extract_text()
print(f"Page {i+1}:")
print(f" Text length: {len(text)}")
print(f" Text preview: {repr(text[:100])}")
if text.strip():
print(" ✅ Contains text")
else:
print(" ❌ No text or only whitespace")
except Exception as e:
print(f"❌ Error reading PDF: {e}")
if __name__ == "__main__":
check_pdf_content("ocr.pdf")