28 lines
914 B
Python
28 lines
914 B
Python
import PyPDF2
|
|
import os
|
|
|
|
def check_pdf_content(pdf_path):
|
|
if not os.path.exists(pdf_path):
|
|
print(f"❌ {pdf_path} not found")
|
|
return
|
|
|
|
try:
|
|
with open(pdf_path, 'rb') as file:
|
|
reader = PyPDF2.PdfReader(file)
|
|
num_pages = len(reader.pages)
|
|
print(f"📄 {pdf_path} has {num_pages} pages")
|
|
|
|
for i, page in enumerate(reader.pages):
|
|
text = page.extract_text()
|
|
print(f"Page {i+1}:")
|
|
print(f" Text length: {len(text)}")
|
|
print(f" Text preview: {repr(text[:100])}")
|
|
if text.strip():
|
|
print(" ✅ Contains text")
|
|
else:
|
|
print(" ❌ No text or only whitespace")
|
|
except Exception as e:
|
|
print(f"❌ Error reading PDF: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
check_pdf_content("ocr.pdf") |