Files
railseek6/check_pdf_content.py

29 lines
869 B
Python

import fitz
def check_pdf_content(pdf_path):
try:
doc = fitz.open(pdf_path)
print(f"Total pages: {len(doc)}")
print(f"File size: {len(open(pdf_path, 'rb').read())} bytes")
text = ""
for page_num, page in enumerate(doc):
page_text = page.get_text()
text += page_text
print(f"Page {page_num + 1} text length: {len(page_text)}")
print(f"Total text length: {len(text)}")
if text:
print("First 500 characters:")
print(text[:500])
else:
print("NO TEXT EXTRACTED - This is likely a scanned/image PDF")
doc.close()
return len(text) > 0
except Exception as e:
print(f"Error reading PDF: {e}")
return False
if __name__ == "__main__":
check_pdf_content('ocr.pdf')