29 lines
869 B
Python
29 lines
869 B
Python
import fitz
|
|
|
|
def check_pdf_content(pdf_path):
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
print(f"Total pages: {len(doc)}")
|
|
print(f"File size: {len(open(pdf_path, 'rb').read())} bytes")
|
|
|
|
text = ""
|
|
for page_num, page in enumerate(doc):
|
|
page_text = page.get_text()
|
|
text += page_text
|
|
print(f"Page {page_num + 1} text length: {len(page_text)}")
|
|
|
|
print(f"Total text length: {len(text)}")
|
|
if text:
|
|
print("First 500 characters:")
|
|
print(text[:500])
|
|
else:
|
|
print("NO TEXT EXTRACTED - This is likely a scanned/image PDF")
|
|
|
|
doc.close()
|
|
return len(text) > 0
|
|
except Exception as e:
|
|
print(f"Error reading PDF: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
check_pdf_content('ocr.pdf') |