import fitz def check_pdf_content(pdf_path): try: doc = fitz.open(pdf_path) print(f"Total pages: {len(doc)}") print(f"File size: {len(open(pdf_path, 'rb').read())} bytes") text = "" for page_num, page in enumerate(doc): page_text = page.get_text() text += page_text print(f"Page {page_num + 1} text length: {len(page_text)}") print(f"Total text length: {len(text)}") if text: print("First 500 characters:") print(text[:500]) else: print("NO TEXT EXTRACTED - This is likely a scanned/image PDF") doc.close() return len(text) > 0 except Exception as e: print(f"Error reading PDF: {e}") return False if __name__ == "__main__": check_pdf_content('ocr.pdf')