import PyPDF2 import os import fitz # PyMuPDF from PIL import Image import io def analyze_pdf_structure(pdf_path): if not os.path.exists(pdf_path): print(f"❌ {pdf_path} not found") return print(f"📊 Analyzing {pdf_path} structure...") # Method 1: Try PyPDF2 for text extraction try: with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) print(f" Pages: {num_pages}") for i, page in enumerate(reader.pages): text = page.extract_text() print(f" Page {i+1} - PyPDF2 text length: {len(text)}") if text.strip(): print(f" Text preview: {repr(text[:200])}") else: print(" No text found with PyPDF2") except Exception as e: print(f" PyPDF2 error: {e}") # Method 2: Try PyMuPDF for better table/text detection try: doc = fitz.open(pdf_path) print(f" PyMuPDF analysis:") print(f" - Page count: {len(doc)}") for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() print(f" Page {page_num+1} - PyMuPDF text length: {len(text)}") if text.strip(): print(f" Text preview: {repr(text[:200])}") # Check for tables by looking for tabular patterns lines = text.split('\n') table_like_lines = [line for line in lines if len(line.split()) > 3] if len(table_like_lines) > 2: print(f" Potential table detected with {len(table_like_lines)} table-like lines") print(f" Sample table lines:") for i, line in enumerate(table_like_lines[:3]): print(f" {i+1}: {line}") else: print(" No text found with PyMuPDF") # Check if page has images (scanned document) image_list = page.get_images() if image_list: print(f" Images found: {len(image_list)}") doc.close() except Exception as e: print(f" PyMuPDF error: {e}") # Method 3: Check file properties file_size = os.path.getsize(pdf_path) print(f" File size: {file_size} bytes") if file_size < 1000: print(" ⚠️ Very small file - might be corrupted") elif file_size > 10000000: print(" ⚠️ Very large file - might be complex") if __name__ == "__main__": analyze_pdf_structure("ocr.pdf")