Files
railseek6/analyze_ocr_pdf_table.py

75 lines
2.7 KiB
Python

import PyPDF2
import os
import fitz # PyMuPDF
from PIL import Image
import io
def analyze_pdf_structure(pdf_path):
if not os.path.exists(pdf_path):
print(f"{pdf_path} not found")
return
print(f"📊 Analyzing {pdf_path} structure...")
# Method 1: Try PyPDF2 for text extraction
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
print(f" Pages: {num_pages}")
for i, page in enumerate(reader.pages):
text = page.extract_text()
print(f" Page {i+1} - PyPDF2 text length: {len(text)}")
if text.strip():
print(f" Text preview: {repr(text[:200])}")
else:
print(" No text found with PyPDF2")
except Exception as e:
print(f" PyPDF2 error: {e}")
# Method 2: Try PyMuPDF for better table/text detection
try:
doc = fitz.open(pdf_path)
print(f" PyMuPDF analysis:")
print(f" - Page count: {len(doc)}")
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
print(f" Page {page_num+1} - PyMuPDF text length: {len(text)}")
if text.strip():
print(f" Text preview: {repr(text[:200])}")
# Check for tables by looking for tabular patterns
lines = text.split('\n')
table_like_lines = [line for line in lines if len(line.split()) > 3]
if len(table_like_lines) > 2:
print(f" Potential table detected with {len(table_like_lines)} table-like lines")
print(f" Sample table lines:")
for i, line in enumerate(table_like_lines[:3]):
print(f" {i+1}: {line}")
else:
print(" No text found with PyMuPDF")
# Check if page has images (scanned document)
image_list = page.get_images()
if image_list:
print(f" Images found: {len(image_list)}")
doc.close()
except Exception as e:
print(f" PyMuPDF error: {e}")
# Method 3: Check file properties
file_size = os.path.getsize(pdf_path)
print(f" File size: {file_size} bytes")
if file_size < 1000:
print(" ⚠️ Very small file - might be corrupted")
elif file_size > 10000000:
print(" ⚠️ Very large file - might be complex")
if __name__ == "__main__":
analyze_pdf_structure("ocr.pdf")