75 lines
2.7 KiB
Python
75 lines
2.7 KiB
Python
import PyPDF2
|
|
import os
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
import io
|
|
|
|
def analyze_pdf_structure(pdf_path):
|
|
if not os.path.exists(pdf_path):
|
|
print(f"❌ {pdf_path} not found")
|
|
return
|
|
|
|
print(f"📊 Analyzing {pdf_path} structure...")
|
|
|
|
# Method 1: Try PyPDF2 for text extraction
|
|
try:
|
|
with open(pdf_path, 'rb') as file:
|
|
reader = PyPDF2.PdfReader(file)
|
|
num_pages = len(reader.pages)
|
|
print(f" Pages: {num_pages}")
|
|
|
|
for i, page in enumerate(reader.pages):
|
|
text = page.extract_text()
|
|
print(f" Page {i+1} - PyPDF2 text length: {len(text)}")
|
|
if text.strip():
|
|
print(f" Text preview: {repr(text[:200])}")
|
|
else:
|
|
print(" No text found with PyPDF2")
|
|
except Exception as e:
|
|
print(f" PyPDF2 error: {e}")
|
|
|
|
# Method 2: Try PyMuPDF for better table/text detection
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
print(f" PyMuPDF analysis:")
|
|
print(f" - Page count: {len(doc)}")
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
text = page.get_text()
|
|
print(f" Page {page_num+1} - PyMuPDF text length: {len(text)}")
|
|
|
|
if text.strip():
|
|
print(f" Text preview: {repr(text[:200])}")
|
|
|
|
# Check for tables by looking for tabular patterns
|
|
lines = text.split('\n')
|
|
table_like_lines = [line for line in lines if len(line.split()) > 3]
|
|
if len(table_like_lines) > 2:
|
|
print(f" Potential table detected with {len(table_like_lines)} table-like lines")
|
|
print(f" Sample table lines:")
|
|
for i, line in enumerate(table_like_lines[:3]):
|
|
print(f" {i+1}: {line}")
|
|
else:
|
|
print(" No text found with PyMuPDF")
|
|
|
|
# Check if page has images (scanned document)
|
|
image_list = page.get_images()
|
|
if image_list:
|
|
print(f" Images found: {len(image_list)}")
|
|
|
|
doc.close()
|
|
except Exception as e:
|
|
print(f" PyMuPDF error: {e}")
|
|
|
|
# Method 3: Check file properties
|
|
file_size = os.path.getsize(pdf_path)
|
|
print(f" File size: {file_size} bytes")
|
|
|
|
if file_size < 1000:
|
|
print(" ⚠️ Very small file - might be corrupted")
|
|
elif file_size > 10000000:
|
|
print(" ⚠️ Very large file - might be complex")
|
|
|
|
if __name__ == "__main__":
|
|
analyze_pdf_structure("ocr.pdf") |