Files
railseek6/analyze_pdf_fonts.py
2026-01-13 18:25:49 +08:00

75 lines
2.5 KiB
Python

import fitz # PyMuPDF
import sys
def analyze_pdf_fonts(pdf_path):
doc = fitz.open(pdf_path)
page = doc[0]
# Get page text with flags
text = page.get_text("dict")
print("Page structure keys:", text.keys())
# Look for blocks
blocks = text.get("blocks", [])
print(f"Number of blocks: {len(blocks)}")
for i, block in enumerate(blocks[:5]): # first few blocks
print(f"\nBlock {i}: type={block.get('type')}")
if block.get('type') == 0: # text block
lines = block.get('lines', [])
print(f" Lines: {len(lines)}")
for line in lines[:2]:
spans = line.get('spans', [])
print(f" Spans: {len(spans)}")
for span in spans[:2]:
font = span.get('font', '')
flags = span.get('flags', 0)
color = span.get('color', 0)
size = span.get('size', 0)
text = span.get('text', '')
print(f" Font: {font[:30]}...")
print(f" Flags: {flags}")
print(f" Size: {size}")
print(f" Text sample: {repr(text[:50])}")
# Try to extract text with different encodings
print("\n--- Text extraction methods ---")
# 1. Raw text
raw = page.get_text()
print(f"Raw text length: {len(raw)}")
print(f"First 200 chars: {repr(raw[:200])}")
# 2. HTML output (includes font info)
html = page.get_text("html")
print(f"HTML length: {len(html)}")
# find font-family in html
import re
fonts = re.findall(r'font-family:([^;]+);', html)
if fonts:
print(f"Font families used: {set(fonts)}")
# 3. Check if there's a ToUnicode map
xref = page.get_contents()[0] # first content stream xref
print(f"Content stream xref: {xref}")
# Get the page object
page_obj = doc._getPageObject(page.number)
print(f"Page object keys: {page_obj.keys()}")
# Look for Resources -> Font
resources = page_obj.get('Resources', {})
fonts = resources.get('Font', {})
print(f"Number of fonts: {len(fonts)}")
for font_name, font_obj in fonts.items():
print(f" Font {font_name}: {font_obj}")
# Check if font has ToUnicode
if '/ToUnicode' in font_obj:
print(" Has ToUnicode map")
else:
print(" No ToUnicode map")
doc.close()
if __name__ == "__main__":
pdf_path = "test/safedistance.pdf"
analyze_pdf_fonts(pdf_path)