75 lines
2.5 KiB
Python
75 lines
2.5 KiB
Python
import fitz # PyMuPDF
|
|
import sys
|
|
|
|
def analyze_pdf_fonts(pdf_path):
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0]
|
|
|
|
# Get page text with flags
|
|
text = page.get_text("dict")
|
|
print("Page structure keys:", text.keys())
|
|
|
|
# Look for blocks
|
|
blocks = text.get("blocks", [])
|
|
print(f"Number of blocks: {len(blocks)}")
|
|
|
|
for i, block in enumerate(blocks[:5]): # first few blocks
|
|
print(f"\nBlock {i}: type={block.get('type')}")
|
|
if block.get('type') == 0: # text block
|
|
lines = block.get('lines', [])
|
|
print(f" Lines: {len(lines)}")
|
|
for line in lines[:2]:
|
|
spans = line.get('spans', [])
|
|
print(f" Spans: {len(spans)}")
|
|
for span in spans[:2]:
|
|
font = span.get('font', '')
|
|
flags = span.get('flags', 0)
|
|
color = span.get('color', 0)
|
|
size = span.get('size', 0)
|
|
text = span.get('text', '')
|
|
print(f" Font: {font[:30]}...")
|
|
print(f" Flags: {flags}")
|
|
print(f" Size: {size}")
|
|
print(f" Text sample: {repr(text[:50])}")
|
|
|
|
# Try to extract text with different encodings
|
|
print("\n--- Text extraction methods ---")
|
|
# 1. Raw text
|
|
raw = page.get_text()
|
|
print(f"Raw text length: {len(raw)}")
|
|
print(f"First 200 chars: {repr(raw[:200])}")
|
|
|
|
# 2. HTML output (includes font info)
|
|
html = page.get_text("html")
|
|
print(f"HTML length: {len(html)}")
|
|
# find font-family in html
|
|
import re
|
|
fonts = re.findall(r'font-family:([^;]+);', html)
|
|
if fonts:
|
|
print(f"Font families used: {set(fonts)}")
|
|
|
|
# 3. Check if there's a ToUnicode map
|
|
xref = page.get_contents()[0] # first content stream xref
|
|
print(f"Content stream xref: {xref}")
|
|
|
|
# Get the page object
|
|
page_obj = doc._getPageObject(page.number)
|
|
print(f"Page object keys: {page_obj.keys()}")
|
|
|
|
# Look for Resources -> Font
|
|
resources = page_obj.get('Resources', {})
|
|
fonts = resources.get('Font', {})
|
|
print(f"Number of fonts: {len(fonts)}")
|
|
for font_name, font_obj in fonts.items():
|
|
print(f" Font {font_name}: {font_obj}")
|
|
# Check if font has ToUnicode
|
|
if '/ToUnicode' in font_obj:
|
|
print(" Has ToUnicode map")
|
|
else:
|
|
print(" No ToUnicode map")
|
|
|
|
doc.close()
|
|
|
|
if __name__ == "__main__":
|
|
pdf_path = "test/safedistance.pdf"
|
|
analyze_pdf_fonts(pdf_path) |