import fitz # PyMuPDF import sys def analyze_pdf_fonts(pdf_path): doc = fitz.open(pdf_path) page = doc[0] # Get page text with flags text = page.get_text("dict") print("Page structure keys:", text.keys()) # Look for blocks blocks = text.get("blocks", []) print(f"Number of blocks: {len(blocks)}") for i, block in enumerate(blocks[:5]): # first few blocks print(f"\nBlock {i}: type={block.get('type')}") if block.get('type') == 0: # text block lines = block.get('lines', []) print(f" Lines: {len(lines)}") for line in lines[:2]: spans = line.get('spans', []) print(f" Spans: {len(spans)}") for span in spans[:2]: font = span.get('font', '') flags = span.get('flags', 0) color = span.get('color', 0) size = span.get('size', 0) text = span.get('text', '') print(f" Font: {font[:30]}...") print(f" Flags: {flags}") print(f" Size: {size}") print(f" Text sample: {repr(text[:50])}") # Try to extract text with different encodings print("\n--- Text extraction methods ---") # 1. Raw text raw = page.get_text() print(f"Raw text length: {len(raw)}") print(f"First 200 chars: {repr(raw[:200])}") # 2. HTML output (includes font info) html = page.get_text("html") print(f"HTML length: {len(html)}") # find font-family in html import re fonts = re.findall(r'font-family:([^;]+);', html) if fonts: print(f"Font families used: {set(fonts)}") # 3. Check if there's a ToUnicode map xref = page.get_contents()[0] # first content stream xref print(f"Content stream xref: {xref}") # Get the page object page_obj = doc._getPageObject(page.number) print(f"Page object keys: {page_obj.keys()}") # Look for Resources -> Font resources = page_obj.get('Resources', {}) fonts = resources.get('Font', {}) print(f"Number of fonts: {len(fonts)}") for font_name, font_obj in fonts.items(): print(f" Font {font_name}: {font_obj}") # Check if font has ToUnicode if '/ToUnicode' in font_obj: print(" Has ToUnicode map") else: print(" No ToUnicode map") doc.close() if __name__ == "__main__": pdf_path = "test/safedistance.pdf" analyze_pdf_fonts(pdf_path)