railseek6/analyze_pdf_fonts.py

import fitz  # PyMuPDF
import sys

def analyze_pdf_fonts(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc[0]

    # Get page text with flags
    text = page.get_text("dict")
    print("Page structure keys:", text.keys())

    # Look for blocks
    blocks = text.get("blocks", [])
    print(f"Number of blocks: {len(blocks)}")

    for i, block in enumerate(blocks[:5]):  # first few blocks
        print(f"\nBlock {i}: type={block.get('type')}")
        if block.get('type') == 0:  # text block
            lines = block.get('lines', [])
            print(f"  Lines: {len(lines)}")
            for line in lines[:2]:
                spans = line.get('spans', [])
                print(f"    Spans: {len(spans)}")
                for span in spans[:2]:
                    font = span.get('font', '')
                    flags = span.get('flags', 0)
                    color = span.get('color', 0)
                    size = span.get('size', 0)
                    text = span.get('text', '')
                    print(f"      Font: {font[:30]}...")
                    print(f"      Flags: {flags}")
                    print(f"      Size: {size}")
                    print(f"      Text sample: {repr(text[:50])}")

    # Try to extract text with different encodings
    print("\n--- Text extraction methods ---")
    # 1. Raw text
    raw = page.get_text()
    print(f"Raw text length: {len(raw)}")
    print(f"First 200 chars: {repr(raw[:200])}")

    # 2. HTML output (includes font info)
    html = page.get_text("html")
    print(f"HTML length: {len(html)}")
    # find font-family in html
    import re
    fonts = re.findall(r'font-family:([^;]+);', html)
    if fonts:
        print(f"Font families used: {set(fonts)}")

    # 3. Check if there's a ToUnicode map
    xref = page.get_contents()[0]  # first content stream xref
    print(f"Content stream xref: {xref}")

    # Get the page object
    page_obj = doc._getPageObject(page.number)
    print(f"Page object keys: {page_obj.keys()}")

    # Look for Resources -> Font
    resources = page_obj.get('Resources', {})
    fonts = resources.get('Font', {})
    print(f"Number of fonts: {len(fonts)}")
    for font_name, font_obj in fonts.items():
        print(f"  Font {font_name}: {font_obj}")
        # Check if font has ToUnicode
        if '/ToUnicode' in font_obj:
            print("    Has ToUnicode map")
        else:
            print("    No ToUnicode map")

    doc.close()

if __name__ == "__main__":
    pdf_path = "test/safedistance.pdf"
    analyze_pdf_fonts(pdf_path)