ocr improved
This commit is contained in:
75
analyze_pdf_fonts.py
Normal file
75
analyze_pdf_fonts.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import fitz # PyMuPDF
|
||||
import sys
|
||||
|
||||
def analyze_pdf_fonts(pdf_path):
|
||||
doc = fitz.open(pdf_path)
|
||||
page = doc[0]
|
||||
|
||||
# Get page text with flags
|
||||
text = page.get_text("dict")
|
||||
print("Page structure keys:", text.keys())
|
||||
|
||||
# Look for blocks
|
||||
blocks = text.get("blocks", [])
|
||||
print(f"Number of blocks: {len(blocks)}")
|
||||
|
||||
for i, block in enumerate(blocks[:5]): # first few blocks
|
||||
print(f"\nBlock {i}: type={block.get('type')}")
|
||||
if block.get('type') == 0: # text block
|
||||
lines = block.get('lines', [])
|
||||
print(f" Lines: {len(lines)}")
|
||||
for line in lines[:2]:
|
||||
spans = line.get('spans', [])
|
||||
print(f" Spans: {len(spans)}")
|
||||
for span in spans[:2]:
|
||||
font = span.get('font', '')
|
||||
flags = span.get('flags', 0)
|
||||
color = span.get('color', 0)
|
||||
size = span.get('size', 0)
|
||||
text = span.get('text', '')
|
||||
print(f" Font: {font[:30]}...")
|
||||
print(f" Flags: {flags}")
|
||||
print(f" Size: {size}")
|
||||
print(f" Text sample: {repr(text[:50])}")
|
||||
|
||||
# Try to extract text with different encodings
|
||||
print("\n--- Text extraction methods ---")
|
||||
# 1. Raw text
|
||||
raw = page.get_text()
|
||||
print(f"Raw text length: {len(raw)}")
|
||||
print(f"First 200 chars: {repr(raw[:200])}")
|
||||
|
||||
# 2. HTML output (includes font info)
|
||||
html = page.get_text("html")
|
||||
print(f"HTML length: {len(html)}")
|
||||
# find font-family in html
|
||||
import re
|
||||
fonts = re.findall(r'font-family:([^;]+);', html)
|
||||
if fonts:
|
||||
print(f"Font families used: {set(fonts)}")
|
||||
|
||||
# 3. Check if there's a ToUnicode map
|
||||
xref = page.get_contents()[0] # first content stream xref
|
||||
print(f"Content stream xref: {xref}")
|
||||
|
||||
# Get the page object
|
||||
page_obj = doc._getPageObject(page.number)
|
||||
print(f"Page object keys: {page_obj.keys()}")
|
||||
|
||||
# Look for Resources -> Font
|
||||
resources = page_obj.get('Resources', {})
|
||||
fonts = resources.get('Font', {})
|
||||
print(f"Number of fonts: {len(fonts)}")
|
||||
for font_name, font_obj in fonts.items():
|
||||
print(f" Font {font_name}: {font_obj}")
|
||||
# Check if font has ToUnicode
|
||||
if '/ToUnicode' in font_obj:
|
||||
print(" Has ToUnicode map")
|
||||
else:
|
||||
print(" No ToUnicode map")
|
||||
|
||||
doc.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
pdf_path = "test/safedistance.pdf"
|
||||
analyze_pdf_fonts(pdf_path)
|
||||
Reference in New Issue
Block a user