railseek6/test_text_quality.py

import fitz

def text_quality_score(text):
    """Return a score between 0 and 1 indicating text quality.
    Higher score means more readable English text."""
    if not text:
        return 0.0
    total = len(text)
    # Count printable ASCII letters and spaces
    printable = sum(1 for c in text if 32 <= ord(c) <= 126)
    # Count replacement characters (<28>) which is Unicode U+FFFD
    replacement = text.count('\ufffd')
    # Count other non-ASCII characters
    non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
    # Score based on printable ratio, penalize replacement chars
    score = (printable / total) * (1 - (replacement / total))
    return score

def analyze_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc[0]
    text = page.get_text()
    score = text_quality_score(text)
    print(f"Text length: {len(text)}")
    print(f"Quality score: {score:.3f}")
    print(f"Sample (first 500 chars): {repr(text[:500])}")
    # Determine if we should use OCR
    threshold = 0.5  # arbitrary
    if score < threshold:
        print("Recommendation: Use OCR (text is garbled)")
    else:
        print("Recommendation: Use extracted text")
    doc.close()

if __name__ == "__main__":
    analyze_pdf("test/safedistance.pdf")
    # Compare with a good PDF
    print("\n--- For comparison, test ocr.pdf ---")
    analyze_pdf("ocr.pdf")