import fitz def text_quality_score(text): """Return a score between 0 and 1 indicating text quality. Higher score means more readable English text.""" if not text: return 0.0 total = len(text) # Count printable ASCII letters and spaces printable = sum(1 for c in text if 32 <= ord(c) <= 126) # Count replacement characters (�) which is Unicode U+FFFD replacement = text.count('\ufffd') # Count other non-ASCII characters non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd) # Score based on printable ratio, penalize replacement chars score = (printable / total) * (1 - (replacement / total)) return score def analyze_pdf(pdf_path): doc = fitz.open(pdf_path) page = doc[0] text = page.get_text() score = text_quality_score(text) print(f"Text length: {len(text)}") print(f"Quality score: {score:.3f}") print(f"Sample (first 500 chars): {repr(text[:500])}") # Determine if we should use OCR threshold = 0.5 # arbitrary if score < threshold: print("Recommendation: Use OCR (text is garbled)") else: print("Recommendation: Use extracted text") doc.close() if __name__ == "__main__": analyze_pdf("test/safedistance.pdf") # Compare with a good PDF print("\n--- For comparison, test ocr.pdf ---") analyze_pdf("ocr.pdf")