39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
import fitz
|
||
|
||
def text_quality_score(text):
|
||
"""Return a score between 0 and 1 indicating text quality.
|
||
Higher score means more readable English text."""
|
||
if not text:
|
||
return 0.0
|
||
total = len(text)
|
||
# Count printable ASCII letters and spaces
|
||
printable = sum(1 for c in text if 32 <= ord(c) <= 126)
|
||
# Count replacement characters (<28>) which is Unicode U+FFFD
|
||
replacement = text.count('\ufffd')
|
||
# Count other non-ASCII characters
|
||
non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
|
||
# Score based on printable ratio, penalize replacement chars
|
||
score = (printable / total) * (1 - (replacement / total))
|
||
return score
|
||
|
||
def analyze_pdf(pdf_path):
|
||
doc = fitz.open(pdf_path)
|
||
page = doc[0]
|
||
text = page.get_text()
|
||
score = text_quality_score(text)
|
||
print(f"Text length: {len(text)}")
|
||
print(f"Quality score: {score:.3f}")
|
||
print(f"Sample (first 500 chars): {repr(text[:500])}")
|
||
# Determine if we should use OCR
|
||
threshold = 0.5 # arbitrary
|
||
if score < threshold:
|
||
print("Recommendation: Use OCR (text is garbled)")
|
||
else:
|
||
print("Recommendation: Use extracted text")
|
||
doc.close()
|
||
|
||
if __name__ == "__main__":
|
||
analyze_pdf("test/safedistance.pdf")
|
||
# Compare with a good PDF
|
||
print("\n--- For comparison, test ocr.pdf ---")
|
||
analyze_pdf("ocr.pdf") |