ocr improved

This commit is contained in:
2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions

39
test_text_quality.py Normal file
View File

@@ -0,0 +1,39 @@
import fitz
def text_quality_score(text):
"""Return a score between 0 and 1 indicating text quality.
Higher score means more readable English text."""
if not text:
return 0.0
total = len(text)
# Count printable ASCII letters and spaces
printable = sum(1 for c in text if 32 <= ord(c) <= 126)
# Count replacement characters (<28>) which is Unicode U+FFFD
replacement = text.count('\ufffd')
# Count other non-ASCII characters
non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
# Score based on printable ratio, penalize replacement chars
score = (printable / total) * (1 - (replacement / total))
return score
def analyze_pdf(pdf_path):
doc = fitz.open(pdf_path)
page = doc[0]
text = page.get_text()
score = text_quality_score(text)
print(f"Text length: {len(text)}")
print(f"Quality score: {score:.3f}")
print(f"Sample (first 500 chars): {repr(text[:500])}")
# Determine if we should use OCR
threshold = 0.5 # arbitrary
if score < threshold:
print("Recommendation: Use OCR (text is garbled)")
else:
print("Recommendation: Use extracted text")
doc.close()
if __name__ == "__main__":
analyze_pdf("test/safedistance.pdf")
# Compare with a good PDF
print("\n--- For comparison, test ocr.pdf ---")
analyze_pdf("ocr.pdf")