ocr improved
This commit is contained in:
39
test_text_quality.py
Normal file
39
test_text_quality.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import fitz
|
||||
|
||||
def text_quality_score(text):
|
||||
"""Return a score between 0 and 1 indicating text quality.
|
||||
Higher score means more readable English text."""
|
||||
if not text:
|
||||
return 0.0
|
||||
total = len(text)
|
||||
# Count printable ASCII letters and spaces
|
||||
printable = sum(1 for c in text if 32 <= ord(c) <= 126)
|
||||
# Count replacement characters (<28>) which is Unicode U+FFFD
|
||||
replacement = text.count('\ufffd')
|
||||
# Count other non-ASCII characters
|
||||
non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
|
||||
# Score based on printable ratio, penalize replacement chars
|
||||
score = (printable / total) * (1 - (replacement / total))
|
||||
return score
|
||||
|
||||
def analyze_pdf(pdf_path):
|
||||
doc = fitz.open(pdf_path)
|
||||
page = doc[0]
|
||||
text = page.get_text()
|
||||
score = text_quality_score(text)
|
||||
print(f"Text length: {len(text)}")
|
||||
print(f"Quality score: {score:.3f}")
|
||||
print(f"Sample (first 500 chars): {repr(text[:500])}")
|
||||
# Determine if we should use OCR
|
||||
threshold = 0.5 # arbitrary
|
||||
if score < threshold:
|
||||
print("Recommendation: Use OCR (text is garbled)")
|
||||
else:
|
||||
print("Recommendation: Use extracted text")
|
||||
doc.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_pdf("test/safedistance.pdf")
|
||||
# Compare with a good PDF
|
||||
print("\n--- For comparison, test ocr.pdf ---")
|
||||
analyze_pdf("ocr.pdf")
|
||||
Reference in New Issue
Block a user