ocr improved

2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions
--- a/test_text_quality.py
+++ b/test_text_quality.py
@@ -0,0 +1,39 @@
+import fitz
+
+def text_quality_score(text):
+    """Return a score between 0 and 1 indicating text quality.
+    Higher score means more readable English text."""
+    if not text:
+        return 0.0
+    total = len(text)
+    # Count printable ASCII letters and spaces
+    printable = sum(1 for c in text if 32 <= ord(c) <= 126)
+    # Count replacement characters (<28>) which is Unicode U+FFFD
+    replacement = text.count('\ufffd')
+    # Count other non-ASCII characters
+    non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
+    # Score based on printable ratio, penalize replacement chars
+    score = (printable / total) * (1 - (replacement / total))
+    return score
+
+def analyze_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    text = page.get_text()
+    score = text_quality_score(text)
+    print(f"Text length: {len(text)}")
+    print(f"Quality score: {score:.3f}")
+    print(f"Sample (first 500 chars): {repr(text[:500])}")
+    # Determine if we should use OCR
+    threshold = 0.5  # arbitrary
+    if score < threshold:
+        print("Recommendation: Use OCR (text is garbled)")
+    else:
+        print("Recommendation: Use extracted text")
+    doc.close()
+
+if __name__ == "__main__":
+    analyze_pdf("test/safedistance.pdf")
+    # Compare with a good PDF
+    print("\n--- For comparison, test ocr.pdf ---")
+    analyze_pdf("ocr.pdf")