ocr improved

2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions
--- a/LightRAG-main/lightrag/document_processor.py
+++ b/LightRAG-main/lightrag/document_processor.py
@@ -569,6 +569,22 @@ class DocumentProcessor:
        
        return processed_images, "\n".join(additional_content)
    
+    def _text_quality_score(self, text: str) -> float:
+        """Return a score between 0 and 1 indicating text quality.
+        Higher score means more readable English text."""
+        if not text:
+            return 0.0
+        total = len(text)
+        # Count printable ASCII letters and spaces
+        printable = sum(1 for c in text if 32 <= ord(c) <= 126)
+        # Count replacement characters (<28>) which is Unicode U+FFFD
+        replacement = text.count('\ufffd')
+        # Count other non-ASCII characters
+        non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
+        # Score based on printable ratio, penalize replacement chars
+        score = (printable / total) * (1 - (replacement / total))
+        return score
+
    async def _process_pdf(self, file_path: Path) -> ProcessingResult:
        """Process PDF files with text extraction and OCR fallback"""
        pdf_document = None
@@ -587,11 +603,14 @@ class DocumentProcessor:
                
                # Try text extraction first
                text = page.get_text()
-                if text.strip():
+                text_score = self._text_quality_score(text)
+                # Determine if text is usable (not garbled)
+                # Threshold 0.5 means at least half of characters are printable ASCII and not replacement
+                if text.strip() and text_score >= 0.5:
                    content_parts.append(f"Page {page_num + 1}:\n{text}")
                else:
-                    # Fall back to OCR for scanned pages with higher resolution
-                    logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR")
+                    # Text is empty, garbled, or low quality -> use OCR
+                    logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
                    # Use higher resolution for better OCR accuracy on scanned documents
                    mat = fitz.Matrix(2, 2)  # 2x resolution for better OCR
                    pix = page.get_pixmap(matrix=mat)