ocr improved

This commit is contained in:
2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions

View File

@@ -569,6 +569,22 @@ class DocumentProcessor:
return processed_images, "\n".join(additional_content)
def _text_quality_score(self, text: str) -> float:
"""Return a score between 0 and 1 indicating text quality.
Higher score means more readable English text."""
if not text:
return 0.0
total = len(text)
# Count printable ASCII letters and spaces
printable = sum(1 for c in text if 32 <= ord(c) <= 126)
# Count replacement characters (<28>) which is Unicode U+FFFD
replacement = text.count('\ufffd')
# Count other non-ASCII characters
non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
# Score based on printable ratio, penalize replacement chars
score = (printable / total) * (1 - (replacement / total))
return score
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
"""Process PDF files with text extraction and OCR fallback"""
pdf_document = None
@@ -587,11 +603,14 @@ class DocumentProcessor:
# Try text extraction first
text = page.get_text()
if text.strip():
text_score = self._text_quality_score(text)
# Determine if text is usable (not garbled)
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
if text.strip() and text_score >= 0.5:
content_parts.append(f"Page {page_num + 1}:\n{text}")
else:
# Fall back to OCR for scanned pages with higher resolution
logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR")
# Text is empty, garbled, or low quality -> use OCR
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
# Use higher resolution for better OCR accuracy on scanned documents
mat = fitz.Matrix(2, 2) # 2x resolution for better OCR
pix = page.get_pixmap(matrix=mat)