ocr improved
This commit is contained in:
@@ -569,6 +569,22 @@ class DocumentProcessor:
|
||||
|
||||
return processed_images, "\n".join(additional_content)
|
||||
|
||||
def _text_quality_score(self, text: str) -> float:
|
||||
"""Return a score between 0 and 1 indicating text quality.
|
||||
Higher score means more readable English text."""
|
||||
if not text:
|
||||
return 0.0
|
||||
total = len(text)
|
||||
# Count printable ASCII letters and spaces
|
||||
printable = sum(1 for c in text if 32 <= ord(c) <= 126)
|
||||
# Count replacement characters (<28>) which is Unicode U+FFFD
|
||||
replacement = text.count('\ufffd')
|
||||
# Count other non-ASCII characters
|
||||
non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
|
||||
# Score based on printable ratio, penalize replacement chars
|
||||
score = (printable / total) * (1 - (replacement / total))
|
||||
return score
|
||||
|
||||
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
|
||||
"""Process PDF files with text extraction and OCR fallback"""
|
||||
pdf_document = None
|
||||
@@ -587,11 +603,14 @@ class DocumentProcessor:
|
||||
|
||||
# Try text extraction first
|
||||
text = page.get_text()
|
||||
if text.strip():
|
||||
text_score = self._text_quality_score(text)
|
||||
# Determine if text is usable (not garbled)
|
||||
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
|
||||
if text.strip() and text_score >= 0.5:
|
||||
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
||||
else:
|
||||
# Fall back to OCR for scanned pages with higher resolution
|
||||
logger.info(f"Page {page_num + 1} has no text, using high-resolution OCR")
|
||||
# Text is empty, garbled, or low quality -> use OCR
|
||||
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
|
||||
# Use higher resolution for better OCR accuracy on scanned documents
|
||||
mat = fitz.Matrix(2, 2) # 2x resolution for better OCR
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
|
||||
Reference in New Issue
Block a user