ocr speed improved
This commit is contained in:
@@ -586,7 +586,7 @@ class DocumentProcessor:
|
||||
return score
|
||||
|
||||
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
|
||||
"""Process PDF files with text extraction and OCR fallback"""
|
||||
"""Process PDF files with text extraction and OCR fallback using batch processing"""
|
||||
pdf_document = None
|
||||
try:
|
||||
content_parts = []
|
||||
@@ -598,6 +598,10 @@ class DocumentProcessor:
|
||||
pdf_document = fitz.open(str(file_path))
|
||||
total_pages = len(pdf_document)
|
||||
|
||||
# Collect pages that need OCR
|
||||
ocr_pages = [] # list of (page_num, temp_path)
|
||||
page_texts = {} # page_num -> text (if usable)
|
||||
|
||||
for page_num in range(total_pages):
|
||||
page = pdf_document[page_num]
|
||||
|
||||
@@ -607,7 +611,7 @@ class DocumentProcessor:
|
||||
# Determine if text is usable (not garbled)
|
||||
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
|
||||
if text.strip() and text_score >= 0.5:
|
||||
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
||||
page_texts[page_num] = text
|
||||
else:
|
||||
# Text is empty, garbled, or low quality -> use OCR
|
||||
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
|
||||
@@ -621,34 +625,94 @@ class DocumentProcessor:
|
||||
temp_file.write(img_data)
|
||||
temp_path = temp_file.name
|
||||
|
||||
try:
|
||||
if self.ocr_processor.ocr_available:
|
||||
logger.info(f"Running OCR on page {page_num + 1} with high resolution")
|
||||
ocr_pages.append((page_num, temp_path))
|
||||
|
||||
# Process OCR pages in batch if any
|
||||
if ocr_pages and self.ocr_processor.ocr_available:
|
||||
try:
|
||||
temp_paths = [temp_path for _, temp_path in ocr_pages]
|
||||
logger.info(f"Running batch OCR on {len(temp_paths)} pages")
|
||||
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
|
||||
logger.info(f"Batch OCR completed for {len(batch_results)} pages")
|
||||
|
||||
# Map results back to pages
|
||||
for idx, (page_num, temp_path) in enumerate(ocr_pages):
|
||||
ocr_result = None
|
||||
if idx < len(batch_results):
|
||||
batch_result = batch_results[idx]
|
||||
ocr_result = {
|
||||
"text": batch_result.text,
|
||||
"confidence": batch_result.confidence,
|
||||
"bboxes": batch_result.bboxes,
|
||||
"line_count": batch_result.line_count
|
||||
}
|
||||
else:
|
||||
# Fallback to individual OCR
|
||||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||||
|
||||
if ocr_result["text"].strip():
|
||||
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
|
||||
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
||||
processed_with_ocr = True
|
||||
else:
|
||||
logger.warning(f"OCR returned empty text for page {page_num + 1}")
|
||||
# Don't add empty content, just mark as processed
|
||||
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
||||
|
||||
# Extract tables from OCR
|
||||
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||
if ocr_tables:
|
||||
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
||||
tables.extend(ocr_tables)
|
||||
|
||||
# Clean up temporary file
|
||||
if temp_path and os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
except Exception as batch_error:
|
||||
logger.error(f"Batch OCR processing failed: {batch_error}")
|
||||
# Fall back to individual processing for each page
|
||||
for page_num, temp_path in ocr_pages:
|
||||
try:
|
||||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||||
|
||||
if ocr_result["text"].strip():
|
||||
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
|
||||
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
||||
processed_with_ocr = True
|
||||
else:
|
||||
logger.warning(f"OCR returned empty text for page {page_num + 1}")
|
||||
# Don't add empty content, just mark as processed
|
||||
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
||||
|
||||
# Extract tables from OCR
|
||||
# Extract tables
|
||||
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||
if ocr_tables:
|
||||
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
||||
tables.extend(ocr_tables)
|
||||
else:
|
||||
logger.warning("OCR not available, skipping OCR processing")
|
||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
|
||||
except Exception as ocr_error:
|
||||
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
|
||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
|
||||
finally:
|
||||
except Exception as ocr_error:
|
||||
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
|
||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
|
||||
finally:
|
||||
if temp_path and os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
elif ocr_pages and not self.ocr_processor.ocr_available:
|
||||
logger.warning("OCR not available, skipping OCR processing")
|
||||
for page_num, temp_path in ocr_pages:
|
||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
|
||||
if temp_path and os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
# Add text pages content
|
||||
for page_num, text in page_texts.items():
|
||||
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
||||
|
||||
# Sort content parts by page number
|
||||
def extract_page_num(part):
|
||||
# Find the first number after "Page "
|
||||
import re
|
||||
match = re.search(r'Page\s+(\d+)', part)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return 0
|
||||
|
||||
content_parts.sort(key=extract_page_num)
|
||||
|
||||
full_content = "\n\n".join(content_parts)
|
||||
|
||||
return ProcessingResult(
|
||||
|
||||
Reference in New Issue
Block a user