ocr speed improved

2026-01-13 19:10:24 +08:00
parent a5eb381384
commit e7256a10ea
7 changed files with 283 additions and 22 deletions
--- a/LightRAG-main/lightrag/document_processor.py
+++ b/LightRAG-main/lightrag/document_processor.py
@@ -586,7 +586,7 @@ class DocumentProcessor:
        return score

    async def _process_pdf(self, file_path: Path) -> ProcessingResult:
-        """Process PDF files with text extraction and OCR fallback"""
+        """Process PDF files with text extraction and OCR fallback using batch processing"""
        pdf_document = None
        try:
            content_parts = []
@@ -598,6 +598,10 @@ class DocumentProcessor:
            pdf_document = fitz.open(str(file_path))
            total_pages = len(pdf_document)
            
+            # Collect pages that need OCR
+            ocr_pages = []  # list of (page_num, temp_path)
+            page_texts = {}  # page_num -> text (if usable)
+            
            for page_num in range(total_pages):
                page = pdf_document[page_num]
                
@@ -607,7 +611,7 @@ class DocumentProcessor:
                # Determine if text is usable (not garbled)
                # Threshold 0.5 means at least half of characters are printable ASCII and not replacement
                if text.strip() and text_score >= 0.5:
-                    content_parts.append(f"Page {page_num + 1}:\n{text}")
+                    page_texts[page_num] = text
                else:
                    # Text is empty, garbled, or low quality -> use OCR
                    logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
@@ -621,34 +625,94 @@ class DocumentProcessor:
                        temp_file.write(img_data)
                        temp_path = temp_file.name
                    
-                    try:
-                        if self.ocr_processor.ocr_available:
-                            logger.info(f"Running OCR on page {page_num + 1} with high resolution")
+                    ocr_pages.append((page_num, temp_path))
+            
+            # Process OCR pages in batch if any
+            if ocr_pages and self.ocr_processor.ocr_available:
+                try:
+                    temp_paths = [temp_path for _, temp_path in ocr_pages]
+                    logger.info(f"Running batch OCR on {len(temp_paths)} pages")
+                    batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
+                    logger.info(f"Batch OCR completed for {len(batch_results)} pages")
+                    
+                    # Map results back to pages
+                    for idx, (page_num, temp_path) in enumerate(ocr_pages):
+                        ocr_result = None
+                        if idx < len(batch_results):
+                            batch_result = batch_results[idx]
+                            ocr_result = {
+                                "text": batch_result.text,
+                                "confidence": batch_result.confidence,
+                                "bboxes": batch_result.bboxes,
+                                "line_count": batch_result.line_count
+                            }
+                        else:
+                            # Fallback to individual OCR
+                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
+                        
+                        if ocr_result["text"].strip():
+                            logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
+                            content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
+                            processed_with_ocr = True
+                        else:
+                            logger.warning(f"OCR returned empty text for page {page_num + 1}")
+                            # Don't add empty content, just mark as processed
+                            content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
+                        
+                        # Extract tables from OCR
+                        ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
+                        if ocr_tables:
+                            logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
+                            tables.extend(ocr_tables)
+                        
+                        # Clean up temporary file
+                        if temp_path and os.path.exists(temp_path):
+                            os.unlink(temp_path)
+                
+                except Exception as batch_error:
+                    logger.error(f"Batch OCR processing failed: {batch_error}")
+                    # Fall back to individual processing for each page
+                    for page_num, temp_path in ocr_pages:
+                        try:
                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
-                            
                            if ocr_result["text"].strip():
-                                logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
                                content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
                                processed_with_ocr = True
                            else:
-                                logger.warning(f"OCR returned empty text for page {page_num + 1}")
-                                # Don't add empty content, just mark as processed
                                content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
                            
-                            # Extract tables from OCR
+                            # Extract tables
                            ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
                            if ocr_tables:
-                                logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
                                tables.extend(ocr_tables)
-                        else:
-                            logger.warning("OCR not available, skipping OCR processing")
-                            content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
-                    except Exception as ocr_error:
-                        logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
-                        content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
-                    finally:
+                        except Exception as ocr_error:
+                            logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
+                            content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
+                        finally:
+                            if temp_path and os.path.exists(temp_path):
+                                os.unlink(temp_path)
+            elif ocr_pages and not self.ocr_processor.ocr_available:
+                logger.warning("OCR not available, skipping OCR processing")
+                for page_num, temp_path in ocr_pages:
+                    content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
+                    if temp_path and os.path.exists(temp_path):
                        os.unlink(temp_path)
            
+            # Add text pages content
+            for page_num, text in page_texts.items():
+                content_parts.append(f"Page {page_num + 1}:\n{text}")
+            
+            # Sort content parts by page number
+            def extract_page_num(part):
+                # Find the first number after "Page "
+                import re
+                match = re.search(r'Page\s+(\d+)', part)
+                if match:
+                    return int(match.group(1))
+                return 0
+            
+            content_parts.sort(key=extract_page_num)
+            
            full_content = "\n\n".join(content_parts)
            
            return ProcessingResult(