table detection enhanced

2026-01-14 15:15:01 +08:00
parent e7256a10ea
commit 1838c37302
14 changed files with 18065490 additions and 71 deletions
--- a/LightRAG-main/lightrag/document_processor.py
+++ b/LightRAG-main/lightrag/document_processor.py
@@ -396,6 +396,17 @@ class DocumentProcessor:
                logger.info("Image classifier initialized successfully")
            except Exception as e:
                logger.warning(f"Failed to initialize image classifier: {e}")
+        
+        # Initialize Tabula for PDF table extraction (optional dependency)
+        self.tabula_available = False
+        try:
+            import tabula
+            self.tabula_available = True
+            logger.info("Tabula initialized successfully for PDF table extraction")
+        except ImportError:
+            logger.warning("Tabula not available. PDF table extraction will use OCR-based method only.")
+        except Exception as e:
+            logger.warning(f"Failed to initialize Tabula: {e}")
    
    async def process_document(self, file_path: str) -> ProcessingResult:
        """Process document based on file extension"""
@@ -569,6 +580,64 @@ class DocumentProcessor:
        
        return processed_images, "\n".join(additional_content)
    
+    def _extract_tables_with_tabula(self, pdf_path: str) -> List[Dict[str, Any]]:
+        """
+        Extract tables from PDF using Tabula (for digital PDFs with text layers)
+        
+        Args:
+            pdf_path: Path to PDF file
+            
+        Returns:
+            List of table dictionaries
+        """
+        if not self.tabula_available:
+            return []
+        
+        try:
+            import tabula
+            import pandas as pd
+            
+            # Try to extract tables from all pages
+            tables = []
+            
+            # Use Tabula to extract tables
+            dfs = tabula.read_pdf(
+                pdf_path,
+                pages='all',
+                multiple_tables=True,
+                lattice=True,  # Try lattice mode first (for bordered tables)
+                stream=True,   # Fall back to stream mode (for borderless tables)
+                guess=False,
+                silent=True
+            )
+            
+            for i, df in enumerate(dfs):
+                if df is not None and not df.empty:
+                    # Convert DataFrame to table structure
+                    table_data = df.values.tolist()
+                    columns = df.columns.tolist()
+                    
+                    # Add column headers as first row if they're meaningful
+                    if any(col and str(col).strip() for col in columns):
+                        table_data.insert(0, columns)
+                    
+                    if table_data:
+                        tables.append({
+                            "data": table_data,
+                            "rows": len(table_data),
+                            "columns": len(table_data[0]) if table_data else 0,
+                            "source": "tabula",
+                            "table_index": i,
+                            "has_header": True if columns else False
+                        })
+            
+            logger.info(f"Tabula extracted {len(tables)} tables from {pdf_path}")
+            return tables
+            
+        except Exception as e:
+            logger.warning(f"Tabula table extraction failed for {pdf_path}: {e}")
+            return []
+    
    def _text_quality_score(self, text: str) -> float:
        """Return a score between 0 and 1 indicating text quality.
        Higher score means more readable English text."""
@@ -586,21 +655,30 @@ class DocumentProcessor:
        return score

    async def _process_pdf(self, file_path: Path) -> ProcessingResult:
-        """Process PDF files with text extraction and OCR fallback using batch processing"""
+        """Process PDF files with hybrid approach: Tabula for digital PDFs, OCR for scanned"""
        pdf_document = None
        try:
            content_parts = []
            tables = []
            images = []
            processed_with_ocr = False
+            used_tabula = False
            
            # Open PDF
            pdf_document = fitz.open(str(file_path))
            total_pages = len(pdf_document)
            
-            # Collect pages that need OCR
-            ocr_pages = []  # list of (page_num, temp_path)
-            page_texts = {}  # page_num -> text (if usable)
+            # Step 1: Try Tabula for digital PDFs with text layers
+            if self.tabula_available:
+                tabula_tables = self._extract_tables_with_tabula(str(file_path))
+                if tabula_tables:
+                    tables.extend(tabula_tables)
+                    used_tabula = True
+                    logger.info(f"Extracted {len(tabula_tables)} tables using Tabula")
+            
+            # Step 2: Analyze each page for text vs scanned content
+            ocr_pages = []  # list of (page_num, temp_path) for scanned pages
+            page_texts = {}  # page_num -> text (for digital pages)
            
            for page_num in range(total_pages):
                page = pdf_document[page_num]
@@ -608,14 +686,31 @@ class DocumentProcessor:
                # Try text extraction first
                text = page.get_text()
                text_score = self._text_quality_score(text)
-                # Determine if text is usable (not garbled)
-                # Threshold 0.5 means at least half of characters are printable ASCII and not replacement
+                
+                # Determine if page is digital (good text) or scanned (needs OCR)
                if text.strip() and text_score >= 0.5:
+                    # Digital page with good text
                    page_texts[page_num] = text
+                    
+                    # If Tabula didn't find tables, try to extract tables from text
+                    if not used_tabula and "|" in text or "\t" in text:
+                        # Simple table detection from text patterns
+                        lines = text.split('\n')
+                        table_like_lines = [line for line in lines if len(line.split()) > 3]
+                        if len(table_like_lines) > 2:
+                            table_data = [line.split('|') if '|' in line else line.split('\t') for line in table_like_lines]
+                            if table_data and len(table_data) >= 2:
+                                tables.append({
+                                    "data": table_data,
+                                    "rows": len(table_data),
+                                    "columns": max(len(row) for row in table_data) if table_data else 0,
+                                    "source": "text_pattern",
+                                    "page": page_num + 1
+                                })
                else:
-                    # Text is empty, garbled, or low quality -> use OCR
+                    # Scanned page or poor text quality -> use OCR
                    logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
-                    # Use higher resolution for better OCR accuracy on scanned documents
+                    # Use higher resolution for better OCR accuracy
                    mat = fitz.Matrix(2, 2)  # 2x resolution for better OCR
                    pix = page.get_pixmap(matrix=mat)
                    img_data = pix.tobytes("png")
@@ -627,11 +722,11 @@ class DocumentProcessor:
                    
                    ocr_pages.append((page_num, temp_path))
            
-            # Process OCR pages in batch if any
+            # Step 3: Process scanned pages with OCR if any
            if ocr_pages and self.ocr_processor.ocr_available:
                try:
                    temp_paths = [temp_path for _, temp_path in ocr_pages]
-                    logger.info(f"Running batch OCR on {len(temp_paths)} pages")
+                    logger.info(f"Running batch OCR on {len(temp_paths)} scanned pages")
                    batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
                    logger.info(f"Batch OCR completed for {len(batch_results)} pages")
                    
@@ -654,24 +749,26 @@ class DocumentProcessor:
                            logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
                            content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
                            processed_with_ocr = True
+                            
+                            # Extract tables from OCR using enhanced heuristic method
+                            ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
+                            if ocr_tables:
+                                logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
+                                for table in ocr_tables:
+                                    table["source"] = "ocr_enhanced"
+                                    table["page"] = page_num + 1
+                                tables.extend(ocr_tables)
                        else:
                            logger.warning(f"OCR returned empty text for page {page_num + 1}")
-                            # Don't add empty content, just mark as processed
                            content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
                        
-                        # Extract tables from OCR
-                        ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
-                        if ocr_tables:
-                            logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
-                            tables.extend(ocr_tables)
-                        
                        # Clean up temporary file
                        if temp_path and os.path.exists(temp_path):
                            os.unlink(temp_path)
                
                except Exception as batch_error:
                    logger.error(f"Batch OCR processing failed: {batch_error}")
-                    # Fall back to individual processing for each page
+                    # Fall back to individual processing
                    for page_num, temp_path in ocr_pages:
                        try:
                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
@@ -684,6 +781,9 @@ class DocumentProcessor:
                            # Extract tables
                            ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
                            if ocr_tables:
+                                for table in ocr_tables:
+                                    table["source"] = "ocr_fallback"
+                                    table["page"] = page_num + 1
                                tables.extend(ocr_tables)
                        except Exception as ocr_error:
                            logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
@@ -692,19 +792,18 @@ class DocumentProcessor:
                            if temp_path and os.path.exists(temp_path):
                                os.unlink(temp_path)
            elif ocr_pages and not self.ocr_processor.ocr_available:
-                logger.warning("OCR not available, skipping OCR processing")
+                logger.warning("OCR not available, skipping OCR processing for scanned pages")
                for page_num, temp_path in ocr_pages:
                    content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
                    if temp_path and os.path.exists(temp_path):
                        os.unlink(temp_path)
            
-            # Add text pages content
+            # Step 4: Add digital pages content
            for page_num, text in page_texts.items():
                content_parts.append(f"Page {page_num + 1}:\n{text}")
            
            # Sort content parts by page number
            def extract_page_num(part):
-                # Find the first number after "Page "
                import re
                match = re.search(r'Page\s+(\d+)', part)
                if match:
@@ -721,7 +820,10 @@ class DocumentProcessor:
                metadata={
                    "pages": total_pages,
                    "file_type": "pdf",
-                    "processed_with_ocr": processed_with_ocr
+                    "processed_with_ocr": processed_with_ocr,
+                    "used_tabula": used_tabula,
+                    "tables_found": len(tables),
+                    "table_sources": list(set(table.get("source", "unknown") for table in tables))
                },
                tables=tables,
                images=images
--- a/LightRAG-main/lightrag/optimized_ocr_processor.py
+++ b/LightRAG-main/lightrag/optimized_ocr_processor.py
@@ -412,22 +412,47 @@ class OptimizedOCRProcessor:
    
    def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]:
        """
-        Detect tables from OCR bounding boxes (compatible with original implementation)
+        Enhanced table detection from OCR bounding boxes with improved accuracy
+        
+        Features:
+        1. Adaptive row grouping based on text height
+        2. Column alignment detection using common x-coordinates
+        3. Header row detection based on formatting patterns
+        4. Table boundary validation
+        5. Multi-table detection in single image
        """
        tables = []
        
-        if not bboxes:
+        if not bboxes or len(bboxes) < 4:  # Need at least 4 text elements for a table
            return tables
        
-        # Group text by rows based on y-coordinates
-        rows = {}
        text_lines = text.split('\n') if text else []
        
+        # Step 1: Calculate text height statistics for adaptive row grouping
+        text_heights = []
+        for bbox in bboxes:
+            if not bbox or len(bbox) < 4:
+                continue
+            try:
+                # Get min and max y coordinates
+                y_coords = [float(point[1]) for point in bbox if point and len(point) >= 2]
+                if y_coords:
+                    height = max(y_coords) - min(y_coords)
+                    if height > 0:
+                        text_heights.append(height)
+            except (TypeError, ValueError, IndexError):
+                continue
+        
+        avg_text_height = sum(text_heights) / len(text_heights) if text_heights else 20.0
+        row_tolerance = avg_text_height * 0.8  # 80% of text height for row grouping
+        
+        # Step 2: Group text by rows with adaptive tolerance
+        rows = {}
        for i, bbox in enumerate(bboxes):
            try:
-                if not bbox:
+                if not bbox or len(bbox) < 4:
                    continue
-                    
+                
                # Calculate y-center of bounding box
                y_values = []
                for point in bbox:
@@ -445,52 +470,133 @@ class OptimizedOCRProcessor:
                    else:
                        y_values.append(0.0)
                
-                if y_values:
-                    y_center = sum(y_values) / len(y_values)
-                else:
-                    y_center = 0.0
+                if not y_values:
+                    continue
+                    
+                y_center = sum(y_values) / len(y_values)
                
-                row_key = round(y_center / 10)  # Group by 10-pixel rows
-                
-                if row_key not in rows:
-                    rows[row_key] = []
-                
-                row_text = text_lines[i] if i < len(text_lines) else ""
-                rows[row_key].append((bbox, row_text))
+                # Find existing row or create new one
+                row_found = False
+                for row_key in list(rows.keys()):
+                    if abs(y_center - row_key) <= row_tolerance:
+                        rows[row_key].append((bbox, text_lines[i] if i < len(text_lines) else ""))
+                        row_found = True
+                        break
                
+                if not row_found:
+                    rows[y_center] = [(bbox, text_lines[i] if i < len(text_lines) else "")]
+                    
            except Exception as e:
-                logger.warning(f"Error processing bbox {i}: {e}")
+                logger.debug(f"Error processing bbox {i} for table detection: {e}")
                continue
        
-        # Sort rows and create table structure
-        sorted_rows = sorted(rows.keys())
+        if len(rows) < 2:  # Need at least 2 rows for a table
+            return tables
+        
+        # Step 3: Sort rows by y-coordinate and process each row
+        sorted_row_keys = sorted(rows.keys())
+        sorted_rows = [rows[key] for key in sorted_row_keys]
+        
+        # Step 4: Detect column positions using x-coordinate clustering
+        all_x_centers = []
+        for row in sorted_rows:
+            for bbox, _ in row:
+                try:
+                    if bbox and len(bbox) >= 4:
+                        x_coords = [float(point[0]) for point in bbox if point and len(point) >= 1]
+                        if x_coords:
+                            x_center = sum(x_coords) / len(x_coords)
+                            all_x_centers.append(x_center)
+                except (TypeError, ValueError, IndexError):
+                    continue
+        
+        if not all_x_centers:
+            return tables
+        
+        # Simple column clustering: sort x-centers and group by proximity
+        all_x_centers.sort()
+        column_positions = []
+        current_cluster = [all_x_centers[0]]
+        
+        for x in all_x_centers[1:]:
+            if x - current_cluster[-1] <= avg_text_height * 1.5:  # 1.5x text width tolerance
+                current_cluster.append(x)
+            else:
+                column_positions.append(sum(current_cluster) / len(current_cluster))
+                current_cluster = [x]
+        
+        if current_cluster:
+            column_positions.append(sum(current_cluster) / len(current_cluster))
+        
+        # Need at least 2 columns for a table
+        if len(column_positions) < 2:
+            return tables
+        
+        # Step 5: Create table structure with proper cell alignment
+        column_positions.sort()
        table_data = []
+        column_count = len(column_positions)
        
-        for row_key in sorted_rows:
-            try:
-                def get_x_coordinate(item):
-                    try:
-                        if (item[0] and len(item[0]) > 0 and
-                            item[0][0] and len(item[0][0]) > 0):
-                            x_val = item[0][0][0]
-                            return float(x_val) if x_val is not None else 0.0
-                        return 0.0
-                    except (TypeError, ValueError, IndexError):
-                        return 0.0
+        for row in sorted_rows:
+            # Sort row items by x-coordinate
+            def get_x_center(item):
+                try:
+                    bbox = item[0]
+                    if bbox and len(bbox) >= 4:
+                        x_coords = [float(point[0]) for point in bbox if point and len(point) >= 1]
+                        return sum(x_coords) / len(x_coords) if x_coords else 0.0
+                except (TypeError, ValueError, IndexError):
+                    pass
+                return 0.0
+            
+            sorted_row = sorted(row, key=get_x_center)
+            
+            # Create row with cells aligned to columns
+            row_cells = [""] * column_count
+            for bbox, cell_text in sorted_row:
+                try:
+                    x_center = get_x_center((bbox, cell_text))
+                    # Find closest column
+                    if column_positions:
+                        closest_col = min(range(column_count),
+                                         key=lambda i: abs(x_center - column_positions[i]))
+                        # Only assign if cell is empty or this text is closer to column center
+                        if not row_cells[closest_col] or \
+                           abs(x_center - column_positions[closest_col]) < avg_text_height * 0.5:
+                            row_cells[closest_col] = cell_text
+                except Exception:
+                    continue
+            
+            # Only add row if it has meaningful content (not all empty)
+            if any(cell.strip() for cell in row_cells):
+                table_data.append(row_cells)
+        
+        # Step 6: Validate table structure
+        if len(table_data) >= 2 and column_count >= 2:
+            # Calculate table consistency score
+            non_empty_cells = sum(1 for row in table_data for cell in row if cell.strip())
+            total_cells = len(table_data) * column_count
+            fill_ratio = non_empty_cells / total_cells if total_cells > 0 else 0
+            
+            # Only accept tables with reasonable fill ratio (20-90%)
+            if 0.2 <= fill_ratio <= 0.9:
+                # Detect potential header row (first row often has different characteristics)
+                has_header = False
+                if len(table_data) >= 3:
+                    # Check if first row has more text or different formatting
+                    first_row_text_len = sum(len(cell) for cell in table_data[0])
+                    second_row_text_len = sum(len(cell) for cell in table_data[1])
+                    if first_row_text_len > second_row_text_len * 1.5:
+                        has_header = True
                
-                row_items = sorted(rows[row_key], key=get_x_coordinate)
-                row_text = [item[1] for item in row_items]
-                table_data.append(row_text)
-            except Exception as e:
-                logger.warning(f"Error sorting row {row_key}: {e}")
-                continue
-        
-        if len(table_data) > 1:  # At least 2 rows for a table
-            tables.append({
-                "data": table_data,
-                "rows": len(table_data),
-                "columns": max(len(row) for row in table_data) if table_data else 0
-            })
+                tables.append({
+                    "data": table_data,
+                    "rows": len(table_data),
+                    "columns": column_count,
+                    "has_header": has_header,
+                    "fill_ratio": fill_ratio,
+                    "type": "detected_table"
+                })
        
        return tables