table detection enhanced
This commit is contained in:
@@ -396,6 +396,17 @@ class DocumentProcessor:
|
||||
logger.info("Image classifier initialized successfully")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize image classifier: {e}")
|
||||
|
||||
# Initialize Tabula for PDF table extraction (optional dependency)
|
||||
self.tabula_available = False
|
||||
try:
|
||||
import tabula
|
||||
self.tabula_available = True
|
||||
logger.info("Tabula initialized successfully for PDF table extraction")
|
||||
except ImportError:
|
||||
logger.warning("Tabula not available. PDF table extraction will use OCR-based method only.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize Tabula: {e}")
|
||||
|
||||
async def process_document(self, file_path: str) -> ProcessingResult:
|
||||
"""Process document based on file extension"""
|
||||
@@ -569,6 +580,64 @@ class DocumentProcessor:
|
||||
|
||||
return processed_images, "\n".join(additional_content)
|
||||
|
||||
def _extract_tables_with_tabula(self, pdf_path: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract tables from PDF using Tabula (for digital PDFs with text layers)
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
|
||||
Returns:
|
||||
List of table dictionaries
|
||||
"""
|
||||
if not self.tabula_available:
|
||||
return []
|
||||
|
||||
try:
|
||||
import tabula
|
||||
import pandas as pd
|
||||
|
||||
# Try to extract tables from all pages
|
||||
tables = []
|
||||
|
||||
# Use Tabula to extract tables
|
||||
dfs = tabula.read_pdf(
|
||||
pdf_path,
|
||||
pages='all',
|
||||
multiple_tables=True,
|
||||
lattice=True, # Try lattice mode first (for bordered tables)
|
||||
stream=True, # Fall back to stream mode (for borderless tables)
|
||||
guess=False,
|
||||
silent=True
|
||||
)
|
||||
|
||||
for i, df in enumerate(dfs):
|
||||
if df is not None and not df.empty:
|
||||
# Convert DataFrame to table structure
|
||||
table_data = df.values.tolist()
|
||||
columns = df.columns.tolist()
|
||||
|
||||
# Add column headers as first row if they're meaningful
|
||||
if any(col and str(col).strip() for col in columns):
|
||||
table_data.insert(0, columns)
|
||||
|
||||
if table_data:
|
||||
tables.append({
|
||||
"data": table_data,
|
||||
"rows": len(table_data),
|
||||
"columns": len(table_data[0]) if table_data else 0,
|
||||
"source": "tabula",
|
||||
"table_index": i,
|
||||
"has_header": True if columns else False
|
||||
})
|
||||
|
||||
logger.info(f"Tabula extracted {len(tables)} tables from {pdf_path}")
|
||||
return tables
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Tabula table extraction failed for {pdf_path}: {e}")
|
||||
return []
|
||||
|
||||
def _text_quality_score(self, text: str) -> float:
|
||||
"""Return a score between 0 and 1 indicating text quality.
|
||||
Higher score means more readable English text."""
|
||||
@@ -586,21 +655,30 @@ class DocumentProcessor:
|
||||
return score
|
||||
|
||||
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
|
||||
"""Process PDF files with text extraction and OCR fallback using batch processing"""
|
||||
"""Process PDF files with hybrid approach: Tabula for digital PDFs, OCR for scanned"""
|
||||
pdf_document = None
|
||||
try:
|
||||
content_parts = []
|
||||
tables = []
|
||||
images = []
|
||||
processed_with_ocr = False
|
||||
used_tabula = False
|
||||
|
||||
# Open PDF
|
||||
pdf_document = fitz.open(str(file_path))
|
||||
total_pages = len(pdf_document)
|
||||
|
||||
# Collect pages that need OCR
|
||||
ocr_pages = [] # list of (page_num, temp_path)
|
||||
page_texts = {} # page_num -> text (if usable)
|
||||
# Step 1: Try Tabula for digital PDFs with text layers
|
||||
if self.tabula_available:
|
||||
tabula_tables = self._extract_tables_with_tabula(str(file_path))
|
||||
if tabula_tables:
|
||||
tables.extend(tabula_tables)
|
||||
used_tabula = True
|
||||
logger.info(f"Extracted {len(tabula_tables)} tables using Tabula")
|
||||
|
||||
# Step 2: Analyze each page for text vs scanned content
|
||||
ocr_pages = [] # list of (page_num, temp_path) for scanned pages
|
||||
page_texts = {} # page_num -> text (for digital pages)
|
||||
|
||||
for page_num in range(total_pages):
|
||||
page = pdf_document[page_num]
|
||||
@@ -608,14 +686,31 @@ class DocumentProcessor:
|
||||
# Try text extraction first
|
||||
text = page.get_text()
|
||||
text_score = self._text_quality_score(text)
|
||||
# Determine if text is usable (not garbled)
|
||||
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
|
||||
|
||||
# Determine if page is digital (good text) or scanned (needs OCR)
|
||||
if text.strip() and text_score >= 0.5:
|
||||
# Digital page with good text
|
||||
page_texts[page_num] = text
|
||||
|
||||
# If Tabula didn't find tables, try to extract tables from text
|
||||
if not used_tabula and "|" in text or "\t" in text:
|
||||
# Simple table detection from text patterns
|
||||
lines = text.split('\n')
|
||||
table_like_lines = [line for line in lines if len(line.split()) > 3]
|
||||
if len(table_like_lines) > 2:
|
||||
table_data = [line.split('|') if '|' in line else line.split('\t') for line in table_like_lines]
|
||||
if table_data and len(table_data) >= 2:
|
||||
tables.append({
|
||||
"data": table_data,
|
||||
"rows": len(table_data),
|
||||
"columns": max(len(row) for row in table_data) if table_data else 0,
|
||||
"source": "text_pattern",
|
||||
"page": page_num + 1
|
||||
})
|
||||
else:
|
||||
# Text is empty, garbled, or low quality -> use OCR
|
||||
# Scanned page or poor text quality -> use OCR
|
||||
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
|
||||
# Use higher resolution for better OCR accuracy on scanned documents
|
||||
# Use higher resolution for better OCR accuracy
|
||||
mat = fitz.Matrix(2, 2) # 2x resolution for better OCR
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_data = pix.tobytes("png")
|
||||
@@ -627,11 +722,11 @@ class DocumentProcessor:
|
||||
|
||||
ocr_pages.append((page_num, temp_path))
|
||||
|
||||
# Process OCR pages in batch if any
|
||||
# Step 3: Process scanned pages with OCR if any
|
||||
if ocr_pages and self.ocr_processor.ocr_available:
|
||||
try:
|
||||
temp_paths = [temp_path for _, temp_path in ocr_pages]
|
||||
logger.info(f"Running batch OCR on {len(temp_paths)} pages")
|
||||
logger.info(f"Running batch OCR on {len(temp_paths)} scanned pages")
|
||||
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
|
||||
logger.info(f"Batch OCR completed for {len(batch_results)} pages")
|
||||
|
||||
@@ -654,24 +749,26 @@ class DocumentProcessor:
|
||||
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
|
||||
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
|
||||
processed_with_ocr = True
|
||||
|
||||
# Extract tables from OCR using enhanced heuristic method
|
||||
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||
if ocr_tables:
|
||||
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
||||
for table in ocr_tables:
|
||||
table["source"] = "ocr_enhanced"
|
||||
table["page"] = page_num + 1
|
||||
tables.extend(ocr_tables)
|
||||
else:
|
||||
logger.warning(f"OCR returned empty text for page {page_num + 1}")
|
||||
# Don't add empty content, just mark as processed
|
||||
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
|
||||
|
||||
# Extract tables from OCR
|
||||
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||
if ocr_tables:
|
||||
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
|
||||
tables.extend(ocr_tables)
|
||||
|
||||
# Clean up temporary file
|
||||
if temp_path and os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
except Exception as batch_error:
|
||||
logger.error(f"Batch OCR processing failed: {batch_error}")
|
||||
# Fall back to individual processing for each page
|
||||
# Fall back to individual processing
|
||||
for page_num, temp_path in ocr_pages:
|
||||
try:
|
||||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||||
@@ -684,6 +781,9 @@ class DocumentProcessor:
|
||||
# Extract tables
|
||||
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
|
||||
if ocr_tables:
|
||||
for table in ocr_tables:
|
||||
table["source"] = "ocr_fallback"
|
||||
table["page"] = page_num + 1
|
||||
tables.extend(ocr_tables)
|
||||
except Exception as ocr_error:
|
||||
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
|
||||
@@ -692,19 +792,18 @@ class DocumentProcessor:
|
||||
if temp_path and os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
elif ocr_pages and not self.ocr_processor.ocr_available:
|
||||
logger.warning("OCR not available, skipping OCR processing")
|
||||
logger.warning("OCR not available, skipping OCR processing for scanned pages")
|
||||
for page_num, temp_path in ocr_pages:
|
||||
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
|
||||
if temp_path and os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
# Add text pages content
|
||||
# Step 4: Add digital pages content
|
||||
for page_num, text in page_texts.items():
|
||||
content_parts.append(f"Page {page_num + 1}:\n{text}")
|
||||
|
||||
# Sort content parts by page number
|
||||
def extract_page_num(part):
|
||||
# Find the first number after "Page "
|
||||
import re
|
||||
match = re.search(r'Page\s+(\d+)', part)
|
||||
if match:
|
||||
@@ -721,7 +820,10 @@ class DocumentProcessor:
|
||||
metadata={
|
||||
"pages": total_pages,
|
||||
"file_type": "pdf",
|
||||
"processed_with_ocr": processed_with_ocr
|
||||
"processed_with_ocr": processed_with_ocr,
|
||||
"used_tabula": used_tabula,
|
||||
"tables_found": len(tables),
|
||||
"table_sources": list(set(table.get("source", "unknown") for table in tables))
|
||||
},
|
||||
tables=tables,
|
||||
images=images
|
||||
|
||||
@@ -412,22 +412,47 @@ class OptimizedOCRProcessor:
|
||||
|
||||
def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect tables from OCR bounding boxes (compatible with original implementation)
|
||||
Enhanced table detection from OCR bounding boxes with improved accuracy
|
||||
|
||||
Features:
|
||||
1. Adaptive row grouping based on text height
|
||||
2. Column alignment detection using common x-coordinates
|
||||
3. Header row detection based on formatting patterns
|
||||
4. Table boundary validation
|
||||
5. Multi-table detection in single image
|
||||
"""
|
||||
tables = []
|
||||
|
||||
if not bboxes:
|
||||
if not bboxes or len(bboxes) < 4: # Need at least 4 text elements for a table
|
||||
return tables
|
||||
|
||||
# Group text by rows based on y-coordinates
|
||||
rows = {}
|
||||
text_lines = text.split('\n') if text else []
|
||||
|
||||
# Step 1: Calculate text height statistics for adaptive row grouping
|
||||
text_heights = []
|
||||
for bbox in bboxes:
|
||||
if not bbox or len(bbox) < 4:
|
||||
continue
|
||||
try:
|
||||
# Get min and max y coordinates
|
||||
y_coords = [float(point[1]) for point in bbox if point and len(point) >= 2]
|
||||
if y_coords:
|
||||
height = max(y_coords) - min(y_coords)
|
||||
if height > 0:
|
||||
text_heights.append(height)
|
||||
except (TypeError, ValueError, IndexError):
|
||||
continue
|
||||
|
||||
avg_text_height = sum(text_heights) / len(text_heights) if text_heights else 20.0
|
||||
row_tolerance = avg_text_height * 0.8 # 80% of text height for row grouping
|
||||
|
||||
# Step 2: Group text by rows with adaptive tolerance
|
||||
rows = {}
|
||||
for i, bbox in enumerate(bboxes):
|
||||
try:
|
||||
if not bbox:
|
||||
if not bbox or len(bbox) < 4:
|
||||
continue
|
||||
|
||||
|
||||
# Calculate y-center of bounding box
|
||||
y_values = []
|
||||
for point in bbox:
|
||||
@@ -445,52 +470,133 @@ class OptimizedOCRProcessor:
|
||||
else:
|
||||
y_values.append(0.0)
|
||||
|
||||
if y_values:
|
||||
y_center = sum(y_values) / len(y_values)
|
||||
else:
|
||||
y_center = 0.0
|
||||
if not y_values:
|
||||
continue
|
||||
|
||||
y_center = sum(y_values) / len(y_values)
|
||||
|
||||
row_key = round(y_center / 10) # Group by 10-pixel rows
|
||||
|
||||
if row_key not in rows:
|
||||
rows[row_key] = []
|
||||
|
||||
row_text = text_lines[i] if i < len(text_lines) else ""
|
||||
rows[row_key].append((bbox, row_text))
|
||||
# Find existing row or create new one
|
||||
row_found = False
|
||||
for row_key in list(rows.keys()):
|
||||
if abs(y_center - row_key) <= row_tolerance:
|
||||
rows[row_key].append((bbox, text_lines[i] if i < len(text_lines) else ""))
|
||||
row_found = True
|
||||
break
|
||||
|
||||
if not row_found:
|
||||
rows[y_center] = [(bbox, text_lines[i] if i < len(text_lines) else "")]
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing bbox {i}: {e}")
|
||||
logger.debug(f"Error processing bbox {i} for table detection: {e}")
|
||||
continue
|
||||
|
||||
# Sort rows and create table structure
|
||||
sorted_rows = sorted(rows.keys())
|
||||
if len(rows) < 2: # Need at least 2 rows for a table
|
||||
return tables
|
||||
|
||||
# Step 3: Sort rows by y-coordinate and process each row
|
||||
sorted_row_keys = sorted(rows.keys())
|
||||
sorted_rows = [rows[key] for key in sorted_row_keys]
|
||||
|
||||
# Step 4: Detect column positions using x-coordinate clustering
|
||||
all_x_centers = []
|
||||
for row in sorted_rows:
|
||||
for bbox, _ in row:
|
||||
try:
|
||||
if bbox and len(bbox) >= 4:
|
||||
x_coords = [float(point[0]) for point in bbox if point and len(point) >= 1]
|
||||
if x_coords:
|
||||
x_center = sum(x_coords) / len(x_coords)
|
||||
all_x_centers.append(x_center)
|
||||
except (TypeError, ValueError, IndexError):
|
||||
continue
|
||||
|
||||
if not all_x_centers:
|
||||
return tables
|
||||
|
||||
# Simple column clustering: sort x-centers and group by proximity
|
||||
all_x_centers.sort()
|
||||
column_positions = []
|
||||
current_cluster = [all_x_centers[0]]
|
||||
|
||||
for x in all_x_centers[1:]:
|
||||
if x - current_cluster[-1] <= avg_text_height * 1.5: # 1.5x text width tolerance
|
||||
current_cluster.append(x)
|
||||
else:
|
||||
column_positions.append(sum(current_cluster) / len(current_cluster))
|
||||
current_cluster = [x]
|
||||
|
||||
if current_cluster:
|
||||
column_positions.append(sum(current_cluster) / len(current_cluster))
|
||||
|
||||
# Need at least 2 columns for a table
|
||||
if len(column_positions) < 2:
|
||||
return tables
|
||||
|
||||
# Step 5: Create table structure with proper cell alignment
|
||||
column_positions.sort()
|
||||
table_data = []
|
||||
column_count = len(column_positions)
|
||||
|
||||
for row_key in sorted_rows:
|
||||
try:
|
||||
def get_x_coordinate(item):
|
||||
try:
|
||||
if (item[0] and len(item[0]) > 0 and
|
||||
item[0][0] and len(item[0][0]) > 0):
|
||||
x_val = item[0][0][0]
|
||||
return float(x_val) if x_val is not None else 0.0
|
||||
return 0.0
|
||||
except (TypeError, ValueError, IndexError):
|
||||
return 0.0
|
||||
for row in sorted_rows:
|
||||
# Sort row items by x-coordinate
|
||||
def get_x_center(item):
|
||||
try:
|
||||
bbox = item[0]
|
||||
if bbox and len(bbox) >= 4:
|
||||
x_coords = [float(point[0]) for point in bbox if point and len(point) >= 1]
|
||||
return sum(x_coords) / len(x_coords) if x_coords else 0.0
|
||||
except (TypeError, ValueError, IndexError):
|
||||
pass
|
||||
return 0.0
|
||||
|
||||
sorted_row = sorted(row, key=get_x_center)
|
||||
|
||||
# Create row with cells aligned to columns
|
||||
row_cells = [""] * column_count
|
||||
for bbox, cell_text in sorted_row:
|
||||
try:
|
||||
x_center = get_x_center((bbox, cell_text))
|
||||
# Find closest column
|
||||
if column_positions:
|
||||
closest_col = min(range(column_count),
|
||||
key=lambda i: abs(x_center - column_positions[i]))
|
||||
# Only assign if cell is empty or this text is closer to column center
|
||||
if not row_cells[closest_col] or \
|
||||
abs(x_center - column_positions[closest_col]) < avg_text_height * 0.5:
|
||||
row_cells[closest_col] = cell_text
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Only add row if it has meaningful content (not all empty)
|
||||
if any(cell.strip() for cell in row_cells):
|
||||
table_data.append(row_cells)
|
||||
|
||||
# Step 6: Validate table structure
|
||||
if len(table_data) >= 2 and column_count >= 2:
|
||||
# Calculate table consistency score
|
||||
non_empty_cells = sum(1 for row in table_data for cell in row if cell.strip())
|
||||
total_cells = len(table_data) * column_count
|
||||
fill_ratio = non_empty_cells / total_cells if total_cells > 0 else 0
|
||||
|
||||
# Only accept tables with reasonable fill ratio (20-90%)
|
||||
if 0.2 <= fill_ratio <= 0.9:
|
||||
# Detect potential header row (first row often has different characteristics)
|
||||
has_header = False
|
||||
if len(table_data) >= 3:
|
||||
# Check if first row has more text or different formatting
|
||||
first_row_text_len = sum(len(cell) for cell in table_data[0])
|
||||
second_row_text_len = sum(len(cell) for cell in table_data[1])
|
||||
if first_row_text_len > second_row_text_len * 1.5:
|
||||
has_header = True
|
||||
|
||||
row_items = sorted(rows[row_key], key=get_x_coordinate)
|
||||
row_text = [item[1] for item in row_items]
|
||||
table_data.append(row_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error sorting row {row_key}: {e}")
|
||||
continue
|
||||
|
||||
if len(table_data) > 1: # At least 2 rows for a table
|
||||
tables.append({
|
||||
"data": table_data,
|
||||
"rows": len(table_data),
|
||||
"columns": max(len(row) for row in table_data) if table_data else 0
|
||||
})
|
||||
tables.append({
|
||||
"data": table_data,
|
||||
"rows": len(table_data),
|
||||
"columns": column_count,
|
||||
"has_header": has_header,
|
||||
"fill_ratio": fill_ratio,
|
||||
"type": "detected_table"
|
||||
})
|
||||
|
||||
return tables
|
||||
|
||||
|
||||
Reference in New Issue
Block a user