table detection enhanced

This commit is contained in:
2026-01-14 15:15:01 +08:00
parent e7256a10ea
commit 1838c37302
14 changed files with 18065490 additions and 71 deletions

View File

@@ -396,6 +396,17 @@ class DocumentProcessor:
logger.info("Image classifier initialized successfully")
except Exception as e:
logger.warning(f"Failed to initialize image classifier: {e}")
# Initialize Tabula for PDF table extraction (optional dependency)
self.tabula_available = False
try:
import tabula
self.tabula_available = True
logger.info("Tabula initialized successfully for PDF table extraction")
except ImportError:
logger.warning("Tabula not available. PDF table extraction will use OCR-based method only.")
except Exception as e:
logger.warning(f"Failed to initialize Tabula: {e}")
async def process_document(self, file_path: str) -> ProcessingResult:
"""Process document based on file extension"""
@@ -569,6 +580,64 @@ class DocumentProcessor:
return processed_images, "\n".join(additional_content)
def _extract_tables_with_tabula(self, pdf_path: str) -> List[Dict[str, Any]]:
"""
Extract tables from PDF using Tabula (for digital PDFs with text layers)
Args:
pdf_path: Path to PDF file
Returns:
List of table dictionaries
"""
if not self.tabula_available:
return []
try:
import tabula
import pandas as pd
# Try to extract tables from all pages
tables = []
# Use Tabula to extract tables
dfs = tabula.read_pdf(
pdf_path,
pages='all',
multiple_tables=True,
lattice=True, # Try lattice mode first (for bordered tables)
stream=True, # Fall back to stream mode (for borderless tables)
guess=False,
silent=True
)
for i, df in enumerate(dfs):
if df is not None and not df.empty:
# Convert DataFrame to table structure
table_data = df.values.tolist()
columns = df.columns.tolist()
# Add column headers as first row if they're meaningful
if any(col and str(col).strip() for col in columns):
table_data.insert(0, columns)
if table_data:
tables.append({
"data": table_data,
"rows": len(table_data),
"columns": len(table_data[0]) if table_data else 0,
"source": "tabula",
"table_index": i,
"has_header": True if columns else False
})
logger.info(f"Tabula extracted {len(tables)} tables from {pdf_path}")
return tables
except Exception as e:
logger.warning(f"Tabula table extraction failed for {pdf_path}: {e}")
return []
def _text_quality_score(self, text: str) -> float:
"""Return a score between 0 and 1 indicating text quality.
Higher score means more readable English text."""
@@ -586,21 +655,30 @@ class DocumentProcessor:
return score
async def _process_pdf(self, file_path: Path) -> ProcessingResult:
"""Process PDF files with text extraction and OCR fallback using batch processing"""
"""Process PDF files with hybrid approach: Tabula for digital PDFs, OCR for scanned"""
pdf_document = None
try:
content_parts = []
tables = []
images = []
processed_with_ocr = False
used_tabula = False
# Open PDF
pdf_document = fitz.open(str(file_path))
total_pages = len(pdf_document)
# Collect pages that need OCR
ocr_pages = [] # list of (page_num, temp_path)
page_texts = {} # page_num -> text (if usable)
# Step 1: Try Tabula for digital PDFs with text layers
if self.tabula_available:
tabula_tables = self._extract_tables_with_tabula(str(file_path))
if tabula_tables:
tables.extend(tabula_tables)
used_tabula = True
logger.info(f"Extracted {len(tabula_tables)} tables using Tabula")
# Step 2: Analyze each page for text vs scanned content
ocr_pages = [] # list of (page_num, temp_path) for scanned pages
page_texts = {} # page_num -> text (for digital pages)
for page_num in range(total_pages):
page = pdf_document[page_num]
@@ -608,14 +686,31 @@ class DocumentProcessor:
# Try text extraction first
text = page.get_text()
text_score = self._text_quality_score(text)
# Determine if text is usable (not garbled)
# Threshold 0.5 means at least half of characters are printable ASCII and not replacement
# Determine if page is digital (good text) or scanned (needs OCR)
if text.strip() and text_score >= 0.5:
# Digital page with good text
page_texts[page_num] = text
# If Tabula didn't find tables, try to extract tables from text
if not used_tabula and "|" in text or "\t" in text:
# Simple table detection from text patterns
lines = text.split('\n')
table_like_lines = [line for line in lines if len(line.split()) > 3]
if len(table_like_lines) > 2:
table_data = [line.split('|') if '|' in line else line.split('\t') for line in table_like_lines]
if table_data and len(table_data) >= 2:
tables.append({
"data": table_data,
"rows": len(table_data),
"columns": max(len(row) for row in table_data) if table_data else 0,
"source": "text_pattern",
"page": page_num + 1
})
else:
# Text is empty, garbled, or low quality -> use OCR
# Scanned page or poor text quality -> use OCR
logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
# Use higher resolution for better OCR accuracy on scanned documents
# Use higher resolution for better OCR accuracy
mat = fitz.Matrix(2, 2) # 2x resolution for better OCR
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
@@ -627,11 +722,11 @@ class DocumentProcessor:
ocr_pages.append((page_num, temp_path))
# Process OCR pages in batch if any
# Step 3: Process scanned pages with OCR if any
if ocr_pages and self.ocr_processor.ocr_available:
try:
temp_paths = [temp_path for _, temp_path in ocr_pages]
logger.info(f"Running batch OCR on {len(temp_paths)} pages")
logger.info(f"Running batch OCR on {len(temp_paths)} scanned pages")
batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
logger.info(f"Batch OCR completed for {len(batch_results)} pages")
@@ -654,24 +749,26 @@ class DocumentProcessor:
logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
processed_with_ocr = True
# Extract tables from OCR using enhanced heuristic method
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
if ocr_tables:
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
for table in ocr_tables:
table["source"] = "ocr_enhanced"
table["page"] = page_num + 1
tables.extend(ocr_tables)
else:
logger.warning(f"OCR returned empty text for page {page_num + 1}")
# Don't add empty content, just mark as processed
content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")
# Extract tables from OCR
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
if ocr_tables:
logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
tables.extend(ocr_tables)
# Clean up temporary file
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as batch_error:
logger.error(f"Batch OCR processing failed: {batch_error}")
# Fall back to individual processing for each page
# Fall back to individual processing
for page_num, temp_path in ocr_pages:
try:
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
@@ -684,6 +781,9 @@ class DocumentProcessor:
# Extract tables
ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
if ocr_tables:
for table in ocr_tables:
table["source"] = "ocr_fallback"
table["page"] = page_num + 1
tables.extend(ocr_tables)
except Exception as ocr_error:
logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
@@ -692,19 +792,18 @@ class DocumentProcessor:
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)
elif ocr_pages and not self.ocr_processor.ocr_available:
logger.warning("OCR not available, skipping OCR processing")
logger.warning("OCR not available, skipping OCR processing for scanned pages")
for page_num, temp_path in ocr_pages:
content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)
# Add text pages content
# Step 4: Add digital pages content
for page_num, text in page_texts.items():
content_parts.append(f"Page {page_num + 1}:\n{text}")
# Sort content parts by page number
def extract_page_num(part):
# Find the first number after "Page "
import re
match = re.search(r'Page\s+(\d+)', part)
if match:
@@ -721,7 +820,10 @@ class DocumentProcessor:
metadata={
"pages": total_pages,
"file_type": "pdf",
"processed_with_ocr": processed_with_ocr
"processed_with_ocr": processed_with_ocr,
"used_tabula": used_tabula,
"tables_found": len(tables),
"table_sources": list(set(table.get("source", "unknown") for table in tables))
},
tables=tables,
images=images