railseek6/LightRAG-main/lightrag/document_processor.py

"""
Multi-format Document Processing Pipeline for LightRAG
Supports PDF, images, Office documents, and more with GPU acceleration
Enhanced with text-first extraction and isolated image classification
"""

import os
import logging
import asyncio
from typing import Dict, List, Any, Optional, Union, Tuple
from dataclasses import dataclass
import tempfile
from pathlib import Path

# Import required libraries
import fitz  # PyMuPDF
import docx
import openpyxl
from pptx import Presentation
from bs4 import BeautifulSoup
import pandas as pd
from .production_config import get_config

# Import optimized image classifier using subprocess isolation
import sys
import os
# Add the workspace directory to path where fast_image_classifier.py is located
workspace_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if workspace_dir not in sys.path:
    sys.path.insert(0, workspace_dir)
from fast_image_classifier import get_image_classifier

# Import optimized OCR processor
from .optimized_ocr_processor import OptimizedOCRProcessor

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ProcessingResult:
    """Result of document processing"""
    success: bool
    content: str
    metadata: Dict[str, Any]
    error: Optional[str] = None
    tables: List[Dict[str, Any]] = None
    images: List[Dict[str, Any]] = None

class OCRProcessor:
    """GPU-accelerated OCR processing using PaddleOCR with process-per-request isolation"""

    def __init__(self, use_gpu: bool = True, languages: List[str] = None):
        self.use_gpu = use_gpu
        self.languages = languages or ['en', 'ch']
        self.ocr_available = False
        self._temp_dir = None
        self._initialize_ocr()

    def _initialize_ocr(self):
        """Initialize PaddleOCR by testing if it can be loaded"""
        try:
            logger.info("Testing PaddleOCR availability with process-per-request approach")

            # Create a simple test script to verify OCR works
            test_script = """
import sys
import json
from paddleocr import PaddleOCR

try:
    # Test OCR initialization
    ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang='en', show_log=False, gpu_mem=2000)
    print("PaddleOCR test: SUCCESS")
    sys.exit(0)
except Exception as e:
    print(f"PaddleOCR test: FAILED - {e}")
    sys.exit(1)
"""

            import tempfile
            import subprocess

            # Create temporary directory
            self._temp_dir = tempfile.mkdtemp(prefix="paddleocr_")
            script_path = os.path.join(self._temp_dir, "test_ocr.py")

            with open(script_path, 'w') as f:
                f.write(test_script)

            # Run test
            env = os.environ.copy()
            result = subprocess.run(
                [sys.executable, script_path],
                capture_output=True,
                text=True,
                timeout=30,
                env=env
            )

            if result.returncode == 0:
                self.ocr_available = True
                logger.info("PaddleOCR is available for process-per-request OCR")
            else:
                logger.error(f"PaddleOCR test failed: {result.stderr}")
                self.ocr_available = False

        except Exception as e:
            logger.error(f"Failed to initialize OCR processor: {e}")
            self.ocr_available = False

    def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
        """Extract text from image using isolated OCR process per request"""
        if not self.ocr_available:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        try:
            import tempfile
            import subprocess
            import json

            # OCR script that processes one image and returns JSON result
            ocr_script = """
import sys
import json
from paddleocr import PaddleOCR

def extract_text_from_image(image_path):
    try:
        ocr_engine = PaddleOCR(
            use_gpu=True,
            use_angle_cls=True,
            lang='en',
            show_log=False,
            gpu_mem=2000
        )

        result = ocr_engine.ocr(image_path)

        if not result or not result[0]:
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        extracted_text = []
        bboxes = []
        total_confidence = 0.0
        line_count = 0

        for line in result[0]:
            try:
                if len(line) == 2:
                    bbox, (text, confidence) = line
                elif len(line) >= 1:
                    bbox = line[0] if len(line) > 0 else []
                    if len(line) > 1:
                        if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
                            text, confidence = line[1][0], line[1][1]
                        else:
                            text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
                    else:
                        text, confidence = "", 0.0
                else:
                    continue

                text_str = str(text) if text is not None else ""
                confidence_float = 0.0
                if confidence is not None:
                    if isinstance(confidence, (int, float)):
                        confidence_float = float(confidence)
                    elif isinstance(confidence, str):
                        try:
                            confidence_float = float(confidence)
                        except ValueError:
                            confidence_float = 0.0
                    else:
                        confidence_float = 0.0
                else:
                    confidence_float = 0.0

                extracted_text.append(text_str)
                bboxes.append(bbox)
                total_confidence += confidence_float
                line_count += 1

            except (TypeError, ValueError, IndexError) as e:
                extracted_text.append("")
                bboxes.append([])
                total_confidence += 0.0
                line_count += 1

        avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
        full_text = "\\\\n".join(extracted_text)

        return {
            "text": full_text,
            "confidence": avg_confidence,
            "bboxes": bboxes,
            "line_count": line_count
        }
    except Exception as e:
        return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

# Main execution
if __name__ == "__main__":
    image_path = sys.argv[1]
    try:
        result = extract_text_from_image(image_path)
        print(json.dumps(result))
    except Exception as e:
        print(json.dumps({"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0, "error": str(e)}))
"""

            # Write OCR script
            script_path = os.path.join(self._temp_dir, "ocr_single.py")
            with open(script_path, 'w') as f:
                f.write(ocr_script)

            # Run OCR process
            env = os.environ.copy()
            result = subprocess.run(
                [sys.executable, script_path, image_path],
                capture_output=True,
                text=True,
                timeout=60,  # 60 second timeout for OCR
                env=env
            )

            if result.returncode == 0:
                try:
                    ocr_result = json.loads(result.stdout)
                    return ocr_result
                except json.JSONDecodeError:
                    logger.error(f"Failed to parse OCR result: {result.stdout}")
                    return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
            else:
                logger.error(f"OCR process failed with return code {result.returncode}: {result.stderr}")
                return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

        except subprocess.TimeoutExpired:
            logger.error("OCR processing timeout")
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
        except Exception as e:
            logger.error(f"OCR request failed: {e}")
            return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}

    def close(self):
        """Close the OCR process"""
        if self._process:
            try:
                exit_request = {"action": "exit"}
                self._process.stdin.write(json.dumps(exit_request) + '\n')
                self._process.stdin.flush()
                self._process.wait(timeout=5)
            except:
                self._process.terminate()
            finally:
                self._process = None

        if self._temp_dir and os.path.exists(self._temp_dir):
            import shutil
            try:
                shutil.rmtree(self._temp_dir)
            except:
                pass

    def __del__(self):
        """Destructor to ensure cleanup"""
        self.close()

    def extract_tables_from_image(self, image_path: str) -> List[Dict[str, Any]]:
        """Extract tables from image using OCR and layout analysis"""
        try:
            # Use OCR to get text with bounding boxes
            ocr_result = self.extract_text_from_image(image_path)

            # Simple table detection based on text alignment
            tables = self._detect_tables_from_bboxes(ocr_result["bboxes"], ocr_result["text"])
            return tables
        except Exception as e:
            logger.error(f"Table extraction failed: {e}")
            return []

    def _detect_tables_from_bboxes(self, bboxes: List, text: str) -> List[Dict[str, Any]]:
        """Detect tables from OCR bounding boxes"""
        tables = []

        if not bboxes:
            return tables

        # Group text by rows based on y-coordinates
        rows = {}
        for i, bbox in enumerate(bboxes):
            try:
                # Ensure all points are converted to float with proper error handling
                y_values = []
                for point in bbox:
                    if point and len(point) >= 2:
                        try:
                            # Ensure we convert both coordinates to float with explicit type safety
                            y_val = point[1]
                            if isinstance(y_val, (int, float)):
                                y_values.append(float(y_val))
                            elif isinstance(y_val, str):
                                y_values.append(float(y_val))
                            else:
                                logger.warning(f"Unexpected y-coordinate type: {type(y_val)}, value: {y_val}")
                                y_values.append(0.0)
                        except (TypeError, ValueError) as conv_error:
                            logger.warning(f"Type conversion error for y-coordinate {point[1]}: {conv_error}")
                            y_values.append(0.0)
                    else:
                        y_values.append(0.0)

                # Safe calculation of y_center with explicit float conversion
                try:
                    if y_values:
                        # Convert all values to float explicitly and handle any remaining type issues
                        float_y_values = []
                        for val in y_values:
                            try:
                                float_y_values.append(float(val))
                            except (TypeError, ValueError):
                                float_y_values.append(0.0)
                        y_center = sum(float_y_values) / len(float_y_values)
                    else:
                        y_center = 0.0
                except (TypeError, ZeroDivisionError) as calc_error:
                    logger.warning(f"Error calculating y_center: {calc_error}")
                    y_center = 0.0

                row_key = round(y_center / 10)  # Group by 10-pixel rows

                if row_key not in rows:
                    rows[row_key] = []
                # Safe text extraction with bounds checking
                text_lines = text.split('\n')
                row_text = text_lines[i] if i < len(text_lines) else ""
                rows[row_key].append((bbox, row_text))
            except Exception as e:
                logger.warning(f"Error processing bbox {i}: {e}")
                continue

        # Sort rows and create table structure
        sorted_rows = sorted(rows.keys())
        table_data = []

        for row_key in sorted_rows:
            try:
                # Ensure all x-coordinates are converted to float with proper error handling
                def get_x_coordinate(item):
                    try:
                        if (item[0] and len(item[0]) > 0 and
                            item[0][0] and len(item[0][0]) > 0):
                            # Explicit float conversion with error handling
                            x_val = item[0][0][0]
                            return float(x_val) if x_val is not None else 0.0
                        return 0.0
                    except (TypeError, ValueError, IndexError) as x_error:
                        logger.warning(f"Error getting x-coordinate: {x_error}")
                        return 0.0

                row_items = sorted(rows[row_key], key=get_x_coordinate)
                row_text = [item[1] for item in row_items]
                table_data.append(row_text)
            except Exception as e:
                logger.warning(f"Error sorting row {row_key}: {e}")
                continue

        if len(table_data) > 1:  # At least 2 rows for a table
            tables.append({
                "data": table_data,
                "rows": len(table_data),
                "columns": max(len(row) for row in table_data) if table_data else 0
            })

        return tables


class DocumentProcessor:
    """Main document processor for multiple file formats"""

    def __init__(self):
        self.config = get_config()
        self.ocr_processor = OptimizedOCRProcessor(
            use_gpu=self.config.performance.USE_GPU,
            languages=self.config.document_processing.OCR_LANGUAGES,
            batch_size=4,  # Process 4 images at a time for better performance
            max_workers=2   # Use 2 parallel workers for async operations
        )
        self.supported_extensions = self.config.document_processing.SUPPORTED_EXTENSIONS

        # Initialize image classifier if available
        self.image_classifier = None
        if get_image_classifier:
            try:
                self.image_classifier = get_image_classifier()
                logger.info("Image classifier initialized successfully")
            except Exception as e:
                logger.warning(f"Failed to initialize image classifier: {e}")

        # Initialize Tabula for PDF table extraction (optional dependency)
        self.tabula_available = False
        try:
            import tabula
            self.tabula_available = True
            logger.info("Tabula initialized successfully for PDF table extraction")
        except ImportError:
            logger.warning("Tabula not available. PDF table extraction will use OCR-based method only.")
        except Exception as e:
            logger.warning(f"Failed to initialize Tabula: {e}")

    async def process_document(self, file_path: str) -> ProcessingResult:
        """Process document based on file extension"""
        file_path = Path(file_path)

        if not file_path.exists():
            return ProcessingResult(
                success=False,
                content="",
                metadata={"error": "File not found"},
                error="File not found"
            )

        # Determine file type and process accordingly
        extension = file_path.suffix.lower()

        try:
            if extension in ['.pdf']:
                return await self._process_pdf(file_path)
            elif extension in ['.doc', '.docx']:
                return await self._process_word(file_path)
            elif extension in ['.xls', '.xlsx']:
                return await self._process_excel(file_path)
            elif extension in ['.ppt', '.pptx']:
                return await self._process_powerpoint(file_path)
            elif extension in ['.txt', '.csv', '.html']:
                return await self._process_text(file_path)
            elif extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']:
                return await self._process_image(file_path)
            else:
                return ProcessingResult(
                    success=False,
                    content="",
                    metadata={"error": f"Unsupported file type: {extension}"},
                    error=f"Unsupported file type: {extension}"
                )
        except Exception as e:
            logger.error(f"Error processing {file_path}: {e}")
            return ProcessingResult(
                success=False,
                content="",
                metadata={"error": str(e)},
                error=str(e)
            )

    def _extract_and_process_images(self, images: List[Any], file_type: str) -> Tuple[List[Dict[str, Any]], str]:
        """
        Extract and process images from documents with batch OCR processing
        Returns processed images metadata and additional content from OCR
        """
        processed_images = []
        additional_content = []
        temp_paths = []
        temp_files = []

        # Step 1: Save all images to temporary files
        for i, image_data in enumerate(images):
            temp_path = None
            try:
                # Save image to temporary file
                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
                    if file_type == 'word':
                        # For Word documents, image_data is an inline_shape
                        image_bytes = image_data.image.blob
                    elif file_type == 'pdf':
                        # For PDFs, image_data is a pixmap
                        image_bytes = image_data.tobytes("png")
                    else:
                        image_bytes = image_data

                    temp_file.write(image_bytes)
                    temp_path = temp_file.name

                temp_paths.append(temp_path)
                temp_files.append((i, temp_path, image_data))

            except Exception as e:
                logger.error(f"Error saving image {i} to temporary file: {e}")
                processed_images.append({
                    "index": i,
                    "error": str(e),
                    "path": temp_path or "unknown"
                })

        if not temp_paths:
            return processed_images, ""

        # Step 2: Batch OCR processing
        batch_results = []
        if self.ocr_processor.ocr_available:
            try:
                logger.info(f"Running batch OCR on {len(temp_paths)} images")
                batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
                logger.info(f"Batch OCR completed for {len(batch_results)} images")
            except Exception as e:
                logger.error(f"Batch OCR processing failed: {e}")
                # Fall back to individual processing
                batch_results = []

        # Step 3: Process results
        for idx, (i, temp_path, image_data) in enumerate(temp_files):
            image_metadata = {"path": temp_path, "index": i}

            try:
                # Get OCR result for this image
                ocr_result = None
                if batch_results and idx < len(batch_results):
                    batch_result = batch_results[idx]
                    ocr_result = {
                        "text": batch_result.text,
                        "confidence": batch_result.confidence,
                        "bboxes": batch_result.bboxes,
                        "line_count": batch_result.line_count
                    }
                else:
                    # Fallback to individual OCR
                    if self.ocr_processor.ocr_available:
                        ocr_result = self.ocr_processor.extract_text_from_image(temp_path)

                if ocr_result and ocr_result["text"].strip():
                    image_metadata["ocr_text"] = ocr_result["text"]
                    image_metadata["ocr_confidence"] = ocr_result["confidence"]
                    additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
                    logger.info(f"Image {i+1} has text content, skipping classification")
                else:
                    logger.info(f"Image {i+1} has no text, proceeding to classification")
                    # Step 4: Only classify if OCR found no text
                    if self.image_classifier and self.image_classifier.available:
                        try:
                            classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
                            image_metadata["classification"] = classification_results
                            # Add classification to content for indexing
                            top_label = classification_results[0]["label"] if classification_results else "unknown"
                            top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
                            image_metadata["primary_classification"] = top_label
                            # Add classification with confidence for better searchability
                            classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
                            additional_content.append(classification_text)
                            logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")

                            # Add bee classification as a special entity for search
                            if "bee" in top_label.lower():
                                # Add multiple variations to ensure it gets picked up by entity extraction
                                bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
                                additional_content.append(bee_entity_text)
                                # Also add as standalone entity markers
                                additional_content.append("Entity: Bee")
                                additional_content.append("Entity: Insect")
                                additional_content.append("Entity: Animal")

                        except Exception as classify_error:
                            logger.error(f"Image classification failed for image {i+1}: {classify_error}")
                            image_metadata["classification_error"] = str(classify_error)

                processed_images.append(image_metadata)

            except Exception as e:
                logger.error(f"Error processing image {i}: {e}")
                processed_images.append({
                    "index": i,
                    "error": str(e),
                    "path": temp_path
                })
            finally:
                # Clean up temporary file
                if temp_path and os.path.exists(temp_path):
                    try:
                        os.unlink(temp_path)
                    except Exception as e:
                        logger.warning(f"Failed to delete temporary image file {temp_path}: {e}")

        return processed_images, "\n".join(additional_content)

    def _extract_tables_with_tabula(self, pdf_path: str) -> List[Dict[str, Any]]:
        """
        Extract tables from PDF using Tabula (for digital PDFs with text layers)

        Args:
            pdf_path: Path to PDF file

        Returns:
            List of table dictionaries
        """
        if not self.tabula_available:
            return []

        try:
            import tabula
            import pandas as pd

            # Try to extract tables from all pages
            tables = []

            # Use Tabula to extract tables
            dfs = tabula.read_pdf(
                pdf_path,
                pages='all',
                multiple_tables=True,
                lattice=True,  # Try lattice mode first (for bordered tables)
                stream=True,   # Fall back to stream mode (for borderless tables)
                guess=False,
                silent=True
            )

            for i, df in enumerate(dfs):
                if df is not None and not df.empty:
                    # Convert DataFrame to table structure
                    table_data = df.values.tolist()
                    columns = df.columns.tolist()

                    # Add column headers as first row if they're meaningful
                    if any(col and str(col).strip() for col in columns):
                        table_data.insert(0, columns)

                    if table_data:
                        tables.append({
                            "data": table_data,
                            "rows": len(table_data),
                            "columns": len(table_data[0]) if table_data else 0,
                            "source": "tabula",
                            "table_index": i,
                            "has_header": True if columns else False
                        })

            logger.info(f"Tabula extracted {len(tables)} tables from {pdf_path}")
            return tables

        except Exception as e:
            logger.warning(f"Tabula table extraction failed for {pdf_path}: {e}")
            return []

    def _text_quality_score(self, text: str) -> float:
        """Return a score between 0 and 1 indicating text quality.
        Higher score means more readable English text."""
        if not text:
            return 0.0
        total = len(text)
        # Count printable ASCII letters and spaces
        printable = sum(1 for c in text if 32 <= ord(c) <= 126)
        # Count replacement characters (<28>) which is Unicode U+FFFD
        replacement = text.count('\ufffd')
        # Count other non-ASCII characters
        non_ascii = sum(1 for c in text if ord(c) > 127 and ord(c) != 0xfffd)
        # Score based on printable ratio, penalize replacement chars
        score = (printable / total) * (1 - (replacement / total))
        return score

    async def _process_pdf(self, file_path: Path) -> ProcessingResult:
        """Process PDF files with hybrid approach: Tabula for digital PDFs, OCR for scanned"""
        pdf_document = None
        try:
            content_parts = []
            tables = []
            images = []
            processed_with_ocr = False
            used_tabula = False

            # Open PDF
            pdf_document = fitz.open(str(file_path))
            total_pages = len(pdf_document)

            # Step 1: Try Tabula for digital PDFs with text layers
            if self.tabula_available:
                tabula_tables = self._extract_tables_with_tabula(str(file_path))
                if tabula_tables:
                    tables.extend(tabula_tables)
                    used_tabula = True
                    logger.info(f"Extracted {len(tabula_tables)} tables using Tabula")

            # Step 2: Analyze each page for text vs scanned content
            ocr_pages = []  # list of (page_num, temp_path) for scanned pages
            page_texts = {}  # page_num -> text (for digital pages)

            for page_num in range(total_pages):
                page = pdf_document[page_num]

                # Try text extraction first
                text = page.get_text()
                text_score = self._text_quality_score(text)

                # Determine if page is digital (good text) or scanned (needs OCR)
                if text.strip() and text_score >= 0.5:
                    # Digital page with good text
                    page_texts[page_num] = text

                    # If Tabula didn't find tables, try to extract tables from text
                    if not used_tabula and "|" in text or "\t" in text:
                        # Simple table detection from text patterns
                        lines = text.split('\n')
                        table_like_lines = [line for line in lines if len(line.split()) > 3]
                        if len(table_like_lines) > 2:
                            table_data = [line.split('|') if '|' in line else line.split('\t') for line in table_like_lines]
                            if table_data and len(table_data) >= 2:
                                tables.append({
                                    "data": table_data,
                                    "rows": len(table_data),
                                    "columns": max(len(row) for row in table_data) if table_data else 0,
                                    "source": "text_pattern",
                                    "page": page_num + 1
                                })
                else:
                    # Scanned page or poor text quality -> use OCR
                    logger.info(f"Page {page_num + 1} has no usable text (score {text_score:.3f}), using high-resolution OCR")
                    # Use higher resolution for better OCR accuracy
                    mat = fitz.Matrix(2, 2)  # 2x resolution for better OCR
                    pix = page.get_pixmap(matrix=mat)
                    img_data = pix.tobytes("png")

                    # Save temporary image for OCR
                    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
                        temp_file.write(img_data)
                        temp_path = temp_file.name

                    ocr_pages.append((page_num, temp_path))

            # Step 3: Process scanned pages with OCR if any
            if ocr_pages and self.ocr_processor.ocr_available:
                try:
                    temp_paths = [temp_path for _, temp_path in ocr_pages]
                    logger.info(f"Running batch OCR on {len(temp_paths)} scanned pages")
                    batch_results = self.ocr_processor.extract_text_from_images_batch(temp_paths)
                    logger.info(f"Batch OCR completed for {len(batch_results)} pages")

                    # Map results back to pages
                    for idx, (page_num, temp_path) in enumerate(ocr_pages):
                        ocr_result = None
                        if idx < len(batch_results):
                            batch_result = batch_results[idx]
                            ocr_result = {
                                "text": batch_result.text,
                                "confidence": batch_result.confidence,
                                "bboxes": batch_result.bboxes,
                                "line_count": batch_result.line_count
                            }
                        else:
                            # Fallback to individual OCR
                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)

                        if ocr_result["text"].strip():
                            logger.info(f"OCR extracted {len(ocr_result['text'])} characters from page {page_num + 1}")
                            content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
                            processed_with_ocr = True

                            # Extract tables from OCR using enhanced heuristic method
                            ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
                            if ocr_tables:
                                logger.info(f"Found {len(ocr_tables)} tables on page {page_num + 1}")
                                for table in ocr_tables:
                                    table["source"] = "ocr_enhanced"
                                    table["page"] = page_num + 1
                                tables.extend(ocr_tables)
                        else:
                            logger.warning(f"OCR returned empty text for page {page_num + 1}")
                            content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")

                        # Clean up temporary file
                        if temp_path and os.path.exists(temp_path):
                            os.unlink(temp_path)

                except Exception as batch_error:
                    logger.error(f"Batch OCR processing failed: {batch_error}")
                    # Fall back to individual processing
                    for page_num, temp_path in ocr_pages:
                        try:
                            ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
                            if ocr_result["text"].strip():
                                content_parts.append(f"Page {page_num + 1} (OCR):\n{str(ocr_result['text'])}")
                                processed_with_ocr = True
                            else:
                                content_parts.append(f"Page {page_num + 1}: [Scanned content - no text detected by OCR]")

                            # Extract tables
                            ocr_tables = self.ocr_processor.extract_tables_from_image(temp_path)
                            if ocr_tables:
                                for table in ocr_tables:
                                    table["source"] = "ocr_fallback"
                                    table["page"] = page_num + 1
                                tables.extend(ocr_tables)
                        except Exception as ocr_error:
                            logger.error(f"OCR processing failed for page {page_num + 1}: {ocr_error}")
                            content_parts.append(f"Page {page_num + 1}: [Image content - OCR failed: {str(ocr_error)}]")
                        finally:
                            if temp_path and os.path.exists(temp_path):
                                os.unlink(temp_path)
            elif ocr_pages and not self.ocr_processor.ocr_available:
                logger.warning("OCR not available, skipping OCR processing for scanned pages")
                for page_num, temp_path in ocr_pages:
                    content_parts.append(f"Page {page_num + 1}: [Image content - OCR not available]")
                    if temp_path and os.path.exists(temp_path):
                        os.unlink(temp_path)

            # Step 4: Add digital pages content
            for page_num, text in page_texts.items():
                content_parts.append(f"Page {page_num + 1}:\n{text}")

            # Sort content parts by page number
            def extract_page_num(part):
                import re
                match = re.search(r'Page\s+(\d+)', part)
                if match:
                    return int(match.group(1))
                return 0

            content_parts.sort(key=extract_page_num)

            full_content = "\n\n".join(content_parts)

            return ProcessingResult(
                success=True,
                content=full_content,
                metadata={
                    "pages": total_pages,
                    "file_type": "pdf",
                    "processed_with_ocr": processed_with_ocr,
                    "used_tabula": used_tabula,
                    "tables_found": len(tables),
                    "table_sources": list(set(table.get("source", "unknown") for table in tables))
                },
                tables=tables,
                images=images
            )
        except Exception as e:
            logger.error(f"PDF processing failed: {e}")
            raise
        finally:
            if pdf_document:
                pdf_document.close()

    async def _process_word(self, file_path: Path) -> ProcessingResult:
        """Process Word documents with image extraction and classification"""
        try:
            doc = docx.Document(str(file_path))
            content_parts = []
            tables = []
            images = []

            # Extract text from paragraphs first (primary content)
            for para in doc.paragraphs:
                if para.text.strip():
                    content_parts.append(para.text)

            # Extract tables
            for table in doc.tables:
                table_data = []
                for row in table.rows:
                    row_data = [cell.text for cell in row.cells]
                    table_data.append(row_data)

                if table_data:
                    tables.append({
                        "data": table_data,
                        "rows": len(table_data),
                        "columns": max(len(row) for row in table_data) if table_data else 0
                    })

            # Extract and process images using zipfile method
            try:
                import zipfile
                import os

                # Create temporary directory for extracted images
                with tempfile.TemporaryDirectory() as temp_dir:
                    # Extract images from docx using zipfile
                    with zipfile.ZipFile(str(file_path), 'r') as zip_ref:
                        image_files = []
                        for file_info in zip_ref.filelist:
                            if file_info.filename.startswith('word/media/'):
                                # Extract the image
                                image_filename = os.path.basename(file_info.filename)
                                image_path = os.path.join(temp_dir, image_filename)

                                # Extract and save
                                with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
                                    target.write(source.read())

                                image_files.append(image_path)
                                logger.info(f"📸 Extracted image: {image_path}")

                    if image_files:
                        logger.info(f"Found {len(image_files)} images in Word document using zipfile method")

                        # Process each extracted image
                        for i, image_path in enumerate(image_files):
                            try:
                                image_metadata = {"path": image_path, "index": i}

                                # Step 1: Always run GPU OCR first
                                if self.ocr_processor.ocr_available:
                                    ocr_result = self.ocr_processor.extract_text_from_image(image_path)
                                    if ocr_result["text"].strip():
                                        image_metadata["ocr_text"] = ocr_result["text"]
                                        image_metadata["ocr_confidence"] = ocr_result["confidence"]
                                        content_parts.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
                                        logger.info(f"Image {i+1} has text content, skipping classification")
                                    else:
                                        logger.info(f"Image {i+1} has no text, proceeding to classification")
                                        # Step 2: Only classify if OCR found no text
                                        if self.image_classifier and self.image_classifier.available:
                                            classification_results = self.image_classifier.classify_image(image_path, top_k=3)
                                            image_metadata["classification"] = classification_results
                                            # Add classification to content for indexing
                                            top_label = classification_results[0]["label"] if classification_results else "unknown"
                                            top_confidence = classification_results[0]["confidence"] if classification_results else 0.0
                                            image_metadata["primary_classification"] = top_label
                                            # Add classification with confidence for better searchability
                                            classification_text = f"[Image {i+1} Classification]: {top_label} (confidence: {top_confidence:.2f})"
                                            content_parts.append(classification_text)
                                            logger.info(f"Image {i+1} classified as: {top_label} with confidence {top_confidence:.2f}")

                                            # Add bee classification as a special entity for search
                                            if "bee" in top_label.lower():
                                                # Add multiple variations to ensure it gets picked up by entity extraction
                                                bee_entity_text = f"Bee image classification: {top_label} with confidence {top_confidence:.2f}. This image contains a bee."
                                                content_parts.append(bee_entity_text)
                                                # Also add as standalone entity markers
                                                content_parts.append("Entity: Bee")
                                                content_parts.append("Entity: Insect")
                                                content_parts.append("Entity: Animal")

                                images.append(image_metadata)

                            except Exception as img_error:
                                logger.error(f"Error processing image {i}: {img_error}")
                                images.append({
                                    "index": i,
                                    "error": str(img_error),
                                    "path": image_path
                                })
            except Exception as img_error:
                logger.warning(f"Image extraction from Word document failed: {img_error}")

            full_content = "\n".join(content_parts)

            return ProcessingResult(
                success=True,
                content=full_content,
                metadata={
                    "file_type": "word",
                    "paragraphs": len([p for p in content_parts if not p.startswith('[')]),
                    "tables_count": len(tables),
                    "images_count": len(images)
                },
                tables=tables,
                images=images
            )
        except Exception as e:
            logger.error(f"Word document processing failed: {e}")
            raise

    async def _process_excel(self, file_path: Path) -> ProcessingResult:
        """Process Excel files"""
        try:
            workbook = openpyxl.load_workbook(str(file_path))
            content_parts = []
            tables = []

            for sheet_name in workbook.sheetnames:
                sheet = workbook[sheet_name]
                content_parts.append(f"Sheet: {sheet_name}")

                # Extract data from cells
                sheet_data = []
                for row in sheet.iter_rows(values_only=True):
                    if any(cell is not None for cell in row):
                        sheet_data.append([str(cell) if cell is not None else "" for cell in row])

                if sheet_data:
                    tables.append({
                        "data": sheet_data,
                        "sheet": sheet_name,
                        "rows": len(sheet_data),
                        "columns": max(len(row) for row in sheet_data) if sheet_data else 0
                    })

                    # Add sample content (first few rows)
                    sample_rows = min(5, len(sheet_data))
                    for i in range(sample_rows):
                        content_parts.append(" | ".join(sheet_data[i]))

            workbook.close()
            full_content = "\n".join(content_parts)

            return ProcessingResult(
                success=True,
                content=full_content,
                metadata={
                    "file_type": "excel",
                    "sheets": len(workbook.sheetnames),
                    "tables_count": len(tables)
                },
                tables=tables
            )
        except Exception as e:
            logger.error(f"Excel processing failed: {e}")
            raise

    async def _process_powerpoint(self, file_path: Path) -> ProcessingResult:
        """Process PowerPoint presentations"""
        try:
            presentation = Presentation(str(file_path))
            content_parts = []

            for i, slide in enumerate(presentation.slides):
                content_parts.append(f"Slide {i + 1}:")

                # Extract text from slide shapes
                slide_text = []
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        slide_text.append(shape.text)

                if slide_text:
                    content_parts.extend(slide_text)
                content_parts.append("")  # Empty line between slides

            full_content = "\n".join(content_parts)

            return ProcessingResult(
                success=True,
                content=full_content,
                metadata={
                    "file_type": "powerpoint",
                    "slides": len(presentation.slides)
                }
            )
        except Exception as e:
            logger.error(f"PowerPoint processing failed: {e}")
            raise

    async def _process_text(self, file_path: Path) -> ProcessingResult:
        """Process text-based files (TXT, CSV, HTML)"""
        try:
            extension = file_path.suffix.lower()

            if extension == '.csv':
                # Process CSV with pandas
                df = pd.read_csv(file_path)
                content = df.to_string(index=False)
                tables = [{
                    "data": df.values.tolist(),
                    "columns": df.columns.tolist(),
                    "rows": len(df),
                    "columns_count": len(df.columns)
                }]

                return ProcessingResult(
                    success=True,
                    content=content,
                    metadata={"file_type": "csv", "rows": len(df), "columns": len(df.columns)},
                    tables=tables
                )

            elif extension == '.html':
                # Process HTML with BeautifulSoup
                with open(file_path, 'r', encoding='utf-8') as f:
                    html_content = f.read()

                soup = BeautifulSoup(html_content, 'html.parser')

                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()

                text = soup.get_text()
                lines = (line.strip() for line in text.splitlines())
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                content = '\n'.join(chunk for chunk in chunks if chunk)

                return ProcessingResult(
                    success=True,
                    content=content,
                    metadata={"file_type": "html"}
                )

            else:  # TXT and other text files
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                return ProcessingResult(
                    success=True,
                    content=content,
                    metadata={"file_type": "text"}
                )
        except Exception as e:
            logger.error(f"Text file processing failed: {e}")
            raise

    async def _process_image(self, file_path: Path) -> ProcessingResult:
        """Process image files with OCR"""
        try:
            content_parts = []
            tables = []
            images = [{"path": str(file_path), "classification": "processed_with_ocr"}]

            # Always perform OCR on images
            ocr_result = self.ocr_processor.extract_text_from_image(str(file_path))
            if ocr_result["text"].strip():
                content_parts.append(ocr_result["text"])

            # Extract tables from image
            ocr_tables = self.ocr_processor.extract_tables_from_image(str(file_path))
            tables.extend(ocr_tables)

            full_content = "\n".join(content_parts) if content_parts else "No text extracted from image"

            return ProcessingResult(
                success=True,
                content=full_content,
                metadata={
                    "file_type": "image",
                    "ocr_confidence": ocr_result.get("confidence", 0.0),
                    "line_count": ocr_result.get("line_count", 0)
                },
                tables=tables,
                images=images
            )
        except Exception as e:
            logger.error(f"Image processing failed: {e}")
            raise

    def get_supported_formats(self) -> List[str]:
        """Get list of supported file formats"""
        return list(self.supported_extensions)

    async def process_batch(self, file_paths: List[str]) -> List[ProcessingResult]:
        """Process multiple documents in batch"""
        tasks = [self.process_document(file_path) for file_path in file_paths]
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Handle exceptions in results
        processed_results = []
        for result in results:
            if isinstance(result, Exception):
                processed_results.append(ProcessingResult(
                    success=False,
                    content="",
                    metadata={"error": str(result)},
                    error=str(result)
                ))
            else:
                processed_results.append(result)

        return processed_results

# Singleton instance
_processor_instance = None

def get_document_processor() -> DocumentProcessor:
    """Get singleton document processor instance"""
    global _processor_instance
    if _processor_instance is None:
        _processor_instance = DocumentProcessor()
    return _processor_instance

async def test_processor():
    """Test function for document processor"""
    processor = get_document_processor()

    # Test with a sample file (modify path as needed)
    test_file = "test_documents/test_document.txt"
    if os.path.exists(test_file):
        result = await processor.process_document(test_file)
        print(f"Success: {result.success}")
        print(f"Content length: {len(result.content)}")
        print(f"Metadata: {result.metadata}")
    else:
        print("Test file not found")

if __name__ == "__main__":
    # Run test
    asyncio.run(test_processor())