railseek6/LightRAG-main/lightrag/spacy_entity_extractor.py

"""
spaCy Entity Extractor for LightRAG
Replaces LLM-based entity extraction to dramatically reduce indexing time
"""

import os
import time
import asyncio
from typing import List, Dict, Any, Tuple
from collections import defaultdict

try:
    import spacy
    HAS_SPACY = True
except ImportError:
    HAS_SPACY = False

class SpacyEntityExtractor:
    """Local entity extraction using spaCy to replace LLM calls"""

    def __init__(self):
        self.setup_spacy_model()
        self.performance_stats = {}

    def setup_spacy_model(self):
        """Initialize spaCy model for entity extraction"""
        if not HAS_SPACY:
            raise ImportError(
                "spaCy is required. Install with: "
                "pip install spacy && python -m spacy download en_core_web_sm"
            )

        try:
            self.nlp = spacy.load("en_core_web_sm")
            print("✅ spaCy model loaded successfully")
        except OSError:
            print("❌ spaCy model not found. Download with: python -m spacy download en_core_web_sm")
            raise

    def map_entity_type(self, spacy_label: str) -> str:
        """Map spaCy entity labels to LightRAG entity types"""
        mapping = {
            'PERSON': 'Person',
            'ORG': 'Organization',
            'GPE': 'Location',
            'LOC': 'Location',
            'EVENT': 'Event',
            'WORK_OF_ART': 'Artifact',
            'LAW': 'Concept',
            'LANGUAGE': 'Concept',
            'DATE': 'Concept',
            'TIME': 'Concept',
            'PERCENT': 'Data',
            'MONEY': 'Data',
            'QUANTITY': 'Data',
            'ORDINAL': 'Data',
            'CARDINAL': 'Data',
            'PRODUCT': 'Artifact',
            'FAC': 'Location',
            'NORP': 'Organization',  # Nationalities, religious, political groups
        }
        return mapping.get(spacy_label, 'Concept')

    async def extract_entities_and_relations(self, text: str, chunk_key: str, file_path: str = "unknown_source") -> Tuple[Dict, Dict]:
        """
        Extract entities from text using spaCy and format for LightRAG

        Returns:
            Tuple of (entities_dict, relations_dict) in LightRAG format
        """
        start_time = time.time()

        # Process text with spaCy
        doc = self.nlp(text)

        # Extract entities
        maybe_nodes = defaultdict(list)
        maybe_edges = defaultdict(list)

        for ent in doc.ents:
            # Map spaCy entity type to LightRAG entity type
            entity_type = self.map_entity_type(ent.label_)

            # Create entity description based on context
            entity_description = f"{ent.text} is a {entity_type.lower()} mentioned in the text."

            # Format entity data for LightRAG
            entity_data = {
                "entity_name": ent.text,
                "entity_type": entity_type,
                "description": entity_description,
                "source_id": chunk_key,
                "file_path": file_path,
                "timestamp": int(time.time()),
            }

            maybe_nodes[ent.text].append(entity_data)

        extraction_time = time.time() - start_time
        self.performance_stats['spacy_extraction'] = extraction_time

        print(f"🔍 Extracted {len(maybe_nodes)} entities in {extraction_time:.3f}s")

        return dict(maybe_nodes), dict(maybe_edges)

    async def batch_extract_entities(self, chunks_data: List[Tuple[str, str, str]]) -> List[Tuple[Dict, Dict]]:
        """
        Extract entities from multiple chunks in batch

        Args:
            chunks_data: List of tuples (text, chunk_key, file_path)

        Returns:
            List of (entities_dict, relations_dict) for each chunk
        """
        start_time = time.time()

        results = []
        for text, chunk_key, file_path in chunks_data:
            entities, relations = await self.extract_entities_and_relations(text, chunk_key, file_path)
            results.append((entities, relations))

        batch_time = time.time() - start_time
        self.performance_stats['batch_extraction'] = batch_time

        print(f"📚 Batch extracted entities from {len(chunks_data)} chunks in {batch_time:.2f}s")
        return results

# Global instance for reuse
_spacy_extractor = None

def get_spacy_extractor() -> SpacyEntityExtractor:
    """Get or create the global spaCy extractor instance"""
    global _spacy_extractor
    if _spacy_extractor is None:
        _spacy_extractor = SpacyEntityExtractor()
    return _spacy_extractor

async def extract_entities_spacy(
    chunks: Dict[str, Any],
    global_config: Dict[str, str],
    pipeline_status: Dict = None,
    pipeline_status_lock = None,
    text_chunks_storage: Any = None,
) -> List:
    """
    spaCy-based entity extraction for LightRAG

    This function replaces the LLM-based entity extraction to dramatically reduce indexing time.

    Args:
        chunks: Dictionary of text chunks to process
        global_config: Global configuration dictionary
        pipeline_status: Pipeline status dictionary
        pipeline_status_lock: Lock for pipeline status
        text_chunks_storage: Text chunks storage

    Returns:
        List of chunk results for processing in merge_nodes_and_edges
    """
    print("🚀 Using spaCy for entity extraction (faster than LLM)")

    ordered_chunks = list(chunks.items())
    total_chunks = len(ordered_chunks)
    processed_chunks = 0

    # Get spaCy extractor
    spacy_extractor = get_spacy_extractor()

    # Prepare chunk data for batch processing
    chunks_data = []
    for chunk_key, chunk_dp in ordered_chunks:
        content = chunk_dp["content"]
        file_path = chunk_dp.get("file_path", "unknown_source")
        chunks_data.append((content, chunk_key, file_path))

    # Process chunks in batch
    chunk_results = await spacy_extractor.batch_extract_entities(chunks_data)

    # Update progress for all chunks
    processed_chunks = total_chunks
    log_message = f"Chunk {processed_chunks} of {total_chunks} extracted entities with spaCy"
    print(log_message)

    if pipeline_status is not None and pipeline_status_lock is not None:
        async with pipeline_status_lock:
            pipeline_status["latest_message"] = log_message
            pipeline_status["history_messages"].append(log_message)

    return chunk_results