""" spaCy Entity Extractor for LightRAG Replaces LLM-based entity extraction to dramatically reduce indexing time """ import os import time import asyncio from typing import List, Dict, Any, Tuple from collections import defaultdict try: import spacy HAS_SPACY = True except ImportError: HAS_SPACY = False class SpacyEntityExtractor: """Local entity extraction using spaCy to replace LLM calls""" def __init__(self): self.setup_spacy_model() self.performance_stats = {} def setup_spacy_model(self): """Initialize spaCy model for entity extraction""" if not HAS_SPACY: raise ImportError( "spaCy is required. Install with: " "pip install spacy && python -m spacy download en_core_web_sm" ) try: self.nlp = spacy.load("en_core_web_sm") print("✅ spaCy model loaded successfully") except OSError: print("❌ spaCy model not found. Download with: python -m spacy download en_core_web_sm") raise def map_entity_type(self, spacy_label: str) -> str: """Map spaCy entity labels to LightRAG entity types""" mapping = { 'PERSON': 'Person', 'ORG': 'Organization', 'GPE': 'Location', 'LOC': 'Location', 'EVENT': 'Event', 'WORK_OF_ART': 'Artifact', 'LAW': 'Concept', 'LANGUAGE': 'Concept', 'DATE': 'Concept', 'TIME': 'Concept', 'PERCENT': 'Data', 'MONEY': 'Data', 'QUANTITY': 'Data', 'ORDINAL': 'Data', 'CARDINAL': 'Data', 'PRODUCT': 'Artifact', 'FAC': 'Location', 'NORP': 'Organization', # Nationalities, religious, political groups } return mapping.get(spacy_label, 'Concept') async def extract_entities_and_relations(self, text: str, chunk_key: str, file_path: str = "unknown_source") -> Tuple[Dict, Dict]: """ Extract entities from text using spaCy and format for LightRAG Returns: Tuple of (entities_dict, relations_dict) in LightRAG format """ start_time = time.time() # Process text with spaCy doc = self.nlp(text) # Extract entities maybe_nodes = defaultdict(list) maybe_edges = defaultdict(list) for ent in doc.ents: # Map spaCy entity type to LightRAG entity type entity_type = self.map_entity_type(ent.label_) # Create entity description based on context entity_description = f"{ent.text} is a {entity_type.lower()} mentioned in the text." # Format entity data for LightRAG entity_data = { "entity_name": ent.text, "entity_type": entity_type, "description": entity_description, "source_id": chunk_key, "file_path": file_path, "timestamp": int(time.time()), } maybe_nodes[ent.text].append(entity_data) extraction_time = time.time() - start_time self.performance_stats['spacy_extraction'] = extraction_time print(f"🔍 Extracted {len(maybe_nodes)} entities in {extraction_time:.3f}s") return dict(maybe_nodes), dict(maybe_edges) async def batch_extract_entities(self, chunks_data: List[Tuple[str, str, str]]) -> List[Tuple[Dict, Dict]]: """ Extract entities from multiple chunks in batch Args: chunks_data: List of tuples (text, chunk_key, file_path) Returns: List of (entities_dict, relations_dict) for each chunk """ start_time = time.time() results = [] for text, chunk_key, file_path in chunks_data: entities, relations = await self.extract_entities_and_relations(text, chunk_key, file_path) results.append((entities, relations)) batch_time = time.time() - start_time self.performance_stats['batch_extraction'] = batch_time print(f"📚 Batch extracted entities from {len(chunks_data)} chunks in {batch_time:.2f}s") return results # Global instance for reuse _spacy_extractor = None def get_spacy_extractor() -> SpacyEntityExtractor: """Get or create the global spaCy extractor instance""" global _spacy_extractor if _spacy_extractor is None: _spacy_extractor = SpacyEntityExtractor() return _spacy_extractor async def extract_entities_spacy( chunks: Dict[str, Any], global_config: Dict[str, str], pipeline_status: Dict = None, pipeline_status_lock = None, text_chunks_storage: Any = None, ) -> List: """ spaCy-based entity extraction for LightRAG This function replaces the LLM-based entity extraction to dramatically reduce indexing time. Args: chunks: Dictionary of text chunks to process global_config: Global configuration dictionary pipeline_status: Pipeline status dictionary pipeline_status_lock: Lock for pipeline status text_chunks_storage: Text chunks storage Returns: List of chunk results for processing in merge_nodes_and_edges """ print("🚀 Using spaCy for entity extraction (faster than LLM)") ordered_chunks = list(chunks.items()) total_chunks = len(ordered_chunks) processed_chunks = 0 # Get spaCy extractor spacy_extractor = get_spacy_extractor() # Prepare chunk data for batch processing chunks_data = [] for chunk_key, chunk_dp in ordered_chunks: content = chunk_dp["content"] file_path = chunk_dp.get("file_path", "unknown_source") chunks_data.append((content, chunk_key, file_path)) # Process chunks in batch chunk_results = await spacy_extractor.batch_extract_entities(chunks_data) # Update progress for all chunks processed_chunks = total_chunks log_message = f"Chunk {processed_chunks} of {total_chunks} extracted entities with spaCy" print(log_message) if pipeline_status is not None and pipeline_status_lock is not None: async with pipeline_status_lock: pipeline_status["latest_message"] = log_message pipeline_status["history_messages"].append(log_message) return chunk_results