Files
railseek6/LightRAG-main/lightrag/spacy_entity_extractor.py

190 lines
6.4 KiB
Python

"""
spaCy Entity Extractor for LightRAG
Replaces LLM-based entity extraction to dramatically reduce indexing time
"""
import os
import time
import asyncio
from typing import List, Dict, Any, Tuple
from collections import defaultdict
try:
import spacy
HAS_SPACY = True
except ImportError:
HAS_SPACY = False
class SpacyEntityExtractor:
"""Local entity extraction using spaCy to replace LLM calls"""
def __init__(self):
self.setup_spacy_model()
self.performance_stats = {}
def setup_spacy_model(self):
"""Initialize spaCy model for entity extraction"""
if not HAS_SPACY:
raise ImportError(
"spaCy is required. Install with: "
"pip install spacy && python -m spacy download en_core_web_sm"
)
try:
self.nlp = spacy.load("en_core_web_sm")
print("✅ spaCy model loaded successfully")
except OSError:
print("❌ spaCy model not found. Download with: python -m spacy download en_core_web_sm")
raise
def map_entity_type(self, spacy_label: str) -> str:
"""Map spaCy entity labels to LightRAG entity types"""
mapping = {
'PERSON': 'Person',
'ORG': 'Organization',
'GPE': 'Location',
'LOC': 'Location',
'EVENT': 'Event',
'WORK_OF_ART': 'Artifact',
'LAW': 'Concept',
'LANGUAGE': 'Concept',
'DATE': 'Concept',
'TIME': 'Concept',
'PERCENT': 'Data',
'MONEY': 'Data',
'QUANTITY': 'Data',
'ORDINAL': 'Data',
'CARDINAL': 'Data',
'PRODUCT': 'Artifact',
'FAC': 'Location',
'NORP': 'Organization', # Nationalities, religious, political groups
}
return mapping.get(spacy_label, 'Concept')
async def extract_entities_and_relations(self, text: str, chunk_key: str, file_path: str = "unknown_source") -> Tuple[Dict, Dict]:
"""
Extract entities from text using spaCy and format for LightRAG
Returns:
Tuple of (entities_dict, relations_dict) in LightRAG format
"""
start_time = time.time()
# Process text with spaCy
doc = self.nlp(text)
# Extract entities
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
for ent in doc.ents:
# Map spaCy entity type to LightRAG entity type
entity_type = self.map_entity_type(ent.label_)
# Create entity description based on context
entity_description = f"{ent.text} is a {entity_type.lower()} mentioned in the text."
# Format entity data for LightRAG
entity_data = {
"entity_name": ent.text,
"entity_type": entity_type,
"description": entity_description,
"source_id": chunk_key,
"file_path": file_path,
"timestamp": int(time.time()),
}
maybe_nodes[ent.text].append(entity_data)
extraction_time = time.time() - start_time
self.performance_stats['spacy_extraction'] = extraction_time
print(f"🔍 Extracted {len(maybe_nodes)} entities in {extraction_time:.3f}s")
return dict(maybe_nodes), dict(maybe_edges)
async def batch_extract_entities(self, chunks_data: List[Tuple[str, str, str]]) -> List[Tuple[Dict, Dict]]:
"""
Extract entities from multiple chunks in batch
Args:
chunks_data: List of tuples (text, chunk_key, file_path)
Returns:
List of (entities_dict, relations_dict) for each chunk
"""
start_time = time.time()
results = []
for text, chunk_key, file_path in chunks_data:
entities, relations = await self.extract_entities_and_relations(text, chunk_key, file_path)
results.append((entities, relations))
batch_time = time.time() - start_time
self.performance_stats['batch_extraction'] = batch_time
print(f"📚 Batch extracted entities from {len(chunks_data)} chunks in {batch_time:.2f}s")
return results
# Global instance for reuse
_spacy_extractor = None
def get_spacy_extractor() -> SpacyEntityExtractor:
"""Get or create the global spaCy extractor instance"""
global _spacy_extractor
if _spacy_extractor is None:
_spacy_extractor = SpacyEntityExtractor()
return _spacy_extractor
async def extract_entities_spacy(
chunks: Dict[str, Any],
global_config: Dict[str, str],
pipeline_status: Dict = None,
pipeline_status_lock = None,
text_chunks_storage: Any = None,
) -> List:
"""
spaCy-based entity extraction for LightRAG
This function replaces the LLM-based entity extraction to dramatically reduce indexing time.
Args:
chunks: Dictionary of text chunks to process
global_config: Global configuration dictionary
pipeline_status: Pipeline status dictionary
pipeline_status_lock: Lock for pipeline status
text_chunks_storage: Text chunks storage
Returns:
List of chunk results for processing in merge_nodes_and_edges
"""
print("🚀 Using spaCy for entity extraction (faster than LLM)")
ordered_chunks = list(chunks.items())
total_chunks = len(ordered_chunks)
processed_chunks = 0
# Get spaCy extractor
spacy_extractor = get_spacy_extractor()
# Prepare chunk data for batch processing
chunks_data = []
for chunk_key, chunk_dp in ordered_chunks:
content = chunk_dp["content"]
file_path = chunk_dp.get("file_path", "unknown_source")
chunks_data.append((content, chunk_key, file_path))
# Process chunks in batch
chunk_results = await spacy_extractor.batch_extract_entities(chunks_data)
# Update progress for all chunks
processed_chunks = total_chunks
log_message = f"Chunk {processed_chunks} of {total_chunks} extracted entities with spaCy"
print(log_message)
if pipeline_status is not None and pipeline_status_lock is not None:
async with pipeline_status_lock:
pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message)
return chunk_results