190 lines
6.4 KiB
Python
190 lines
6.4 KiB
Python
"""
|
|
spaCy Entity Extractor for LightRAG
|
|
Replaces LLM-based entity extraction to dramatically reduce indexing time
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import asyncio
|
|
from typing import List, Dict, Any, Tuple
|
|
from collections import defaultdict
|
|
|
|
try:
|
|
import spacy
|
|
HAS_SPACY = True
|
|
except ImportError:
|
|
HAS_SPACY = False
|
|
|
|
class SpacyEntityExtractor:
|
|
"""Local entity extraction using spaCy to replace LLM calls"""
|
|
|
|
def __init__(self):
|
|
self.setup_spacy_model()
|
|
self.performance_stats = {}
|
|
|
|
def setup_spacy_model(self):
|
|
"""Initialize spaCy model for entity extraction"""
|
|
if not HAS_SPACY:
|
|
raise ImportError(
|
|
"spaCy is required. Install with: "
|
|
"pip install spacy && python -m spacy download en_core_web_sm"
|
|
)
|
|
|
|
try:
|
|
self.nlp = spacy.load("en_core_web_sm")
|
|
print("✅ spaCy model loaded successfully")
|
|
except OSError:
|
|
print("❌ spaCy model not found. Download with: python -m spacy download en_core_web_sm")
|
|
raise
|
|
|
|
def map_entity_type(self, spacy_label: str) -> str:
|
|
"""Map spaCy entity labels to LightRAG entity types"""
|
|
mapping = {
|
|
'PERSON': 'Person',
|
|
'ORG': 'Organization',
|
|
'GPE': 'Location',
|
|
'LOC': 'Location',
|
|
'EVENT': 'Event',
|
|
'WORK_OF_ART': 'Artifact',
|
|
'LAW': 'Concept',
|
|
'LANGUAGE': 'Concept',
|
|
'DATE': 'Concept',
|
|
'TIME': 'Concept',
|
|
'PERCENT': 'Data',
|
|
'MONEY': 'Data',
|
|
'QUANTITY': 'Data',
|
|
'ORDINAL': 'Data',
|
|
'CARDINAL': 'Data',
|
|
'PRODUCT': 'Artifact',
|
|
'FAC': 'Location',
|
|
'NORP': 'Organization', # Nationalities, religious, political groups
|
|
}
|
|
return mapping.get(spacy_label, 'Concept')
|
|
|
|
async def extract_entities_and_relations(self, text: str, chunk_key: str, file_path: str = "unknown_source") -> Tuple[Dict, Dict]:
|
|
"""
|
|
Extract entities from text using spaCy and format for LightRAG
|
|
|
|
Returns:
|
|
Tuple of (entities_dict, relations_dict) in LightRAG format
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Process text with spaCy
|
|
doc = self.nlp(text)
|
|
|
|
# Extract entities
|
|
maybe_nodes = defaultdict(list)
|
|
maybe_edges = defaultdict(list)
|
|
|
|
for ent in doc.ents:
|
|
# Map spaCy entity type to LightRAG entity type
|
|
entity_type = self.map_entity_type(ent.label_)
|
|
|
|
# Create entity description based on context
|
|
entity_description = f"{ent.text} is a {entity_type.lower()} mentioned in the text."
|
|
|
|
# Format entity data for LightRAG
|
|
entity_data = {
|
|
"entity_name": ent.text,
|
|
"entity_type": entity_type,
|
|
"description": entity_description,
|
|
"source_id": chunk_key,
|
|
"file_path": file_path,
|
|
"timestamp": int(time.time()),
|
|
}
|
|
|
|
maybe_nodes[ent.text].append(entity_data)
|
|
|
|
extraction_time = time.time() - start_time
|
|
self.performance_stats['spacy_extraction'] = extraction_time
|
|
|
|
print(f"🔍 Extracted {len(maybe_nodes)} entities in {extraction_time:.3f}s")
|
|
|
|
return dict(maybe_nodes), dict(maybe_edges)
|
|
|
|
async def batch_extract_entities(self, chunks_data: List[Tuple[str, str, str]]) -> List[Tuple[Dict, Dict]]:
|
|
"""
|
|
Extract entities from multiple chunks in batch
|
|
|
|
Args:
|
|
chunks_data: List of tuples (text, chunk_key, file_path)
|
|
|
|
Returns:
|
|
List of (entities_dict, relations_dict) for each chunk
|
|
"""
|
|
start_time = time.time()
|
|
|
|
results = []
|
|
for text, chunk_key, file_path in chunks_data:
|
|
entities, relations = await self.extract_entities_and_relations(text, chunk_key, file_path)
|
|
results.append((entities, relations))
|
|
|
|
batch_time = time.time() - start_time
|
|
self.performance_stats['batch_extraction'] = batch_time
|
|
|
|
print(f"📚 Batch extracted entities from {len(chunks_data)} chunks in {batch_time:.2f}s")
|
|
return results
|
|
|
|
# Global instance for reuse
|
|
_spacy_extractor = None
|
|
|
|
def get_spacy_extractor() -> SpacyEntityExtractor:
|
|
"""Get or create the global spaCy extractor instance"""
|
|
global _spacy_extractor
|
|
if _spacy_extractor is None:
|
|
_spacy_extractor = SpacyEntityExtractor()
|
|
return _spacy_extractor
|
|
|
|
async def extract_entities_spacy(
|
|
chunks: Dict[str, Any],
|
|
global_config: Dict[str, str],
|
|
pipeline_status: Dict = None,
|
|
pipeline_status_lock = None,
|
|
text_chunks_storage: Any = None,
|
|
) -> List:
|
|
"""
|
|
spaCy-based entity extraction for LightRAG
|
|
|
|
This function replaces the LLM-based entity extraction to dramatically reduce indexing time.
|
|
|
|
Args:
|
|
chunks: Dictionary of text chunks to process
|
|
global_config: Global configuration dictionary
|
|
pipeline_status: Pipeline status dictionary
|
|
pipeline_status_lock: Lock for pipeline status
|
|
text_chunks_storage: Text chunks storage
|
|
|
|
Returns:
|
|
List of chunk results for processing in merge_nodes_and_edges
|
|
"""
|
|
print("🚀 Using spaCy for entity extraction (faster than LLM)")
|
|
|
|
ordered_chunks = list(chunks.items())
|
|
total_chunks = len(ordered_chunks)
|
|
processed_chunks = 0
|
|
|
|
# Get spaCy extractor
|
|
spacy_extractor = get_spacy_extractor()
|
|
|
|
# Prepare chunk data for batch processing
|
|
chunks_data = []
|
|
for chunk_key, chunk_dp in ordered_chunks:
|
|
content = chunk_dp["content"]
|
|
file_path = chunk_dp.get("file_path", "unknown_source")
|
|
chunks_data.append((content, chunk_key, file_path))
|
|
|
|
# Process chunks in batch
|
|
chunk_results = await spacy_extractor.batch_extract_entities(chunks_data)
|
|
|
|
# Update progress for all chunks
|
|
processed_chunks = total_chunks
|
|
log_message = f"Chunk {processed_chunks} of {total_chunks} extracted entities with spaCy"
|
|
print(log_message)
|
|
|
|
if pipeline_status is not None and pipeline_status_lock is not None:
|
|
async with pipeline_status_lock:
|
|
pipeline_status["latest_message"] = log_message
|
|
pipeline_status["history_messages"].append(log_message)
|
|
|
|
return chunk_results |