218 lines
7.7 KiB
Python
218 lines
7.7 KiB
Python
"""
|
|
Optimization 3: Performance monitoring for merging stage
|
|
This optimization adds detailed performance monitoring and resource tracking
|
|
to identify bottlenecks and optimize resource usage.
|
|
"""
|
|
|
|
import time
|
|
import asyncio
|
|
import psutil
|
|
import logging
|
|
from typing import Dict, Any, Optional, List
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class PerformanceMetrics:
|
|
"""Performance metrics for merging stage phase 2"""
|
|
doc_id: str
|
|
start_time: float
|
|
end_time: Optional[float] = None
|
|
duration: Optional[float] = None
|
|
memory_usage_mb: float = 0
|
|
cpu_percent: float = 0
|
|
entities_processed: int = 0
|
|
relationships_processed: int = 0
|
|
graph_nodes_before: int = 0
|
|
graph_nodes_after: int = 0
|
|
graph_edges_before: int = 0
|
|
graph_edges_after: int = 0
|
|
success: bool = False
|
|
error_message: Optional[str] = None
|
|
current_file_number: int = 1
|
|
total_files: int = 1
|
|
file_path: str = ""
|
|
|
|
class PerformanceMonitor:
|
|
"""Monitor and optimize performance for merging stage operations"""
|
|
|
|
def __init__(self):
|
|
self.enabled = True
|
|
self.metrics_history: List[PerformanceMetrics] = []
|
|
self.process = psutil.Process()
|
|
|
|
async def start_merging_stage_phase2(
|
|
self,
|
|
doc_id: str,
|
|
current_file_number: int = 1,
|
|
total_files: int = 1,
|
|
file_path: str = ""
|
|
) -> PerformanceMetrics:
|
|
"""Start performance monitoring for merging stage phase 2"""
|
|
if not self.enabled:
|
|
return None
|
|
|
|
metrics = PerformanceMetrics(
|
|
doc_id=doc_id,
|
|
start_time=time.time(),
|
|
current_file_number=current_file_number,
|
|
total_files=total_files,
|
|
file_path=file_path
|
|
)
|
|
|
|
# Record initial resource usage
|
|
metrics.memory_usage_mb = self.process.memory_info().rss / 1024 / 1024
|
|
metrics.cpu_percent = self.process.cpu_percent()
|
|
|
|
logger.info(f"Performance monitoring started for doc {doc_id}")
|
|
return metrics
|
|
|
|
async def record_success(self, metrics: PerformanceMetrics) -> None:
|
|
"""Record successful completion of merging stage"""
|
|
if not self.enabled or metrics is None:
|
|
return
|
|
|
|
metrics.end_time = time.time()
|
|
metrics.duration = metrics.end_time - metrics.start_time
|
|
metrics.success = True
|
|
|
|
# Record final resource usage
|
|
metrics.memory_usage_mb = self.process.memory_info().rss / 1024 / 1024
|
|
metrics.cpu_percent = self.process.cpu_percent()
|
|
|
|
self.metrics_history.append(metrics)
|
|
|
|
logger.info(
|
|
f"Performance metrics for doc {metrics.doc_id}: "
|
|
f"Duration: {metrics.duration:.2f}s, "
|
|
f"Memory: {metrics.memory_usage_mb:.1f}MB, "
|
|
f"CPU: {metrics.cpu_percent:.1f}%, "
|
|
f"Entities: {metrics.entities_processed}, "
|
|
f"Relationships: {metrics.relationships_processed}"
|
|
)
|
|
|
|
async def record_failure(self, metrics: PerformanceMetrics, error_message: str) -> None:
|
|
"""Record failed merging stage"""
|
|
if not self.enabled or metrics is None:
|
|
return
|
|
|
|
metrics.end_time = time.time()
|
|
metrics.duration = metrics.end_time - metrics.start_time
|
|
metrics.success = False
|
|
metrics.error_message = error_message
|
|
|
|
self.metrics_history.append(metrics)
|
|
|
|
logger.error(
|
|
f"Performance failure for doc {metrics.doc_id}: "
|
|
f"Duration: {metrics.duration:.2f}s, "
|
|
f"Error: {error_message}"
|
|
)
|
|
|
|
def update_graph_stats(
|
|
self,
|
|
metrics: PerformanceMetrics,
|
|
nodes_before: int,
|
|
edges_before: int,
|
|
nodes_after: int,
|
|
edges_after: int
|
|
) -> None:
|
|
"""Update graph statistics in performance metrics"""
|
|
if metrics is None:
|
|
return
|
|
|
|
metrics.graph_nodes_before = nodes_before
|
|
metrics.graph_nodes_after = nodes_after
|
|
metrics.graph_edges_before = edges_before
|
|
metrics.graph_edges_after = edges_after
|
|
|
|
def update_entity_relationship_counts(
|
|
self,
|
|
metrics: PerformanceMetrics,
|
|
entities_processed: int,
|
|
relationships_processed: int
|
|
) -> None:
|
|
"""Update entity and relationship counts"""
|
|
if metrics is None:
|
|
return
|
|
|
|
metrics.entities_processed = entities_processed
|
|
metrics.relationships_processed = relationships_processed
|
|
|
|
def get_performance_summary(self) -> Dict[str, Any]:
|
|
"""Get performance summary across all recorded metrics"""
|
|
if not self.metrics_history:
|
|
return {}
|
|
|
|
successful_runs = [m for m in self.metrics_history if m.success]
|
|
failed_runs = [m for m in self.metrics_history if not m.success]
|
|
|
|
if successful_runs:
|
|
avg_duration = sum(m.duration for m in successful_runs) / len(successful_runs)
|
|
avg_memory = sum(m.memory_usage_mb for m in successful_runs) / len(successful_runs)
|
|
avg_cpu = sum(m.cpu_percent for m in successful_runs) / len(successful_runs)
|
|
avg_entities = sum(m.entities_processed for m in successful_runs) / len(successful_runs)
|
|
avg_relationships = sum(m.relationships_processed for m in successful_runs) / len(successful_runs)
|
|
else:
|
|
avg_duration = avg_memory = avg_cpu = avg_entities = avg_relationships = 0
|
|
|
|
return {
|
|
"total_runs": len(self.metrics_history),
|
|
"successful_runs": len(successful_runs),
|
|
"failed_runs": len(failed_runs),
|
|
"success_rate": len(successful_runs) / len(self.metrics_history) if self.metrics_history else 0,
|
|
"average_duration_seconds": avg_duration,
|
|
"average_memory_mb": avg_memory,
|
|
"average_cpu_percent": avg_cpu,
|
|
"average_entities_processed": avg_entities,
|
|
"average_relationships_processed": avg_relationships,
|
|
"last_run": self.metrics_history[-1].__dict__ if self.metrics_history else None
|
|
}
|
|
|
|
def enable_monitoring(self) -> None:
|
|
"""Enable performance monitoring"""
|
|
self.enabled = True
|
|
logger.info("Performance monitoring enabled")
|
|
|
|
def disable_monitoring(self) -> None:
|
|
"""Disable performance monitoring"""
|
|
self.enabled = False
|
|
logger.info("Performance monitoring disabled")
|
|
|
|
def clear_history(self) -> None:
|
|
"""Clear performance metrics history"""
|
|
self.metrics_history.clear()
|
|
logger.info("Performance metrics history cleared")
|
|
|
|
|
|
# Global performance monitor instance
|
|
performance_monitor = PerformanceMonitor()
|
|
|
|
|
|
async def monitor_merging_stage_phase2(
|
|
doc_id: str,
|
|
current_file_number: int = 1,
|
|
total_files: int = 1,
|
|
file_path: str = ""
|
|
) -> PerformanceMetrics:
|
|
"""Convenience function to start monitoring merging stage phase 2"""
|
|
return await performance_monitor.start_merging_stage_phase2(
|
|
doc_id, current_file_number, total_files, file_path
|
|
)
|
|
|
|
|
|
async def record_merging_success(metrics: PerformanceMetrics) -> None:
|
|
"""Convenience function to record successful merging"""
|
|
await performance_monitor.record_success(metrics)
|
|
|
|
|
|
async def record_merging_failure(metrics: PerformanceMetrics, error_message: str) -> None:
|
|
"""Convenience function to record failed merging"""
|
|
await performance_monitor.record_failure(metrics, error_message)
|
|
|
|
|
|
def get_performance_report() -> Dict[str, Any]:
|
|
"""Get comprehensive performance report"""
|
|
return performance_monitor.get_performance_summary() |