""" Optimization 3: Performance monitoring for merging stage This optimization adds detailed performance monitoring and resource tracking to identify bottlenecks and optimize resource usage. """ import time import asyncio import psutil import logging from typing import Dict, Any, Optional, List from dataclasses import dataclass from datetime import datetime logger = logging.getLogger(__name__) @dataclass class PerformanceMetrics: """Performance metrics for merging stage phase 2""" doc_id: str start_time: float end_time: Optional[float] = None duration: Optional[float] = None memory_usage_mb: float = 0 cpu_percent: float = 0 entities_processed: int = 0 relationships_processed: int = 0 graph_nodes_before: int = 0 graph_nodes_after: int = 0 graph_edges_before: int = 0 graph_edges_after: int = 0 success: bool = False error_message: Optional[str] = None current_file_number: int = 1 total_files: int = 1 file_path: str = "" class PerformanceMonitor: """Monitor and optimize performance for merging stage operations""" def __init__(self): self.enabled = True self.metrics_history: List[PerformanceMetrics] = [] self.process = psutil.Process() async def start_merging_stage_phase2( self, doc_id: str, current_file_number: int = 1, total_files: int = 1, file_path: str = "" ) -> PerformanceMetrics: """Start performance monitoring for merging stage phase 2""" if not self.enabled: return None metrics = PerformanceMetrics( doc_id=doc_id, start_time=time.time(), current_file_number=current_file_number, total_files=total_files, file_path=file_path ) # Record initial resource usage metrics.memory_usage_mb = self.process.memory_info().rss / 1024 / 1024 metrics.cpu_percent = self.process.cpu_percent() logger.info(f"Performance monitoring started for doc {doc_id}") return metrics async def record_success(self, metrics: PerformanceMetrics) -> None: """Record successful completion of merging stage""" if not self.enabled or metrics is None: return metrics.end_time = time.time() metrics.duration = metrics.end_time - metrics.start_time metrics.success = True # Record final resource usage metrics.memory_usage_mb = self.process.memory_info().rss / 1024 / 1024 metrics.cpu_percent = self.process.cpu_percent() self.metrics_history.append(metrics) logger.info( f"Performance metrics for doc {metrics.doc_id}: " f"Duration: {metrics.duration:.2f}s, " f"Memory: {metrics.memory_usage_mb:.1f}MB, " f"CPU: {metrics.cpu_percent:.1f}%, " f"Entities: {metrics.entities_processed}, " f"Relationships: {metrics.relationships_processed}" ) async def record_failure(self, metrics: PerformanceMetrics, error_message: str) -> None: """Record failed merging stage""" if not self.enabled or metrics is None: return metrics.end_time = time.time() metrics.duration = metrics.end_time - metrics.start_time metrics.success = False metrics.error_message = error_message self.metrics_history.append(metrics) logger.error( f"Performance failure for doc {metrics.doc_id}: " f"Duration: {metrics.duration:.2f}s, " f"Error: {error_message}" ) def update_graph_stats( self, metrics: PerformanceMetrics, nodes_before: int, edges_before: int, nodes_after: int, edges_after: int ) -> None: """Update graph statistics in performance metrics""" if metrics is None: return metrics.graph_nodes_before = nodes_before metrics.graph_nodes_after = nodes_after metrics.graph_edges_before = edges_before metrics.graph_edges_after = edges_after def update_entity_relationship_counts( self, metrics: PerformanceMetrics, entities_processed: int, relationships_processed: int ) -> None: """Update entity and relationship counts""" if metrics is None: return metrics.entities_processed = entities_processed metrics.relationships_processed = relationships_processed def get_performance_summary(self) -> Dict[str, Any]: """Get performance summary across all recorded metrics""" if not self.metrics_history: return {} successful_runs = [m for m in self.metrics_history if m.success] failed_runs = [m for m in self.metrics_history if not m.success] if successful_runs: avg_duration = sum(m.duration for m in successful_runs) / len(successful_runs) avg_memory = sum(m.memory_usage_mb for m in successful_runs) / len(successful_runs) avg_cpu = sum(m.cpu_percent for m in successful_runs) / len(successful_runs) avg_entities = sum(m.entities_processed for m in successful_runs) / len(successful_runs) avg_relationships = sum(m.relationships_processed for m in successful_runs) / len(successful_runs) else: avg_duration = avg_memory = avg_cpu = avg_entities = avg_relationships = 0 return { "total_runs": len(self.metrics_history), "successful_runs": len(successful_runs), "failed_runs": len(failed_runs), "success_rate": len(successful_runs) / len(self.metrics_history) if self.metrics_history else 0, "average_duration_seconds": avg_duration, "average_memory_mb": avg_memory, "average_cpu_percent": avg_cpu, "average_entities_processed": avg_entities, "average_relationships_processed": avg_relationships, "last_run": self.metrics_history[-1].__dict__ if self.metrics_history else None } def enable_monitoring(self) -> None: """Enable performance monitoring""" self.enabled = True logger.info("Performance monitoring enabled") def disable_monitoring(self) -> None: """Disable performance monitoring""" self.enabled = False logger.info("Performance monitoring disabled") def clear_history(self) -> None: """Clear performance metrics history""" self.metrics_history.clear() logger.info("Performance metrics history cleared") # Global performance monitor instance performance_monitor = PerformanceMonitor() async def monitor_merging_stage_phase2( doc_id: str, current_file_number: int = 1, total_files: int = 1, file_path: str = "" ) -> PerformanceMetrics: """Convenience function to start monitoring merging stage phase 2""" return await performance_monitor.start_merging_stage_phase2( doc_id, current_file_number, total_files, file_path ) async def record_merging_success(metrics: PerformanceMetrics) -> None: """Convenience function to record successful merging""" await performance_monitor.record_success(metrics) async def record_merging_failure(metrics: PerformanceMetrics, error_message: str) -> None: """Convenience function to record failed merging""" await performance_monitor.record_failure(metrics, error_message) def get_performance_report() -> Dict[str, Any]: """Get comprehensive performance report""" return performance_monitor.get_performance_summary()