"""
Performance Root Cause Analysis for Image Classification
Identifies bottlenecks and proposes solutions
"""
import os
import time
import subprocess
import json
import psutil
import GPUtil
from typing import Dict, Any, List

class PerformanceAnalyzer:
    """Analyze performance bottlenecks in image classification"""
    
    def __init__(self):
        self.venv_python = "openclip_gpu_env\\Scripts\\python.exe"
    
    def analyze_system_resources(self) -> Dict[str, Any]:
        """Analyze system resource usage"""
        print("🔍 ANALYZING SYSTEM RESOURCES")
        print("-" * 40)
        
        resources = {}
        
        # CPU usage
        cpu_percent = psutil.cpu_percent(interval=1)
        resources['cpu_usage'] = cpu_percent
        print(f"CPU Usage: {cpu_percent}%")
        
        # Memory usage
        memory = psutil.virtual_memory()
        resources['memory_usage'] = memory.percent
        resources['memory_available_gb'] = memory.available / (1024**3)
        print(f"Memory Usage: {memory.percent}% ({memory.available / (1024**3):.1f} GB available)")
        
        # GPU usage
        try:
            gpus = GPUtil.getGPUs()
            if gpus:
                gpu = gpus[0]
                resources['gpu_usage'] = gpu.load * 100
                resources['gpu_memory_usage'] = gpu.memoryUtil * 100
                resources['gpu_memory_available'] = gpu.memoryFree
                print(f"GPU Usage: {gpu.load * 100:.1f}%")
                print(f"GPU Memory: {gpu.memoryUtil * 100:.1f}% ({gpu.memoryFree} MB free)")
            else:
                resources['gpu_usage'] = 0
                resources['gpu_memory_usage'] = 0
                print("No GPU detected")
        except Exception as e:
            print(f"GPU analysis failed: {e}")
            resources['gpu_usage'] = 0
            resources['gpu_memory_usage'] = 0
        
        return resources
    
    def analyze_openclip_performance(self) -> Dict[str, Any]:
        """Analyze OpenCLIP performance in virtual environment"""
        print("\n🔍 ANALYZING OPENCLIP PERFORMANCE")
        print("-" * 40)
        
        if not os.path.exists(self.venv_python):
            print("❌ OpenCLIP virtual environment not found")
            return {"available": False}
        
        performance_script = """
import time
import torch
import open_clip
from PIL import Image
import json

def benchmark_model():
    results = {}
    
    # Test 1: Model loading time
    start_time = time.time()
    model, _, processor = open_clip.create_model_and_transforms(
        model_name="ViT-B-32",
        pretrained="laion2b_s34b_b79k"
    )
    results['model_loading_time'] = time.time() - start_time
    
    # Test 2: GPU transfer time
    start_time = time.time()
    if torch.cuda.is_available():
        model = model.half().cuda()
    results['gpu_transfer_time'] = time.time() - start_time
    
    # Test 3: Text encoding time
    text_labels = ["a photo of a bee", "a photo of a document"]
    start_time = time.time()
    with torch.no_grad():
        text_tokens = open_clip.tokenize(text_labels)
        if torch.cuda.is_available():
            text_tokens = text_tokens.cuda()
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    results['text_encoding_time'] = time.time() - start_time
    
    # Test 4: Single image inference
    # Create a dummy image
    dummy_image = Image.new('RGB', (224, 224), color='red')
    start_time = time.time()
    image_tensor = processor(dummy_image).unsqueeze(0)
    if torch.cuda.is_available():
        image_tensor = image_tensor.half().cuda()
    
    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    
    results['single_inference_time'] = time.time() - start_time
    
    # Test 5: Batch inference (8 images)
    start_time = time.time()
    batch_results = []
    for i in range(8):
        image_tensor = processor(dummy_image).unsqueeze(0)
        if torch.cuda.is_available():
            image_tensor = image_tensor.half().cuda()
        
        with torch.no_grad():
            image_features = model.encode_image(image_tensor)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            batch_results.append(similarity)
    
    results['batch_8_inference_time'] = time.time() - start_time
    results['per_image_time_batch'] = results['batch_8_inference_time'] / 8
    
    # System info
    results['cuda_available'] = torch.cuda.is_available()
    if torch.cuda.is_available():
        results['gpu_name'] = torch.cuda.get_device_name(0)
        results['gpu_memory_allocated'] = torch.cuda.memory_allocated() / (1024**3)
        results['gpu_memory_reserved'] = torch.cuda.memory_reserved() / (1024**3)
    
    print(json.dumps(results))

benchmark_model()
"""
        
        try:
            start_time = time.time()
            result = subprocess.run([
                self.venv_python, "-c", performance_script
            ], capture_output=True, text=True, timeout=120)
            
            if result.returncode == 0:
                performance_data = json.loads(result.stdout)
                total_time = time.time() - start_time
                performance_data['total_execution_time'] = total_time
                
                print(f"✅ OpenCLIP Performance Analysis:")
                print(f"   Model Loading: {performance_data['model_loading_time']:.3f}s")
                print(f"   GPU Transfer: {performance_data['gpu_transfer_time']:.3f}s")
                print(f"   Text Encoding: {performance_data['text_encoding_time']:.3f}s")
                print(f"   Single Inference: {performance_data['single_inference_time']:.3f}s")
                print(f"   Batch (8) Inference: {performance_data['batch_8_inference_time']:.3f}s")
                print(f"   Per Image (Batch): {performance_data['per_image_time_batch']:.3f}s")
                print(f"   Total Execution: {total_time:.3f}s")
                
                if performance_data.get('cuda_available'):
                    print(f"   GPU: {performance_data.get('gpu_name', 'Unknown')}")
                    print(f"   GPU Memory Allocated: {performance_data.get('gpu_memory_allocated', 0):.2f} GB")
                
                performance_data['available'] = True
                return performance_data
            else:
                print(f"❌ OpenCLIP benchmark failed: {result.stderr}")
                return {"available": False, "error": result.stderr}
                
        except subprocess.TimeoutExpired:
            print("❌ OpenCLIP benchmark timed out (120s)")
            return {"available": False, "error": "Timeout"}
        except Exception as e:
            print(f"❌ OpenCLIP benchmark failed: {e}")
            return {"available": False, "error": str(e)}
    
    def analyze_subprocess_overhead(self) -> Dict[str, Any]:
        """Analyze subprocess communication overhead"""
        print("\n🔍 ANALYZING SUBPROCESS OVERHEAD")
        print("-" * 40)
        
        overhead_tests = []
        
        # Test 1: Simple subprocess call
        start_time = time.time()
        for i in range(5):
            result = subprocess.run([self.venv_python, "-c", "print('test')"], 
                                  capture_output=True, text=True)
        simple_overhead = (time.time() - start_time) / 5
        overhead_tests.append(("Simple subprocess", simple_overhead))
        
        # Test 2: Import overhead
        start_time = time.time()
        for i in range(3):
            result = subprocess.run([self.venv_python, "-c", "import torch; print('loaded')"], 
                                  capture_output=True, text=True, timeout=30)
        import_overhead = (time.time() - start_time) / 3
        overhead_tests.append(("Import torch", import_overhead))
        
        # Test 3: Full model load overhead
        start_time = time.time()
        for i in range(2):
            result = subprocess.run([
                self.venv_python, "-c", 
                "import torch, open_clip; model, _, _ = open_clip.create_model_and_transforms('ViT-B-32'); print('loaded')"
            ], capture_output=True, text=True, timeout=60)
        model_overhead = (time.time() - start_time) / 2
        overhead_tests.append(("Full model load", model_overhead))
        
        print("Subprocess Overhead Analysis:")
        for test_name, overhead in overhead_tests:
            print(f"   {test_name}: {overhead:.3f}s")
        
        return {
            "simple_subprocess": simple_overhead,
            "import_torch": import_overhead,
            "full_model_load": model_overhead
        }
    
    def identify_bottlenecks(self, system_resources: Dict, openclip_perf: Dict, subprocess_overhead: Dict) -> List[str]:
        """Identify performance bottlenecks"""
        print("\n🔍 IDENTIFYING BOTTLENECKS")
        print("-" * 40)
        
        bottlenecks = []
        
        # Check system resources
        if system_resources.get('cpu_usage', 0) > 80:
            bottlenecks.append("High CPU usage may be slowing down processing")
        
        if system_resources.get('memory_usage', 0) > 80:
            bottlenecks.append("High memory usage may cause swapping")
        
        if system_resources.get('gpu_usage', 0) > 90:
            bottlenecks.append("GPU is heavily utilized")
        
        # Check OpenCLIP performance
        if openclip_perf.get('available'):
            if openclip_perf.get('model_loading_time', 0) > 10:
                bottlenecks.append("Slow model loading (>10s)")
            
            if openclip_perf.get('single_inference_time', 0) > 1:
                bottlenecks.append("Slow single image inference (>1s)")
            
            if openclip_perf.get('per_image_time_batch', 0) > 0.5:
                bottlenecks.append("Slow batch inference per image (>0.5s)")
        
        # Check subprocess overhead
        if subprocess_overhead.get('full_model_load', 0) > 15:
            bottlenecks.append("High subprocess overhead for model loading")
        
        if not bottlenecks:
            bottlenecks.append("No obvious bottlenecks detected - may be I/O or network related")
        
        print("Identified Bottlenecks:")
        for i, bottleneck in enumerate(bottlenecks, 1):
            print(f"   {i}. {bottleneck}")
        
        return bottlenecks
    
    def propose_solutions(self, bottlenecks: List[str]) -> List[str]:
        """Propose solutions for identified bottlenecks"""
        print("\n💡 PROPOSED SOLUTIONS")
        print("-" * 40)
        
        solutions = []
        solution_map = {
            "High CPU usage": "Optimize CPU-intensive operations, consider threading",
            "High memory usage": "Reduce batch size, clear cache between operations",
            "GPU is heavily utilized": "Monitor GPU usage, consider model quantization",
            "Slow model loading": "Implement model caching or persistent process",
            "Slow single image inference": "Use smaller model, optimize preprocessing",
            "Slow batch inference": "Increase batch size, optimize GPU memory usage",
            "High subprocess overhead": "Use persistent classifier process, reduce subprocess calls"
        }
        
        for bottleneck in bottlenecks:
            for key, solution in solution_map.items():
                if key.lower() in bottleneck.lower():
                    solutions.append(solution)
                    break
            else:
                solutions.append("Investigate specific performance issue")
        
        # Additional general solutions
        solutions.extend([
            "Implement model warm-up to pre-load models",
            "Use smaller OpenCLIP model (ViT-B-16 instead of ViT-B-32)",
            "Implement async processing for image classification",
            "Cache classification results for repeated images",
            "Use image downscaling for faster processing",
            "Implement progressive loading - classify while extracting"
        ])
        
        print("Recommended Solutions:")
        for i, solution in enumerate(solutions, 1):
            print(f"   {i}. {solution}")
        
        return solutions

def main():
    """Run comprehensive performance analysis"""
    print("🚀 PERFORMANCE ROOT CAUSE ANALYSIS")
    print("=" * 50)
    
    analyzer = PerformanceAnalyzer()
    
    # Run all analyses
    system_resources = analyzer.analyze_system_resources()
    openclip_perf = analyzer.analyze_openclip_performance()
    subprocess_overhead = analyzer.analyze_subprocess_overhead()
    
    # Identify bottlenecks and propose solutions
    bottlenecks = analyzer.identify_bottlenecks(system_resources, openclip_perf, subprocess_overhead)
    solutions = analyzer.propose_solutions(bottlenecks)
    
    print("\n📊 PERFORMANCE SUMMARY")
    print("=" * 50)
    print(f"System Resources: CPU {system_resources.get('cpu_usage', 0)}%, Memory {system_resources.get('memory_usage', 0)}%")
    print(f"OpenCLIP Available: {openclip_perf.get('available', False)}")
    if openclip_perf.get('available'):
        print(f"Single Inference Time: {openclip_perf.get('single_inference_time', 0):.3f}s")
        print(f"Batch Per Image Time: {openclip_perf.get('per_image_time_batch', 0):.3f}s")
    print(f"Bottlenecks Identified: {len(bottlenecks)}")
    print(f"Solutions Proposed: {len(solutions)}")

if __name__ == "__main__":
    main()