""" Performance Root Cause Analysis for Image Classification Identifies bottlenecks and proposes solutions """ import os import time import subprocess import json import psutil import GPUtil from typing import Dict, Any, List class PerformanceAnalyzer: """Analyze performance bottlenecks in image classification""" def __init__(self): self.venv_python = "openclip_gpu_env\\Scripts\\python.exe" def analyze_system_resources(self) -> Dict[str, Any]: """Analyze system resource usage""" print("šŸ” ANALYZING SYSTEM RESOURCES") print("-" * 40) resources = {} # CPU usage cpu_percent = psutil.cpu_percent(interval=1) resources['cpu_usage'] = cpu_percent print(f"CPU Usage: {cpu_percent}%") # Memory usage memory = psutil.virtual_memory() resources['memory_usage'] = memory.percent resources['memory_available_gb'] = memory.available / (1024**3) print(f"Memory Usage: {memory.percent}% ({memory.available / (1024**3):.1f} GB available)") # GPU usage try: gpus = GPUtil.getGPUs() if gpus: gpu = gpus[0] resources['gpu_usage'] = gpu.load * 100 resources['gpu_memory_usage'] = gpu.memoryUtil * 100 resources['gpu_memory_available'] = gpu.memoryFree print(f"GPU Usage: {gpu.load * 100:.1f}%") print(f"GPU Memory: {gpu.memoryUtil * 100:.1f}% ({gpu.memoryFree} MB free)") else: resources['gpu_usage'] = 0 resources['gpu_memory_usage'] = 0 print("No GPU detected") except Exception as e: print(f"GPU analysis failed: {e}") resources['gpu_usage'] = 0 resources['gpu_memory_usage'] = 0 return resources def analyze_openclip_performance(self) -> Dict[str, Any]: """Analyze OpenCLIP performance in virtual environment""" print("\nšŸ” ANALYZING OPENCLIP PERFORMANCE") print("-" * 40) if not os.path.exists(self.venv_python): print("āŒ OpenCLIP virtual environment not found") return {"available": False} performance_script = """ import time import torch import open_clip from PIL import Image import json def benchmark_model(): results = {} # Test 1: Model loading time start_time = time.time() model, _, processor = open_clip.create_model_and_transforms( model_name="ViT-B-32", pretrained="laion2b_s34b_b79k" ) results['model_loading_time'] = time.time() - start_time # Test 2: GPU transfer time start_time = time.time() if torch.cuda.is_available(): model = model.half().cuda() results['gpu_transfer_time'] = time.time() - start_time # Test 3: Text encoding time text_labels = ["a photo of a bee", "a photo of a document"] start_time = time.time() with torch.no_grad(): text_tokens = open_clip.tokenize(text_labels) if torch.cuda.is_available(): text_tokens = text_tokens.cuda() text_features = model.encode_text(text_tokens) text_features /= text_features.norm(dim=-1, keepdim=True) results['text_encoding_time'] = time.time() - start_time # Test 4: Single image inference # Create a dummy image dummy_image = Image.new('RGB', (224, 224), color='red') start_time = time.time() image_tensor = processor(dummy_image).unsqueeze(0) if torch.cuda.is_available(): image_tensor = image_tensor.half().cuda() with torch.no_grad(): image_features = model.encode_image(image_tensor) image_features /= image_features.norm(dim=-1, keepdim=True) similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) results['single_inference_time'] = time.time() - start_time # Test 5: Batch inference (8 images) start_time = time.time() batch_results = [] for i in range(8): image_tensor = processor(dummy_image).unsqueeze(0) if torch.cuda.is_available(): image_tensor = image_tensor.half().cuda() with torch.no_grad(): image_features = model.encode_image(image_tensor) image_features /= image_features.norm(dim=-1, keepdim=True) similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) batch_results.append(similarity) results['batch_8_inference_time'] = time.time() - start_time results['per_image_time_batch'] = results['batch_8_inference_time'] / 8 # System info results['cuda_available'] = torch.cuda.is_available() if torch.cuda.is_available(): results['gpu_name'] = torch.cuda.get_device_name(0) results['gpu_memory_allocated'] = torch.cuda.memory_allocated() / (1024**3) results['gpu_memory_reserved'] = torch.cuda.memory_reserved() / (1024**3) print(json.dumps(results)) benchmark_model() """ try: start_time = time.time() result = subprocess.run([ self.venv_python, "-c", performance_script ], capture_output=True, text=True, timeout=120) if result.returncode == 0: performance_data = json.loads(result.stdout) total_time = time.time() - start_time performance_data['total_execution_time'] = total_time print(f"āœ… OpenCLIP Performance Analysis:") print(f" Model Loading: {performance_data['model_loading_time']:.3f}s") print(f" GPU Transfer: {performance_data['gpu_transfer_time']:.3f}s") print(f" Text Encoding: {performance_data['text_encoding_time']:.3f}s") print(f" Single Inference: {performance_data['single_inference_time']:.3f}s") print(f" Batch (8) Inference: {performance_data['batch_8_inference_time']:.3f}s") print(f" Per Image (Batch): {performance_data['per_image_time_batch']:.3f}s") print(f" Total Execution: {total_time:.3f}s") if performance_data.get('cuda_available'): print(f" GPU: {performance_data.get('gpu_name', 'Unknown')}") print(f" GPU Memory Allocated: {performance_data.get('gpu_memory_allocated', 0):.2f} GB") performance_data['available'] = True return performance_data else: print(f"āŒ OpenCLIP benchmark failed: {result.stderr}") return {"available": False, "error": result.stderr} except subprocess.TimeoutExpired: print("āŒ OpenCLIP benchmark timed out (120s)") return {"available": False, "error": "Timeout"} except Exception as e: print(f"āŒ OpenCLIP benchmark failed: {e}") return {"available": False, "error": str(e)} def analyze_subprocess_overhead(self) -> Dict[str, Any]: """Analyze subprocess communication overhead""" print("\nšŸ” ANALYZING SUBPROCESS OVERHEAD") print("-" * 40) overhead_tests = [] # Test 1: Simple subprocess call start_time = time.time() for i in range(5): result = subprocess.run([self.venv_python, "-c", "print('test')"], capture_output=True, text=True) simple_overhead = (time.time() - start_time) / 5 overhead_tests.append(("Simple subprocess", simple_overhead)) # Test 2: Import overhead start_time = time.time() for i in range(3): result = subprocess.run([self.venv_python, "-c", "import torch; print('loaded')"], capture_output=True, text=True, timeout=30) import_overhead = (time.time() - start_time) / 3 overhead_tests.append(("Import torch", import_overhead)) # Test 3: Full model load overhead start_time = time.time() for i in range(2): result = subprocess.run([ self.venv_python, "-c", "import torch, open_clip; model, _, _ = open_clip.create_model_and_transforms('ViT-B-32'); print('loaded')" ], capture_output=True, text=True, timeout=60) model_overhead = (time.time() - start_time) / 2 overhead_tests.append(("Full model load", model_overhead)) print("Subprocess Overhead Analysis:") for test_name, overhead in overhead_tests: print(f" {test_name}: {overhead:.3f}s") return { "simple_subprocess": simple_overhead, "import_torch": import_overhead, "full_model_load": model_overhead } def identify_bottlenecks(self, system_resources: Dict, openclip_perf: Dict, subprocess_overhead: Dict) -> List[str]: """Identify performance bottlenecks""" print("\nšŸ” IDENTIFYING BOTTLENECKS") print("-" * 40) bottlenecks = [] # Check system resources if system_resources.get('cpu_usage', 0) > 80: bottlenecks.append("High CPU usage may be slowing down processing") if system_resources.get('memory_usage', 0) > 80: bottlenecks.append("High memory usage may cause swapping") if system_resources.get('gpu_usage', 0) > 90: bottlenecks.append("GPU is heavily utilized") # Check OpenCLIP performance if openclip_perf.get('available'): if openclip_perf.get('model_loading_time', 0) > 10: bottlenecks.append("Slow model loading (>10s)") if openclip_perf.get('single_inference_time', 0) > 1: bottlenecks.append("Slow single image inference (>1s)") if openclip_perf.get('per_image_time_batch', 0) > 0.5: bottlenecks.append("Slow batch inference per image (>0.5s)") # Check subprocess overhead if subprocess_overhead.get('full_model_load', 0) > 15: bottlenecks.append("High subprocess overhead for model loading") if not bottlenecks: bottlenecks.append("No obvious bottlenecks detected - may be I/O or network related") print("Identified Bottlenecks:") for i, bottleneck in enumerate(bottlenecks, 1): print(f" {i}. {bottleneck}") return bottlenecks def propose_solutions(self, bottlenecks: List[str]) -> List[str]: """Propose solutions for identified bottlenecks""" print("\nšŸ’” PROPOSED SOLUTIONS") print("-" * 40) solutions = [] solution_map = { "High CPU usage": "Optimize CPU-intensive operations, consider threading", "High memory usage": "Reduce batch size, clear cache between operations", "GPU is heavily utilized": "Monitor GPU usage, consider model quantization", "Slow model loading": "Implement model caching or persistent process", "Slow single image inference": "Use smaller model, optimize preprocessing", "Slow batch inference": "Increase batch size, optimize GPU memory usage", "High subprocess overhead": "Use persistent classifier process, reduce subprocess calls" } for bottleneck in bottlenecks: for key, solution in solution_map.items(): if key.lower() in bottleneck.lower(): solutions.append(solution) break else: solutions.append("Investigate specific performance issue") # Additional general solutions solutions.extend([ "Implement model warm-up to pre-load models", "Use smaller OpenCLIP model (ViT-B-16 instead of ViT-B-32)", "Implement async processing for image classification", "Cache classification results for repeated images", "Use image downscaling for faster processing", "Implement progressive loading - classify while extracting" ]) print("Recommended Solutions:") for i, solution in enumerate(solutions, 1): print(f" {i}. {solution}") return solutions def main(): """Run comprehensive performance analysis""" print("šŸš€ PERFORMANCE ROOT CAUSE ANALYSIS") print("=" * 50) analyzer = PerformanceAnalyzer() # Run all analyses system_resources = analyzer.analyze_system_resources() openclip_perf = analyzer.analyze_openclip_performance() subprocess_overhead = analyzer.analyze_subprocess_overhead() # Identify bottlenecks and propose solutions bottlenecks = analyzer.identify_bottlenecks(system_resources, openclip_perf, subprocess_overhead) solutions = analyzer.propose_solutions(bottlenecks) print("\nšŸ“Š PERFORMANCE SUMMARY") print("=" * 50) print(f"System Resources: CPU {system_resources.get('cpu_usage', 0)}%, Memory {system_resources.get('memory_usage', 0)}%") print(f"OpenCLIP Available: {openclip_perf.get('available', False)}") if openclip_perf.get('available'): print(f"Single Inference Time: {openclip_perf.get('single_inference_time', 0):.3f}s") print(f"Batch Per Image Time: {openclip_perf.get('per_image_time_batch', 0):.3f}s") print(f"Bottlenecks Identified: {len(bottlenecks)}") print(f"Solutions Proposed: {len(solutions)}") if __name__ == "__main__": main()