Files
railseek6/performance_root_cause_analysis.py

334 lines
14 KiB
Python

"""
Performance Root Cause Analysis for Image Classification
Identifies bottlenecks and proposes solutions
"""
import os
import time
import subprocess
import json
import psutil
import GPUtil
from typing import Dict, Any, List
class PerformanceAnalyzer:
"""Analyze performance bottlenecks in image classification"""
def __init__(self):
self.venv_python = "openclip_gpu_env\\Scripts\\python.exe"
def analyze_system_resources(self) -> Dict[str, Any]:
"""Analyze system resource usage"""
print("🔍 ANALYZING SYSTEM RESOURCES")
print("-" * 40)
resources = {}
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
resources['cpu_usage'] = cpu_percent
print(f"CPU Usage: {cpu_percent}%")
# Memory usage
memory = psutil.virtual_memory()
resources['memory_usage'] = memory.percent
resources['memory_available_gb'] = memory.available / (1024**3)
print(f"Memory Usage: {memory.percent}% ({memory.available / (1024**3):.1f} GB available)")
# GPU usage
try:
gpus = GPUtil.getGPUs()
if gpus:
gpu = gpus[0]
resources['gpu_usage'] = gpu.load * 100
resources['gpu_memory_usage'] = gpu.memoryUtil * 100
resources['gpu_memory_available'] = gpu.memoryFree
print(f"GPU Usage: {gpu.load * 100:.1f}%")
print(f"GPU Memory: {gpu.memoryUtil * 100:.1f}% ({gpu.memoryFree} MB free)")
else:
resources['gpu_usage'] = 0
resources['gpu_memory_usage'] = 0
print("No GPU detected")
except Exception as e:
print(f"GPU analysis failed: {e}")
resources['gpu_usage'] = 0
resources['gpu_memory_usage'] = 0
return resources
def analyze_openclip_performance(self) -> Dict[str, Any]:
"""Analyze OpenCLIP performance in virtual environment"""
print("\n🔍 ANALYZING OPENCLIP PERFORMANCE")
print("-" * 40)
if not os.path.exists(self.venv_python):
print("❌ OpenCLIP virtual environment not found")
return {"available": False}
performance_script = """
import time
import torch
import open_clip
from PIL import Image
import json
def benchmark_model():
results = {}
# Test 1: Model loading time
start_time = time.time()
model, _, processor = open_clip.create_model_and_transforms(
model_name="ViT-B-32",
pretrained="laion2b_s34b_b79k"
)
results['model_loading_time'] = time.time() - start_time
# Test 2: GPU transfer time
start_time = time.time()
if torch.cuda.is_available():
model = model.half().cuda()
results['gpu_transfer_time'] = time.time() - start_time
# Test 3: Text encoding time
text_labels = ["a photo of a bee", "a photo of a document"]
start_time = time.time()
with torch.no_grad():
text_tokens = open_clip.tokenize(text_labels)
if torch.cuda.is_available():
text_tokens = text_tokens.cuda()
text_features = model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
results['text_encoding_time'] = time.time() - start_time
# Test 4: Single image inference
# Create a dummy image
dummy_image = Image.new('RGB', (224, 224), color='red')
start_time = time.time()
image_tensor = processor(dummy_image).unsqueeze(0)
if torch.cuda.is_available():
image_tensor = image_tensor.half().cuda()
with torch.no_grad():
image_features = model.encode_image(image_tensor)
image_features /= image_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
results['single_inference_time'] = time.time() - start_time
# Test 5: Batch inference (8 images)
start_time = time.time()
batch_results = []
for i in range(8):
image_tensor = processor(dummy_image).unsqueeze(0)
if torch.cuda.is_available():
image_tensor = image_tensor.half().cuda()
with torch.no_grad():
image_features = model.encode_image(image_tensor)
image_features /= image_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
batch_results.append(similarity)
results['batch_8_inference_time'] = time.time() - start_time
results['per_image_time_batch'] = results['batch_8_inference_time'] / 8
# System info
results['cuda_available'] = torch.cuda.is_available()
if torch.cuda.is_available():
results['gpu_name'] = torch.cuda.get_device_name(0)
results['gpu_memory_allocated'] = torch.cuda.memory_allocated() / (1024**3)
results['gpu_memory_reserved'] = torch.cuda.memory_reserved() / (1024**3)
print(json.dumps(results))
benchmark_model()
"""
try:
start_time = time.time()
result = subprocess.run([
self.venv_python, "-c", performance_script
], capture_output=True, text=True, timeout=120)
if result.returncode == 0:
performance_data = json.loads(result.stdout)
total_time = time.time() - start_time
performance_data['total_execution_time'] = total_time
print(f"✅ OpenCLIP Performance Analysis:")
print(f" Model Loading: {performance_data['model_loading_time']:.3f}s")
print(f" GPU Transfer: {performance_data['gpu_transfer_time']:.3f}s")
print(f" Text Encoding: {performance_data['text_encoding_time']:.3f}s")
print(f" Single Inference: {performance_data['single_inference_time']:.3f}s")
print(f" Batch (8) Inference: {performance_data['batch_8_inference_time']:.3f}s")
print(f" Per Image (Batch): {performance_data['per_image_time_batch']:.3f}s")
print(f" Total Execution: {total_time:.3f}s")
if performance_data.get('cuda_available'):
print(f" GPU: {performance_data.get('gpu_name', 'Unknown')}")
print(f" GPU Memory Allocated: {performance_data.get('gpu_memory_allocated', 0):.2f} GB")
performance_data['available'] = True
return performance_data
else:
print(f"❌ OpenCLIP benchmark failed: {result.stderr}")
return {"available": False, "error": result.stderr}
except subprocess.TimeoutExpired:
print("❌ OpenCLIP benchmark timed out (120s)")
return {"available": False, "error": "Timeout"}
except Exception as e:
print(f"❌ OpenCLIP benchmark failed: {e}")
return {"available": False, "error": str(e)}
def analyze_subprocess_overhead(self) -> Dict[str, Any]:
"""Analyze subprocess communication overhead"""
print("\n🔍 ANALYZING SUBPROCESS OVERHEAD")
print("-" * 40)
overhead_tests = []
# Test 1: Simple subprocess call
start_time = time.time()
for i in range(5):
result = subprocess.run([self.venv_python, "-c", "print('test')"],
capture_output=True, text=True)
simple_overhead = (time.time() - start_time) / 5
overhead_tests.append(("Simple subprocess", simple_overhead))
# Test 2: Import overhead
start_time = time.time()
for i in range(3):
result = subprocess.run([self.venv_python, "-c", "import torch; print('loaded')"],
capture_output=True, text=True, timeout=30)
import_overhead = (time.time() - start_time) / 3
overhead_tests.append(("Import torch", import_overhead))
# Test 3: Full model load overhead
start_time = time.time()
for i in range(2):
result = subprocess.run([
self.venv_python, "-c",
"import torch, open_clip; model, _, _ = open_clip.create_model_and_transforms('ViT-B-32'); print('loaded')"
], capture_output=True, text=True, timeout=60)
model_overhead = (time.time() - start_time) / 2
overhead_tests.append(("Full model load", model_overhead))
print("Subprocess Overhead Analysis:")
for test_name, overhead in overhead_tests:
print(f" {test_name}: {overhead:.3f}s")
return {
"simple_subprocess": simple_overhead,
"import_torch": import_overhead,
"full_model_load": model_overhead
}
def identify_bottlenecks(self, system_resources: Dict, openclip_perf: Dict, subprocess_overhead: Dict) -> List[str]:
"""Identify performance bottlenecks"""
print("\n🔍 IDENTIFYING BOTTLENECKS")
print("-" * 40)
bottlenecks = []
# Check system resources
if system_resources.get('cpu_usage', 0) > 80:
bottlenecks.append("High CPU usage may be slowing down processing")
if system_resources.get('memory_usage', 0) > 80:
bottlenecks.append("High memory usage may cause swapping")
if system_resources.get('gpu_usage', 0) > 90:
bottlenecks.append("GPU is heavily utilized")
# Check OpenCLIP performance
if openclip_perf.get('available'):
if openclip_perf.get('model_loading_time', 0) > 10:
bottlenecks.append("Slow model loading (>10s)")
if openclip_perf.get('single_inference_time', 0) > 1:
bottlenecks.append("Slow single image inference (>1s)")
if openclip_perf.get('per_image_time_batch', 0) > 0.5:
bottlenecks.append("Slow batch inference per image (>0.5s)")
# Check subprocess overhead
if subprocess_overhead.get('full_model_load', 0) > 15:
bottlenecks.append("High subprocess overhead for model loading")
if not bottlenecks:
bottlenecks.append("No obvious bottlenecks detected - may be I/O or network related")
print("Identified Bottlenecks:")
for i, bottleneck in enumerate(bottlenecks, 1):
print(f" {i}. {bottleneck}")
return bottlenecks
def propose_solutions(self, bottlenecks: List[str]) -> List[str]:
"""Propose solutions for identified bottlenecks"""
print("\n💡 PROPOSED SOLUTIONS")
print("-" * 40)
solutions = []
solution_map = {
"High CPU usage": "Optimize CPU-intensive operations, consider threading",
"High memory usage": "Reduce batch size, clear cache between operations",
"GPU is heavily utilized": "Monitor GPU usage, consider model quantization",
"Slow model loading": "Implement model caching or persistent process",
"Slow single image inference": "Use smaller model, optimize preprocessing",
"Slow batch inference": "Increase batch size, optimize GPU memory usage",
"High subprocess overhead": "Use persistent classifier process, reduce subprocess calls"
}
for bottleneck in bottlenecks:
for key, solution in solution_map.items():
if key.lower() in bottleneck.lower():
solutions.append(solution)
break
else:
solutions.append("Investigate specific performance issue")
# Additional general solutions
solutions.extend([
"Implement model warm-up to pre-load models",
"Use smaller OpenCLIP model (ViT-B-16 instead of ViT-B-32)",
"Implement async processing for image classification",
"Cache classification results for repeated images",
"Use image downscaling for faster processing",
"Implement progressive loading - classify while extracting"
])
print("Recommended Solutions:")
for i, solution in enumerate(solutions, 1):
print(f" {i}. {solution}")
return solutions
def main():
"""Run comprehensive performance analysis"""
print("🚀 PERFORMANCE ROOT CAUSE ANALYSIS")
print("=" * 50)
analyzer = PerformanceAnalyzer()
# Run all analyses
system_resources = analyzer.analyze_system_resources()
openclip_perf = analyzer.analyze_openclip_performance()
subprocess_overhead = analyzer.analyze_subprocess_overhead()
# Identify bottlenecks and propose solutions
bottlenecks = analyzer.identify_bottlenecks(system_resources, openclip_perf, subprocess_overhead)
solutions = analyzer.propose_solutions(bottlenecks)
print("\n📊 PERFORMANCE SUMMARY")
print("=" * 50)
print(f"System Resources: CPU {system_resources.get('cpu_usage', 0)}%, Memory {system_resources.get('memory_usage', 0)}%")
print(f"OpenCLIP Available: {openclip_perf.get('available', False)}")
if openclip_perf.get('available'):
print(f"Single Inference Time: {openclip_perf.get('single_inference_time', 0):.3f}s")
print(f"Batch Per Image Time: {openclip_perf.get('per_image_time_batch', 0):.3f}s")
print(f"Bottlenecks Identified: {len(bottlenecks)}")
print(f"Solutions Proposed: {len(solutions)}")
if __name__ == "__main__":
main()