334 lines
14 KiB
Python
334 lines
14 KiB
Python
"""
|
|
Performance Root Cause Analysis for Image Classification
|
|
Identifies bottlenecks and proposes solutions
|
|
"""
|
|
import os
|
|
import time
|
|
import subprocess
|
|
import json
|
|
import psutil
|
|
import GPUtil
|
|
from typing import Dict, Any, List
|
|
|
|
class PerformanceAnalyzer:
|
|
"""Analyze performance bottlenecks in image classification"""
|
|
|
|
def __init__(self):
|
|
self.venv_python = "openclip_gpu_env\\Scripts\\python.exe"
|
|
|
|
def analyze_system_resources(self) -> Dict[str, Any]:
|
|
"""Analyze system resource usage"""
|
|
print("🔍 ANALYZING SYSTEM RESOURCES")
|
|
print("-" * 40)
|
|
|
|
resources = {}
|
|
|
|
# CPU usage
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
resources['cpu_usage'] = cpu_percent
|
|
print(f"CPU Usage: {cpu_percent}%")
|
|
|
|
# Memory usage
|
|
memory = psutil.virtual_memory()
|
|
resources['memory_usage'] = memory.percent
|
|
resources['memory_available_gb'] = memory.available / (1024**3)
|
|
print(f"Memory Usage: {memory.percent}% ({memory.available / (1024**3):.1f} GB available)")
|
|
|
|
# GPU usage
|
|
try:
|
|
gpus = GPUtil.getGPUs()
|
|
if gpus:
|
|
gpu = gpus[0]
|
|
resources['gpu_usage'] = gpu.load * 100
|
|
resources['gpu_memory_usage'] = gpu.memoryUtil * 100
|
|
resources['gpu_memory_available'] = gpu.memoryFree
|
|
print(f"GPU Usage: {gpu.load * 100:.1f}%")
|
|
print(f"GPU Memory: {gpu.memoryUtil * 100:.1f}% ({gpu.memoryFree} MB free)")
|
|
else:
|
|
resources['gpu_usage'] = 0
|
|
resources['gpu_memory_usage'] = 0
|
|
print("No GPU detected")
|
|
except Exception as e:
|
|
print(f"GPU analysis failed: {e}")
|
|
resources['gpu_usage'] = 0
|
|
resources['gpu_memory_usage'] = 0
|
|
|
|
return resources
|
|
|
|
def analyze_openclip_performance(self) -> Dict[str, Any]:
|
|
"""Analyze OpenCLIP performance in virtual environment"""
|
|
print("\n🔍 ANALYZING OPENCLIP PERFORMANCE")
|
|
print("-" * 40)
|
|
|
|
if not os.path.exists(self.venv_python):
|
|
print("❌ OpenCLIP virtual environment not found")
|
|
return {"available": False}
|
|
|
|
performance_script = """
|
|
import time
|
|
import torch
|
|
import open_clip
|
|
from PIL import Image
|
|
import json
|
|
|
|
def benchmark_model():
|
|
results = {}
|
|
|
|
# Test 1: Model loading time
|
|
start_time = time.time()
|
|
model, _, processor = open_clip.create_model_and_transforms(
|
|
model_name="ViT-B-32",
|
|
pretrained="laion2b_s34b_b79k"
|
|
)
|
|
results['model_loading_time'] = time.time() - start_time
|
|
|
|
# Test 2: GPU transfer time
|
|
start_time = time.time()
|
|
if torch.cuda.is_available():
|
|
model = model.half().cuda()
|
|
results['gpu_transfer_time'] = time.time() - start_time
|
|
|
|
# Test 3: Text encoding time
|
|
text_labels = ["a photo of a bee", "a photo of a document"]
|
|
start_time = time.time()
|
|
with torch.no_grad():
|
|
text_tokens = open_clip.tokenize(text_labels)
|
|
if torch.cuda.is_available():
|
|
text_tokens = text_tokens.cuda()
|
|
text_features = model.encode_text(text_tokens)
|
|
text_features /= text_features.norm(dim=-1, keepdim=True)
|
|
results['text_encoding_time'] = time.time() - start_time
|
|
|
|
# Test 4: Single image inference
|
|
# Create a dummy image
|
|
dummy_image = Image.new('RGB', (224, 224), color='red')
|
|
start_time = time.time()
|
|
image_tensor = processor(dummy_image).unsqueeze(0)
|
|
if torch.cuda.is_available():
|
|
image_tensor = image_tensor.half().cuda()
|
|
|
|
with torch.no_grad():
|
|
image_features = model.encode_image(image_tensor)
|
|
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
|
|
|
results['single_inference_time'] = time.time() - start_time
|
|
|
|
# Test 5: Batch inference (8 images)
|
|
start_time = time.time()
|
|
batch_results = []
|
|
for i in range(8):
|
|
image_tensor = processor(dummy_image).unsqueeze(0)
|
|
if torch.cuda.is_available():
|
|
image_tensor = image_tensor.half().cuda()
|
|
|
|
with torch.no_grad():
|
|
image_features = model.encode_image(image_tensor)
|
|
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
|
batch_results.append(similarity)
|
|
|
|
results['batch_8_inference_time'] = time.time() - start_time
|
|
results['per_image_time_batch'] = results['batch_8_inference_time'] / 8
|
|
|
|
# System info
|
|
results['cuda_available'] = torch.cuda.is_available()
|
|
if torch.cuda.is_available():
|
|
results['gpu_name'] = torch.cuda.get_device_name(0)
|
|
results['gpu_memory_allocated'] = torch.cuda.memory_allocated() / (1024**3)
|
|
results['gpu_memory_reserved'] = torch.cuda.memory_reserved() / (1024**3)
|
|
|
|
print(json.dumps(results))
|
|
|
|
benchmark_model()
|
|
"""
|
|
|
|
try:
|
|
start_time = time.time()
|
|
result = subprocess.run([
|
|
self.venv_python, "-c", performance_script
|
|
], capture_output=True, text=True, timeout=120)
|
|
|
|
if result.returncode == 0:
|
|
performance_data = json.loads(result.stdout)
|
|
total_time = time.time() - start_time
|
|
performance_data['total_execution_time'] = total_time
|
|
|
|
print(f"✅ OpenCLIP Performance Analysis:")
|
|
print(f" Model Loading: {performance_data['model_loading_time']:.3f}s")
|
|
print(f" GPU Transfer: {performance_data['gpu_transfer_time']:.3f}s")
|
|
print(f" Text Encoding: {performance_data['text_encoding_time']:.3f}s")
|
|
print(f" Single Inference: {performance_data['single_inference_time']:.3f}s")
|
|
print(f" Batch (8) Inference: {performance_data['batch_8_inference_time']:.3f}s")
|
|
print(f" Per Image (Batch): {performance_data['per_image_time_batch']:.3f}s")
|
|
print(f" Total Execution: {total_time:.3f}s")
|
|
|
|
if performance_data.get('cuda_available'):
|
|
print(f" GPU: {performance_data.get('gpu_name', 'Unknown')}")
|
|
print(f" GPU Memory Allocated: {performance_data.get('gpu_memory_allocated', 0):.2f} GB")
|
|
|
|
performance_data['available'] = True
|
|
return performance_data
|
|
else:
|
|
print(f"❌ OpenCLIP benchmark failed: {result.stderr}")
|
|
return {"available": False, "error": result.stderr}
|
|
|
|
except subprocess.TimeoutExpired:
|
|
print("❌ OpenCLIP benchmark timed out (120s)")
|
|
return {"available": False, "error": "Timeout"}
|
|
except Exception as e:
|
|
print(f"❌ OpenCLIP benchmark failed: {e}")
|
|
return {"available": False, "error": str(e)}
|
|
|
|
def analyze_subprocess_overhead(self) -> Dict[str, Any]:
|
|
"""Analyze subprocess communication overhead"""
|
|
print("\n🔍 ANALYZING SUBPROCESS OVERHEAD")
|
|
print("-" * 40)
|
|
|
|
overhead_tests = []
|
|
|
|
# Test 1: Simple subprocess call
|
|
start_time = time.time()
|
|
for i in range(5):
|
|
result = subprocess.run([self.venv_python, "-c", "print('test')"],
|
|
capture_output=True, text=True)
|
|
simple_overhead = (time.time() - start_time) / 5
|
|
overhead_tests.append(("Simple subprocess", simple_overhead))
|
|
|
|
# Test 2: Import overhead
|
|
start_time = time.time()
|
|
for i in range(3):
|
|
result = subprocess.run([self.venv_python, "-c", "import torch; print('loaded')"],
|
|
capture_output=True, text=True, timeout=30)
|
|
import_overhead = (time.time() - start_time) / 3
|
|
overhead_tests.append(("Import torch", import_overhead))
|
|
|
|
# Test 3: Full model load overhead
|
|
start_time = time.time()
|
|
for i in range(2):
|
|
result = subprocess.run([
|
|
self.venv_python, "-c",
|
|
"import torch, open_clip; model, _, _ = open_clip.create_model_and_transforms('ViT-B-32'); print('loaded')"
|
|
], capture_output=True, text=True, timeout=60)
|
|
model_overhead = (time.time() - start_time) / 2
|
|
overhead_tests.append(("Full model load", model_overhead))
|
|
|
|
print("Subprocess Overhead Analysis:")
|
|
for test_name, overhead in overhead_tests:
|
|
print(f" {test_name}: {overhead:.3f}s")
|
|
|
|
return {
|
|
"simple_subprocess": simple_overhead,
|
|
"import_torch": import_overhead,
|
|
"full_model_load": model_overhead
|
|
}
|
|
|
|
def identify_bottlenecks(self, system_resources: Dict, openclip_perf: Dict, subprocess_overhead: Dict) -> List[str]:
|
|
"""Identify performance bottlenecks"""
|
|
print("\n🔍 IDENTIFYING BOTTLENECKS")
|
|
print("-" * 40)
|
|
|
|
bottlenecks = []
|
|
|
|
# Check system resources
|
|
if system_resources.get('cpu_usage', 0) > 80:
|
|
bottlenecks.append("High CPU usage may be slowing down processing")
|
|
|
|
if system_resources.get('memory_usage', 0) > 80:
|
|
bottlenecks.append("High memory usage may cause swapping")
|
|
|
|
if system_resources.get('gpu_usage', 0) > 90:
|
|
bottlenecks.append("GPU is heavily utilized")
|
|
|
|
# Check OpenCLIP performance
|
|
if openclip_perf.get('available'):
|
|
if openclip_perf.get('model_loading_time', 0) > 10:
|
|
bottlenecks.append("Slow model loading (>10s)")
|
|
|
|
if openclip_perf.get('single_inference_time', 0) > 1:
|
|
bottlenecks.append("Slow single image inference (>1s)")
|
|
|
|
if openclip_perf.get('per_image_time_batch', 0) > 0.5:
|
|
bottlenecks.append("Slow batch inference per image (>0.5s)")
|
|
|
|
# Check subprocess overhead
|
|
if subprocess_overhead.get('full_model_load', 0) > 15:
|
|
bottlenecks.append("High subprocess overhead for model loading")
|
|
|
|
if not bottlenecks:
|
|
bottlenecks.append("No obvious bottlenecks detected - may be I/O or network related")
|
|
|
|
print("Identified Bottlenecks:")
|
|
for i, bottleneck in enumerate(bottlenecks, 1):
|
|
print(f" {i}. {bottleneck}")
|
|
|
|
return bottlenecks
|
|
|
|
def propose_solutions(self, bottlenecks: List[str]) -> List[str]:
|
|
"""Propose solutions for identified bottlenecks"""
|
|
print("\n💡 PROPOSED SOLUTIONS")
|
|
print("-" * 40)
|
|
|
|
solutions = []
|
|
solution_map = {
|
|
"High CPU usage": "Optimize CPU-intensive operations, consider threading",
|
|
"High memory usage": "Reduce batch size, clear cache between operations",
|
|
"GPU is heavily utilized": "Monitor GPU usage, consider model quantization",
|
|
"Slow model loading": "Implement model caching or persistent process",
|
|
"Slow single image inference": "Use smaller model, optimize preprocessing",
|
|
"Slow batch inference": "Increase batch size, optimize GPU memory usage",
|
|
"High subprocess overhead": "Use persistent classifier process, reduce subprocess calls"
|
|
}
|
|
|
|
for bottleneck in bottlenecks:
|
|
for key, solution in solution_map.items():
|
|
if key.lower() in bottleneck.lower():
|
|
solutions.append(solution)
|
|
break
|
|
else:
|
|
solutions.append("Investigate specific performance issue")
|
|
|
|
# Additional general solutions
|
|
solutions.extend([
|
|
"Implement model warm-up to pre-load models",
|
|
"Use smaller OpenCLIP model (ViT-B-16 instead of ViT-B-32)",
|
|
"Implement async processing for image classification",
|
|
"Cache classification results for repeated images",
|
|
"Use image downscaling for faster processing",
|
|
"Implement progressive loading - classify while extracting"
|
|
])
|
|
|
|
print("Recommended Solutions:")
|
|
for i, solution in enumerate(solutions, 1):
|
|
print(f" {i}. {solution}")
|
|
|
|
return solutions
|
|
|
|
def main():
|
|
"""Run comprehensive performance analysis"""
|
|
print("🚀 PERFORMANCE ROOT CAUSE ANALYSIS")
|
|
print("=" * 50)
|
|
|
|
analyzer = PerformanceAnalyzer()
|
|
|
|
# Run all analyses
|
|
system_resources = analyzer.analyze_system_resources()
|
|
openclip_perf = analyzer.analyze_openclip_performance()
|
|
subprocess_overhead = analyzer.analyze_subprocess_overhead()
|
|
|
|
# Identify bottlenecks and propose solutions
|
|
bottlenecks = analyzer.identify_bottlenecks(system_resources, openclip_perf, subprocess_overhead)
|
|
solutions = analyzer.propose_solutions(bottlenecks)
|
|
|
|
print("\n📊 PERFORMANCE SUMMARY")
|
|
print("=" * 50)
|
|
print(f"System Resources: CPU {system_resources.get('cpu_usage', 0)}%, Memory {system_resources.get('memory_usage', 0)}%")
|
|
print(f"OpenCLIP Available: {openclip_perf.get('available', False)}")
|
|
if openclip_perf.get('available'):
|
|
print(f"Single Inference Time: {openclip_perf.get('single_inference_time', 0):.3f}s")
|
|
print(f"Batch Per Image Time: {openclip_perf.get('per_image_time_batch', 0):.3f}s")
|
|
print(f"Bottlenecks Identified: {len(bottlenecks)}")
|
|
print(f"Solutions Proposed: {len(solutions)}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |