Files
railseek6/diagnose_gpu_issue.py

210 lines
7.6 KiB
Python

import os
import sys
import subprocess
import torch
import platform
def check_cuda_installation():
"""Check CUDA installation and version"""
print("=== CUDA INSTALLATION CHECK ===")
# Check CUDA_PATH environment variable
cuda_path = os.environ.get('CUDA_PATH')
print(f"CUDA_PATH: {cuda_path}")
if cuda_path and os.path.exists(cuda_path):
print("✓ CUDA_PATH exists")
# Check CUDA version
try:
nvcc_path = os.path.join(cuda_path, 'bin', 'nvcc.exe')
if os.path.exists(nvcc_path):
result = subprocess.run([nvcc_path, '--version'], capture_output=True, text=True)
if result.returncode == 0:
print(f"✓ NVIDIA CUDA Compiler found")
for line in result.stdout.split('\n'):
if 'release' in line.lower():
print(f" {line.strip()}")
else:
print("✗ nvcc.exe not found in CUDA_PATH/bin")
except Exception as e:
print(f"✗ Error checking nvcc: {e}")
else:
print("✗ CUDA_PATH not set or invalid")
def check_cudnn_installation():
"""Check cuDNN installation"""
print("\n=== cuDNN INSTALLATION CHECK ===")
cuda_path = os.environ.get('CUDA_PATH')
if not cuda_path:
print("✗ CUDA_PATH not set, cannot check cuDNN")
return
# Check common cuDNN locations
cudnn_locations = [
os.path.join(cuda_path, 'bin', 'cudnn64_8.dll'),
os.path.join(cuda_path, 'bin', 'cudnn64_7.dll'),
os.path.join(cuda_path, 'bin', 'cudnn64.dll'),
r'C:\Program Files\NVIDIA\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\bin\cudnn64_8.dll'
]
found_cudnn = False
for location in cudnn_locations:
if os.path.exists(location):
print(f"✓ cuDNN found: {location}")
found_cudnn = True
# Check if it's in PATH
cudnn_dir = os.path.dirname(location)
if cudnn_dir in os.environ.get('PATH', ''):
print(f" ✓ cuDNN directory in PATH")
else:
print(f" ✗ cuDNN directory NOT in PATH")
break
if not found_cudnn:
print("✗ No cuDNN DLL found in common locations")
def check_pytorch_gpu():
"""Check PyTorch GPU support"""
print("\n=== PYTORCH GPU CHECK ===")
try:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
print(f" Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
else:
print("✗ CUDA not available in PyTorch")
except Exception as e:
print(f"✗ Error checking PyTorch: {e}")
def check_paddle_gpu():
"""Check PaddlePaddle GPU support"""
print("\n=== PADDLEPADDLE GPU CHECK ===")
try:
import paddle
print(f"PaddlePaddle version: {paddle.__version__}")
print(f"Paddle GPU available: {paddle.is_compiled_with_cuda()}")
if paddle.is_compiled_with_cuda():
print(f"Paddle CUDA version: {paddle.version.cuda()}")
try:
paddle.device.set_device('gpu')
print("✓ PaddlePaddle GPU device set successfully")
except Exception as e:
print(f"✗ Error setting PaddlePaddle GPU device: {e}")
else:
print("✗ PaddlePaddle not compiled with CUDA")
except ImportError:
print("✗ PaddlePaddle not installed")
except Exception as e:
print(f"✗ Error checking PaddlePaddle: {e}")
def check_environment_variables():
"""Check relevant environment variables"""
print("\n=== ENVIRONMENT VARIABLES ===")
env_vars = [
'CUDA_PATH', 'CUDA_PATH_V12_9', 'PATH',
'CUDA_VISIBLE_DEVICES', 'CUDA_CACHE_PATH'
]
for var in env_vars:
value = os.environ.get(var)
if value:
if var == 'PATH':
# Show only CUDA-related paths in PATH
cuda_paths = [p for p in value.split(';') if 'cuda' in p.lower() or 'nvidia' in p.lower()]
print(f"{var}:")
for path in cuda_paths:
print(f" {path}")
else:
print(f"{var}: {value}")
else:
print(f"{var}: Not set")
def test_cudnn_directly():
"""Test cuDNN directly to identify the specific issue"""
print("\n=== DIRECT cuDNN TEST ===")
try:
import ctypes
cuda_path = os.environ.get('CUDA_PATH')
if cuda_path:
cudnn_paths = [
os.path.join(cuda_path, 'bin', 'cudnn64_8.dll'),
os.path.join(cuda_path, 'bin', 'cudnn64.dll'),
r'C:\Program Files\NVIDIA\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\bin\cudnn64_8.dll'
]
for cudnn_path in cudnn_paths:
if os.path.exists(cudnn_path):
print(f"Testing cuDNN: {cudnn_path}")
try:
cudnn = ctypes.WinDLL(cudnn_path)
print(f"✓ Successfully loaded {os.path.basename(cudnn_path)}")
# Try to get version
try:
cudnnGetVersion = cudnn.cudnnGetVersion
cudnnGetVersion.restype = ctypes.c_size_t
version = cudnnGetVersion()
print(f"✓ cuDNN version: {version}")
return True
except Exception as e:
print(f"✗ Cannot get cuDNN version: {e}")
return False
except Exception as e:
print(f"✗ Failed to load {cudnn_path}: {e}")
continue
print("✗ No cuDNN DLL found to test")
else:
print("✗ CUDA_PATH not set")
except Exception as e:
print(f"✗ Error in direct cuDNN test: {e}")
return False
def check_system_info():
"""Check system information"""
print("\n=== SYSTEM INFORMATION ===")
print(f"OS: {platform.system()} {platform.release()}")
print(f"Architecture: {platform.architecture()[0]}")
print(f"Processor: {platform.processor()}")
if __name__ == "__main__":
print("GPU MODE FAILURE DIAGNOSTIC")
print("=" * 50)
check_system_info()
check_environment_variables()
check_cuda_installation()
check_cudnn_installation()
check_pytorch_gpu()
check_paddle_gpu()
cudnn_working = test_cudnn_directly()
print("\n" + "=" * 50)
print("DIAGNOSTIC SUMMARY:")
if cudnn_working:
print("✓ cuDNN appears to be working correctly")
print(" The issue might be in PaddleOCR's cuDNN detection")
else:
print("✗ cuDNN is NOT working correctly")
print(" This is the root cause of the GPU mode failure")
print("\nRECOMMENDED ACTIONS:")
print("1. Reinstall cuDNN and ensure the DLLs are in CUDA_PATH/bin")
print("2. Add cuDNN bin directory to system PATH")
print("3. Restart the system after cuDNN installation")
print("4. Verify cuDNN version compatibility with CUDA 12.9")