railseek6/diagnose_gpu_issue.py

import os
import sys
import subprocess
import torch
import platform

def check_cuda_installation():
    """Check CUDA installation and version"""
    print("=== CUDA INSTALLATION CHECK ===")

    # Check CUDA_PATH environment variable
    cuda_path = os.environ.get('CUDA_PATH')
    print(f"CUDA_PATH: {cuda_path}")

    if cuda_path and os.path.exists(cuda_path):
        print("✓ CUDA_PATH exists")

        # Check CUDA version
        try:
            nvcc_path = os.path.join(cuda_path, 'bin', 'nvcc.exe')
            if os.path.exists(nvcc_path):
                result = subprocess.run([nvcc_path, '--version'], capture_output=True, text=True)
                if result.returncode == 0:
                    print(f"✓ NVIDIA CUDA Compiler found")
                    for line in result.stdout.split('\n'):
                        if 'release' in line.lower():
                            print(f"  {line.strip()}")
            else:
                print("✗ nvcc.exe not found in CUDA_PATH/bin")
        except Exception as e:
            print(f"✗ Error checking nvcc: {e}")
    else:
        print("✗ CUDA_PATH not set or invalid")

def check_cudnn_installation():
    """Check cuDNN installation"""
    print("\n=== cuDNN INSTALLATION CHECK ===")

    cuda_path = os.environ.get('CUDA_PATH')
    if not cuda_path:
        print("✗ CUDA_PATH not set, cannot check cuDNN")
        return

    # Check common cuDNN locations
    cudnn_locations = [
        os.path.join(cuda_path, 'bin', 'cudnn64_8.dll'),
        os.path.join(cuda_path, 'bin', 'cudnn64_7.dll'),
        os.path.join(cuda_path, 'bin', 'cudnn64.dll'),
        r'C:\Program Files\NVIDIA\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\bin\cudnn64_8.dll'
    ]

    found_cudnn = False
    for location in cudnn_locations:
        if os.path.exists(location):
            print(f"✓ cuDNN found: {location}")
            found_cudnn = True
            # Check if it's in PATH
            cudnn_dir = os.path.dirname(location)
            if cudnn_dir in os.environ.get('PATH', ''):
                print(f"  ✓ cuDNN directory in PATH")
            else:
                print(f"  ✗ cuDNN directory NOT in PATH")
            break

    if not found_cudnn:
        print("✗ No cuDNN DLL found in common locations")

def check_pytorch_gpu():
    """Check PyTorch GPU support"""
    print("\n=== PYTORCH GPU CHECK ===")
    try:
        print(f"PyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")

        if torch.cuda.is_available():
            print(f"CUDA version: {torch.version.cuda}")
            print(f"GPU device count: {torch.cuda.device_count()}")

            for i in range(torch.cuda.device_count()):
                print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
                print(f"    Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
        else:
            print("✗ CUDA not available in PyTorch")

    except Exception as e:
        print(f"✗ Error checking PyTorch: {e}")

def check_paddle_gpu():
    """Check PaddlePaddle GPU support"""
    print("\n=== PADDLEPADDLE GPU CHECK ===")
    try:
        import paddle
        print(f"PaddlePaddle version: {paddle.__version__}")
        print(f"Paddle GPU available: {paddle.is_compiled_with_cuda()}")

        if paddle.is_compiled_with_cuda():
            print(f"Paddle CUDA version: {paddle.version.cuda()}")
            try:
                paddle.device.set_device('gpu')
                print("✓ PaddlePaddle GPU device set successfully")
            except Exception as e:
                print(f"✗ Error setting PaddlePaddle GPU device: {e}")
        else:
            print("✗ PaddlePaddle not compiled with CUDA")

    except ImportError:
        print("✗ PaddlePaddle not installed")
    except Exception as e:
        print(f"✗ Error checking PaddlePaddle: {e}")

def check_environment_variables():
    """Check relevant environment variables"""
    print("\n=== ENVIRONMENT VARIABLES ===")
    env_vars = [
        'CUDA_PATH', 'CUDA_PATH_V12_9', 'PATH',
        'CUDA_VISIBLE_DEVICES', 'CUDA_CACHE_PATH'
    ]

    for var in env_vars:
        value = os.environ.get(var)
        if value:
            if var == 'PATH':
                # Show only CUDA-related paths in PATH
                cuda_paths = [p for p in value.split(';') if 'cuda' in p.lower() or 'nvidia' in p.lower()]
                print(f"{var}:")
                for path in cuda_paths:
                    print(f"  {path}")
            else:
                print(f"{var}: {value}")
        else:
            print(f"{var}: Not set")

def test_cudnn_directly():
    """Test cuDNN directly to identify the specific issue"""
    print("\n=== DIRECT cuDNN TEST ===")
    try:
        import ctypes
        cuda_path = os.environ.get('CUDA_PATH')

        if cuda_path:
            cudnn_paths = [
                os.path.join(cuda_path, 'bin', 'cudnn64_8.dll'),
                os.path.join(cuda_path, 'bin', 'cudnn64.dll'),
                r'C:\Program Files\NVIDIA\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\bin\cudnn64_8.dll'
            ]

            for cudnn_path in cudnn_paths:
                if os.path.exists(cudnn_path):
                    print(f"Testing cuDNN: {cudnn_path}")
                    try:
                        cudnn = ctypes.WinDLL(cudnn_path)
                        print(f"✓ Successfully loaded {os.path.basename(cudnn_path)}")

                        # Try to get version
                        try:
                            cudnnGetVersion = cudnn.cudnnGetVersion
                            cudnnGetVersion.restype = ctypes.c_size_t
                            version = cudnnGetVersion()
                            print(f"✓ cuDNN version: {version}")
                            return True
                        except Exception as e:
                            print(f"✗ Cannot get cuDNN version: {e}")
                            return False

                    except Exception as e:
                        print(f"✗ Failed to load {cudnn_path}: {e}")
                        continue
            print("✗ No cuDNN DLL found to test")
        else:
            print("✗ CUDA_PATH not set")

    except Exception as e:
        print(f"✗ Error in direct cuDNN test: {e}")

    return False

def check_system_info():
    """Check system information"""
    print("\n=== SYSTEM INFORMATION ===")
    print(f"OS: {platform.system()} {platform.release()}")
    print(f"Architecture: {platform.architecture()[0]}")
    print(f"Processor: {platform.processor()}")

if __name__ == "__main__":
    print("GPU MODE FAILURE DIAGNOSTIC")
    print("=" * 50)

    check_system_info()
    check_environment_variables()
    check_cuda_installation()
    check_cudnn_installation()
    check_pytorch_gpu()
    check_paddle_gpu()

    cudnn_working = test_cudnn_directly()

    print("\n" + "=" * 50)
    print("DIAGNOSTIC SUMMARY:")

    if cudnn_working:
        print("✓ cuDNN appears to be working correctly")
        print("  The issue might be in PaddleOCR's cuDNN detection")
    else:
        print("✗ cuDNN is NOT working correctly")
        print("  This is the root cause of the GPU mode failure")
        print("\nRECOMMENDED ACTIONS:")
        print("1. Reinstall cuDNN and ensure the DLLs are in CUDA_PATH/bin")
        print("2. Add cuDNN bin directory to system PATH")
        print("3. Restart the system after cuDNN installation")
        print("4. Verify cuDNN version compatibility with CUDA 12.9")