railseek6/check_ollama_health.py

#!/usr/bin/env python3
"""
Robust Ollama health check utility
Checks if Ollama is available and has required models
"""

import requests
import time
import sys

def check_ollama_health(
    host="http://127.0.0.1:11434",
    timeout=10,
    retries=3,
    required_models=None,
    check_models=True
):
    """
    Check Ollama health with retries

    Args:
        host: Ollama host URL
        timeout: Request timeout in seconds
        retries: Number of retry attempts
        required_models: List of required model names (substring match)
        check_models: Whether to check for required models

    Returns:
        tuple: (success: bool, message: str, models: list)
    """
    if required_models is None:
        required_models = ['snowflake-arctic-embed:latest', 'jina-reranker-v2:latest']

    last_exception = None

    for attempt in range(retries):
        try:
            # Try to connect to Ollama
            response = requests.get(f"{host}/api/tags", timeout=timeout)

            if response.status_code == 200:
                data = response.json()
                models = data.get('models', [])
                model_names = [model['name'] for model in models]

                # Check required models if requested
                if check_models:
                    missing_models = []
                    for req_model in required_models:
                        if not any(req_model in name for name in model_names):
                            missing_models.append(req_model)

                    if missing_models:
                        return (
                            False,
                            f"Ollama is running but missing models: {', '.join(missing_models)}",
                            model_names
                        )

                return (
                    True,
                    f"Ollama is running with {len(models)} models",
                    model_names
                )
            else:
                last_exception = f"HTTP {response.status_code}: {response.text}"

        except requests.exceptions.ConnectionError as e:
            last_exception = f"Connection error: {e}"
        except requests.exceptions.Timeout as e:
            last_exception = f"Timeout error: {e}"
        except Exception as e:
            last_exception = f"Unexpected error: {e}"

        # Wait before retry (exponential backoff)
        if attempt < retries - 1:
            wait_time = 2 ** attempt  # 1, 2, 4 seconds
            print(f"  Attempt {attempt + 1} failed, retrying in {wait_time}s...")
            time.sleep(wait_time)

    return False, f"Failed to connect to Ollama after {retries} attempts: {last_exception}", []

def main():
    """Command-line interface"""
    import argparse

    parser = argparse.ArgumentParser(description="Check Ollama health")
    parser.add_argument("--host", default="http://127.0.0.1:11434", help="Ollama host URL")
    parser.add_argument("--timeout", type=int, default=10, help="Request timeout")
    parser.add_argument("--retries", type=int, default=3, help="Number of retries")
    parser.add_argument("--skip-models", action="store_true", help="Skip model checking")
    parser.add_argument("--required", nargs="+", default=['snowflake-arctic-embed:latest', 'jina-reranker-v2:latest'],
                       help="Required models (substring match)")

    args = parser.parse_args()

    print(f"🔍 Checking Ollama at {args.host}...")
    success, message, models = check_ollama_health(
        host=args.host,
        timeout=args.timeout,
        retries=args.retries,
        required_models=args.required,
        check_models=not args.skip_models
    )

    if success:
        print(f"✅ {message}")
        if models:
            print(f"📦 Available models ({len(models)}):")
            for model in sorted(models)[:10]:  # Show first 10
                print(f"   - {model}")
            if len(models) > 10:
                print(f"   ... and {len(models) - 10} more")
        sys.exit(0)
    else:
        print(f"❌ {message}")
        sys.exit(1)

if __name__ == "__main__":
    main()