Skip to main content

Monitoring

Health Check Endpoints


bash

# Server health
curl -f http://mlflow.example.com/health
# API health
curl -f http://mlflow.example.com/api/2.0/mlflow/experiments/list
# Database connectivity
curl -f http://mlflow.example.com/api/2.0/mlflow/runs/search

Prometheus Metrics

yaml
# prometheus.yml
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'mlflow'
static_configs:
- targets: ['mlflow.example.com:5000']
metrics_path: '/metrics'
scrape_interval: 30s

Custom Metrics Collection

python
# mlflow_metrics.py
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import mlflow
import time
# Define metrics
experiment_runs = Counter('mlflow_experiment_runs_total', 'Total experiment runs')
model_registrations = Counter('mlflow_model_registrations_total', 'Total model registrations')
api_response_time = Histogram('mlflow_api_response_time_seconds', 'API response time')
active_experiments = Gauge('mlflow_active_experiments', 'Number of active experiments')
# Start metrics server
start_http_server(8000)
# Example metric collection
def collect_metrics():
client = mlflow.tracking.MlflowClient()
# Count active experiments
experiments = client.list_experiments()
active_experiments.set(len(experiments))
# Track API response time
with api_response_time.time():
runs = client.search_runs(experiment_ids=["1"])

Log Monitoring

bash

# MLflow server logs
tail -f /opt/mlflow/logs/mlflow.log
# System logs
journalctl -u mlflow -f
# Nginx access logs
tail -f /var/log/nginx/mlflow_access.log

Performance Monitoring


python
# performance_monitor.py
import psutil
import time
from mlflow.tracking import MlflowClient
def monitor_performance():
client = MlflowClient()
while True:
# CPU usage
cpu_percent = psutil.cpu_percent()
# Memory usage
memory = psutil.virtual_memory()
# Disk usage
disk = psutil.disk_usage('/opt/mlflow')
# Database connections
try:
experiments = client.list_experiments()
db_status = "healthy"
except Exception as e:
db_status = "unhealthy"
print(f"CPU: {cpu_percent}%, Memory: {memory.percent}%, Disk: {disk.percent}%, DB: {db_status}")
time.sleep(60)
if __name__ == "__main__":
monitor_performance()