apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: backend-rules namespace: monitoring labels: app: api-app-backend release: monitoring spec: groups: - name: backend.rules rules: - alert: HostHighCpuLoad expr: 100 - (avg by(instance) rate(node_cpu_seconds_total{mode="idle"}[2m]) * 100) > 50 for: 2m labels: severity: warning namespace: monitoring annotations: summary: "Host CPU load high" description: "CPU load on host is over 50% \n Value = {{$value}} \n Instance = {{$labels.instance}}" - alert: KubernetesPodCrashLooping expr: kube_pod_container_status_restarts_total > 5 for: 0m labels: severity: critical namespace: monitoring annotations: summary: "Kubernetes pod crash looping" description: "Pod {{ $labels.pod }} is crash looping \n Value = {{ $value }}" - alert: InstanceDown expr: up == 0 for: 1m labels: severity: critical namespace: monitoring annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."