Files
monitoring-stack/alertmanager/alertmanager.yml
Greg Hendrickson d310d6ebbe feat: add production-ready Prometheus & Alertmanager configs
- prometheus.yml: Service discovery, alerting, multi-job scraping
- alertmanager.yml: Routing tree, inhibition rules, multi-channel
- node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system)
- File-based service discovery for dynamic host management
- Updated README with usage docs and alert catalog

Alert categories: availability, resource saturation, disk predictive,
I/O latency, network errors, clock sync, OOM detection, conntrack
2026-02-04 18:02:47 +00:00

154 lines
4.4 KiB
YAML
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Alertmanager Configuration
# Production-ready template with multi-channel routing
# Docs: https://prometheus.io/docs/alerting/latest/configuration/
global:
# Default SMTP settings (configure as needed)
# smtp_smarthost: 'smtp.example.com:587'
# smtp_from: 'alertmanager@example.com'
# smtp_auth_username: 'alertmanager'
# smtp_auth_password: '{{ .SMTP_PASSWORD }}'
# smtp_require_tls: true
# Slack API URL (set via environment or secrets)
# slack_api_url: '{{ .SLACK_WEBHOOK_URL }}'
# PagerDuty routing key
# pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# Default resolve timeout
resolve_timeout: 5m
# Inhibition rules - suppress lower severity alerts when higher severity fires
inhibit_rules:
# If critical fires, suppress warning for same alertname + instance
- source_matchers:
- severity = critical
target_matchers:
- severity = warning
equal: ['alertname', 'instance']
# If cluster-wide alert fires, suppress instance-level alerts
- source_matchers:
- severity = critical
- scope = cluster
target_matchers:
- severity =~ warning|critical
- scope != cluster
equal: ['alertname']
# Route tree
route:
# Default receiver
receiver: 'default-receiver'
# Group alerts by these labels
group_by: ['alertname', 'severity', 'instance']
# Wait before sending initial notification
group_wait: 30s
# Wait before sending updated notifications
group_interval: 5m
# Minimum wait before resending
repeat_interval: 4h
# Child routes (evaluated in order, first match wins)
routes:
# Critical alerts -> immediate paging
- matchers:
- severity = critical
receiver: 'pagerduty-critical'
group_wait: 10s
repeat_interval: 1h
continue: true # Also send to Slack
# Warning alerts -> Slack
- matchers:
- severity = warning
receiver: 'slack-warnings'
group_wait: 1m
repeat_interval: 4h
# Info alerts -> low priority channel
- matchers:
- severity = info
receiver: 'slack-info'
group_wait: 5m
repeat_interval: 24h
# Watchdog/Deadman switch -> dedicated receiver
- matchers:
- alertname = Watchdog
receiver: 'null'
repeat_interval: 5m
# Receivers
receivers:
# Null receiver (discard alerts)
- name: 'null'
# Default fallback
- name: 'default-receiver'
# Uncomment and configure your preferred channel:
# slack_configs:
# - channel: '#alerts'
# send_resolved: true
# title: '{{ template "slack.default.title" . }}'
# text: '{{ template "slack.default.text" . }}'
# Critical alerts -> PagerDuty
- name: 'pagerduty-critical'
# pagerduty_configs:
# - routing_key: '{{ .PAGERDUTY_ROUTING_KEY }}'
# severity: critical
# description: '{{ .CommonAnnotations.summary }}'
# details:
# firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
# resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
# Warnings -> Slack
- name: 'slack-warnings'
# slack_configs:
# - channel: '#alerts-warnings'
# send_resolved: true
# color: '{{ if eq .Status "firing" }}warning{{ else }}good{{ end }}'
# title: '{{ template "slack.default.title" . }}'
# text: '{{ template "slack.default.text" . }}'
# actions:
# - type: button
# text: 'Runbook :book:'
# url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
# - type: button
# text: 'Silence :mute:'
# url: '{{ template "slack.default.silence_url" . }}'
# Info -> Slack (low priority)
- name: 'slack-info'
# slack_configs:
# - channel: '#alerts-info'
# send_resolved: false
# title: ' {{ .CommonLabels.alertname }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
# Email receiver (optional)
- name: 'email-oncall'
# email_configs:
# - to: 'oncall@example.com'
# send_resolved: true
# headers:
# Subject: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
# Webhook receiver (for custom integrations)
- name: 'webhook-custom'
# webhook_configs:
# - url: 'http://alerthandler.internal/webhook'
# send_resolved: true
# http_config:
# bearer_token: '{{ .WEBHOOK_TOKEN }}'
# Templates
# templates:
# - '/etc/alertmanager/templates/*.tmpl'