feat: add production-ready Prometheus & Alertmanager configs

- prometheus.yml: Service discovery, alerting, multi-job scraping
- alertmanager.yml: Routing tree, inhibition rules, multi-channel
- node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system)
- File-based service discovery for dynamic host management
- Updated README with usage docs and alert catalog

Alert categories: availability, resource saturation, disk predictive,
I/O latency, network errors, clock sync, OOM detection, conntrack
This commit is contained in:
Greg Hendrickson
2026-02-04 18:02:41 +00:00
parent c6c841b3a9
commit d310d6ebbe
5 changed files with 711 additions and 22 deletions

View File

@@ -0,0 +1,153 @@
# Alertmanager Configuration
# Production-ready template with multi-channel routing
# Docs: https://prometheus.io/docs/alerting/latest/configuration/
global:
# Default SMTP settings (configure as needed)
# smtp_smarthost: 'smtp.example.com:587'
# smtp_from: 'alertmanager@example.com'
# smtp_auth_username: 'alertmanager'
# smtp_auth_password: '{{ .SMTP_PASSWORD }}'
# smtp_require_tls: true
# Slack API URL (set via environment or secrets)
# slack_api_url: '{{ .SLACK_WEBHOOK_URL }}'
# PagerDuty routing key
# pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# Default resolve timeout
resolve_timeout: 5m
# Inhibition rules - suppress lower severity alerts when higher severity fires
inhibit_rules:
# If critical fires, suppress warning for same alertname + instance
- source_matchers:
- severity = critical
target_matchers:
- severity = warning
equal: ['alertname', 'instance']
# If cluster-wide alert fires, suppress instance-level alerts
- source_matchers:
- severity = critical
- scope = cluster
target_matchers:
- severity =~ warning|critical
- scope != cluster
equal: ['alertname']
# Route tree
route:
# Default receiver
receiver: 'default-receiver'
# Group alerts by these labels
group_by: ['alertname', 'severity', 'instance']
# Wait before sending initial notification
group_wait: 30s
# Wait before sending updated notifications
group_interval: 5m
# Minimum wait before resending
repeat_interval: 4h
# Child routes (evaluated in order, first match wins)
routes:
# Critical alerts -> immediate paging
- matchers:
- severity = critical
receiver: 'pagerduty-critical'
group_wait: 10s
repeat_interval: 1h
continue: true # Also send to Slack
# Warning alerts -> Slack
- matchers:
- severity = warning
receiver: 'slack-warnings'
group_wait: 1m
repeat_interval: 4h
# Info alerts -> low priority channel
- matchers:
- severity = info
receiver: 'slack-info'
group_wait: 5m
repeat_interval: 24h
# Watchdog/Deadman switch -> dedicated receiver
- matchers:
- alertname = Watchdog
receiver: 'null'
repeat_interval: 5m
# Receivers
receivers:
# Null receiver (discard alerts)
- name: 'null'
# Default fallback
- name: 'default-receiver'
# Uncomment and configure your preferred channel:
# slack_configs:
# - channel: '#alerts'
# send_resolved: true
# title: '{{ template "slack.default.title" . }}'
# text: '{{ template "slack.default.text" . }}'
# Critical alerts -> PagerDuty
- name: 'pagerduty-critical'
# pagerduty_configs:
# - routing_key: '{{ .PAGERDUTY_ROUTING_KEY }}'
# severity: critical
# description: '{{ .CommonAnnotations.summary }}'
# details:
# firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
# resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
# Warnings -> Slack
- name: 'slack-warnings'
# slack_configs:
# - channel: '#alerts-warnings'
# send_resolved: true
# color: '{{ if eq .Status "firing" }}warning{{ else }}good{{ end }}'
# title: '{{ template "slack.default.title" . }}'
# text: '{{ template "slack.default.text" . }}'
# actions:
# - type: button
# text: 'Runbook :book:'
# url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
# - type: button
# text: 'Silence :mute:'
# url: '{{ template "slack.default.silence_url" . }}'
# Info -> Slack (low priority)
- name: 'slack-info'
# slack_configs:
# - channel: '#alerts-info'
# send_resolved: false
# title: ' {{ .CommonLabels.alertname }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
# Email receiver (optional)
- name: 'email-oncall'
# email_configs:
# - to: 'oncall@example.com'
# send_resolved: true
# headers:
# Subject: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
# Webhook receiver (for custom integrations)
- name: 'webhook-custom'
# webhook_configs:
# - url: 'http://alerthandler.internal/webhook'
# send_resolved: true
# http_config:
# bearer_token: '{{ .WEBHOOK_TOKEN }}'
# Templates
# templates:
# - '/etc/alertmanager/templates/*.tmpl'