mirror of
https://github.com/ghndrx/monitoring-stack.git
synced 2026-02-10 06:45:11 +00:00
- prometheus.yml: Service discovery, alerting, multi-job scraping - alertmanager.yml: Routing tree, inhibition rules, multi-channel - node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system) - File-based service discovery for dynamic host management - Updated README with usage docs and alert catalog Alert categories: availability, resource saturation, disk predictive, I/O latency, network errors, clock sync, OOM detection, conntrack
154 lines
4.4 KiB
YAML
154 lines
4.4 KiB
YAML
# Alertmanager Configuration
|
||
# Production-ready template with multi-channel routing
|
||
# Docs: https://prometheus.io/docs/alerting/latest/configuration/
|
||
|
||
global:
|
||
# Default SMTP settings (configure as needed)
|
||
# smtp_smarthost: 'smtp.example.com:587'
|
||
# smtp_from: 'alertmanager@example.com'
|
||
# smtp_auth_username: 'alertmanager'
|
||
# smtp_auth_password: '{{ .SMTP_PASSWORD }}'
|
||
# smtp_require_tls: true
|
||
|
||
# Slack API URL (set via environment or secrets)
|
||
# slack_api_url: '{{ .SLACK_WEBHOOK_URL }}'
|
||
|
||
# PagerDuty routing key
|
||
# pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
|
||
|
||
# Default resolve timeout
|
||
resolve_timeout: 5m
|
||
|
||
# Inhibition rules - suppress lower severity alerts when higher severity fires
|
||
inhibit_rules:
|
||
# If critical fires, suppress warning for same alertname + instance
|
||
- source_matchers:
|
||
- severity = critical
|
||
target_matchers:
|
||
- severity = warning
|
||
equal: ['alertname', 'instance']
|
||
|
||
# If cluster-wide alert fires, suppress instance-level alerts
|
||
- source_matchers:
|
||
- severity = critical
|
||
- scope = cluster
|
||
target_matchers:
|
||
- severity =~ warning|critical
|
||
- scope != cluster
|
||
equal: ['alertname']
|
||
|
||
# Route tree
|
||
route:
|
||
# Default receiver
|
||
receiver: 'default-receiver'
|
||
|
||
# Group alerts by these labels
|
||
group_by: ['alertname', 'severity', 'instance']
|
||
|
||
# Wait before sending initial notification
|
||
group_wait: 30s
|
||
|
||
# Wait before sending updated notifications
|
||
group_interval: 5m
|
||
|
||
# Minimum wait before resending
|
||
repeat_interval: 4h
|
||
|
||
# Child routes (evaluated in order, first match wins)
|
||
routes:
|
||
# Critical alerts -> immediate paging
|
||
- matchers:
|
||
- severity = critical
|
||
receiver: 'pagerduty-critical'
|
||
group_wait: 10s
|
||
repeat_interval: 1h
|
||
continue: true # Also send to Slack
|
||
|
||
# Warning alerts -> Slack
|
||
- matchers:
|
||
- severity = warning
|
||
receiver: 'slack-warnings'
|
||
group_wait: 1m
|
||
repeat_interval: 4h
|
||
|
||
# Info alerts -> low priority channel
|
||
- matchers:
|
||
- severity = info
|
||
receiver: 'slack-info'
|
||
group_wait: 5m
|
||
repeat_interval: 24h
|
||
|
||
# Watchdog/Deadman switch -> dedicated receiver
|
||
- matchers:
|
||
- alertname = Watchdog
|
||
receiver: 'null'
|
||
repeat_interval: 5m
|
||
|
||
# Receivers
|
||
receivers:
|
||
# Null receiver (discard alerts)
|
||
- name: 'null'
|
||
|
||
# Default fallback
|
||
- name: 'default-receiver'
|
||
# Uncomment and configure your preferred channel:
|
||
# slack_configs:
|
||
# - channel: '#alerts'
|
||
# send_resolved: true
|
||
# title: '{{ template "slack.default.title" . }}'
|
||
# text: '{{ template "slack.default.text" . }}'
|
||
|
||
# Critical alerts -> PagerDuty
|
||
- name: 'pagerduty-critical'
|
||
# pagerduty_configs:
|
||
# - routing_key: '{{ .PAGERDUTY_ROUTING_KEY }}'
|
||
# severity: critical
|
||
# description: '{{ .CommonAnnotations.summary }}'
|
||
# details:
|
||
# firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
|
||
# resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}'
|
||
|
||
# Warnings -> Slack
|
||
- name: 'slack-warnings'
|
||
# slack_configs:
|
||
# - channel: '#alerts-warnings'
|
||
# send_resolved: true
|
||
# color: '{{ if eq .Status "firing" }}warning{{ else }}good{{ end }}'
|
||
# title: '{{ template "slack.default.title" . }}'
|
||
# text: '{{ template "slack.default.text" . }}'
|
||
# actions:
|
||
# - type: button
|
||
# text: 'Runbook :book:'
|
||
# url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
|
||
# - type: button
|
||
# text: 'Silence :mute:'
|
||
# url: '{{ template "slack.default.silence_url" . }}'
|
||
|
||
# Info -> Slack (low priority)
|
||
- name: 'slack-info'
|
||
# slack_configs:
|
||
# - channel: '#alerts-info'
|
||
# send_resolved: false
|
||
# title: 'ℹ️ {{ .CommonLabels.alertname }}'
|
||
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||
|
||
# Email receiver (optional)
|
||
- name: 'email-oncall'
|
||
# email_configs:
|
||
# - to: 'oncall@example.com'
|
||
# send_resolved: true
|
||
# headers:
|
||
# Subject: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
|
||
|
||
# Webhook receiver (for custom integrations)
|
||
- name: 'webhook-custom'
|
||
# webhook_configs:
|
||
# - url: 'http://alerthandler.internal/webhook'
|
||
# send_resolved: true
|
||
# http_config:
|
||
# bearer_token: '{{ .WEBHOOK_TOKEN }}'
|
||
|
||
# Templates
|
||
# templates:
|
||
# - '/etc/alertmanager/templates/*.tmpl'
|