feat: add production-ready Prometheus & Alertmanager configs

- prometheus.yml: Service discovery, alerting, multi-job scraping
- alertmanager.yml: Routing tree, inhibition rules, multi-channel
- node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system)
- File-based service discovery for dynamic host management
- Updated README with usage docs and alert catalog

Alert categories: availability, resource saturation, disk predictive,
I/O latency, network errors, clock sync, OOM detection, conntrack
This commit is contained in:
Greg Hendrickson
2026-02-04 18:02:41 +00:00
parent c6c841b3a9
commit d310d6ebbe
5 changed files with 711 additions and 22 deletions

147
prometheus/prometheus.yml Normal file
View File

@@ -0,0 +1,147 @@
# Prometheus Configuration
# Production-ready configuration with alerting and service discovery
global:
# Default scrape interval
scrape_interval: 15s
# Evaluation interval for rules
evaluation_interval: 15s
# Scrape timeout
scrape_timeout: 10s
# External labels (sent with alerts and remote write)
external_labels:
cluster: 'production'
environment: 'prod'
# region: 'us-east-1'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Timeout for Alertmanager requests
timeout: 10s
# API version
api_version: v2
# Rule files - evaluate these periodically
rule_files:
- 'rules/*.yml'
# Scrape configurations
scrape_configs:
# ========================================
# Prometheus self-monitoring
# ========================================
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
relabel_configs:
- target_label: instance
replacement: 'prometheus'
# ========================================
# Alertmanager
# ========================================
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
relabel_configs:
- target_label: instance
replacement: 'alertmanager'
# ========================================
# Node Exporter - Host metrics
# ========================================
- job_name: 'node-exporter'
# Static targets (for fixed hosts)
static_configs:
- targets:
- 'node-exporter:9100'
labels:
env: 'production'
# File-based service discovery (add/remove hosts dynamically)
file_sd_configs:
- files:
- 'targets/node-exporter.yml'
refresh_interval: 30s
# Relabeling
relabel_configs:
# Extract hostname from target address
- source_labels: [__address__]
regex: '([^:]+):\d+'
target_label: hostname
replacement: '${1}'
# ========================================
# Docker / cAdvisor - Container metrics
# ========================================
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
metric_relabel_configs:
# Drop high-cardinality container metrics if needed
- source_labels: [__name__]
regex: 'container_(network_tcp_usage_total|tasks_state|cpu_load_average_10s)'
action: drop
# ========================================
# Kubernetes Service Discovery (if applicable)
# ========================================
# - job_name: 'kubernetes-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
# action: keep
# regex: true
# - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
# action: replace
# target_label: __metrics_path__
# regex: (.+)
# - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:$2
# target_label: __address__
# - source_labels: [__meta_kubernetes_namespace]
# target_label: namespace
# - source_labels: [__meta_kubernetes_pod_name]
# target_label: pod
# ========================================
# Blackbox Exporter - Synthetic monitoring
# ========================================
# - job_name: 'blackbox-http'
# metrics_path: /probe
# params:
# module: [http_2xx]
# file_sd_configs:
# - files:
# - 'targets/blackbox-http.yml'
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: blackbox-exporter:9115
# Remote write (for long-term storage)
# remote_write:
# - url: 'http://mimir:9009/api/v1/push'
# queue_config:
# max_samples_per_send: 1000
# max_shards: 200
# capacity: 2500
# Remote read (for querying long-term storage)
# remote_read:
# - url: 'http://mimir:9009/prometheus/api/v1/read'
# read_recent: true

View File

@@ -0,0 +1,274 @@
# Node Exporter Alert Rules
# Production-ready host/VM monitoring alerts
# Reference: https://samber.github.io/awesome-prometheus-alerts/
groups:
- name: node-exporter-alerts
rules:
# ============================================
# Availability
# ============================================
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node exporter down ({{ $labels.instance }})"
description: "Node exporter is not responding.\n Instance: {{ $labels.instance }}"
runbook_url: "https://runbooks.example.com/node-down"
# ============================================
# CPU
# ============================================
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load ({{ $labels.instance }})"
description: "CPU load is > 80%\n Current value: {{ $value | printf \"%.1f\" }}%"
- alert: HostCriticalCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 5m
labels:
severity: critical
annotations:
summary: "Critical CPU load ({{ $labels.instance }})"
description: "CPU load is > 95%\n Current value: {{ $value | printf \"%.1f\" }}%"
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 5m
labels:
severity: warning
annotations:
summary: "CPU steal high - noisy neighbor ({{ $labels.instance }})"
description: "CPU steal is > 10%. A noisy neighbor VM is consuming resources.\n Current value: {{ $value | printf \"%.1f\" }}%"
# ============================================
# Memory
# ============================================
- alert: HostHighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage ({{ $labels.instance }})"
description: "Memory usage is > 85%\n Current value: {{ $value | printf \"%.1f\" }}%"
- alert: HostCriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical memory usage ({{ $labels.instance }})"
description: "Memory usage is > 95%\n Current value: {{ $value | printf \"%.1f\" }}%"
- alert: HostMemoryUnderPressure
expr: rate(node_vmstat_pgmajfault[5m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under pressure ({{ $labels.instance }})"
description: "High rate of major page faults ({{ $value | printf \"%.0f\" }}/sec). System may be swapping."
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "OOM kill detected ({{ $labels.instance }})"
description: "OOM kill detected. A process was terminated due to memory pressure."
# ============================================
# Disk Space
# ============================================
- alert: HostDiskSpaceWarning
expr: |
(
(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20
and node_filesystem_readonly == 0
) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space < 20% ({{ $labels.instance }})"
description: "Disk {{ $labels.mountpoint }} is almost full (< 20% free)\n Current free: {{ $value | printf \"%.1f\" }}%"
- alert: HostDiskSpaceCritical
expr: |
(
(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
and node_filesystem_readonly == 0
) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9
for: 2m
labels:
severity: critical
annotations:
summary: "Disk space < 10% ({{ $labels.instance }})"
description: "Disk {{ $labels.mountpoint }} is critically full (< 10% free)\n Current free: {{ $value | printf \"%.1f\" }}%"
- alert: HostDiskWillFillIn24Hours
expr: |
(
node_filesystem_avail_bytes * 100 / node_filesystem_size_bytes < 10
and predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|nfs"}[6h], 24 * 3600) < 0
and node_filesystem_readonly == 0
)
for: 5m
labels:
severity: warning
annotations:
summary: "Disk will fill within 24h ({{ $labels.instance }})"
description: "Filesystem {{ $labels.mountpoint }} is predicted to run out of space within 24 hours at current write rate."
# ============================================
# Disk I/O
# ============================================
- alert: HostDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[5m]) / rate(node_disk_reads_completed_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High disk read latency ({{ $labels.instance }})"
description: "Disk read latency is > 100ms\n Current value: {{ $value | printf \"%.0f\" }}ms"
- alert: HostDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[5m]) / rate(node_disk_writes_completed_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High disk write latency ({{ $labels.instance }})"
description: "Disk write latency is > 100ms\n Current value: {{ $value | printf \"%.0f\" }}ms"
- alert: HostDiskIOSaturation
expr: rate(node_disk_io_time_weighted_seconds_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk I/O saturated ({{ $labels.instance }})"
description: "Disk I/O is saturated on {{ $labels.device }}. Weighted I/O time: {{ $value | printf \"%.1f\" }}s"
# ============================================
# Network
# ============================================
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Network receive errors ({{ $labels.instance }})"
description: "Network interface {{ $labels.device }} has receive errors.\n Rate: {{ $value | printf \"%.1f\" }}/s"
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Network transmit errors ({{ $labels.instance }})"
description: "Network interface {{ $labels.device }} has transmit errors.\n Rate: {{ $value | printf \"%.1f\" }}/s"
- alert: HostNetworkInterfaceSaturated
expr: |
(
rate(node_network_receive_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m]) +
rate(node_network_transmit_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m])
) / node_network_speed_bytes{device!~"lo|docker.*|veth.*|br-.*"} > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Network interface saturated ({{ $labels.instance }})"
description: "Network interface {{ $labels.device }} is > 80% saturated."
# ============================================
# System
# ============================================
- alert: HostClockSkew
expr: |
(
node_timex_offset_seconds > 0.05
or node_timex_offset_seconds < -0.05
)
for: 5m
labels:
severity: warning
annotations:
summary: "Clock skew detected ({{ $labels.instance }})"
description: "Clock skew is > 50ms. NTP may not be working.\n Offset: {{ $value | printf \"%.3f\" }}s"
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m]) == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Clock not synchronising ({{ $labels.instance }})"
description: "Clock is not synchronising. Verify NTP configuration."
- alert: HostRequiresReboot
expr: node_reboot_required > 0
for: 4h
labels:
severity: info
annotations:
summary: "Host requires reboot ({{ $labels.instance }})"
description: "Kernel updates require a reboot."
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: "Systemd service crashed ({{ $labels.instance }})"
description: "Systemd service {{ $labels.name }} has crashed/failed."
# ============================================
# Entropy
# ============================================
- alert: HostLowEntropy
expr: node_entropy_available_bits < 200
for: 5m
labels:
severity: warning
annotations:
summary: "Low entropy ({{ $labels.instance }})"
description: "Available entropy is low ({{ $value }} bits). Crypto operations may block."
# ============================================
# File Descriptors
# ============================================
- alert: HostFileDescriptorsExhausted
expr: node_filefd_allocated / node_filefd_maximum * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "File descriptors > 80% ({{ $labels.instance }})"
description: "File descriptor usage is high ({{ $value | printf \"%.1f\" }}%)."
# ============================================
# Conntrack
# ============================================
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Conntrack limit > 80% ({{ $labels.instance }})"
description: "Conntrack table is {{ $value | printf \"%.1f\" }}% full. Connections may be dropped."

View File

@@ -0,0 +1,30 @@
# Node Exporter Targets
# File-based service discovery for node-exporter
# Prometheus watches this file and reloads targets automatically
#
# Add your hosts below:
# - Each target should be hostname:port (default port 9100)
# - Labels are applied to all metrics from that target
# - Changes are picked up automatically (no Prometheus restart needed)
# Example configuration:
# - targets:
# - 'web-server-01:9100'
# - 'web-server-02:9100'
# labels:
# role: 'web'
# datacenter: 'us-east-1'
#
# - targets:
# - 'db-primary:9100'
# - 'db-replica-01:9100'
# labels:
# role: 'database'
# datacenter: 'us-east-1'
# Your targets:
- targets:
- 'localhost:9100'
labels:
role: 'prometheus'
env: 'production'