mirror of
https://github.com/ghndrx/monitoring-stack.git
synced 2026-02-10 06:45:11 +00:00
feat: add production-ready Prometheus & Alertmanager configs
- prometheus.yml: Service discovery, alerting, multi-job scraping - alertmanager.yml: Routing tree, inhibition rules, multi-channel - node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system) - File-based service discovery for dynamic host management - Updated README with usage docs and alert catalog Alert categories: availability, resource saturation, disk predictive, I/O latency, network errors, clock sync, OOM detection, conntrack
This commit is contained in:
274
prometheus/rules/node-exporter.yml
Normal file
274
prometheus/rules/node-exporter.yml
Normal file
@@ -0,0 +1,274 @@
|
||||
# Node Exporter Alert Rules
|
||||
# Production-ready host/VM monitoring alerts
|
||||
# Reference: https://samber.github.io/awesome-prometheus-alerts/
|
||||
|
||||
groups:
|
||||
- name: node-exporter-alerts
|
||||
rules:
|
||||
# ============================================
|
||||
# Availability
|
||||
# ============================================
|
||||
- alert: NodeDown
|
||||
expr: up{job="node-exporter"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node exporter down ({{ $labels.instance }})"
|
||||
description: "Node exporter is not responding.\n Instance: {{ $labels.instance }}"
|
||||
runbook_url: "https://runbooks.example.com/node-down"
|
||||
|
||||
# ============================================
|
||||
# CPU
|
||||
# ============================================
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU load ({{ $labels.instance }})"
|
||||
description: "CPU load is > 80%\n Current value: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
- alert: HostCriticalCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU load ({{ $labels.instance }})"
|
||||
description: "CPU load is > 95%\n Current value: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU steal high - noisy neighbor ({{ $labels.instance }})"
|
||||
description: "CPU steal is > 10%. A noisy neighbor VM is consuming resources.\n Current value: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
# ============================================
|
||||
# Memory
|
||||
# ============================================
|
||||
- alert: HostHighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage ({{ $labels.instance }})"
|
||||
description: "Memory usage is > 85%\n Current value: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
- alert: HostCriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory usage ({{ $labels.instance }})"
|
||||
description: "Memory usage is > 95%\n Current value: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
- alert: HostMemoryUnderPressure
|
||||
expr: rate(node_vmstat_pgmajfault[5m]) > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Host memory under pressure ({{ $labels.instance }})"
|
||||
description: "High rate of major page faults ({{ $value | printf \"%.0f\" }}/sec). System may be swapping."
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OOM kill detected ({{ $labels.instance }})"
|
||||
description: "OOM kill detected. A process was terminated due to memory pressure."
|
||||
|
||||
# ============================================
|
||||
# Disk Space
|
||||
# ============================================
|
||||
- alert: HostDiskSpaceWarning
|
||||
expr: |
|
||||
(
|
||||
(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20
|
||||
and node_filesystem_readonly == 0
|
||||
) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space < 20% ({{ $labels.instance }})"
|
||||
description: "Disk {{ $labels.mountpoint }} is almost full (< 20% free)\n Current free: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
- alert: HostDiskSpaceCritical
|
||||
expr: |
|
||||
(
|
||||
(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
|
||||
and node_filesystem_readonly == 0
|
||||
) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk space < 10% ({{ $labels.instance }})"
|
||||
description: "Disk {{ $labels.mountpoint }} is critically full (< 10% free)\n Current free: {{ $value | printf \"%.1f\" }}%"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes * 100 / node_filesystem_size_bytes < 10
|
||||
and predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|nfs"}[6h], 24 * 3600) < 0
|
||||
and node_filesystem_readonly == 0
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk will fill within 24h ({{ $labels.instance }})"
|
||||
description: "Filesystem {{ $labels.mountpoint }} is predicted to run out of space within 24 hours at current write rate."
|
||||
|
||||
# ============================================
|
||||
# Disk I/O
|
||||
# ============================================
|
||||
- alert: HostDiskReadLatency
|
||||
expr: rate(node_disk_read_time_seconds_total[5m]) / rate(node_disk_reads_completed_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High disk read latency ({{ $labels.instance }})"
|
||||
description: "Disk read latency is > 100ms\n Current value: {{ $value | printf \"%.0f\" }}ms"
|
||||
|
||||
- alert: HostDiskWriteLatency
|
||||
expr: rate(node_disk_write_time_seconds_total[5m]) / rate(node_disk_writes_completed_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High disk write latency ({{ $labels.instance }})"
|
||||
description: "Disk write latency is > 100ms\n Current value: {{ $value | printf \"%.0f\" }}ms"
|
||||
|
||||
- alert: HostDiskIOSaturation
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk I/O saturated ({{ $labels.instance }})"
|
||||
description: "Disk I/O is saturated on {{ $labels.device }}. Weighted I/O time: {{ $value | printf \"%.1f\" }}s"
|
||||
|
||||
# ============================================
|
||||
# Network
|
||||
# ============================================
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network receive errors ({{ $labels.instance }})"
|
||||
description: "Network interface {{ $labels.device }} has receive errors.\n Rate: {{ $value | printf \"%.1f\" }}/s"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network transmit errors ({{ $labels.instance }})"
|
||||
description: "Network interface {{ $labels.device }} has transmit errors.\n Rate: {{ $value | printf \"%.1f\" }}/s"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: |
|
||||
(
|
||||
rate(node_network_receive_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m]) +
|
||||
rate(node_network_transmit_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m])
|
||||
) / node_network_speed_bytes{device!~"lo|docker.*|veth.*|br-.*"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network interface saturated ({{ $labels.instance }})"
|
||||
description: "Network interface {{ $labels.device }} is > 80% saturated."
|
||||
|
||||
# ============================================
|
||||
# System
|
||||
# ============================================
|
||||
- alert: HostClockSkew
|
||||
expr: |
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
or node_timex_offset_seconds < -0.05
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Clock skew detected ({{ $labels.instance }})"
|
||||
description: "Clock skew is > 50ms. NTP may not be working.\n Offset: {{ $value | printf \"%.3f\" }}s"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: min_over_time(node_timex_sync_status[5m]) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Clock not synchronising ({{ $labels.instance }})"
|
||||
description: "Clock is not synchronising. Verify NTP configuration."
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: node_reboot_required > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Host requires reboot ({{ $labels.instance }})"
|
||||
description: "Kernel updates require a reboot."
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: node_systemd_unit_state{state="failed"} == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Systemd service crashed ({{ $labels.instance }})"
|
||||
description: "Systemd service {{ $labels.name }} has crashed/failed."
|
||||
|
||||
# ============================================
|
||||
# Entropy
|
||||
# ============================================
|
||||
- alert: HostLowEntropy
|
||||
expr: node_entropy_available_bits < 200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low entropy ({{ $labels.instance }})"
|
||||
description: "Available entropy is low ({{ $value }} bits). Crypto operations may block."
|
||||
|
||||
# ============================================
|
||||
# File Descriptors
|
||||
# ============================================
|
||||
- alert: HostFileDescriptorsExhausted
|
||||
expr: node_filefd_allocated / node_filefd_maximum * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "File descriptors > 80% ({{ $labels.instance }})"
|
||||
description: "File descriptor usage is high ({{ $value | printf \"%.1f\" }}%)."
|
||||
|
||||
# ============================================
|
||||
# Conntrack
|
||||
# ============================================
|
||||
- alert: HostConntrackLimit
|
||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Conntrack limit > 80% ({{ $labels.instance }})"
|
||||
description: "Conntrack table is {{ $value | printf \"%.1f\" }}% full. Connections may be dropped."
|
||||
Reference in New Issue
Block a user