feat: add production-ready Prometheus & Alertmanager configs

- prometheus.yml: Service discovery, alerting, multi-job scraping - alertmanager.yml: Routing tree, inhibition rules, multi-channel - node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system) - File-based service discovery for dynamic host management - Updated README with usage docs and alert catalog Alert categories: availability, resource saturation, disk predictive, I/O latency, network errors, clock sync, OOM detection, conntrack
2026-02-10 06:45:11 +00:00 · 2026-02-04 18:02:41 +00:00
parent c6c841b3a9
commit d310d6ebbe
5 changed files with 711 additions and 22 deletions
--- a/prometheus/prometheus.yml
+++ b/prometheus/prometheus.yml
@@ -0,0 +1,147 @@
+# Prometheus Configuration
+# Production-ready configuration with alerting and service discovery
+
+global:
+  # Default scrape interval
+  scrape_interval: 15s
+  
+  # Evaluation interval for rules
+  evaluation_interval: 15s
+  
+  # Scrape timeout
+  scrape_timeout: 10s
+
+  # External labels (sent with alerts and remote write)
+  external_labels:
+    cluster: 'production'
+    environment: 'prod'
+    # region: 'us-east-1'
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+      # Timeout for Alertmanager requests
+      timeout: 10s
+      # API version
+      api_version: v2
+
+# Rule files - evaluate these periodically
+rule_files:
+  - 'rules/*.yml'
+
+# Scrape configurations
+scrape_configs:
+  # ========================================
+  # Prometheus self-monitoring
+  # ========================================
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+    relabel_configs:
+      - target_label: instance
+        replacement: 'prometheus'
+
+  # ========================================
+  # Alertmanager
+  # ========================================
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['alertmanager:9093']
+    relabel_configs:
+      - target_label: instance
+        replacement: 'alertmanager'
+
+  # ========================================
+  # Node Exporter - Host metrics
+  # ========================================
+  - job_name: 'node-exporter'
+    # Static targets (for fixed hosts)
+    static_configs:
+      - targets:
+          - 'node-exporter:9100'
+        labels:
+          env: 'production'
+    
+    # File-based service discovery (add/remove hosts dynamically)
+    file_sd_configs:
+      - files:
+          - 'targets/node-exporter.yml'
+        refresh_interval: 30s
+
+    # Relabeling
+    relabel_configs:
+      # Extract hostname from target address
+      - source_labels: [__address__]
+        regex: '([^:]+):\d+'
+        target_label: hostname
+        replacement: '${1}'
+
+  # ========================================
+  # Docker / cAdvisor - Container metrics
+  # ========================================
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets: ['cadvisor:8080']
+    metric_relabel_configs:
+      # Drop high-cardinality container metrics if needed
+      - source_labels: [__name__]
+        regex: 'container_(network_tcp_usage_total|tasks_state|cpu_load_average_10s)'
+        action: drop
+
+  # ========================================
+  # Kubernetes Service Discovery (if applicable)
+  # ========================================
+  # - job_name: 'kubernetes-pods'
+  #   kubernetes_sd_configs:
+  #     - role: pod
+  #   relabel_configs:
+  #     - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+  #       action: keep
+  #       regex: true
+  #     - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+  #       action: replace
+  #       target_label: __metrics_path__
+  #       regex: (.+)
+  #     - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+  #       action: replace
+  #       regex: ([^:]+)(?::\d+)?;(\d+)
+  #       replacement: $1:$2
+  #       target_label: __address__
+  #     - source_labels: [__meta_kubernetes_namespace]
+  #       target_label: namespace
+  #     - source_labels: [__meta_kubernetes_pod_name]
+  #       target_label: pod
+
+  # ========================================
+  # Blackbox Exporter - Synthetic monitoring
+  # ========================================
+  # - job_name: 'blackbox-http'
+  #   metrics_path: /probe
+  #   params:
+  #     module: [http_2xx]
+  #   file_sd_configs:
+  #     - files:
+  #         - 'targets/blackbox-http.yml'
+  #   relabel_configs:
+  #     - source_labels: [__address__]
+  #       target_label: __param_target
+  #     - source_labels: [__param_target]
+  #       target_label: instance
+  #     - target_label: __address__
+  #       replacement: blackbox-exporter:9115
+
+# Remote write (for long-term storage)
+# remote_write:
+#   - url: 'http://mimir:9009/api/v1/push'
+#     queue_config:
+#       max_samples_per_send: 1000
+#       max_shards: 200
+#       capacity: 2500
+
+# Remote read (for querying long-term storage)
+# remote_read:
+#   - url: 'http://mimir:9009/prometheus/api/v1/read'
+#     read_recent: true
--- a/prometheus/rules/node-exporter.yml
+++ b/prometheus/rules/node-exporter.yml
@@ -0,0 +1,274 @@
+# Node Exporter Alert Rules
+# Production-ready host/VM monitoring alerts
+# Reference: https://samber.github.io/awesome-prometheus-alerts/
+
+groups:
+  - name: node-exporter-alerts
+    rules:
+      # ============================================
+      # Availability
+      # ============================================
+      - alert: NodeDown
+        expr: up{job="node-exporter"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Node exporter down ({{ $labels.instance }})"
+          description: "Node exporter is not responding.\n  Instance: {{ $labels.instance }}"
+          runbook_url: "https://runbooks.example.com/node-down"
+
+      # ============================================
+      # CPU
+      # ============================================
+      - alert: HostHighCpuLoad
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU load ({{ $labels.instance }})"
+          description: "CPU load is > 80%\n  Current value: {{ $value | printf \"%.1f\" }}%"
+
+      - alert: HostCriticalCpuLoad
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical CPU load ({{ $labels.instance }})"
+          description: "CPU load is > 95%\n  Current value: {{ $value | printf \"%.1f\" }}%"
+
+      - alert: HostCpuStealNoisyNeighbor
+        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CPU steal high - noisy neighbor ({{ $labels.instance }})"
+          description: "CPU steal is > 10%. A noisy neighbor VM is consuming resources.\n  Current value: {{ $value | printf \"%.1f\" }}%"
+
+      # ============================================
+      # Memory
+      # ============================================
+      - alert: HostHighMemoryUsage
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage ({{ $labels.instance }})"
+          description: "Memory usage is > 85%\n  Current value: {{ $value | printf \"%.1f\" }}%"
+
+      - alert: HostCriticalMemoryUsage
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical memory usage ({{ $labels.instance }})"
+          description: "Memory usage is > 95%\n  Current value: {{ $value | printf \"%.1f\" }}%"
+
+      - alert: HostMemoryUnderPressure
+        expr: rate(node_vmstat_pgmajfault[5m]) > 1000
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host memory under pressure ({{ $labels.instance }})"
+          description: "High rate of major page faults ({{ $value | printf \"%.0f\" }}/sec). System may be swapping."
+
+      - alert: HostOomKillDetected
+        expr: increase(node_vmstat_oom_kill[5m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: "OOM kill detected ({{ $labels.instance }})"
+          description: "OOM kill detected. A process was terminated due to memory pressure."
+
+      # ============================================
+      # Disk Space
+      # ============================================
+      - alert: HostDiskSpaceWarning
+        expr: |
+          (
+            (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20
+            and node_filesystem_readonly == 0
+          ) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk space < 20% ({{ $labels.instance }})"
+          description: "Disk {{ $labels.mountpoint }} is almost full (< 20% free)\n  Current free: {{ $value | printf \"%.1f\" }}%"
+
+      - alert: HostDiskSpaceCritical
+        expr: |
+          (
+            (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
+            and node_filesystem_readonly == 0
+          ) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Disk space < 10% ({{ $labels.instance }})"
+          description: "Disk {{ $labels.mountpoint }} is critically full (< 10% free)\n  Current free: {{ $value | printf \"%.1f\" }}%"
+
+      - alert: HostDiskWillFillIn24Hours
+        expr: |
+          (
+            node_filesystem_avail_bytes * 100 / node_filesystem_size_bytes < 10
+            and predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|nfs"}[6h], 24 * 3600) < 0
+            and node_filesystem_readonly == 0
+          )
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk will fill within 24h ({{ $labels.instance }})"
+          description: "Filesystem {{ $labels.mountpoint }} is predicted to run out of space within 24 hours at current write rate."
+
+      # ============================================
+      # Disk I/O
+      # ============================================
+      - alert: HostDiskReadLatency
+        expr: rate(node_disk_read_time_seconds_total[5m]) / rate(node_disk_reads_completed_total[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High disk read latency ({{ $labels.instance }})"
+          description: "Disk read latency is > 100ms\n  Current value: {{ $value | printf \"%.0f\" }}ms"
+
+      - alert: HostDiskWriteLatency
+        expr: rate(node_disk_write_time_seconds_total[5m]) / rate(node_disk_writes_completed_total[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High disk write latency ({{ $labels.instance }})"
+          description: "Disk write latency is > 100ms\n  Current value: {{ $value | printf \"%.0f\" }}ms"
+
+      - alert: HostDiskIOSaturation
+        expr: rate(node_disk_io_time_weighted_seconds_total[5m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk I/O saturated ({{ $labels.instance }})"
+          description: "Disk I/O is saturated on {{ $labels.device }}. Weighted I/O time: {{ $value | printf \"%.1f\" }}s"
+
+      # ============================================
+      # Network
+      # ============================================
+      - alert: HostNetworkReceiveErrors
+        expr: rate(node_network_receive_errs_total[5m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Network receive errors ({{ $labels.instance }})"
+          description: "Network interface {{ $labels.device }} has receive errors.\n  Rate: {{ $value | printf \"%.1f\" }}/s"
+
+      - alert: HostNetworkTransmitErrors
+        expr: rate(node_network_transmit_errs_total[5m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Network transmit errors ({{ $labels.instance }})"
+          description: "Network interface {{ $labels.device }} has transmit errors.\n  Rate: {{ $value | printf \"%.1f\" }}/s"
+
+      - alert: HostNetworkInterfaceSaturated
+        expr: |
+          (
+            rate(node_network_receive_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m]) +
+            rate(node_network_transmit_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m])
+          ) / node_network_speed_bytes{device!~"lo|docker.*|veth.*|br-.*"} > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Network interface saturated ({{ $labels.instance }})"
+          description: "Network interface {{ $labels.device }} is > 80% saturated."
+
+      # ============================================
+      # System
+      # ============================================
+      - alert: HostClockSkew
+        expr: |
+          (
+            node_timex_offset_seconds > 0.05
+            or node_timex_offset_seconds < -0.05
+          )
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Clock skew detected ({{ $labels.instance }})"
+          description: "Clock skew is > 50ms. NTP may not be working.\n  Offset: {{ $value | printf \"%.3f\" }}s"
+
+      - alert: HostClockNotSynchronising
+        expr: min_over_time(node_timex_sync_status[5m]) == 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Clock not synchronising ({{ $labels.instance }})"
+          description: "Clock is not synchronising. Verify NTP configuration."
+
+      - alert: HostRequiresReboot
+        expr: node_reboot_required > 0
+        for: 4h
+        labels:
+          severity: info
+        annotations:
+          summary: "Host requires reboot ({{ $labels.instance }})"
+          description: "Kernel updates require a reboot."
+
+      - alert: HostSystemdServiceCrashed
+        expr: node_systemd_unit_state{state="failed"} == 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Systemd service crashed ({{ $labels.instance }})"
+          description: "Systemd service {{ $labels.name }} has crashed/failed."
+
+      # ============================================
+      # Entropy
+      # ============================================
+      - alert: HostLowEntropy
+        expr: node_entropy_available_bits < 200
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low entropy ({{ $labels.instance }})"
+          description: "Available entropy is low ({{ $value }} bits). Crypto operations may block."
+
+      # ============================================
+      # File Descriptors
+      # ============================================
+      - alert: HostFileDescriptorsExhausted
+        expr: node_filefd_allocated / node_filefd_maximum * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "File descriptors > 80% ({{ $labels.instance }})"
+          description: "File descriptor usage is high ({{ $value | printf \"%.1f\" }}%)."
+
+      # ============================================
+      # Conntrack
+      # ============================================
+      - alert: HostConntrackLimit
+        expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Conntrack limit > 80% ({{ $labels.instance }})"
+          description: "Conntrack table is {{ $value | printf \"%.1f\" }}% full. Connections may be dropped."
--- a/prometheus/targets/node-exporter.yml
+++ b/prometheus/targets/node-exporter.yml
@@ -0,0 +1,30 @@
+# Node Exporter Targets
+# File-based service discovery for node-exporter
+# Prometheus watches this file and reloads targets automatically
+#
+# Add your hosts below:
+# - Each target should be hostname:port (default port 9100)
+# - Labels are applied to all metrics from that target
+# - Changes are picked up automatically (no Prometheus restart needed)
+
+# Example configuration:
+# - targets:
+#     - 'web-server-01:9100'
+#     - 'web-server-02:9100'
+#   labels:
+#     role: 'web'
+#     datacenter: 'us-east-1'
+#
+# - targets:
+#     - 'db-primary:9100'
+#     - 'db-replica-01:9100'
+#   labels:
+#     role: 'database'
+#     datacenter: 'us-east-1'
+
+# Your targets:
+- targets:
+    - 'localhost:9100'
+  labels:
+    role: 'prometheus'
+    env: 'production'