From d310d6ebbe1174ce132a1f5c2a876ca51824e086 Mon Sep 17 00:00:00 2001 From: Greg Hendrickson Date: Wed, 4 Feb 2026 18:02:41 +0000 Subject: [PATCH] feat: add production-ready Prometheus & Alertmanager configs - prometheus.yml: Service discovery, alerting, multi-job scraping - alertmanager.yml: Routing tree, inhibition rules, multi-channel - node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system) - File-based service discovery for dynamic host management - Updated README with usage docs and alert catalog Alert categories: availability, resource saturation, disk predictive, I/O latency, network errors, clock sync, OOM detection, conntrack --- README.md | 129 ++++++++++--- alertmanager/alertmanager.yml | 153 +++++++++++++++ prometheus/prometheus.yml | 147 ++++++++++++++ prometheus/rules/node-exporter.yml | 274 +++++++++++++++++++++++++++ prometheus/targets/node-exporter.yml | 30 +++ 5 files changed, 711 insertions(+), 22 deletions(-) create mode 100644 alertmanager/alertmanager.yml create mode 100644 prometheus/prometheus.yml create mode 100644 prometheus/rules/node-exporter.yml create mode 100644 prometheus/targets/node-exporter.yml diff --git a/README.md b/README.md index 9fa36f2..edf23a0 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,132 @@ # Monitoring Stack -![Prometheus](https://img.shields.io/badge/Prometheus-2.47+-E6522C?style=flat&logo=prometheus&logoColor=white) +![Prometheus](https://img.shields.io/badge/Prometheus-2.50+-E6522C?style=flat&logo=prometheus&logoColor=white) ![Grafana](https://img.shields.io/badge/Grafana-10+-F46800?style=flat&logo=grafana&logoColor=white) +![Alertmanager](https://img.shields.io/badge/Alertmanager-0.27+-E6522C?style=flat&logo=prometheus&logoColor=white) ![License](https://img.shields.io/badge/License-MIT-blue) Production-ready monitoring stack configurations for Prometheus, Grafana, Loki, and Alertmanager. -## Components +## đŸ—‚ī¸ Structure ``` ├── prometheus/ -│ ├── rules/ # Alert rules -│ └── targets/ # Scrape targets +│ ├── prometheus.yml # Main Prometheus config +│ ├── rules/ # Alert rules +│ │ └── node-exporter.yml # Host monitoring alerts (30+ rules) +│ └── targets/ # File-based service discovery +│ └── node-exporter.yml # Node exporter targets +├── alertmanager/ +│ └── alertmanager.yml # Alert routing & receivers ├── grafana/ -│ ├── dashboards/ # JSON dashboards -│ └── datasources/ # Data source configs -├── alertmanager/ # Alert routing -├── loki/ # Log aggregation -└── promtail/ # Log shipping +│ ├── dashboards/ # JSON dashboards +│ └── datasources/ # Data source configs +├── loki/ # Log aggregation +└── promtail/ # Log shipping ``` -## Dashboards +## 📊 Alert Rules -- 📊 Node Exporter - System metrics -- đŸŗ Docker/Kubernetes - Container metrics -- 🌐 NGINX/Traefik - Ingress metrics -- 💾 PostgreSQL/Redis - Database metrics -- ⚡ Custom app dashboards +### Node Exporter (Host Monitoring) -## Alert Rules +| Category | Alerts | +|----------|--------| +| **Availability** | NodeDown | +| **CPU** | HighCpuLoad, CriticalCpuLoad, CpuStealNoisyNeighbor | +| **Memory** | HighMemoryUsage, CriticalMemoryUsage, MemoryUnderPressure, OomKillDetected | +| **Disk Space** | DiskSpaceWarning, DiskSpaceCritical, DiskWillFillIn24Hours | +| **Disk I/O** | DiskReadLatency, DiskWriteLatency, DiskIOSaturation | +| **Network** | NetworkReceiveErrors, NetworkTransmitErrors, NetworkInterfaceSaturated | +| **System** | ClockSkew, ClockNotSynchronising, RequiresReboot, SystemdServiceCrashed | +| **Resources** | LowEntropy, FileDescriptorsExhausted, ConntrackLimit | -- 🔴 HighCPU, HighMemory, DiskFull -- 🟡 ServiceDown, HighLatency -- đŸ”ĩ CertExpiring, BackupFailed +### Alert Severities -## Quick Start +- 🔴 **Critical** - Page immediately, potential data loss or outage +- 🟡 **Warning** - Investigate within hours, degradation detected +- đŸ”ĩ **Info** - Low priority, informational only + +## 🔔 Alertmanager Features + +- **Intelligent routing** - Critical → PagerDuty, Warning → Slack, Info → low-priority channel +- **Inhibition rules** - Critical alerts suppress matching warnings +- **Grouped notifications** - Reduces alert fatigue +- **Multiple receivers** - Slack, PagerDuty, Email, Webhooks pre-configured + +## 🚀 Quick Start + +### Docker Compose ```bash docker-compose up -d -# Grafana: http://localhost:3000 (admin/admin) + +# Access points: # Prometheus: http://localhost:9090 +# Alertmanager: http://localhost:9093 +# Grafana: http://localhost:3000 (admin/admin) ``` -## License +### Kubernetes / Helm + +```bash +# Using kube-prometheus-stack +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm install monitoring prometheus-community/kube-prometheus-stack \ + --set alertmanager.config="$(cat alertmanager/alertmanager.yml)" +``` + +### Copy Rules Only + +```bash +# If you have existing Prometheus, just copy the rules +cp prometheus/rules/*.yml /etc/prometheus/rules/ +# Reload: curl -X POST http://localhost:9090/-/reload +``` + +## 📁 File-Based Service Discovery + +Add hosts without restarting Prometheus: + +```yaml +# prometheus/targets/node-exporter.yml +- targets: + - 'web-server-01:9100' + - 'web-server-02:9100' + labels: + role: 'web' + datacenter: 'us-east-1' +``` + +Prometheus watches this file and auto-reloads targets. + +## 🔧 Configuration + +### Environment Variables (Alertmanager) + +```bash +# .env or secrets +SLACK_WEBHOOK_URL=https://hooks.slack.com/services/... +PAGERDUTY_ROUTING_KEY=your-routing-key +SMTP_PASSWORD=your-smtp-password +``` + +### Customizing Thresholds + +Edit `prometheus/rules/node-exporter.yml`: + +```yaml +# Change CPU threshold from 80% to 75% +- alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 +``` + +## 📚 References + +- [Prometheus Documentation](https://prometheus.io/docs/) +- [Alertmanager Configuration](https://prometheus.io/docs/alerting/latest/configuration/) +- [Awesome Prometheus Alerts](https://samber.github.io/awesome-prometheus-alerts/) +- [Node Exporter](https://github.com/prometheus/node_exporter) + +## 📝 License MIT diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml new file mode 100644 index 0000000..0541b57 --- /dev/null +++ b/alertmanager/alertmanager.yml @@ -0,0 +1,153 @@ +# Alertmanager Configuration +# Production-ready template with multi-channel routing +# Docs: https://prometheus.io/docs/alerting/latest/configuration/ + +global: + # Default SMTP settings (configure as needed) + # smtp_smarthost: 'smtp.example.com:587' + # smtp_from: 'alertmanager@example.com' + # smtp_auth_username: 'alertmanager' + # smtp_auth_password: '{{ .SMTP_PASSWORD }}' + # smtp_require_tls: true + + # Slack API URL (set via environment or secrets) + # slack_api_url: '{{ .SLACK_WEBHOOK_URL }}' + + # PagerDuty routing key + # pagerduty_url: 'https://events.pagerduty.com/v2/enqueue' + + # Default resolve timeout + resolve_timeout: 5m + +# Inhibition rules - suppress lower severity alerts when higher severity fires +inhibit_rules: + # If critical fires, suppress warning for same alertname + instance + - source_matchers: + - severity = critical + target_matchers: + - severity = warning + equal: ['alertname', 'instance'] + + # If cluster-wide alert fires, suppress instance-level alerts + - source_matchers: + - severity = critical + - scope = cluster + target_matchers: + - severity =~ warning|critical + - scope != cluster + equal: ['alertname'] + +# Route tree +route: + # Default receiver + receiver: 'default-receiver' + + # Group alerts by these labels + group_by: ['alertname', 'severity', 'instance'] + + # Wait before sending initial notification + group_wait: 30s + + # Wait before sending updated notifications + group_interval: 5m + + # Minimum wait before resending + repeat_interval: 4h + + # Child routes (evaluated in order, first match wins) + routes: + # Critical alerts -> immediate paging + - matchers: + - severity = critical + receiver: 'pagerduty-critical' + group_wait: 10s + repeat_interval: 1h + continue: true # Also send to Slack + + # Warning alerts -> Slack + - matchers: + - severity = warning + receiver: 'slack-warnings' + group_wait: 1m + repeat_interval: 4h + + # Info alerts -> low priority channel + - matchers: + - severity = info + receiver: 'slack-info' + group_wait: 5m + repeat_interval: 24h + + # Watchdog/Deadman switch -> dedicated receiver + - matchers: + - alertname = Watchdog + receiver: 'null' + repeat_interval: 5m + +# Receivers +receivers: + # Null receiver (discard alerts) + - name: 'null' + + # Default fallback + - name: 'default-receiver' + # Uncomment and configure your preferred channel: + # slack_configs: + # - channel: '#alerts' + # send_resolved: true + # title: '{{ template "slack.default.title" . }}' + # text: '{{ template "slack.default.text" . }}' + + # Critical alerts -> PagerDuty + - name: 'pagerduty-critical' + # pagerduty_configs: + # - routing_key: '{{ .PAGERDUTY_ROUTING_KEY }}' + # severity: critical + # description: '{{ .CommonAnnotations.summary }}' + # details: + # firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}' + # resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}' + + # Warnings -> Slack + - name: 'slack-warnings' + # slack_configs: + # - channel: '#alerts-warnings' + # send_resolved: true + # color: '{{ if eq .Status "firing" }}warning{{ else }}good{{ end }}' + # title: '{{ template "slack.default.title" . }}' + # text: '{{ template "slack.default.text" . }}' + # actions: + # - type: button + # text: 'Runbook :book:' + # url: '{{ (index .Alerts 0).Annotations.runbook_url }}' + # - type: button + # text: 'Silence :mute:' + # url: '{{ template "slack.default.silence_url" . }}' + + # Info -> Slack (low priority) + - name: 'slack-info' + # slack_configs: + # - channel: '#alerts-info' + # send_resolved: false + # title: 'â„šī¸ {{ .CommonLabels.alertname }}' + # text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' + + # Email receiver (optional) + - name: 'email-oncall' + # email_configs: + # - to: 'oncall@example.com' + # send_resolved: true + # headers: + # Subject: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}' + + # Webhook receiver (for custom integrations) + - name: 'webhook-custom' + # webhook_configs: + # - url: 'http://alerthandler.internal/webhook' + # send_resolved: true + # http_config: + # bearer_token: '{{ .WEBHOOK_TOKEN }}' + +# Templates +# templates: +# - '/etc/alertmanager/templates/*.tmpl' diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml new file mode 100644 index 0000000..b862131 --- /dev/null +++ b/prometheus/prometheus.yml @@ -0,0 +1,147 @@ +# Prometheus Configuration +# Production-ready configuration with alerting and service discovery + +global: + # Default scrape interval + scrape_interval: 15s + + # Evaluation interval for rules + evaluation_interval: 15s + + # Scrape timeout + scrape_timeout: 10s + + # External labels (sent with alerts and remote write) + external_labels: + cluster: 'production' + environment: 'prod' + # region: 'us-east-1' + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + # Timeout for Alertmanager requests + timeout: 10s + # API version + api_version: v2 + +# Rule files - evaluate these periodically +rule_files: + - 'rules/*.yml' + +# Scrape configurations +scrape_configs: + # ======================================== + # Prometheus self-monitoring + # ======================================== + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + relabel_configs: + - target_label: instance + replacement: 'prometheus' + + # ======================================== + # Alertmanager + # ======================================== + - job_name: 'alertmanager' + static_configs: + - targets: ['alertmanager:9093'] + relabel_configs: + - target_label: instance + replacement: 'alertmanager' + + # ======================================== + # Node Exporter - Host metrics + # ======================================== + - job_name: 'node-exporter' + # Static targets (for fixed hosts) + static_configs: + - targets: + - 'node-exporter:9100' + labels: + env: 'production' + + # File-based service discovery (add/remove hosts dynamically) + file_sd_configs: + - files: + - 'targets/node-exporter.yml' + refresh_interval: 30s + + # Relabeling + relabel_configs: + # Extract hostname from target address + - source_labels: [__address__] + regex: '([^:]+):\d+' + target_label: hostname + replacement: '${1}' + + # ======================================== + # Docker / cAdvisor - Container metrics + # ======================================== + - job_name: 'cadvisor' + static_configs: + - targets: ['cadvisor:8080'] + metric_relabel_configs: + # Drop high-cardinality container metrics if needed + - source_labels: [__name__] + regex: 'container_(network_tcp_usage_total|tasks_state|cpu_load_average_10s)' + action: drop + + # ======================================== + # Kubernetes Service Discovery (if applicable) + # ======================================== + # - job_name: 'kubernetes-pods' + # kubernetes_sd_configs: + # - role: pod + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + # action: keep + # regex: true + # - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + # action: replace + # target_label: __metrics_path__ + # regex: (.+) + # - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + # action: replace + # regex: ([^:]+)(?::\d+)?;(\d+) + # replacement: $1:$2 + # target_label: __address__ + # - source_labels: [__meta_kubernetes_namespace] + # target_label: namespace + # - source_labels: [__meta_kubernetes_pod_name] + # target_label: pod + + # ======================================== + # Blackbox Exporter - Synthetic monitoring + # ======================================== + # - job_name: 'blackbox-http' + # metrics_path: /probe + # params: + # module: [http_2xx] + # file_sd_configs: + # - files: + # - 'targets/blackbox-http.yml' + # relabel_configs: + # - source_labels: [__address__] + # target_label: __param_target + # - source_labels: [__param_target] + # target_label: instance + # - target_label: __address__ + # replacement: blackbox-exporter:9115 + +# Remote write (for long-term storage) +# remote_write: +# - url: 'http://mimir:9009/api/v1/push' +# queue_config: +# max_samples_per_send: 1000 +# max_shards: 200 +# capacity: 2500 + +# Remote read (for querying long-term storage) +# remote_read: +# - url: 'http://mimir:9009/prometheus/api/v1/read' +# read_recent: true diff --git a/prometheus/rules/node-exporter.yml b/prometheus/rules/node-exporter.yml new file mode 100644 index 0000000..bb54839 --- /dev/null +++ b/prometheus/rules/node-exporter.yml @@ -0,0 +1,274 @@ +# Node Exporter Alert Rules +# Production-ready host/VM monitoring alerts +# Reference: https://samber.github.io/awesome-prometheus-alerts/ + +groups: + - name: node-exporter-alerts + rules: + # ============================================ + # Availability + # ============================================ + - alert: NodeDown + expr: up{job="node-exporter"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Node exporter down ({{ $labels.instance }})" + description: "Node exporter is not responding.\n Instance: {{ $labels.instance }}" + runbook_url: "https://runbooks.example.com/node-down" + + # ============================================ + # CPU + # ============================================ + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU load ({{ $labels.instance }})" + description: "CPU load is > 80%\n Current value: {{ $value | printf \"%.1f\" }}%" + + - alert: HostCriticalCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 + for: 5m + labels: + severity: critical + annotations: + summary: "Critical CPU load ({{ $labels.instance }})" + description: "CPU load is > 95%\n Current value: {{ $value | printf \"%.1f\" }}%" + + - alert: HostCpuStealNoisyNeighbor + expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "CPU steal high - noisy neighbor ({{ $labels.instance }})" + description: "CPU steal is > 10%. A noisy neighbor VM is consuming resources.\n Current value: {{ $value | printf \"%.1f\" }}%" + + # ============================================ + # Memory + # ============================================ + - alert: HostHighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage ({{ $labels.instance }})" + description: "Memory usage is > 85%\n Current value: {{ $value | printf \"%.1f\" }}%" + + - alert: HostCriticalMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical memory usage ({{ $labels.instance }})" + description: "Memory usage is > 95%\n Current value: {{ $value | printf \"%.1f\" }}%" + + - alert: HostMemoryUnderPressure + expr: rate(node_vmstat_pgmajfault[5m]) > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: "Host memory under pressure ({{ $labels.instance }})" + description: "High rate of major page faults ({{ $value | printf \"%.0f\" }}/sec). System may be swapping." + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "OOM kill detected ({{ $labels.instance }})" + description: "OOM kill detected. A process was terminated due to memory pressure." + + # ============================================ + # Disk Space + # ============================================ + - alert: HostDiskSpaceWarning + expr: | + ( + (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 + and node_filesystem_readonly == 0 + ) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk space < 20% ({{ $labels.instance }})" + description: "Disk {{ $labels.mountpoint }} is almost full (< 20% free)\n Current free: {{ $value | printf \"%.1f\" }}%" + + - alert: HostDiskSpaceCritical + expr: | + ( + (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 + and node_filesystem_readonly == 0 + ) * on(instance, device, mountpoint) group_left node_filesystem_size_bytes > 1e9 + for: 2m + labels: + severity: critical + annotations: + summary: "Disk space < 10% ({{ $labels.instance }})" + description: "Disk {{ $labels.mountpoint }} is critically full (< 10% free)\n Current free: {{ $value | printf \"%.1f\" }}%" + + - alert: HostDiskWillFillIn24Hours + expr: | + ( + node_filesystem_avail_bytes * 100 / node_filesystem_size_bytes < 10 + and predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|nfs"}[6h], 24 * 3600) < 0 + and node_filesystem_readonly == 0 + ) + for: 5m + labels: + severity: warning + annotations: + summary: "Disk will fill within 24h ({{ $labels.instance }})" + description: "Filesystem {{ $labels.mountpoint }} is predicted to run out of space within 24 hours at current write rate." + + # ============================================ + # Disk I/O + # ============================================ + - alert: HostDiskReadLatency + expr: rate(node_disk_read_time_seconds_total[5m]) / rate(node_disk_reads_completed_total[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High disk read latency ({{ $labels.instance }})" + description: "Disk read latency is > 100ms\n Current value: {{ $value | printf \"%.0f\" }}ms" + + - alert: HostDiskWriteLatency + expr: rate(node_disk_write_time_seconds_total[5m]) / rate(node_disk_writes_completed_total[5m]) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High disk write latency ({{ $labels.instance }})" + description: "Disk write latency is > 100ms\n Current value: {{ $value | printf \"%.0f\" }}ms" + + - alert: HostDiskIOSaturation + expr: rate(node_disk_io_time_weighted_seconds_total[5m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk I/O saturated ({{ $labels.instance }})" + description: "Disk I/O is saturated on {{ $labels.device }}. Weighted I/O time: {{ $value | printf \"%.1f\" }}s" + + # ============================================ + # Network + # ============================================ + - alert: HostNetworkReceiveErrors + expr: rate(node_network_receive_errs_total[5m]) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Network receive errors ({{ $labels.instance }})" + description: "Network interface {{ $labels.device }} has receive errors.\n Rate: {{ $value | printf \"%.1f\" }}/s" + + - alert: HostNetworkTransmitErrors + expr: rate(node_network_transmit_errs_total[5m]) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Network transmit errors ({{ $labels.instance }})" + description: "Network interface {{ $labels.device }} has transmit errors.\n Rate: {{ $value | printf \"%.1f\" }}/s" + + - alert: HostNetworkInterfaceSaturated + expr: | + ( + rate(node_network_receive_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m]) + + rate(node_network_transmit_bytes_total{device!~"lo|docker.*|veth.*|br-.*"}[5m]) + ) / node_network_speed_bytes{device!~"lo|docker.*|veth.*|br-.*"} > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "Network interface saturated ({{ $labels.instance }})" + description: "Network interface {{ $labels.device }} is > 80% saturated." + + # ============================================ + # System + # ============================================ + - alert: HostClockSkew + expr: | + ( + node_timex_offset_seconds > 0.05 + or node_timex_offset_seconds < -0.05 + ) + for: 5m + labels: + severity: warning + annotations: + summary: "Clock skew detected ({{ $labels.instance }})" + description: "Clock skew is > 50ms. NTP may not be working.\n Offset: {{ $value | printf \"%.3f\" }}s" + + - alert: HostClockNotSynchronising + expr: min_over_time(node_timex_sync_status[5m]) == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Clock not synchronising ({{ $labels.instance }})" + description: "Clock is not synchronising. Verify NTP configuration." + + - alert: HostRequiresReboot + expr: node_reboot_required > 0 + for: 4h + labels: + severity: info + annotations: + summary: "Host requires reboot ({{ $labels.instance }})" + description: "Kernel updates require a reboot." + + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Systemd service crashed ({{ $labels.instance }})" + description: "Systemd service {{ $labels.name }} has crashed/failed." + + # ============================================ + # Entropy + # ============================================ + - alert: HostLowEntropy + expr: node_entropy_available_bits < 200 + for: 5m + labels: + severity: warning + annotations: + summary: "Low entropy ({{ $labels.instance }})" + description: "Available entropy is low ({{ $value }} bits). Crypto operations may block." + + # ============================================ + # File Descriptors + # ============================================ + - alert: HostFileDescriptorsExhausted + expr: node_filefd_allocated / node_filefd_maximum * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "File descriptors > 80% ({{ $labels.instance }})" + description: "File descriptor usage is high ({{ $value | printf \"%.1f\" }}%)." + + # ============================================ + # Conntrack + # ============================================ + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Conntrack limit > 80% ({{ $labels.instance }})" + description: "Conntrack table is {{ $value | printf \"%.1f\" }}% full. Connections may be dropped." diff --git a/prometheus/targets/node-exporter.yml b/prometheus/targets/node-exporter.yml new file mode 100644 index 0000000..4a44d53 --- /dev/null +++ b/prometheus/targets/node-exporter.yml @@ -0,0 +1,30 @@ +# Node Exporter Targets +# File-based service discovery for node-exporter +# Prometheus watches this file and reloads targets automatically +# +# Add your hosts below: +# - Each target should be hostname:port (default port 9100) +# - Labels are applied to all metrics from that target +# - Changes are picked up automatically (no Prometheus restart needed) + +# Example configuration: +# - targets: +# - 'web-server-01:9100' +# - 'web-server-02:9100' +# labels: +# role: 'web' +# datacenter: 'us-east-1' +# +# - targets: +# - 'db-primary:9100' +# - 'db-replica-01:9100' +# labels: +# role: 'database' +# datacenter: 'us-east-1' + +# Your targets: +- targets: + - 'localhost:9100' + labels: + role: 'prometheus' + env: 'production'