Files
monitoring-stack/prometheus/prometheus.yml
Greg Hendrickson d310d6ebbe feat: add production-ready Prometheus & Alertmanager configs
- prometheus.yml: Service discovery, alerting, multi-job scraping
- alertmanager.yml: Routing tree, inhibition rules, multi-channel
- node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system)
- File-based service discovery for dynamic host management
- Updated README with usage docs and alert catalog

Alert categories: availability, resource saturation, disk predictive,
I/O latency, network errors, clock sync, OOM detection, conntrack
2026-02-04 18:02:47 +00:00

148 lines
4.2 KiB
YAML

# Prometheus Configuration
# Production-ready configuration with alerting and service discovery
global:
# Default scrape interval
scrape_interval: 15s
# Evaluation interval for rules
evaluation_interval: 15s
# Scrape timeout
scrape_timeout: 10s
# External labels (sent with alerts and remote write)
external_labels:
cluster: 'production'
environment: 'prod'
# region: 'us-east-1'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Timeout for Alertmanager requests
timeout: 10s
# API version
api_version: v2
# Rule files - evaluate these periodically
rule_files:
- 'rules/*.yml'
# Scrape configurations
scrape_configs:
# ========================================
# Prometheus self-monitoring
# ========================================
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
relabel_configs:
- target_label: instance
replacement: 'prometheus'
# ========================================
# Alertmanager
# ========================================
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
relabel_configs:
- target_label: instance
replacement: 'alertmanager'
# ========================================
# Node Exporter - Host metrics
# ========================================
- job_name: 'node-exporter'
# Static targets (for fixed hosts)
static_configs:
- targets:
- 'node-exporter:9100'
labels:
env: 'production'
# File-based service discovery (add/remove hosts dynamically)
file_sd_configs:
- files:
- 'targets/node-exporter.yml'
refresh_interval: 30s
# Relabeling
relabel_configs:
# Extract hostname from target address
- source_labels: [__address__]
regex: '([^:]+):\d+'
target_label: hostname
replacement: '${1}'
# ========================================
# Docker / cAdvisor - Container metrics
# ========================================
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
metric_relabel_configs:
# Drop high-cardinality container metrics if needed
- source_labels: [__name__]
regex: 'container_(network_tcp_usage_total|tasks_state|cpu_load_average_10s)'
action: drop
# ========================================
# Kubernetes Service Discovery (if applicable)
# ========================================
# - job_name: 'kubernetes-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
# action: keep
# regex: true
# - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
# action: replace
# target_label: __metrics_path__
# regex: (.+)
# - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:$2
# target_label: __address__
# - source_labels: [__meta_kubernetes_namespace]
# target_label: namespace
# - source_labels: [__meta_kubernetes_pod_name]
# target_label: pod
# ========================================
# Blackbox Exporter - Synthetic monitoring
# ========================================
# - job_name: 'blackbox-http'
# metrics_path: /probe
# params:
# module: [http_2xx]
# file_sd_configs:
# - files:
# - 'targets/blackbox-http.yml'
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: blackbox-exporter:9115
# Remote write (for long-term storage)
# remote_write:
# - url: 'http://mimir:9009/api/v1/push'
# queue_config:
# max_samples_per_send: 1000
# max_shards: 200
# capacity: 2500
# Remote read (for querying long-term storage)
# remote_read:
# - url: 'http://mimir:9009/prometheus/api/v1/read'
# read_recent: true