mirror of
https://github.com/ghndrx/monitoring-stack.git
synced 2026-02-10 06:45:11 +00:00
- prometheus.yml: Service discovery, alerting, multi-job scraping - alertmanager.yml: Routing tree, inhibition rules, multi-channel - node-exporter.yml: 30+ alert rules (CPU, memory, disk, network, system) - File-based service discovery for dynamic host management - Updated README with usage docs and alert catalog Alert categories: availability, resource saturation, disk predictive, I/O latency, network errors, clock sync, OOM detection, conntrack
148 lines
4.2 KiB
YAML
148 lines
4.2 KiB
YAML
# Prometheus Configuration
|
|
# Production-ready configuration with alerting and service discovery
|
|
|
|
global:
|
|
# Default scrape interval
|
|
scrape_interval: 15s
|
|
|
|
# Evaluation interval for rules
|
|
evaluation_interval: 15s
|
|
|
|
# Scrape timeout
|
|
scrape_timeout: 10s
|
|
|
|
# External labels (sent with alerts and remote write)
|
|
external_labels:
|
|
cluster: 'production'
|
|
environment: 'prod'
|
|
# region: 'us-east-1'
|
|
|
|
# Alertmanager configuration
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- alertmanager:9093
|
|
# Timeout for Alertmanager requests
|
|
timeout: 10s
|
|
# API version
|
|
api_version: v2
|
|
|
|
# Rule files - evaluate these periodically
|
|
rule_files:
|
|
- 'rules/*.yml'
|
|
|
|
# Scrape configurations
|
|
scrape_configs:
|
|
# ========================================
|
|
# Prometheus self-monitoring
|
|
# ========================================
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
relabel_configs:
|
|
- target_label: instance
|
|
replacement: 'prometheus'
|
|
|
|
# ========================================
|
|
# Alertmanager
|
|
# ========================================
|
|
- job_name: 'alertmanager'
|
|
static_configs:
|
|
- targets: ['alertmanager:9093']
|
|
relabel_configs:
|
|
- target_label: instance
|
|
replacement: 'alertmanager'
|
|
|
|
# ========================================
|
|
# Node Exporter - Host metrics
|
|
# ========================================
|
|
- job_name: 'node-exporter'
|
|
# Static targets (for fixed hosts)
|
|
static_configs:
|
|
- targets:
|
|
- 'node-exporter:9100'
|
|
labels:
|
|
env: 'production'
|
|
|
|
# File-based service discovery (add/remove hosts dynamically)
|
|
file_sd_configs:
|
|
- files:
|
|
- 'targets/node-exporter.yml'
|
|
refresh_interval: 30s
|
|
|
|
# Relabeling
|
|
relabel_configs:
|
|
# Extract hostname from target address
|
|
- source_labels: [__address__]
|
|
regex: '([^:]+):\d+'
|
|
target_label: hostname
|
|
replacement: '${1}'
|
|
|
|
# ========================================
|
|
# Docker / cAdvisor - Container metrics
|
|
# ========================================
|
|
- job_name: 'cadvisor'
|
|
static_configs:
|
|
- targets: ['cadvisor:8080']
|
|
metric_relabel_configs:
|
|
# Drop high-cardinality container metrics if needed
|
|
- source_labels: [__name__]
|
|
regex: 'container_(network_tcp_usage_total|tasks_state|cpu_load_average_10s)'
|
|
action: drop
|
|
|
|
# ========================================
|
|
# Kubernetes Service Discovery (if applicable)
|
|
# ========================================
|
|
# - job_name: 'kubernetes-pods'
|
|
# kubernetes_sd_configs:
|
|
# - role: pod
|
|
# relabel_configs:
|
|
# - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
# action: keep
|
|
# regex: true
|
|
# - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
# action: replace
|
|
# target_label: __metrics_path__
|
|
# regex: (.+)
|
|
# - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
# action: replace
|
|
# regex: ([^:]+)(?::\d+)?;(\d+)
|
|
# replacement: $1:$2
|
|
# target_label: __address__
|
|
# - source_labels: [__meta_kubernetes_namespace]
|
|
# target_label: namespace
|
|
# - source_labels: [__meta_kubernetes_pod_name]
|
|
# target_label: pod
|
|
|
|
# ========================================
|
|
# Blackbox Exporter - Synthetic monitoring
|
|
# ========================================
|
|
# - job_name: 'blackbox-http'
|
|
# metrics_path: /probe
|
|
# params:
|
|
# module: [http_2xx]
|
|
# file_sd_configs:
|
|
# - files:
|
|
# - 'targets/blackbox-http.yml'
|
|
# relabel_configs:
|
|
# - source_labels: [__address__]
|
|
# target_label: __param_target
|
|
# - source_labels: [__param_target]
|
|
# target_label: instance
|
|
# - target_label: __address__
|
|
# replacement: blackbox-exporter:9115
|
|
|
|
# Remote write (for long-term storage)
|
|
# remote_write:
|
|
# - url: 'http://mimir:9009/api/v1/push'
|
|
# queue_config:
|
|
# max_samples_per_send: 1000
|
|
# max_shards: 200
|
|
# capacity: 2500
|
|
|
|
# Remote read (for querying long-term storage)
|
|
# remote_read:
|
|
# - url: 'http://mimir:9009/prometheus/api/v1/read'
|
|
# read_recent: true
|