From cae319ee5902f80cd46abe4503c26ad98d435bd5 Mon Sep 17 00:00:00 2001 From: Greg Hendrickson Date: Thu, 5 Feb 2026 18:02:47 +0000 Subject: [PATCH] feat(finops): add cost-anomaly-detection module ML-powered anomaly detection using AWS Cost Explorer: - Flexible monitoring (service-level, account-level, or Cost Category) - Dual thresholds (percentage OR absolute impact) - Service-specific monitors with custom thresholds - SNS + direct email alerting - KMS encryption support Complements budget-alerts by catching spending anomalies that don't breach budget thresholds but deviate from patterns. --- .../modules/cost-anomaly-detection/README.md | 188 ++++++++++++++++ .../modules/cost-anomaly-detection/main.tf | 211 ++++++++++++++++++ .../modules/cost-anomaly-detection/outputs.tf | 38 ++++ .../cost-anomaly-detection/variables.tf | 114 ++++++++++ 4 files changed, 551 insertions(+) create mode 100644 terraform/modules/cost-anomaly-detection/README.md create mode 100644 terraform/modules/cost-anomaly-detection/main.tf create mode 100644 terraform/modules/cost-anomaly-detection/outputs.tf create mode 100644 terraform/modules/cost-anomaly-detection/variables.tf diff --git a/terraform/modules/cost-anomaly-detection/README.md b/terraform/modules/cost-anomaly-detection/README.md new file mode 100644 index 0000000..710276b --- /dev/null +++ b/terraform/modules/cost-anomaly-detection/README.md @@ -0,0 +1,188 @@ +# Cost Anomaly Detection Module + +ML-powered cost anomaly detection for AWS using Cost Explorer Anomaly Detection. + +## Overview + +AWS Cost Anomaly Detection uses machine learning to identify unusual spending patterns that might not trigger traditional budget alerts. This module complements `budget-alerts` by catching: + +- Unexpected spikes in service usage +- New services being used without authorization +- Gradual cost drift that compounds over time +- Anomalies specific to individual linked accounts + +## Features + +- **Flexible Monitoring**: Account-level, service-level, or custom (Cost Category) monitors +- **Smart Thresholds**: Alert on percentage change OR absolute impact (whichever triggers first) +- **Service-Specific Monitors**: Different thresholds for different services +- **Multi-Channel Alerts**: SNS topics + direct email subscriptions +- **Encryption**: Optional KMS encryption for SNS topic + +## Usage + +### Basic Setup + +```hcl +module "cost_anomaly" { + source = "../modules/cost-anomaly-detection" + + name_prefix = "prod" + alert_emails = ["finops@example.com", "oncall@example.com"] + + # Alert when anomaly exceeds 10% OR $100 + threshold_percentage = 10 + threshold_absolute = 100 +} +``` + +### With Service-Specific Monitors + +```hcl +module "cost_anomaly" { + source = "../modules/cost-anomaly-detection" + + name_prefix = "prod" + alert_emails = ["finops@example.com"] + + threshold_percentage = 10 + threshold_absolute = 100 + + # Additional monitors for critical services with custom thresholds + service_monitors = { + "Amazon Elastic Compute Cloud - Compute" = { + threshold_percentage = 15 + threshold_absolute = 500 + } + "Amazon Relational Database Service" = { + threshold_percentage = 20 + threshold_absolute = 200 + } + "Amazon SageMaker" = { + threshold_percentage = 25 + threshold_absolute = 1000 + } + } +} +``` + +### Multi-Account with Cost Categories + +```hcl +module "cost_anomaly" { + source = "../modules/cost-anomaly-detection" + + name_prefix = "enterprise" + + # Use CUSTOM monitor for Cost Category filtering + monitor_type = "CUSTOM" + cost_category_name = "Environment" + cost_category_values = ["Production"] + + threshold_percentage = 5 + threshold_absolute = 250 + + alert_emails = ["finops@example.com"] +} +``` + +### Linked Account Monitoring + +```hcl +module "cost_anomaly" { + source = "../modules/cost-anomaly-detection" + + name_prefix = "org" + monitor_dimension = "LINKED_ACCOUNT" + + threshold_percentage = 15 + threshold_absolute = 100 + + alert_frequency = "IMMEDIATE" + + alert_emails = ["finops@example.com"] +} +``` + +## How It Works + +1. **Monitors** continuously analyze your AWS spending patterns using ML +2. **Anomalies** are detected when spending deviates significantly from the baseline +3. **Subscriptions** evaluate anomalies against your thresholds +4. **Alerts** are sent via SNS/email when thresholds are exceeded + +### Alert Frequency Options + +| Frequency | Description | +|-----------|-------------| +| `IMMEDIATE` | Alert as soon as anomaly is detected (may be noisy) | +| `DAILY` | Aggregate anomalies and send daily summary | +| `WEEKLY` | Weekly anomaly summary | + +### Threshold Logic + +Alerts trigger when EITHER condition is met: +- Impact percentage >= `threshold_percentage` +- Impact amount >= `threshold_absolute` + +This prevents both small-percentage large-dollar anomalies AND large-percentage small-dollar anomalies from being missed. + +## Integration with Budget Alerts + +| Scenario | Budget Alerts | Anomaly Detection | +|----------|--------------|-------------------| +| Spending hits $1000 budget | ✅ Alerts | ❌ No alert | +| Sudden 50% spike ($200→$300) | ❌ Under budget | ✅ Anomaly detected | +| Gradual drift over weeks | ❌ Each day under | ✅ Pattern detected | +| New service unexpected use | ❌ May be under budget | ✅ New baseline alert | + +**Recommendation**: Use both modules together for comprehensive cost monitoring. + +## Requirements + +| Name | Version | +|------|---------| +| terraform | >= 1.5 | +| aws | >= 5.0 | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| name_prefix | Prefix for resource names | `string` | n/a | yes | +| alert_emails | Email addresses for SNS notifications | `list(string)` | `[]` | no | +| direct_email_subscribers | Direct email subscribers (bypasses SNS) | `list(string)` | `[]` | no | +| monitor_type | DIMENSIONAL or CUSTOM | `string` | `"DIMENSIONAL"` | no | +| monitor_dimension | SERVICE or LINKED_ACCOUNT | `string` | `"SERVICE"` | no | +| cost_category_name | Cost Category for CUSTOM monitors | `string` | `null` | no | +| cost_category_values | Values for Cost Category filter | `list(string)` | `[]` | no | +| alert_frequency | DAILY, IMMEDIATE, or WEEKLY | `string` | `"DAILY"` | no | +| threshold_percentage | Impact percentage threshold | `number` | `10` | no | +| threshold_absolute | Impact amount threshold (USD) | `number` | `100` | no | +| service_monitors | Service-specific monitors | `map(object)` | `{}` | no | +| kms_key_id | KMS key for SNS encryption | `string` | `null` | no | +| tags | Resource tags | `map(string)` | `{}` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| monitor_arn | ARN of the main anomaly monitor | +| monitor_id | ID of the main anomaly monitor | +| subscription_arn | ARN of the anomaly subscription | +| subscription_id | ID of the anomaly subscription | +| sns_topic_arn | ARN of the SNS alert topic | +| service_monitor_arns | Map of service monitor ARNs | +| service_subscription_arns | Map of service subscription ARNs | + +## Cost + +AWS Cost Anomaly Detection is **free** to use. You only pay for: +- SNS notifications (minimal) +- Any custom monitoring integrations you add + +## References + +- [AWS Cost Anomaly Detection](https://docs.aws.amazon.com/cost-management/latest/userguide/manage-ad.html) +- [Terraform aws_ce_anomaly_monitor](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ce_anomaly_monitor) +- [AWS FinOps Best Practices](https://aws.amazon.com/aws-cost-management/aws-finops/) diff --git a/terraform/modules/cost-anomaly-detection/main.tf b/terraform/modules/cost-anomaly-detection/main.tf new file mode 100644 index 0000000..9e4ea79 --- /dev/null +++ b/terraform/modules/cost-anomaly-detection/main.tf @@ -0,0 +1,211 @@ +################################################################################ +# Cost Anomaly Detection Module +# +# AWS Cost Anomaly Detection using ML-powered anomaly monitoring: +# - Account-level or service-level monitors +# - Configurable alerting thresholds (% or absolute) +# - SNS and email subscriptions +# - Multi-account support via Cost Category or Linked Account monitors +# +# Complements budget-alerts by catching unexpected spend patterns +# that don't necessarily breach budget thresholds. +# +# Usage: +# module "cost_anomaly" { +# source = "../modules/cost-anomaly-detection" +# +# name_prefix = "prod" +# alert_emails = ["finops@example.com"] +# +# # Alert when anomaly exceeds 10% OR $100 +# threshold_percentage = 10 +# threshold_absolute = 100 +# } +################################################################################ + +terraform { + required_version = ">= 1.5" + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } +} + +# ----------------------------------------------------------------------------- +# SNS Topic for Anomaly Alerts +# ----------------------------------------------------------------------------- +resource "aws_sns_topic" "anomaly_alerts" { + name = "${var.name_prefix}-cost-anomaly-alerts" + kms_master_key_id = var.kms_key_id + + tags = merge(var.tags, { + Name = "${var.name_prefix}-cost-anomaly-alerts" + Purpose = "cost-anomaly-detection" + }) +} + +resource "aws_sns_topic_policy" "anomaly_alerts" { + arn = aws_sns_topic.anomaly_alerts.arn + policy = data.aws_iam_policy_document.sns_policy.json +} + +data "aws_iam_policy_document" "sns_policy" { + statement { + sid = "AllowCostExplorerPublish" + effect = "Allow" + + principals { + type = "Service" + identifiers = ["costalerts.amazonaws.com"] + } + + actions = ["sns:Publish"] + resources = [aws_sns_topic.anomaly_alerts.arn] + + condition { + test = "StringEquals" + variable = "aws:SourceAccount" + values = [data.aws_caller_identity.current.account_id] + } + } +} + +data "aws_caller_identity" "current" {} + +# ----------------------------------------------------------------------------- +# Email Subscriptions +# ----------------------------------------------------------------------------- +resource "aws_sns_topic_subscription" "email" { + for_each = toset(var.alert_emails) + + topic_arn = aws_sns_topic.anomaly_alerts.arn + protocol = "email" + endpoint = each.value +} + +# ----------------------------------------------------------------------------- +# Cost Anomaly Monitor +# ----------------------------------------------------------------------------- +resource "aws_ce_anomaly_monitor" "main" { + name = "${var.name_prefix}-cost-anomaly-monitor" + monitor_type = var.monitor_type + monitor_dimension = var.monitor_type == "DIMENSIONAL" ? var.monitor_dimension : null + + dynamic "monitor_specification" { + for_each = var.monitor_type == "CUSTOM" && var.cost_category_name != null ? [1] : [] + content { + # Custom expression for Cost Category filtering + and = null + or = null + cost_category { + key = var.cost_category_name + values = var.cost_category_values + match_options = ["EQUALS"] + } + } + } + + tags = merge(var.tags, { + Name = "${var.name_prefix}-cost-anomaly-monitor" + }) +} + +# ----------------------------------------------------------------------------- +# Anomaly Subscription (Alert Configuration) +# ----------------------------------------------------------------------------- +resource "aws_ce_anomaly_subscription" "main" { + name = "${var.name_prefix}-cost-anomaly-subscription" + frequency = var.alert_frequency + + monitor_arn_list = [aws_ce_anomaly_monitor.main.arn] + + subscriber { + type = "SNS" + address = aws_sns_topic.anomaly_alerts.arn + } + + # Optional: Additional email subscribers directly (bypasses SNS) + dynamic "subscriber" { + for_each = var.direct_email_subscribers + content { + type = "EMAIL" + address = subscriber.value + } + } + + # Threshold configuration - alert when EITHER condition is met + threshold_expression { + or { + dimension { + key = "ANOMALY_TOTAL_IMPACT_PERCENTAGE" + values = [tostring(var.threshold_percentage)] + match_options = ["GREATER_THAN_OR_EQUAL"] + } + dimension { + key = "ANOMALY_TOTAL_IMPACT_ABSOLUTE" + values = [tostring(var.threshold_absolute)] + match_options = ["GREATER_THAN_OR_EQUAL"] + } + } + } + + tags = merge(var.tags, { + Name = "${var.name_prefix}-cost-anomaly-subscription" + }) + + depends_on = [aws_sns_topic_policy.anomaly_alerts] +} + +# ----------------------------------------------------------------------------- +# Service-Specific Monitors (Optional) +# ----------------------------------------------------------------------------- +resource "aws_ce_anomaly_monitor" "service" { + for_each = var.service_monitors + + name = "${var.name_prefix}-${each.key}-anomaly-monitor" + monitor_type = "DIMENSIONAL" + monitor_dimension = "SERVICE" + + tags = merge(var.tags, { + Name = "${var.name_prefix}-${each.key}-anomaly-monitor" + Service = each.key + }) +} + +resource "aws_ce_anomaly_subscription" "service" { + for_each = var.service_monitors + + name = "${var.name_prefix}-${each.key}-anomaly-subscription" + frequency = var.alert_frequency + + monitor_arn_list = [aws_ce_anomaly_monitor.service[each.key].arn] + + subscriber { + type = "SNS" + address = aws_sns_topic.anomaly_alerts.arn + } + + threshold_expression { + or { + dimension { + key = "ANOMALY_TOTAL_IMPACT_PERCENTAGE" + values = [tostring(each.value.threshold_percentage)] + match_options = ["GREATER_THAN_OR_EQUAL"] + } + dimension { + key = "ANOMALY_TOTAL_IMPACT_ABSOLUTE" + values = [tostring(each.value.threshold_absolute)] + match_options = ["GREATER_THAN_OR_EQUAL"] + } + } + } + + tags = merge(var.tags, { + Name = "${var.name_prefix}-${each.key}-anomaly-subscription" + Service = each.key + }) + + depends_on = [aws_sns_topic_policy.anomaly_alerts] +} diff --git a/terraform/modules/cost-anomaly-detection/outputs.tf b/terraform/modules/cost-anomaly-detection/outputs.tf new file mode 100644 index 0000000..3fe74e4 --- /dev/null +++ b/terraform/modules/cost-anomaly-detection/outputs.tf @@ -0,0 +1,38 @@ +################################################################################ +# Outputs +################################################################################ + +output "monitor_arn" { + description = "ARN of the main cost anomaly monitor" + value = aws_ce_anomaly_monitor.main.arn +} + +output "monitor_id" { + description = "ID of the main cost anomaly monitor" + value = aws_ce_anomaly_monitor.main.id +} + +output "subscription_arn" { + description = "ARN of the cost anomaly subscription" + value = aws_ce_anomaly_subscription.main.arn +} + +output "subscription_id" { + description = "ID of the cost anomaly subscription" + value = aws_ce_anomaly_subscription.main.id +} + +output "sns_topic_arn" { + description = "ARN of the SNS topic for anomaly alerts" + value = aws_sns_topic.anomaly_alerts.arn +} + +output "service_monitor_arns" { + description = "Map of service-specific monitor ARNs" + value = { for k, v in aws_ce_anomaly_monitor.service : k => v.arn } +} + +output "service_subscription_arns" { + description = "Map of service-specific subscription ARNs" + value = { for k, v in aws_ce_anomaly_subscription.service : k => v.arn } +} diff --git a/terraform/modules/cost-anomaly-detection/variables.tf b/terraform/modules/cost-anomaly-detection/variables.tf new file mode 100644 index 0000000..e2aadd2 --- /dev/null +++ b/terraform/modules/cost-anomaly-detection/variables.tf @@ -0,0 +1,114 @@ +################################################################################ +# Variables +################################################################################ + +variable "name_prefix" { + type = string + description = "Prefix for resource names (e.g., 'prod', 'dev', 'finops')" +} + +variable "alert_emails" { + type = list(string) + description = "Email addresses for SNS notifications" + default = [] +} + +variable "direct_email_subscribers" { + type = list(string) + description = "Email addresses for direct Cost Explorer alerts (bypasses SNS)" + default = [] +} + +variable "monitor_type" { + type = string + description = "Type of anomaly monitor: DIMENSIONAL or CUSTOM" + default = "DIMENSIONAL" + + validation { + condition = contains(["DIMENSIONAL", "CUSTOM"], var.monitor_type) + error_message = "monitor_type must be DIMENSIONAL or CUSTOM." + } +} + +variable "monitor_dimension" { + type = string + description = "Dimension for DIMENSIONAL monitors: SERVICE or LINKED_ACCOUNT" + default = "SERVICE" + + validation { + condition = contains(["SERVICE", "LINKED_ACCOUNT"], var.monitor_dimension) + error_message = "monitor_dimension must be SERVICE or LINKED_ACCOUNT." + } +} + +variable "cost_category_name" { + type = string + description = "Cost Category name for CUSTOM monitors" + default = null +} + +variable "cost_category_values" { + type = list(string) + description = "Cost Category values to filter for CUSTOM monitors" + default = [] +} + +variable "alert_frequency" { + type = string + description = "Frequency of anomaly alerts: DAILY or IMMEDIATE" + default = "DAILY" + + validation { + condition = contains(["DAILY", "IMMEDIATE", "WEEKLY"], var.alert_frequency) + error_message = "alert_frequency must be DAILY, IMMEDIATE, or WEEKLY." + } +} + +variable "threshold_percentage" { + type = number + description = "Anomaly impact percentage threshold (e.g., 10 = 10%)" + default = 10 + + validation { + condition = var.threshold_percentage > 0 && var.threshold_percentage <= 100 + error_message = "threshold_percentage must be between 1 and 100." + } +} + +variable "threshold_absolute" { + type = number + description = "Anomaly impact absolute threshold in USD" + default = 100 + + validation { + condition = var.threshold_absolute > 0 + error_message = "threshold_absolute must be greater than 0." + } +} + +variable "service_monitors" { + type = map(object({ + threshold_percentage = number + threshold_absolute = number + })) + description = "Optional service-specific monitors with custom thresholds" + default = {} + + # Example: + # service_monitors = { + # ec2 = { threshold_percentage = 15, threshold_absolute = 200 } + # rds = { threshold_percentage = 20, threshold_absolute = 100 } + # } +} + +variable "kms_key_id" { + type = string + description = "KMS key ID/ARN for SNS topic encryption (optional)" + default = null +} + +variable "tags" { + type = map(string) + description = "Tags to apply to all resources" + default = {} +}