From 079e5aca51c6f0fb246744255d8f20e8ca5c4cd7 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Wed, 4 Feb 2026 17:35:58 +0000 Subject: [PATCH 01/23] chore: lambda-get-code-deploy-alarms --- infra/modules/aws/_shared/lambda/main.tf | 1 + justfile | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/infra/modules/aws/_shared/lambda/main.tf b/infra/modules/aws/_shared/lambda/main.tf index e91b01a..51996f6 100644 --- a/infra/modules/aws/_shared/lambda/main.tf +++ b/infra/modules/aws/_shared/lambda/main.tf @@ -53,6 +53,7 @@ resource "aws_lambda_function" "lambda" { CodeDeployApplication = aws_codedeploy_app.app.name CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name DeploymentStrategy = local.deploy_config.type + CodeDeployAlarms = length(var.codedeploy_alarm_names) > 0 ? jsonencode(var.codedeploy_alarm_names) : "[]" } lifecycle { diff --git a/justfile b/justfile index 33e3da3..15215be 100644 --- a/justfile +++ b/justfile @@ -242,6 +242,11 @@ lambda-upload-bundle: lambda-get-function-arn: #!/usr/bin/env bash + if [[ -z "$FUNCTION_NAME" ]]; then + echo "❌ FUNCTION_NAME environment variable is not set." + exit 1 + fi + aws lambda get-function \ --function-name $FUNCTION_NAME \ --query 'Configuration.FunctionArn' \ @@ -265,6 +270,18 @@ lambda-get-code-deploy-group: --query 'Tags.CodeDeployGroup' \ --output text +lambda-get-code-deploy-alarms: + #!/usr/bin/env bash + set -euo pipefail + + FUNCTION_ARN=$(just lambda-get-function-arn) + + aws lambda list-tags \ + --resource "$FUNCTION_ARN" \ + --query 'Tags.CodeDeployAlarms' \ + --output text \ + | jq -c '.' + lambda-deploy: #!/usr/bin/env bash From 30c7f325142a4c22428c2f608dce9d7cc11e6c78 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Wed, 4 Feb 2026 17:46:25 +0000 Subject: [PATCH 02/23] chore: wip script --- justfile | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/justfile b/justfile index 15215be..d73c3d1 100644 --- a/justfile +++ b/justfile @@ -270,6 +270,7 @@ lambda-get-code-deploy-group: --query 'Tags.CodeDeployGroup' \ --output text + lambda-get-code-deploy-alarms: #!/usr/bin/env bash set -euo pipefail @@ -283,6 +284,28 @@ lambda-get-code-deploy-alarms: | jq -c '.' +lambda-get-code-deploy-alarms: + #!/usr/bin/env bash + set -euo pipefail + + ALARMS_JSON=$(just lambda-get-code-deploy-alarms-json) + + # Convert JSON array to space-separated list + ALARMS=$(echo "$ALARMS_JSON" | jq -r '.[]') + + # Reset each alarm to OK + for ALARM_NAME in $ALARMS; do + echo "Setting alarm to OK: $ALARM_NAME" + aws cloudwatch set-alarm-state \ + --alarm-name "$ALARM_NAME" \ + --state-value OK \ + --state-reason "Reset by CI/CD" + done + + # Output alarm names so they can be captured + echo $ALARMS + + lambda-deploy: #!/usr/bin/env bash set -euo pipefail From 193d2f9c1388997330419c8ea57378fba0664b4a Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 09:43:30 +0000 Subject: [PATCH 03/23] fix: dupe lambda-get-code-deploy-alarms --- justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/justfile b/justfile index d73c3d1..0caa7ba 100644 --- a/justfile +++ b/justfile @@ -284,7 +284,7 @@ lambda-get-code-deploy-alarms: | jq -c '.' -lambda-get-code-deploy-alarms: +lambda-set-code-deploy-alarms: #!/usr/bin/env bash set -euo pipefail From a6a177bf1d0fb6657c408dc66c15646317c46bb6 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 10:33:40 +0000 Subject: [PATCH 04/23] feat: working with reset to OK --- .github/workflows/deploy.yml | 8 ++++++++ infra/modules/aws/_shared/lambda/locals.tf | 5 +++++ infra/modules/aws/_shared/lambda/main.tf | 14 ++++++++------ justfile | 16 +++++++++------- 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 0599860..2dace32 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -104,6 +104,14 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: lambda-upload-bundle + - name: Set Alarms to OK for CodeDeploy (if applicable) + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: lambda-set-code-deploy-alarms + - name: Run CodeDeploy uses: chrispsheehan/just-aws-oidc-action@0.3.0 env: diff --git a/infra/modules/aws/_shared/lambda/locals.tf b/infra/modules/aws/_shared/lambda/locals.tf index fac9ddb..1a7ae80 100644 --- a/infra/modules/aws/_shared/lambda/locals.tf +++ b/infra/modules/aws/_shared/lambda/locals.tf @@ -44,4 +44,9 @@ locals { pc_trigger_percent = try(var.provisioned_config.auto_scale.trigger_percent, 70) / 100 pc_sqs_target_visible_messages = try(var.provisioned_config.sqs_scale.visible_messages, 0) pc_sqs_queue_name = try(var.provisioned_config.sqs_scale.queue_name, "") + + codedeploy_alarm_tags = { + for idx, alarm in var.codedeploy_alarm_names : + "CodeDeployAlarm${idx + 1}" => alarm + } } diff --git a/infra/modules/aws/_shared/lambda/main.tf b/infra/modules/aws/_shared/lambda/main.tf index 51996f6..8f16444 100644 --- a/infra/modules/aws/_shared/lambda/main.tf +++ b/infra/modules/aws/_shared/lambda/main.tf @@ -49,12 +49,14 @@ resource "aws_lambda_function" "lambda" { } # tags for identifying the code deploy app and its deployment config. Used in CI/CD pipelines. - tags = { - CodeDeployApplication = aws_codedeploy_app.app.name - CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name - DeploymentStrategy = local.deploy_config.type - CodeDeployAlarms = length(var.codedeploy_alarm_names) > 0 ? jsonencode(var.codedeploy_alarm_names) : "[]" - } + tags = merge( + { + CodeDeployApplication = aws_codedeploy_app.app.name + CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name + DeploymentStrategy = local.deploy_config.type + }, + local.codedeploy_alarm_tags + ) lifecycle { # Do not update on changes to the initial s3 file version diff --git a/justfile b/justfile index 0caa7ba..0f1ed0a 100644 --- a/justfile +++ b/justfile @@ -279,16 +279,21 @@ lambda-get-code-deploy-alarms: aws lambda list-tags \ --resource "$FUNCTION_ARN" \ - --query 'Tags.CodeDeployAlarms' \ - --output text \ - | jq -c '.' + --query 'Tags' \ + --output json \ + | jq -c ' + to_entries + | map(select(.key | test("^CodeDeployAlarm[0-9]+$"))) + | sort_by(.key | sub("^CodeDeployAlarm"; "") | tonumber) + | map(.value) + ' lambda-set-code-deploy-alarms: #!/usr/bin/env bash set -euo pipefail - ALARMS_JSON=$(just lambda-get-code-deploy-alarms-json) + ALARMS_JSON=$(just lambda-get-code-deploy-alarms) # Convert JSON array to space-separated list ALARMS=$(echo "$ALARMS_JSON" | jq -r '.[]') @@ -302,9 +307,6 @@ lambda-set-code-deploy-alarms: --state-reason "Reset by CI/CD" done - # Output alarm names so they can be captured - echo $ALARMS - lambda-deploy: #!/usr/bin/env bash From 0ddf13dd63f6fb4803cce36326a8f34d8b523a2b Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 14:44:05 +0000 Subject: [PATCH 05/23] fix: wait for none-ok alarms --- .github/workflows/deploy.yml | 14 +++++++------- infra/modules/aws/api/local.tf | 9 ++++++--- infra/modules/aws/api/main.tf | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 2dace32..db69cdd 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -104,13 +104,13 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: lambda-upload-bundle - - name: Set Alarms to OK for CodeDeploy (if applicable) - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - env: - FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} - with: - aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: lambda-set-code-deploy-alarms + # - name: Set Alarms to OK for CodeDeploy (if applicable) + # uses: chrispsheehan/just-aws-oidc-action@0.3.0 + # env: + # FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} + # with: + # aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + # just_action: lambda-set-code-deploy-alarms - name: Run CodeDeploy uses: chrispsheehan/just-aws-oidc-action@0.3.0 diff --git a/infra/modules/aws/api/local.tf b/infra/modules/aws/api/local.tf index aade786..5d7c5b7 100644 --- a/infra/modules/aws/api/local.tf +++ b/infra/modules/aws/api/local.tf @@ -1,5 +1,8 @@ locals { - lambda_name = "${var.environment}-${var.project_name}-api" - apigw_http_5xx_metric = "5xx" - api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" + lambda_name = "${var.environment}-${var.project_name}-api" + apigw_http_5xx_metric = "5xx" + api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" + alarm_period_seconds = 60 + alarm_window_minutes = ceil((local.alarm_period_seconds * var.api_5xx_alarm_evaluation_periods) / 60) + codedeploy_interval_mins = local.alarm_window_minutes + 2 # add a buffer to ensure we wait long enough for alarms to trigger if they will } \ No newline at end of file diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index 93c91b1..e00e573 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -14,7 +14,7 @@ module "lambda_api" { deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers + interval_minutes = local.codedeploy_interval_mins } codedeploy_alarm_names = [ From 302c7cac3425b702bdca9c1064b1d6d694112828 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 14:54:53 +0000 Subject: [PATCH 06/23] fix: consumer codedeploy_interval_mins --- infra/modules/aws/api/local.tf | 7 ++++--- infra/modules/aws/consumer/local.tf | 4 ++++ infra/modules/aws/consumer/main.tf | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/infra/modules/aws/api/local.tf b/infra/modules/aws/api/local.tf index 5d7c5b7..a4ef650 100644 --- a/infra/modules/aws/api/local.tf +++ b/infra/modules/aws/api/local.tf @@ -1,7 +1,8 @@ locals { - lambda_name = "${var.environment}-${var.project_name}-api" - apigw_http_5xx_metric = "5xx" - api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" + lambda_name = "${var.environment}-${var.project_name}-api" + apigw_http_5xx_metric = "5xx" + api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" + alarm_period_seconds = 60 alarm_window_minutes = ceil((local.alarm_period_seconds * var.api_5xx_alarm_evaluation_periods) / 60) codedeploy_interval_mins = local.alarm_window_minutes + 2 # add a buffer to ensure we wait long enough for alarms to trigger if they will diff --git a/infra/modules/aws/consumer/local.tf b/infra/modules/aws/consumer/local.tf index 57c06fc..4fb0f43 100644 --- a/infra/modules/aws/consumer/local.tf +++ b/infra/modules/aws/consumer/local.tf @@ -3,4 +3,8 @@ locals { lambda_name = "${var.environment}-${var.project_name}-consumer" sqs_queue_name = "${var.project_name}-${var.environment}-consumer-queue" sqs_dlq_name = "${var.project_name}-${var.environment}-consumer-dlq" + + alarm_period_seconds = 60 + alarm_window_minutes = ceil((local.alarm_period_seconds * var.sqs_dlq_alarm_evaluation_periods) / 60) + codedeploy_interval_mins = local.alarm_window_minutes + 2 # add a buffer to ensure we wait long enough for alarms to trigger if they will } \ No newline at end of file diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index a2c8e3c..8f76976 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -19,7 +19,7 @@ module "lambda_consumer" { deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers + interval_minutes = local.codedeploy_interval_mins } codedeploy_alarm_names = [ From 953a2ca8fb4f588fa11c53c78a038c25d1dc70fe Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 15:50:33 +0000 Subject: [PATCH 07/23] fix: increase deploy interval minutes --- .github/workflows/deploy.yml | 14 +++++++------- infra/live/dev/aws/api/terragrunt.hcl | 4 ++-- infra/live/dev/aws/consumer/terragrunt.hcl | 4 ++-- infra/live/prod/aws/api/terragrunt.hcl | 4 ++-- infra/live/prod/aws/consumer/terragrunt.hcl | 4 ++-- infra/modules/aws/api/local.tf | 4 ---- infra/modules/aws/api/main.tf | 2 +- infra/modules/aws/consumer/local.tf | 4 ---- infra/modules/aws/consumer/main.tf | 2 +- 9 files changed, 17 insertions(+), 25 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index db69cdd..2dace32 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -104,13 +104,13 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: lambda-upload-bundle - # - name: Set Alarms to OK for CodeDeploy (if applicable) - # uses: chrispsheehan/just-aws-oidc-action@0.3.0 - # env: - # FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} - # with: - # aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - # just_action: lambda-set-code-deploy-alarms + - name: Set Alarms to OK for CodeDeploy (if applicable) + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: lambda-set-code-deploy-alarms - name: Run CodeDeploy uses: chrispsheehan/just-aws-oidc-action@0.3.0 diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 37e4cc2..45daf1c 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -4,8 +4,8 @@ include { inputs = { api_5xx_alarm_threshold = 20.0 - api_5xx_alarm_evaluation_periods = 1 - api_5xx_alarm_datapoints_to_alarm = 1 + sqs_dlq_alarm_evaluation_periods = 3 + sqs_dlq_alarm_datapoints_to_alarm = 3 } terraform { diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index 9f0160a..a787f06 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -4,8 +4,8 @@ include { inputs = { sqs_dlq_alarm_threshold = 5 - sqs_dlq_alarm_evaluation_periods = 1 - sqs_dlq_alarm_datapoints_to_alarm = 1 + sqs_dlq_alarm_evaluation_periods = 3 + sqs_dlq_alarm_datapoints_to_alarm = 3 } terraform { diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl index 35f660c..4d539da 100644 --- a/infra/live/prod/aws/api/terragrunt.hcl +++ b/infra/live/prod/aws/api/terragrunt.hcl @@ -4,8 +4,8 @@ include { inputs = { api_5xx_alarm_threshold = 5.0 - api_5xx_alarm_evaluation_periods = 1 - api_5xx_alarm_datapoints_to_alarm = 1 + api_5xx_alarm_evaluation_periods = 3 + api_5xx_alarm_datapoints_to_alarm = 3 } terraform { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index 9f0160a..a787f06 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -4,8 +4,8 @@ include { inputs = { sqs_dlq_alarm_threshold = 5 - sqs_dlq_alarm_evaluation_periods = 1 - sqs_dlq_alarm_datapoints_to_alarm = 1 + sqs_dlq_alarm_evaluation_periods = 3 + sqs_dlq_alarm_datapoints_to_alarm = 3 } terraform { diff --git a/infra/modules/aws/api/local.tf b/infra/modules/aws/api/local.tf index a4ef650..aade786 100644 --- a/infra/modules/aws/api/local.tf +++ b/infra/modules/aws/api/local.tf @@ -2,8 +2,4 @@ locals { lambda_name = "${var.environment}-${var.project_name}-api" apigw_http_5xx_metric = "5xx" api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" - - alarm_period_seconds = 60 - alarm_window_minutes = ceil((local.alarm_period_seconds * var.api_5xx_alarm_evaluation_periods) / 60) - codedeploy_interval_mins = local.alarm_window_minutes + 2 # add a buffer to ensure we wait long enough for alarms to trigger if they will } \ No newline at end of file diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index e00e573..94298f6 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -14,7 +14,7 @@ module "lambda_api" { deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = local.codedeploy_interval_mins + interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } codedeploy_alarm_names = [ diff --git a/infra/modules/aws/consumer/local.tf b/infra/modules/aws/consumer/local.tf index 4fb0f43..57c06fc 100644 --- a/infra/modules/aws/consumer/local.tf +++ b/infra/modules/aws/consumer/local.tf @@ -3,8 +3,4 @@ locals { lambda_name = "${var.environment}-${var.project_name}-consumer" sqs_queue_name = "${var.project_name}-${var.environment}-consumer-queue" sqs_dlq_name = "${var.project_name}-${var.environment}-consumer-dlq" - - alarm_period_seconds = 60 - alarm_window_minutes = ceil((local.alarm_period_seconds * var.sqs_dlq_alarm_evaluation_periods) / 60) - codedeploy_interval_mins = local.alarm_window_minutes + 2 # add a buffer to ensure we wait long enough for alarms to trigger if they will } \ No newline at end of file diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index 8f76976..4bee045 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -19,7 +19,7 @@ module "lambda_consumer" { deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = local.codedeploy_interval_mins + interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } codedeploy_alarm_names = [ From 5d64998c15931cc7b08a58c1349c0b85eda56745 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 15:54:34 +0000 Subject: [PATCH 08/23] fix: only new dlqs --- infra/modules/aws/consumer/main.tf | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index 4bee045..f7d0fca 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -57,19 +57,20 @@ resource "aws_lambda_event_source_mapping" "sqs" { function_response_types = ["ReportBatchItemFailures"] } -resource "aws_cloudwatch_metric_alarm" "dlq_messages_present" { +resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" { alarm_name = local.sqs_dlq_name - alarm_description = "Messages present in DLQ ${local.sqs_dlq_name}" + alarm_description = "New messages sent to DLQ ${local.sqs_dlq_name}" actions_enabled = true - namespace = "AWS/SQS" - metric_name = "ApproximateNumberOfMessagesVisible" - statistic = "Sum" - period = 60 + namespace = "AWS/SQS" + metric_name = "NumberOfMessagesSent" + statistic = "Sum" + period = 60 + evaluation_periods = var.sqs_dlq_alarm_evaluation_periods datapoints_to_alarm = var.sqs_dlq_alarm_datapoints_to_alarm - comparison_operator = "GreaterThanThreshold" + comparison_operator = "GreaterThanOrEqualToThreshold" threshold = var.sqs_dlq_alarm_threshold treat_missing_data = "notBreaching" @@ -77,3 +78,4 @@ resource "aws_cloudwatch_metric_alarm" "dlq_messages_present" { QueueName = local.sqs_dlq_name } } + From 4ca206daab3f565ec434a345c8023bd373e7d0ce Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 15:55:31 +0000 Subject: [PATCH 09/23] fix: correct tg vars --- infra/live/dev/aws/api/terragrunt.hcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 45daf1c..4ff5588 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -4,8 +4,8 @@ include { inputs = { api_5xx_alarm_threshold = 20.0 - sqs_dlq_alarm_evaluation_periods = 3 - sqs_dlq_alarm_datapoints_to_alarm = 3 + api_5xx_alarm_evaluation_periods = 3 + api_5xx_alarm_datapoints_to_alarm = 3 } terraform { From 3b9e2419641c4466e57e6d203408d48f2944740e Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 16:21:14 +0000 Subject: [PATCH 10/23] chore: minor --- justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/justfile b/justfile index 0f1ed0a..52ddce8 100644 --- a/justfile +++ b/justfile @@ -336,7 +336,7 @@ lambda-deploy: --s3-location bucket=$BUCKET_NAME,key=$APP_SPEC_KEY,bundleType=zip \ --query "deploymentId" --output text) - echo "πŸš€ Started deployment: $DEPLOYMENT_ID" + echo "πŸš€ Started deployment: $DEPLOYMENT_ID App Spec File: $BUCKET_NAME/$APP_SPEC_KEY" if [[ -z "$DEPLOYMENT_ID" || "$DEPLOYMENT_ID" == "None" ]]; then echo "❌ Failed to create deployment β€” no deployment ID returned." From a6a8610f778bf1b35c722456c6a4a18ed6c33a6b Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 16:59:15 +0000 Subject: [PATCH 11/23] chore: mv deploy config + tighten loop --- infra/live/dev/aws/api/terragrunt.hcl | 9 +++++++-- infra/live/dev/aws/consumer/terragrunt.hcl | 9 +++++++-- infra/live/prod/aws/api/terragrunt.hcl | 5 +++++ infra/live/prod/aws/consumer/terragrunt.hcl | 5 +++++ infra/modules/aws/api/main.tf | 6 +----- infra/modules/aws/api/variables.tf | 9 +++++++++ infra/modules/aws/consumer/main.tf | 6 +----- infra/modules/aws/consumer/variables.tf | 9 +++++++++ 8 files changed, 44 insertions(+), 14 deletions(-) diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 4ff5588..56934c8 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -4,8 +4,13 @@ include { inputs = { api_5xx_alarm_threshold = 20.0 - api_5xx_alarm_evaluation_periods = 3 - api_5xx_alarm_datapoints_to_alarm = 3 + api_5xx_alarm_evaluation_periods = 1 + api_5xx_alarm_datapoints_to_alarm = 2 + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } } terraform { diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index a787f06..1de96ee 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -4,8 +4,13 @@ include { inputs = { sqs_dlq_alarm_threshold = 5 - sqs_dlq_alarm_evaluation_periods = 3 - sqs_dlq_alarm_datapoints_to_alarm = 3 + sqs_dlq_alarm_evaluation_periods = 1 + sqs_dlq_alarm_datapoints_to_alarm = 2 + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } } terraform { diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl index 4d539da..72e1484 100644 --- a/infra/live/prod/aws/api/terragrunt.hcl +++ b/infra/live/prod/aws/api/terragrunt.hcl @@ -6,6 +6,11 @@ inputs = { api_5xx_alarm_threshold = 5.0 api_5xx_alarm_evaluation_periods = 3 api_5xx_alarm_datapoints_to_alarm = 3 + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } } terraform { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index a787f06..5556936 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -6,6 +6,11 @@ inputs = { sqs_dlq_alarm_threshold = 5 sqs_dlq_alarm_evaluation_periods = 3 sqs_dlq_alarm_datapoints_to_alarm = 3 + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } } terraform { diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index 94298f6..f11d1bd 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -11,11 +11,7 @@ module "lambda_api" { DEBUG_DELAY_MS = 500 } - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers - } + deployment_config = var.deployment_config codedeploy_alarm_names = [ local.api_5xx_alarm_name diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf index a75acdf..b0b5f28 100644 --- a/infra/modules/aws/api/variables.tf +++ b/infra/modules/aws/api/variables.tf @@ -15,6 +15,15 @@ variable "lambda_bucket" { } ### end of static vars set in root.hcl ### +variable "deployment_config" { + description = "Traffic shifting: all_at_once | canary | linear" + type = object({ + strategy = string # all_at_once | canary | linear + percentage = optional(number) # 1..99 (req for canary/linear) + interval_minutes = optional(number) # >=1 (req for canary/linear) + }) +} + variable "api_5xx_alarm_threshold" { type = number description = "The threshold for the API 5xx error rate alarm" diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index f7d0fca..063a572 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -16,11 +16,7 @@ module "lambda_consumer" { module.sqs_queue.sqs_queue_read_policy_arn ] - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers - } + deployment_config = var.deployment_config codedeploy_alarm_names = [ local.sqs_dlq_name diff --git a/infra/modules/aws/consumer/variables.tf b/infra/modules/aws/consumer/variables.tf index 6968594..205191d 100644 --- a/infra/modules/aws/consumer/variables.tf +++ b/infra/modules/aws/consumer/variables.tf @@ -15,6 +15,15 @@ variable "lambda_bucket" { } ### end of static vars set in root.hcl ### +variable "deployment_config" { + description = "Traffic shifting: all_at_once | canary | linear" + type = object({ + strategy = string # all_at_once | canary | linear + percentage = optional(number) # 1..99 (req for canary/linear) + interval_minutes = optional(number) # >=1 (req for canary/linear) + }) +} + variable "sqs_dlq_alarm_threshold" { type = number description = "The threshold for the SQS DLQ alarm" From 44201ed32ee96c70c6f46e484f9e7607b1722f08 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Thu, 5 Feb 2026 17:09:29 +0000 Subject: [PATCH 12/23] chore: pass in provisioned config --- infra/live/dev/aws/api/terragrunt.hcl | 13 +++++++++++++ infra/live/prod/aws/api/terragrunt.hcl | 13 +++++++++++++ infra/modules/aws/api/main.tf | 12 +----------- infra/modules/aws/api/variables.tf | 20 ++++++++++++++++++++ 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 56934c8..1d273b8 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -6,11 +6,24 @@ inputs = { api_5xx_alarm_threshold = 20.0 api_5xx_alarm_evaluation_periods = 1 api_5xx_alarm_datapoints_to_alarm = 2 + deployment_config = { strategy = "canary" percentage = 10 interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } + + provisioned_config = { + auto_scale = { + max = 2 + min = 1 # always have 1 lambda ready to go + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + + reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting + } } terraform { diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl index 72e1484..456994b 100644 --- a/infra/live/prod/aws/api/terragrunt.hcl +++ b/infra/live/prod/aws/api/terragrunt.hcl @@ -6,11 +6,24 @@ inputs = { api_5xx_alarm_threshold = 5.0 api_5xx_alarm_evaluation_periods = 3 api_5xx_alarm_datapoints_to_alarm = 3 + deployment_config = { strategy = "canary" percentage = 10 interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } + + provisioned_config = { + auto_scale = { + max = 2 + min = 1 # always have 1 lambda ready to go + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + + reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting + } } terraform { diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index f11d1bd..7252e17 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -17,17 +17,7 @@ module "lambda_api" { local.api_5xx_alarm_name ] - provisioned_config = { - auto_scale = { - max = 2 - min = 1 # always have 1 lambda ready to go - trigger_percent = 20 - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - - reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting - } + provisioned_config = var.provisioned_config } resource "aws_apigatewayv2_api" "http_api" { diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf index b0b5f28..17d0029 100644 --- a/infra/modules/aws/api/variables.tf +++ b/infra/modules/aws/api/variables.tf @@ -24,6 +24,26 @@ variable "deployment_config" { }) } +variable "provisioned_config" { + description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none" + type = object({ + fixed = optional(number) # 0/omit = off, >0 = fixed PC + reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency + + auto_scale = optional(object({ + min = number + max = number + trigger_percent = optional(number) + scale_in_cooldown_seconds = optional(number) + scale_out_cooldown_seconds = optional(number) + })) + }) + default = { + fixed = 0 + reserved_concurrency = 1 + } +} + variable "api_5xx_alarm_threshold" { type = number description = "The threshold for the API 5xx error rate alarm" From 90ed8d22d1c35d5d0401c446a93132573a222692 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 09:29:27 +0000 Subject: [PATCH 13/23] chore: pass in sqs provision config --- infra/live/dev/aws/consumer/terragrunt.hcl | 19 +++++++++++++++ infra/live/prod/aws/consumer/terragrunt.hcl | 19 +++++++++++++++ infra/modules/aws/consumer/main.tf | 11 +-------- infra/modules/aws/consumer/variables.tf | 26 +++++++++++++++++++++ 4 files changed, 65 insertions(+), 10 deletions(-) diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index 1de96ee..e3d9735 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -2,15 +2,34 @@ include { path = find_in_parent_folders("root.hcl") } +locals { + aws_account_id = get_aws_account_id() + sqs_queue_name = "${local.aws_account_id}-dev-serverless-consumer-queue" +} + inputs = { + sqs_queue_name = local.sqs_queue_name + sqs_dlq_alarm_threshold = 5 sqs_dlq_alarm_evaluation_periods = 1 sqs_dlq_alarm_datapoints_to_alarm = 2 + deployment_config = { strategy = "canary" percentage = 10 interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } + + provisioned_config = { + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + queue_name = local.sqs_queue_name + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + } } terraform { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index 5556936..f033653 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -2,15 +2,34 @@ include { path = find_in_parent_folders("root.hcl") } +locals { + aws_account_id = get_aws_account_id() + sqs_queue_name = "${local.aws_account_id}-dev-serverless-consumer-queue" +} + inputs = { + sqs_queue_name = local.sqs_queue_name + sqs_dlq_alarm_threshold = 5 sqs_dlq_alarm_evaluation_periods = 3 sqs_dlq_alarm_datapoints_to_alarm = 3 + deployment_config = { strategy = "canary" percentage = 10 interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } + + provisioned_config = { + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + queue_name = local.sqs_queue_name + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + } } terraform { diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index 063a572..9c57296 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -22,16 +22,7 @@ module "lambda_consumer" { local.sqs_dlq_name ] - provisioned_config = { - sqs_scale = { - min = 1 - max = 5 - visible_messages = 10 - queue_name = module.sqs_queue.sqs_queue_name - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - } + provisioned_config = var.provisioned_config } # configure a deadletter queue (DLQ) for the SQS queue used by the Lambda consumer diff --git a/infra/modules/aws/consumer/variables.tf b/infra/modules/aws/consumer/variables.tf index 205191d..02bd554 100644 --- a/infra/modules/aws/consumer/variables.tf +++ b/infra/modules/aws/consumer/variables.tf @@ -15,6 +15,11 @@ variable "lambda_bucket" { } ### end of static vars set in root.hcl ### +variable "sqs_queue_name" { + type = string + description = "The name of the SQS queue" +} + variable "deployment_config" { description = "Traffic shifting: all_at_once | canary | linear" type = object({ @@ -24,6 +29,27 @@ variable "deployment_config" { }) } +variable "provisioned_config" { + description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none" + type = object({ + fixed = optional(number) # 0/omit = off, >0 = fixed PC + reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency + + sqs_scale = optional(object({ + min = number + max = number + visible_messages = number + queue_name = optional(string) + scale_in_cooldown_seconds = optional(number) + scale_out_cooldown_seconds = optional(number) + })) + }) + default = { + fixed = 0 + reserved_concurrency = 1 + } +} + variable "sqs_dlq_alarm_threshold" { type = number description = "The threshold for the SQS DLQ alarm" From 7a15ffac4657f960898458d6d815a25df08024b9 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 09:44:29 +0000 Subject: [PATCH 14/23] =?UTF-8?q?fix:=20DatapointsToAlarm=20must=20be=20le?= =?UTF-8?q?ss=20than=20or=20equal=20to=20=E2=94=82=20EvaluationPeriods?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- infra/live/dev/aws/api/terragrunt.hcl | 2 +- infra/live/dev/aws/consumer/terragrunt.hcl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 1d273b8..5ae721d 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -4,7 +4,7 @@ include { inputs = { api_5xx_alarm_threshold = 20.0 - api_5xx_alarm_evaluation_periods = 1 + api_5xx_alarm_evaluation_periods = 2 api_5xx_alarm_datapoints_to_alarm = 2 deployment_config = { diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index e3d9735..a2f7c84 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -11,7 +11,7 @@ inputs = { sqs_queue_name = local.sqs_queue_name sqs_dlq_alarm_threshold = 5 - sqs_dlq_alarm_evaluation_periods = 1 + sqs_dlq_alarm_evaluation_periods = 2 sqs_dlq_alarm_datapoints_to_alarm = 2 deployment_config = { From 9fcd79c752ffe66804a05400146263a7ba508f8c Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 10:10:52 +0000 Subject: [PATCH 15/23] fix: fail-fast: false --- .github/workflows/infra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/infra.yml b/.github/workflows/infra.yml index 21f2596..9012188 100644 --- a/.github/workflows/infra.yml +++ b/.github/workflows/infra.yml @@ -48,7 +48,7 @@ jobs: needs: oidc runs-on: ubuntu-latest strategy: - fail-fast: true + fail-fast: false # this is to prevent terraform lock issues matrix: value: ${{ fromJson(inputs.matrix) }} steps: From 799aa3080d3cdf45a4168649871e6ec7c496d137 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 10:30:13 +0000 Subject: [PATCH 16/23] chore: better detail in deploy script --- justfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/justfile b/justfile index 52ddce8..17b5621 100644 --- a/justfile +++ b/justfile @@ -336,7 +336,10 @@ lambda-deploy: --s3-location bucket=$BUCKET_NAME,key=$APP_SPEC_KEY,bundleType=zip \ --query "deploymentId" --output text) - echo "πŸš€ Started deployment: $DEPLOYMENT_ID App Spec File: $BUCKET_NAME/$APP_SPEC_KEY" + echo "πŸš€ Deployment started: $DEPLOYMENT_ID" + echo "🏷️ CodeDeploy App: $CODE_DEPLOY_APP_NAME | Group: $CODE_DEPLOY_GROUP_NAME" + echo "πŸ“¦ AppSpec artifact: s3://$BUCKET_NAME/$APP_SPEC_KEY" + echo "⏳ Monitoring deployment status…" if [[ -z "$DEPLOYMENT_ID" || "$DEPLOYMENT_ID" == "None" ]]; then echo "❌ Failed to create deployment β€” no deployment ID returned." From 541141316320be1eefd7dac8cfb695ebff936285 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 10:36:00 +0000 Subject: [PATCH 17/23] chore: rm whitespace --- justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/justfile b/justfile index 17b5621..72da92e 100644 --- a/justfile +++ b/justfile @@ -337,7 +337,7 @@ lambda-deploy: --query "deploymentId" --output text) echo "πŸš€ Deployment started: $DEPLOYMENT_ID" - echo "🏷️ CodeDeploy App: $CODE_DEPLOY_APP_NAME | Group: $CODE_DEPLOY_GROUP_NAME" + echo "🏷️ CodeDeploy App: $CODE_DEPLOY_APP_NAME | Group: $CODE_DEPLOY_GROUP_NAME" echo "πŸ“¦ AppSpec artifact: s3://$BUCKET_NAME/$APP_SPEC_KEY" echo "⏳ Monitoring deployment status…" From dd6b8bb251a19c28390e10b86d1298df1f4d4d25 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 11:48:57 +0000 Subject: [PATCH 18/23] chore: alarm iteration --- infra/live/dev/aws/api/terragrunt.hcl | 4 ++-- infra/live/dev/aws/consumer/terragrunt.hcl | 10 ++++------ infra/live/prod/aws/consumer/terragrunt.hcl | 6 ++---- infra/modules/aws/api/main.tf | 2 +- infra/modules/aws/consumer/main.tf | 5 ++--- infra/modules/aws/consumer/variables.tf | 2 +- justfile | 3 ++- 7 files changed, 14 insertions(+), 18 deletions(-) diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 5ae721d..f82c435 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -4,8 +4,8 @@ include { inputs = { api_5xx_alarm_threshold = 20.0 - api_5xx_alarm_evaluation_periods = 2 - api_5xx_alarm_datapoints_to_alarm = 2 + api_5xx_alarm_evaluation_periods = 1 + api_5xx_alarm_datapoints_to_alarm = 1 deployment_config = { strategy = "canary" diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index a2f7c84..f6d95b2 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -10,14 +10,12 @@ locals { inputs = { sqs_queue_name = local.sqs_queue_name - sqs_dlq_alarm_threshold = 5 - sqs_dlq_alarm_evaluation_periods = 2 - sqs_dlq_alarm_datapoints_to_alarm = 2 + sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing) + sqs_dlq_alarm_evaluation_periods = 1 + sqs_dlq_alarm_datapoints_to_alarm = 1 deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + strategy = "all_at_once" } provisioned_config = { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index f033653..bc2b1bc 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -10,14 +10,12 @@ locals { inputs = { sqs_queue_name = local.sqs_queue_name - sqs_dlq_alarm_threshold = 5 + sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ sqs_dlq_alarm_evaluation_periods = 3 sqs_dlq_alarm_datapoints_to_alarm = 3 deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + strategy = "all_at_once" } provisioned_config = { diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index 7252e17..1991bae 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -89,7 +89,7 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { namespace = "AWS/ApiGateway" metric_name = local.apigw_http_5xx_metric stat = "Sum" - period = 60 + period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms dimensions = { ApiId = aws_apigatewayv2_api.http_api.id diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index 9c57296..fafd7d9 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -45,14 +45,14 @@ resource "aws_lambda_event_source_mapping" "sqs" { } resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" { - alarm_name = local.sqs_dlq_name + alarm_name = "${local.sqs_dlq_name}-new-messages" alarm_description = "New messages sent to DLQ ${local.sqs_dlq_name}" actions_enabled = true namespace = "AWS/SQS" metric_name = "NumberOfMessagesSent" statistic = "Sum" - period = 60 + period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms evaluation_periods = var.sqs_dlq_alarm_evaluation_periods datapoints_to_alarm = var.sqs_dlq_alarm_datapoints_to_alarm @@ -65,4 +65,3 @@ resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" { QueueName = local.sqs_dlq_name } } - diff --git a/infra/modules/aws/consumer/variables.tf b/infra/modules/aws/consumer/variables.tf index 02bd554..49b8449 100644 --- a/infra/modules/aws/consumer/variables.tf +++ b/infra/modules/aws/consumer/variables.tf @@ -52,7 +52,7 @@ variable "provisioned_config" { variable "sqs_dlq_alarm_threshold" { type = number - description = "The threshold for the SQS DLQ alarm" + description = "Age in seconds, e.g. 300 for 5 minutes of messages in the DLQ, to trigger the alarm" } variable "sqs_dlq_alarm_evaluation_periods" { diff --git a/justfile b/justfile index 72da92e..1746f67 100644 --- a/justfile +++ b/justfile @@ -459,8 +459,9 @@ test-send-dlq-messages: echo "Sending messages to SQS DLQ at $SQS_DLQ_QUEUE_URL..." - for i in {1..10}; do + for i in {1..180}; do aws sqs send-message --region $AWS_REGION --queue-url "$SQS_DLQ_QUEUE_URL" --message-body "Test message $i" + sleep 1 done echo "Finished sending messages." \ No newline at end of file From bf187430fc76c2d8afa6b38f0f2855e1e40a8ef9 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 12:33:35 +0000 Subject: [PATCH 19/23] chore: pass in alarm name from resource --- infra/modules/aws/api/main.tf | 2 +- infra/modules/aws/consumer/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index 1991bae..f0101e7 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -14,7 +14,7 @@ module "lambda_api" { deployment_config = var.deployment_config codedeploy_alarm_names = [ - local.api_5xx_alarm_name + aws_cloudwatch_metric_alarm.api_5xx_rate.alarm_name ] provisioned_config = var.provisioned_config diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index fafd7d9..81ce76c 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -19,7 +19,7 @@ module "lambda_consumer" { deployment_config = var.deployment_config codedeploy_alarm_names = [ - local.sqs_dlq_name + aws_cloudwatch_metric_alarm.dlq_new_messages.alarm_name ] provisioned_config = var.provisioned_config From e9f8422ff387dd90164fb0ce23613ae0f211aeb3 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 14:51:31 +0000 Subject: [PATCH 20/23] fix: circular dep --- infra/modules/aws/api/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index f0101e7..1991bae 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -14,7 +14,7 @@ module "lambda_api" { deployment_config = var.deployment_config codedeploy_alarm_names = [ - aws_cloudwatch_metric_alarm.api_5xx_rate.alarm_name + local.api_5xx_alarm_name ] provisioned_config = var.provisioned_config From 5beb15d3bd7462ee40a83a09e4f6030c9ce56402 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 15:03:20 +0000 Subject: [PATCH 21/23] chore: sqs back to canary --- infra/live/dev/aws/consumer/terragrunt.hcl | 4 +++- infra/live/prod/aws/consumer/terragrunt.hcl | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index f6d95b2..663e7f2 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -15,7 +15,9 @@ inputs = { sqs_dlq_alarm_datapoints_to_alarm = 1 deployment_config = { - strategy = "all_at_once" + strategy = "canary" + percentage = 50 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } provisioned_config = { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index bc2b1bc..3859906 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -15,7 +15,9 @@ inputs = { sqs_dlq_alarm_datapoints_to_alarm = 3 deployment_config = { - strategy = "all_at_once" + strategy = "canary" + percentage = 10 + interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } provisioned_config = { From ea5ed85ff11bd1aa166c54e4ec062bbcec731f70 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 15:06:32 +0000 Subject: [PATCH 22/23] fix: rm comments for hcl validate --- infra/live/dev/aws/api/terragrunt.hcl | 6 +++--- infra/live/dev/aws/consumer/terragrunt.hcl | 4 ++-- infra/live/prod/aws/api/terragrunt.hcl | 6 +++--- infra/live/prod/aws/consumer/terragrunt.hcl | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index f82c435..f3cef8d 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -10,19 +10,19 @@ inputs = { deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + interval_minutes = 3 } provisioned_config = { auto_scale = { max = 2 - min = 1 # always have 1 lambda ready to go + min = 1 trigger_percent = 20 scale_in_cooldown_seconds = 60 scale_out_cooldown_seconds = 60 } - reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting + reserved_concurrency = 10 } } diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index 663e7f2..c0d1f4d 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -10,14 +10,14 @@ locals { inputs = { sqs_queue_name = local.sqs_queue_name - sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing) + sqs_dlq_alarm_threshold = 1 sqs_dlq_alarm_evaluation_periods = 1 sqs_dlq_alarm_datapoints_to_alarm = 1 deployment_config = { strategy = "canary" percentage = 50 - interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + interval_minutes = 3 } provisioned_config = { diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl index 456994b..ee0ef64 100644 --- a/infra/live/prod/aws/api/terragrunt.hcl +++ b/infra/live/prod/aws/api/terragrunt.hcl @@ -10,19 +10,19 @@ inputs = { deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + interval_minutes = 5 } provisioned_config = { auto_scale = { max = 2 - min = 1 # always have 1 lambda ready to go + min = 1 trigger_percent = 20 scale_in_cooldown_seconds = 60 scale_out_cooldown_seconds = 60 } - reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting + reserved_concurrency = 10 } } diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index 3859906..e744018 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -10,14 +10,14 @@ locals { inputs = { sqs_queue_name = local.sqs_queue_name - sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ + sqs_dlq_alarm_threshold = 5 sqs_dlq_alarm_evaluation_periods = 3 sqs_dlq_alarm_datapoints_to_alarm = 3 deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = 5 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + interval_minutes = 5 } provisioned_config = { From 799fcf4e3eeeed2335cfbd3f60d1c8fb84f81650 Mon Sep 17 00:00:00 2001 From: chrispsheehan Date: Fri, 6 Feb 2026 15:11:46 +0000 Subject: [PATCH 23/23] fix: fmt --- infra/live/dev/aws/consumer/terragrunt.hcl | 9 ++++----- infra/live/prod/aws/consumer/terragrunt.hcl | 7 +++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index c0d1f4d..bddf4dc 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -3,21 +3,20 @@ include { } locals { - aws_account_id = get_aws_account_id() - sqs_queue_name = "${local.aws_account_id}-dev-serverless-consumer-queue" + sqs_queue_name = "dev-serverless-consumer-queue" } inputs = { sqs_queue_name = local.sqs_queue_name - sqs_dlq_alarm_threshold = 1 - sqs_dlq_alarm_evaluation_periods = 1 + sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing) + sqs_dlq_alarm_evaluation_periods = 1 sqs_dlq_alarm_datapoints_to_alarm = 1 deployment_config = { strategy = "canary" percentage = 50 - interval_minutes = 3 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } provisioned_config = { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index e744018..b0930ac 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -3,21 +3,20 @@ include { } locals { - aws_account_id = get_aws_account_id() - sqs_queue_name = "${local.aws_account_id}-dev-serverless-consumer-queue" + sqs_queue_name = "serverless-consumer-queue" } inputs = { sqs_queue_name = local.sqs_queue_name - sqs_dlq_alarm_threshold = 5 + sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ sqs_dlq_alarm_evaluation_periods = 3 sqs_dlq_alarm_datapoints_to_alarm = 3 deployment_config = { strategy = "canary" percentage = 10 - interval_minutes = 5 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers } provisioned_config = {