chrispsheehan · chrispsheehan · Feb 6, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 5, 2026
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -104,6 +104,14 @@ jobs:
           aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }}
           just_action: lambda-upload-bundle
 
+      - name: Set Alarms to OK for CodeDeploy (if applicable)
+        uses: chrispsheehan/just-aws-oidc-action@0.3.0
+        env:
+          FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }}
+        with:
+          aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }}
+          just_action: lambda-set-code-deploy-alarms
+
       - name: Run CodeDeploy
         uses: chrispsheehan/just-aws-oidc-action@0.3.0
         env:

diff --git a/.github/workflows/infra.yml b/.github/workflows/infra.yml
@@ -48,7 +48,7 @@ jobs:
     needs: oidc
     runs-on: ubuntu-latest
     strategy:
-      fail-fast: true
+      fail-fast: false # this is to prevent terraform lock issues
       matrix:
         value: ${{ fromJson(inputs.matrix) }}
     steps:

diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl
@@ -6,6 +6,24 @@ inputs = {
   api_5xx_alarm_threshold           = 20.0
   api_5xx_alarm_evaluation_periods  = 1
   api_5xx_alarm_datapoints_to_alarm = 1
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 10
+    interval_minutes = 3
+  }
+
+  provisioned_config = {
+    auto_scale = {
+      max                        = 2
+      min                        = 1
+      trigger_percent            = 20
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+
+    reserved_concurrency = 10
+  }
 }
 
 terraform {

diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl
@@ -2,10 +2,33 @@ include {
   path = find_in_parent_folders("root.hcl")
 }
 
+locals {
+  sqs_queue_name = "dev-serverless-consumer-queue"
+}
+
 inputs = {
-  sqs_dlq_alarm_threshold           = 5
+  sqs_queue_name = local.sqs_queue_name
+
+  sqs_dlq_alarm_threshold           = 1 # fail when any messages are in the DLQ (quick fail for testing)
   sqs_dlq_alarm_evaluation_periods  = 1
   sqs_dlq_alarm_datapoints_to_alarm = 1
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 50
+    interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers
+  }
+
+  provisioned_config = {
+    sqs_scale = {
+      min                        = 1
+      max                        = 5
+      visible_messages           = 10
+      queue_name                 = local.sqs_queue_name
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+  }
 }
 
 terraform {

diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl
@@ -4,8 +4,26 @@ include {
 
 inputs = {
   api_5xx_alarm_threshold           = 5.0
-  api_5xx_alarm_evaluation_periods  = 1
-  api_5xx_alarm_datapoints_to_alarm = 1
+  api_5xx_alarm_evaluation_periods  = 3
+  api_5xx_alarm_datapoints_to_alarm = 3
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 10
+    interval_minutes = 5
+  }
+
+  provisioned_config = {
+    auto_scale = {
+      max                        = 2
+      min                        = 1
+      trigger_percent            = 20
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+
+    reserved_concurrency = 10
+  }
 }
 
 terraform {

diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl
@@ -2,10 +2,33 @@ include {
   path = find_in_parent_folders("root.hcl")
 }
 
+locals {
+  sqs_queue_name = "serverless-consumer-queue"
+}
+
 inputs = {
-  sqs_dlq_alarm_threshold           = 5
-  sqs_dlq_alarm_evaluation_periods  = 1
-  sqs_dlq_alarm_datapoints_to_alarm = 1
+  sqs_queue_name = local.sqs_queue_name
+
+  sqs_dlq_alarm_threshold           = 5 # fail when there are 5 messages in the DLQ
+  sqs_dlq_alarm_evaluation_periods  = 3
+  sqs_dlq_alarm_datapoints_to_alarm = 3
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 10
+    interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers
+  }
+
+  provisioned_config = {
+    sqs_scale = {
+      min                        = 1
+      max                        = 5
+      visible_messages           = 10
+      queue_name                 = local.sqs_queue_name
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+  }
 }
 
 terraform {

diff --git a/infra/modules/aws/_shared/lambda/locals.tf b/infra/modules/aws/_shared/lambda/locals.tf
@@ -44,4 +44,9 @@ locals {
   pc_trigger_percent             = try(var.provisioned_config.auto_scale.trigger_percent, 70) / 100
   pc_sqs_target_visible_messages = try(var.provisioned_config.sqs_scale.visible_messages, 0)
   pc_sqs_queue_name              = try(var.provisioned_config.sqs_scale.queue_name, "")
+
+  codedeploy_alarm_tags = {
+    for idx, alarm in var.codedeploy_alarm_names :
+    "CodeDeployAlarm${idx + 1}" => alarm
+  }
 }
diff --git a/infra/modules/aws/_shared/lambda/main.tf b/infra/modules/aws/_shared/lambda/main.tf
@@ -49,11 +49,14 @@ resource "aws_lambda_function" "lambda" {
   }
 
   # tags for identifying the code deploy app and its deployment config. Used in CI/CD pipelines.
-  tags = {
-    CodeDeployApplication = aws_codedeploy_app.app.name
-    CodeDeployGroup       = aws_codedeploy_deployment_group.dg.deployment_group_name
-    DeploymentStrategy    = local.deploy_config.type
-  }
+  tags = merge(
+    {
+      CodeDeployApplication = aws_codedeploy_app.app.name
+      CodeDeployGroup       = aws_codedeploy_deployment_group.dg.deployment_group_name
+      DeploymentStrategy    = local.deploy_config.type
+    },
+    local.codedeploy_alarm_tags
+  )
 
   lifecycle {
     # Do not update on changes to the initial s3 file version

diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf
@@ -11,27 +11,13 @@ module "lambda_api" {
     DEBUG_DELAY_MS = 500
   }
 
-  deployment_config = {
-    strategy         = "canary"
-    percentage       = 10
-    interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers
-  }
+  deployment_config = var.deployment_config
 
   codedeploy_alarm_names = [
     local.api_5xx_alarm_name
   ]
 
-  provisioned_config = {
-    auto_scale = {
-      max                        = 2
-      min                        = 1 # always have 1 lambda ready to go
-      trigger_percent            = 20
-      scale_in_cooldown_seconds  = 60
-      scale_out_cooldown_seconds = 60
-    }
-
-    reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting
-  }
+  provisioned_config = var.provisioned_config
 }
 
 resource "aws_apigatewayv2_api" "http_api" {
@@ -103,7 +89,7 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" {
       namespace   = "AWS/ApiGateway"
       metric_name = local.apigw_http_5xx_metric
       stat        = "Sum"
-      period      = 60
+      period      = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms
 
       dimensions = {
         ApiId = aws_apigatewayv2_api.http_api.id

diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf
@@ -15,6 +15,35 @@ variable "lambda_bucket" {
 }
 ### end of static vars set in root.hcl ###
 
+variable "deployment_config" {
+  description = "Traffic shifting: all_at_once | canary | linear"
+  type = object({
+    strategy         = string           # all_at_once | canary | linear
+    percentage       = optional(number) # 1..99 (req for canary/linear)
+    interval_minutes = optional(number) # >=1  (req for canary/linear)
+  })
+}
+
+variable "provisioned_config" {
+  description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none"
+  type = object({
+    fixed                = optional(number) # 0/omit = off, >0 = fixed PC
+    reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency
+
+    auto_scale = optional(object({
+      min                        = number
+      max                        = number
+      trigger_percent            = optional(number)
+      scale_in_cooldown_seconds  = optional(number)
+      scale_out_cooldown_seconds = optional(number)
+    }))
+  })
+  default = {
+    fixed                = 0
+    reserved_concurrency = 1
+  }
+}
+
 variable "api_5xx_alarm_threshold" {
   type        = number
   description = "The threshold for the API 5xx error rate alarm"

diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf
@@ -16,26 +16,13 @@ module "lambda_consumer" {
     module.sqs_queue.sqs_queue_read_policy_arn
   ]
 
-  deployment_config = {
-    strategy         = "canary"
-    percentage       = 10
-    interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers
-  }
+  deployment_config = var.deployment_config
 
   codedeploy_alarm_names = [
-    local.sqs_dlq_name
+    aws_cloudwatch_metric_alarm.dlq_new_messages.alarm_name
   ]
 
-  provisioned_config = {
-    sqs_scale = {
-      min                        = 1
-      max                        = 5
-      visible_messages           = 10
-      queue_name                 = module.sqs_queue.sqs_queue_name
-      scale_in_cooldown_seconds  = 60
-      scale_out_cooldown_seconds = 60
-    }
-  }
+  provisioned_config = var.provisioned_config
 }
 
 # configure a deadletter queue (DLQ) for the SQS queue used by the Lambda consumer
@@ -57,19 +44,20 @@ resource "aws_lambda_event_source_mapping" "sqs" {
   function_response_types = ["ReportBatchItemFailures"]
 }
 
-resource "aws_cloudwatch_metric_alarm" "dlq_messages_present" {
-  alarm_name        = local.sqs_dlq_name
-  alarm_description = "Messages present in DLQ ${local.sqs_dlq_name}"
+resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" {
+  alarm_name        = "${local.sqs_dlq_name}-new-messages"
+  alarm_description = "New messages sent to DLQ ${local.sqs_dlq_name}"
   actions_enabled   = true
 
-  namespace           = "AWS/SQS"
-  metric_name         = "ApproximateNumberOfMessagesVisible"
-  statistic           = "Sum"
-  period              = 60
+  namespace   = "AWS/SQS"
+  metric_name = "NumberOfMessagesSent"
+  statistic   = "Sum"
+  period      = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms
+
   evaluation_periods  = var.sqs_dlq_alarm_evaluation_periods
   datapoints_to_alarm = var.sqs_dlq_alarm_datapoints_to_alarm
 
-  comparison_operator = "GreaterThanThreshold"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.sqs_dlq_alarm_threshold
   treat_missing_data  = "notBreaching"
 

diff --git a/infra/modules/aws/consumer/variables.tf b/infra/modules/aws/consumer/variables.tf
@@ -15,9 +15,44 @@ variable "lambda_bucket" {
 }
 ### end of static vars set in root.hcl ###
 
+variable "sqs_queue_name" {
+  type        = string
+  description = "The name of the SQS queue"
+}
+
+variable "deployment_config" {
+  description = "Traffic shifting: all_at_once | canary | linear"
+  type = object({
+    strategy         = string           # all_at_once | canary | linear
+    percentage       = optional(number) # 1..99 (req for canary/linear)
+    interval_minutes = optional(number) # >=1  (req for canary/linear)
+  })
+}
+
+variable "provisioned_config" {
+  description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none"
+  type = object({
+    fixed                = optional(number) # 0/omit = off, >0 = fixed PC
+    reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency
+
+    sqs_scale = optional(object({
+      min                        = number
+      max                        = number
+      visible_messages           = number
+      queue_name                 = optional(string)
+      scale_in_cooldown_seconds  = optional(number)
+      scale_out_cooldown_seconds = optional(number)
+    }))
+  })
+  default = {
+    fixed                = 0
+    reserved_concurrency = 1
+  }
+}
+
 variable "sqs_dlq_alarm_threshold" {
   type        = number
-  description = "The threshold for the SQS DLQ alarm"
+  description = "Age in seconds, e.g. 300 for 5 minutes of messages in the DLQ, to trigger the alarm"
 }
 
 variable "sqs_dlq_alarm_evaluation_periods" {