diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 0599860..2dace32 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -104,6 +104,14 @@ jobs:
           aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }}
           just_action: lambda-upload-bundle
 
+      - name: Set Alarms to OK for CodeDeploy (if applicable)
+        uses: chrispsheehan/just-aws-oidc-action@0.3.0
+        env:
+          FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }}
+        with:
+          aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }}
+          just_action: lambda-set-code-deploy-alarms
+
       - name: Run CodeDeploy
         uses: chrispsheehan/just-aws-oidc-action@0.3.0
         env:
diff --git a/.github/workflows/infra.yml b/.github/workflows/infra.yml
index 21f2596..9012188 100644
--- a/.github/workflows/infra.yml
+++ b/.github/workflows/infra.yml
@@ -48,7 +48,7 @@ jobs:
     needs: oidc
     runs-on: ubuntu-latest
     strategy:
-      fail-fast: true
+      fail-fast: false # this is to prevent terraform lock issues
       matrix:
         value: ${{ fromJson(inputs.matrix) }}
     steps:
diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl
index 37e4cc2..f3cef8d 100644
--- a/infra/live/dev/aws/api/terragrunt.hcl
+++ b/infra/live/dev/aws/api/terragrunt.hcl
@@ -6,6 +6,24 @@ inputs = {
   api_5xx_alarm_threshold           = 20.0
   api_5xx_alarm_evaluation_periods  = 1
   api_5xx_alarm_datapoints_to_alarm = 1
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 10
+    interval_minutes = 3
+  }
+
+  provisioned_config = {
+    auto_scale = {
+      max                        = 2
+      min                        = 1
+      trigger_percent            = 20
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+
+    reserved_concurrency = 10
+  }
 }
 
 terraform {
diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl
index 9f0160a..bddf4dc 100644
--- a/infra/live/dev/aws/consumer/terragrunt.hcl
+++ b/infra/live/dev/aws/consumer/terragrunt.hcl
@@ -2,10 +2,33 @@ include {
   path = find_in_parent_folders("root.hcl")
 }
 
+locals {
+  sqs_queue_name = "dev-serverless-consumer-queue"
+}
+
 inputs = {
-  sqs_dlq_alarm_threshold           = 5
+  sqs_queue_name = local.sqs_queue_name
+
+  sqs_dlq_alarm_threshold           = 1 # fail when any messages are in the DLQ (quick fail for testing)
   sqs_dlq_alarm_evaluation_periods  = 1
   sqs_dlq_alarm_datapoints_to_alarm = 1
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 50
+    interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers
+  }
+
+  provisioned_config = {
+    sqs_scale = {
+      min                        = 1
+      max                        = 5
+      visible_messages           = 10
+      queue_name                 = local.sqs_queue_name
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+  }
 }
 
 terraform {
diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl
index 35f660c..ee0ef64 100644
--- a/infra/live/prod/aws/api/terragrunt.hcl
+++ b/infra/live/prod/aws/api/terragrunt.hcl
@@ -4,8 +4,26 @@ include {
 
 inputs = {
   api_5xx_alarm_threshold           = 5.0
-  api_5xx_alarm_evaluation_periods  = 1
-  api_5xx_alarm_datapoints_to_alarm = 1
+  api_5xx_alarm_evaluation_periods  = 3
+  api_5xx_alarm_datapoints_to_alarm = 3
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 10
+    interval_minutes = 5
+  }
+
+  provisioned_config = {
+    auto_scale = {
+      max                        = 2
+      min                        = 1
+      trigger_percent            = 20
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+
+    reserved_concurrency = 10
+  }
 }
 
 terraform {
diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl
index 9f0160a..b0930ac 100644
--- a/infra/live/prod/aws/consumer/terragrunt.hcl
+++ b/infra/live/prod/aws/consumer/terragrunt.hcl
@@ -2,10 +2,33 @@ include {
   path = find_in_parent_folders("root.hcl")
 }
 
+locals {
+  sqs_queue_name = "serverless-consumer-queue"
+}
+
 inputs = {
-  sqs_dlq_alarm_threshold           = 5
-  sqs_dlq_alarm_evaluation_periods  = 1
-  sqs_dlq_alarm_datapoints_to_alarm = 1
+  sqs_queue_name = local.sqs_queue_name
+
+  sqs_dlq_alarm_threshold           = 5 # fail when there are 5 messages in the DLQ
+  sqs_dlq_alarm_evaluation_periods  = 3
+  sqs_dlq_alarm_datapoints_to_alarm = 3
+
+  deployment_config = {
+    strategy         = "canary"
+    percentage       = 10
+    interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers
+  }
+
+  provisioned_config = {
+    sqs_scale = {
+      min                        = 1
+      max                        = 5
+      visible_messages           = 10
+      queue_name                 = local.sqs_queue_name
+      scale_in_cooldown_seconds  = 60
+      scale_out_cooldown_seconds = 60
+    }
+  }
 }
 
 terraform {
diff --git a/infra/modules/aws/_shared/lambda/locals.tf b/infra/modules/aws/_shared/lambda/locals.tf
index fac9ddb..1a7ae80 100644
--- a/infra/modules/aws/_shared/lambda/locals.tf
+++ b/infra/modules/aws/_shared/lambda/locals.tf
@@ -44,4 +44,9 @@ locals {
   pc_trigger_percent             = try(var.provisioned_config.auto_scale.trigger_percent, 70) / 100
   pc_sqs_target_visible_messages = try(var.provisioned_config.sqs_scale.visible_messages, 0)
   pc_sqs_queue_name              = try(var.provisioned_config.sqs_scale.queue_name, "")
+
+  codedeploy_alarm_tags = {
+    for idx, alarm in var.codedeploy_alarm_names :
+    "CodeDeployAlarm${idx + 1}" => alarm
+  }
 }
diff --git a/infra/modules/aws/_shared/lambda/main.tf b/infra/modules/aws/_shared/lambda/main.tf
index e91b01a..8f16444 100644
--- a/infra/modules/aws/_shared/lambda/main.tf
+++ b/infra/modules/aws/_shared/lambda/main.tf
@@ -49,11 +49,14 @@ resource "aws_lambda_function" "lambda" {
   }
 
   # tags for identifying the code deploy app and its deployment config. Used in CI/CD pipelines.
-  tags = {
-    CodeDeployApplication = aws_codedeploy_app.app.name
-    CodeDeployGroup       = aws_codedeploy_deployment_group.dg.deployment_group_name
-    DeploymentStrategy    = local.deploy_config.type
-  }
+  tags = merge(
+    {
+      CodeDeployApplication = aws_codedeploy_app.app.name
+      CodeDeployGroup       = aws_codedeploy_deployment_group.dg.deployment_group_name
+      DeploymentStrategy    = local.deploy_config.type
+    },
+    local.codedeploy_alarm_tags
+  )
 
   lifecycle {
     # Do not update on changes to the initial s3 file version
diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf
index 93c91b1..1991bae 100644
--- a/infra/modules/aws/api/main.tf
+++ b/infra/modules/aws/api/main.tf
@@ -11,27 +11,13 @@ module "lambda_api" {
     DEBUG_DELAY_MS = 500
   }
 
-  deployment_config = {
-    strategy         = "canary"
-    percentage       = 10
-    interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers
-  }
+  deployment_config = var.deployment_config
 
   codedeploy_alarm_names = [
     local.api_5xx_alarm_name
   ]
 
-  provisioned_config = {
-    auto_scale = {
-      max                        = 2
-      min                        = 1 # always have 1 lambda ready to go
-      trigger_percent            = 20
-      scale_in_cooldown_seconds  = 60
-      scale_out_cooldown_seconds = 60
-    }
-
-    reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting
-  }
+  provisioned_config = var.provisioned_config
 }
 
 resource "aws_apigatewayv2_api" "http_api" {
@@ -103,7 +89,7 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" {
       namespace   = "AWS/ApiGateway"
       metric_name = local.apigw_http_5xx_metric
       stat        = "Sum"
-      period      = 60
+      period      = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms
 
       dimensions = {
         ApiId = aws_apigatewayv2_api.http_api.id
diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf
index a75acdf..17d0029 100644
--- a/infra/modules/aws/api/variables.tf
+++ b/infra/modules/aws/api/variables.tf
@@ -15,6 +15,35 @@ variable "lambda_bucket" {
 }
 ### end of static vars set in root.hcl ###
 
+variable "deployment_config" {
+  description = "Traffic shifting: all_at_once | canary | linear"
+  type = object({
+    strategy         = string           # all_at_once | canary | linear
+    percentage       = optional(number) # 1..99 (req for canary/linear)
+    interval_minutes = optional(number) # >=1  (req for canary/linear)
+  })
+}
+
+variable "provisioned_config" {
+  description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none"
+  type = object({
+    fixed                = optional(number) # 0/omit = off, >0 = fixed PC
+    reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency
+
+    auto_scale = optional(object({
+      min                        = number
+      max                        = number
+      trigger_percent            = optional(number)
+      scale_in_cooldown_seconds  = optional(number)
+      scale_out_cooldown_seconds = optional(number)
+    }))
+  })
+  default = {
+    fixed                = 0
+    reserved_concurrency = 1
+  }
+}
+
 variable "api_5xx_alarm_threshold" {
   type        = number
   description = "The threshold for the API 5xx error rate alarm"
diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf
index a2c8e3c..81ce76c 100644
--- a/infra/modules/aws/consumer/main.tf
+++ b/infra/modules/aws/consumer/main.tf
@@ -16,26 +16,13 @@ module "lambda_consumer" {
     module.sqs_queue.sqs_queue_read_policy_arn
   ]
 
-  deployment_config = {
-    strategy         = "canary"
-    percentage       = 10
-    interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers
-  }
+  deployment_config = var.deployment_config
 
   codedeploy_alarm_names = [
-    local.sqs_dlq_name
+    aws_cloudwatch_metric_alarm.dlq_new_messages.alarm_name
   ]
 
-  provisioned_config = {
-    sqs_scale = {
-      min                        = 1
-      max                        = 5
-      visible_messages           = 10
-      queue_name                 = module.sqs_queue.sqs_queue_name
-      scale_in_cooldown_seconds  = 60
-      scale_out_cooldown_seconds = 60
-    }
-  }
+  provisioned_config = var.provisioned_config
 }
 
 # configure a deadletter queue (DLQ) for the SQS queue used by the Lambda consumer
@@ -57,19 +44,20 @@ resource "aws_lambda_event_source_mapping" "sqs" {
   function_response_types = ["ReportBatchItemFailures"]
 }
 
-resource "aws_cloudwatch_metric_alarm" "dlq_messages_present" {
-  alarm_name        = local.sqs_dlq_name
-  alarm_description = "Messages present in DLQ ${local.sqs_dlq_name}"
+resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" {
+  alarm_name        = "${local.sqs_dlq_name}-new-messages"
+  alarm_description = "New messages sent to DLQ ${local.sqs_dlq_name}"
   actions_enabled   = true
 
-  namespace           = "AWS/SQS"
-  metric_name         = "ApproximateNumberOfMessagesVisible"
-  statistic           = "Sum"
-  period              = 60
+  namespace   = "AWS/SQS"
+  metric_name = "NumberOfMessagesSent"
+  statistic   = "Sum"
+  period      = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms
+
   evaluation_periods  = var.sqs_dlq_alarm_evaluation_periods
   datapoints_to_alarm = var.sqs_dlq_alarm_datapoints_to_alarm
 
-  comparison_operator = "GreaterThanThreshold"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.sqs_dlq_alarm_threshold
   treat_missing_data  = "notBreaching"
 
diff --git a/infra/modules/aws/consumer/variables.tf b/infra/modules/aws/consumer/variables.tf
index 6968594..49b8449 100644
--- a/infra/modules/aws/consumer/variables.tf
+++ b/infra/modules/aws/consumer/variables.tf
@@ -15,9 +15,44 @@ variable "lambda_bucket" {
 }
 ### end of static vars set in root.hcl ###
 
+variable "sqs_queue_name" {
+  type        = string
+  description = "The name of the SQS queue"
+}
+
+variable "deployment_config" {
+  description = "Traffic shifting: all_at_once | canary | linear"
+  type = object({
+    strategy         = string           # all_at_once | canary | linear
+    percentage       = optional(number) # 1..99 (req for canary/linear)
+    interval_minutes = optional(number) # >=1  (req for canary/linear)
+  })
+}
+
+variable "provisioned_config" {
+  description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none"
+  type = object({
+    fixed                = optional(number) # 0/omit = off, >0 = fixed PC
+    reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency
+
+    sqs_scale = optional(object({
+      min                        = number
+      max                        = number
+      visible_messages           = number
+      queue_name                 = optional(string)
+      scale_in_cooldown_seconds  = optional(number)
+      scale_out_cooldown_seconds = optional(number)
+    }))
+  })
+  default = {
+    fixed                = 0
+    reserved_concurrency = 1
+  }
+}
+
 variable "sqs_dlq_alarm_threshold" {
   type        = number
-  description = "The threshold for the SQS DLQ alarm"
+  description = "Age in seconds, e.g. 300 for 5 minutes of messages in the DLQ, to trigger the alarm"
 }
 
 variable "sqs_dlq_alarm_evaluation_periods" {
diff --git a/justfile b/justfile
index 33e3da3..1746f67 100644
--- a/justfile
+++ b/justfile
@@ -242,6 +242,11 @@ lambda-upload-bundle:
 
 lambda-get-function-arn:
     #!/usr/bin/env bash
+    if [[ -z "$FUNCTION_NAME" ]]; then
+        echo "❌ FUNCTION_NAME environment variable is not set."
+        exit 1
+    fi
+
     aws lambda get-function \
         --function-name $FUNCTION_NAME \
         --query 'Configuration.FunctionArn' \
@@ -266,6 +271,43 @@ lambda-get-code-deploy-group:
         --output text
 
 
+lambda-get-code-deploy-alarms:
+    #!/usr/bin/env bash
+    set -euo pipefail
+
+    FUNCTION_ARN=$(just lambda-get-function-arn)
+
+    aws lambda list-tags \
+        --resource "$FUNCTION_ARN" \
+        --query 'Tags' \
+        --output json \
+    | jq -c '
+        to_entries
+        | map(select(.key | test("^CodeDeployAlarm[0-9]+$")))
+        | sort_by(.key | sub("^CodeDeployAlarm"; "") | tonumber)
+        | map(.value)
+      '
+
+
+lambda-set-code-deploy-alarms:
+    #!/usr/bin/env bash
+    set -euo pipefail
+
+    ALARMS_JSON=$(just lambda-get-code-deploy-alarms)
+
+    # Convert JSON array to space-separated list
+    ALARMS=$(echo "$ALARMS_JSON" | jq -r '.[]')
+
+    # Reset each alarm to OK
+    for ALARM_NAME in $ALARMS; do
+        echo "Setting alarm to OK: $ALARM_NAME"
+        aws cloudwatch set-alarm-state \
+            --alarm-name "$ALARM_NAME" \
+            --state-value OK \
+            --state-reason "Reset by CI/CD"
+    done
+
+
 lambda-deploy:
     #!/usr/bin/env bash
     set -euo pipefail
@@ -294,7 +336,10 @@ lambda-deploy:
         --s3-location bucket=$BUCKET_NAME,key=$APP_SPEC_KEY,bundleType=zip \
         --query "deploymentId" --output text)
 
-    echo "🚀 Started deployment: $DEPLOYMENT_ID"
+    echo "🚀 Deployment started: $DEPLOYMENT_ID"
+    echo "🏷️ CodeDeploy App: $CODE_DEPLOY_APP_NAME | Group: $CODE_DEPLOY_GROUP_NAME"
+    echo "📦 AppSpec artifact: s3://$BUCKET_NAME/$APP_SPEC_KEY"
+    echo "⏳ Monitoring deployment status…"
 
     if [[ -z "$DEPLOYMENT_ID" || "$DEPLOYMENT_ID" == "None" ]]; then
         echo "❌ Failed to create deployment — no deployment ID returned."
@@ -414,8 +459,9 @@ test-send-dlq-messages:
 
     echo "Sending messages to SQS DLQ at $SQS_DLQ_QUEUE_URL..."
 
-    for i in {1..10}; do
+    for i in {1..180}; do
         aws sqs send-message --region $AWS_REGION --queue-url "$SQS_DLQ_QUEUE_URL" --message-body "Test message $i"
+        sleep 1
     done
 
     echo "Finished sending messages."
\ No newline at end of file