diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 0599860..2dace32 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -104,6 +104,14 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: lambda-upload-bundle + - name: Set Alarms to OK for CodeDeploy (if applicable) + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: lambda-set-code-deploy-alarms + - name: Run CodeDeploy uses: chrispsheehan/just-aws-oidc-action@0.3.0 env: diff --git a/.github/workflows/infra.yml b/.github/workflows/infra.yml index 21f2596..9012188 100644 --- a/.github/workflows/infra.yml +++ b/.github/workflows/infra.yml @@ -48,7 +48,7 @@ jobs: needs: oidc runs-on: ubuntu-latest strategy: - fail-fast: true + fail-fast: false # this is to prevent terraform lock issues matrix: value: ${{ fromJson(inputs.matrix) }} steps: diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 37e4cc2..f3cef8d 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -6,6 +6,24 @@ inputs = { api_5xx_alarm_threshold = 20.0 api_5xx_alarm_evaluation_periods = 1 api_5xx_alarm_datapoints_to_alarm = 1 + + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 3 + } + + provisioned_config = { + auto_scale = { + max = 2 + min = 1 + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + + reserved_concurrency = 10 + } } terraform { diff --git a/infra/live/dev/aws/consumer/terragrunt.hcl b/infra/live/dev/aws/consumer/terragrunt.hcl index 9f0160a..bddf4dc 100644 --- a/infra/live/dev/aws/consumer/terragrunt.hcl +++ b/infra/live/dev/aws/consumer/terragrunt.hcl @@ -2,10 +2,33 @@ include { path = find_in_parent_folders("root.hcl") } +locals { + sqs_queue_name = "dev-serverless-consumer-queue" +} + inputs = { - sqs_dlq_alarm_threshold = 5 + sqs_queue_name = local.sqs_queue_name + + sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing) sqs_dlq_alarm_evaluation_periods = 1 sqs_dlq_alarm_datapoints_to_alarm = 1 + + deployment_config = { + strategy = "canary" + percentage = 50 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } + + provisioned_config = { + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + queue_name = local.sqs_queue_name + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + } } terraform { diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl index 35f660c..ee0ef64 100644 --- a/infra/live/prod/aws/api/terragrunt.hcl +++ b/infra/live/prod/aws/api/terragrunt.hcl @@ -4,8 +4,26 @@ include { inputs = { api_5xx_alarm_threshold = 5.0 - api_5xx_alarm_evaluation_periods = 1 - api_5xx_alarm_datapoints_to_alarm = 1 + api_5xx_alarm_evaluation_periods = 3 + api_5xx_alarm_datapoints_to_alarm = 3 + + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 5 + } + + provisioned_config = { + auto_scale = { + max = 2 + min = 1 + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + + reserved_concurrency = 10 + } } terraform { diff --git a/infra/live/prod/aws/consumer/terragrunt.hcl b/infra/live/prod/aws/consumer/terragrunt.hcl index 9f0160a..b0930ac 100644 --- a/infra/live/prod/aws/consumer/terragrunt.hcl +++ b/infra/live/prod/aws/consumer/terragrunt.hcl @@ -2,10 +2,33 @@ include { path = find_in_parent_folders("root.hcl") } +locals { + sqs_queue_name = "serverless-consumer-queue" +} + inputs = { - sqs_dlq_alarm_threshold = 5 - sqs_dlq_alarm_evaluation_periods = 1 - sqs_dlq_alarm_datapoints_to_alarm = 1 + sqs_queue_name = local.sqs_queue_name + + sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ + sqs_dlq_alarm_evaluation_periods = 3 + sqs_dlq_alarm_datapoints_to_alarm = 3 + + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } + + provisioned_config = { + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + queue_name = local.sqs_queue_name + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + } } terraform { diff --git a/infra/modules/aws/_shared/lambda/locals.tf b/infra/modules/aws/_shared/lambda/locals.tf index fac9ddb..1a7ae80 100644 --- a/infra/modules/aws/_shared/lambda/locals.tf +++ b/infra/modules/aws/_shared/lambda/locals.tf @@ -44,4 +44,9 @@ locals { pc_trigger_percent = try(var.provisioned_config.auto_scale.trigger_percent, 70) / 100 pc_sqs_target_visible_messages = try(var.provisioned_config.sqs_scale.visible_messages, 0) pc_sqs_queue_name = try(var.provisioned_config.sqs_scale.queue_name, "") + + codedeploy_alarm_tags = { + for idx, alarm in var.codedeploy_alarm_names : + "CodeDeployAlarm${idx + 1}" => alarm + } } diff --git a/infra/modules/aws/_shared/lambda/main.tf b/infra/modules/aws/_shared/lambda/main.tf index e91b01a..8f16444 100644 --- a/infra/modules/aws/_shared/lambda/main.tf +++ b/infra/modules/aws/_shared/lambda/main.tf @@ -49,11 +49,14 @@ resource "aws_lambda_function" "lambda" { } # tags for identifying the code deploy app and its deployment config. Used in CI/CD pipelines. - tags = { - CodeDeployApplication = aws_codedeploy_app.app.name - CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name - DeploymentStrategy = local.deploy_config.type - } + tags = merge( + { + CodeDeployApplication = aws_codedeploy_app.app.name + CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name + DeploymentStrategy = local.deploy_config.type + }, + local.codedeploy_alarm_tags + ) lifecycle { # Do not update on changes to the initial s3 file version diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index 93c91b1..1991bae 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -11,27 +11,13 @@ module "lambda_api" { DEBUG_DELAY_MS = 500 } - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers - } + deployment_config = var.deployment_config codedeploy_alarm_names = [ local.api_5xx_alarm_name ] - provisioned_config = { - auto_scale = { - max = 2 - min = 1 # always have 1 lambda ready to go - trigger_percent = 20 - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - - reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting - } + provisioned_config = var.provisioned_config } resource "aws_apigatewayv2_api" "http_api" { @@ -103,7 +89,7 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { namespace = "AWS/ApiGateway" metric_name = local.apigw_http_5xx_metric stat = "Sum" - period = 60 + period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms dimensions = { ApiId = aws_apigatewayv2_api.http_api.id diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf index a75acdf..17d0029 100644 --- a/infra/modules/aws/api/variables.tf +++ b/infra/modules/aws/api/variables.tf @@ -15,6 +15,35 @@ variable "lambda_bucket" { } ### end of static vars set in root.hcl ### +variable "deployment_config" { + description = "Traffic shifting: all_at_once | canary | linear" + type = object({ + strategy = string # all_at_once | canary | linear + percentage = optional(number) # 1..99 (req for canary/linear) + interval_minutes = optional(number) # >=1 (req for canary/linear) + }) +} + +variable "provisioned_config" { + description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none" + type = object({ + fixed = optional(number) # 0/omit = off, >0 = fixed PC + reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency + + auto_scale = optional(object({ + min = number + max = number + trigger_percent = optional(number) + scale_in_cooldown_seconds = optional(number) + scale_out_cooldown_seconds = optional(number) + })) + }) + default = { + fixed = 0 + reserved_concurrency = 1 + } +} + variable "api_5xx_alarm_threshold" { type = number description = "The threshold for the API 5xx error rate alarm" diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index a2c8e3c..81ce76c 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -16,26 +16,13 @@ module "lambda_consumer" { module.sqs_queue.sqs_queue_read_policy_arn ] - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers - } + deployment_config = var.deployment_config codedeploy_alarm_names = [ - local.sqs_dlq_name + aws_cloudwatch_metric_alarm.dlq_new_messages.alarm_name ] - provisioned_config = { - sqs_scale = { - min = 1 - max = 5 - visible_messages = 10 - queue_name = module.sqs_queue.sqs_queue_name - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - } + provisioned_config = var.provisioned_config } # configure a deadletter queue (DLQ) for the SQS queue used by the Lambda consumer @@ -57,19 +44,20 @@ resource "aws_lambda_event_source_mapping" "sqs" { function_response_types = ["ReportBatchItemFailures"] } -resource "aws_cloudwatch_metric_alarm" "dlq_messages_present" { - alarm_name = local.sqs_dlq_name - alarm_description = "Messages present in DLQ ${local.sqs_dlq_name}" +resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" { + alarm_name = "${local.sqs_dlq_name}-new-messages" + alarm_description = "New messages sent to DLQ ${local.sqs_dlq_name}" actions_enabled = true - namespace = "AWS/SQS" - metric_name = "ApproximateNumberOfMessagesVisible" - statistic = "Sum" - period = 60 + namespace = "AWS/SQS" + metric_name = "NumberOfMessagesSent" + statistic = "Sum" + period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms + evaluation_periods = var.sqs_dlq_alarm_evaluation_periods datapoints_to_alarm = var.sqs_dlq_alarm_datapoints_to_alarm - comparison_operator = "GreaterThanThreshold" + comparison_operator = "GreaterThanOrEqualToThreshold" threshold = var.sqs_dlq_alarm_threshold treat_missing_data = "notBreaching" diff --git a/infra/modules/aws/consumer/variables.tf b/infra/modules/aws/consumer/variables.tf index 6968594..49b8449 100644 --- a/infra/modules/aws/consumer/variables.tf +++ b/infra/modules/aws/consumer/variables.tf @@ -15,9 +15,44 @@ variable "lambda_bucket" { } ### end of static vars set in root.hcl ### +variable "sqs_queue_name" { + type = string + description = "The name of the SQS queue" +} + +variable "deployment_config" { + description = "Traffic shifting: all_at_once | canary | linear" + type = object({ + strategy = string # all_at_once | canary | linear + percentage = optional(number) # 1..99 (req for canary/linear) + interval_minutes = optional(number) # >=1 (req for canary/linear) + }) +} + +variable "provisioned_config" { + description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none" + type = object({ + fixed = optional(number) # 0/omit = off, >0 = fixed PC + reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency + + sqs_scale = optional(object({ + min = number + max = number + visible_messages = number + queue_name = optional(string) + scale_in_cooldown_seconds = optional(number) + scale_out_cooldown_seconds = optional(number) + })) + }) + default = { + fixed = 0 + reserved_concurrency = 1 + } +} + variable "sqs_dlq_alarm_threshold" { type = number - description = "The threshold for the SQS DLQ alarm" + description = "Age in seconds, e.g. 300 for 5 minutes of messages in the DLQ, to trigger the alarm" } variable "sqs_dlq_alarm_evaluation_periods" { diff --git a/justfile b/justfile index 33e3da3..1746f67 100644 --- a/justfile +++ b/justfile @@ -242,6 +242,11 @@ lambda-upload-bundle: lambda-get-function-arn: #!/usr/bin/env bash + if [[ -z "$FUNCTION_NAME" ]]; then + echo "❌ FUNCTION_NAME environment variable is not set." + exit 1 + fi + aws lambda get-function \ --function-name $FUNCTION_NAME \ --query 'Configuration.FunctionArn' \ @@ -266,6 +271,43 @@ lambda-get-code-deploy-group: --output text +lambda-get-code-deploy-alarms: + #!/usr/bin/env bash + set -euo pipefail + + FUNCTION_ARN=$(just lambda-get-function-arn) + + aws lambda list-tags \ + --resource "$FUNCTION_ARN" \ + --query 'Tags' \ + --output json \ + | jq -c ' + to_entries + | map(select(.key | test("^CodeDeployAlarm[0-9]+$"))) + | sort_by(.key | sub("^CodeDeployAlarm"; "") | tonumber) + | map(.value) + ' + + +lambda-set-code-deploy-alarms: + #!/usr/bin/env bash + set -euo pipefail + + ALARMS_JSON=$(just lambda-get-code-deploy-alarms) + + # Convert JSON array to space-separated list + ALARMS=$(echo "$ALARMS_JSON" | jq -r '.[]') + + # Reset each alarm to OK + for ALARM_NAME in $ALARMS; do + echo "Setting alarm to OK: $ALARM_NAME" + aws cloudwatch set-alarm-state \ + --alarm-name "$ALARM_NAME" \ + --state-value OK \ + --state-reason "Reset by CI/CD" + done + + lambda-deploy: #!/usr/bin/env bash set -euo pipefail @@ -294,7 +336,10 @@ lambda-deploy: --s3-location bucket=$BUCKET_NAME,key=$APP_SPEC_KEY,bundleType=zip \ --query "deploymentId" --output text) - echo "πŸš€ Started deployment: $DEPLOYMENT_ID" + echo "πŸš€ Deployment started: $DEPLOYMENT_ID" + echo "🏷️ CodeDeploy App: $CODE_DEPLOY_APP_NAME | Group: $CODE_DEPLOY_GROUP_NAME" + echo "πŸ“¦ AppSpec artifact: s3://$BUCKET_NAME/$APP_SPEC_KEY" + echo "⏳ Monitoring deployment status…" if [[ -z "$DEPLOYMENT_ID" || "$DEPLOYMENT_ID" == "None" ]]; then echo "❌ Failed to create deployment β€” no deployment ID returned." @@ -414,8 +459,9 @@ test-send-dlq-messages: echo "Sending messages to SQS DLQ at $SQS_DLQ_QUEUE_URL..." - for i in {1..10}; do + for i in {1..180}; do aws sqs send-message --region $AWS_REGION --queue-url "$SQS_DLQ_QUEUE_URL" --message-body "Test message $i" + sleep 1 done echo "Finished sending messages." \ No newline at end of file