Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ jobs:
aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }}
just_action: lambda-upload-bundle

- name: Set Alarms to OK for CodeDeploy (if applicable)
uses: chrispsheehan/just-aws-oidc-action@0.3.0
env:
FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }}
with:
aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }}
just_action: lambda-set-code-deploy-alarms

- name: Run CodeDeploy
uses: chrispsheehan/just-aws-oidc-action@0.3.0
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
needs: oidc
runs-on: ubuntu-latest
strategy:
fail-fast: true
fail-fast: false # this is to prevent terraform lock issues
matrix:
value: ${{ fromJson(inputs.matrix) }}
steps:
Expand Down
18 changes: 18 additions & 0 deletions infra/live/dev/aws/api/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@ inputs = {
api_5xx_alarm_threshold = 20.0
api_5xx_alarm_evaluation_periods = 1
api_5xx_alarm_datapoints_to_alarm = 1

deployment_config = {
strategy = "canary"
percentage = 10
interval_minutes = 3
}

provisioned_config = {
auto_scale = {
max = 2
min = 1
trigger_percent = 20
scale_in_cooldown_seconds = 60
scale_out_cooldown_seconds = 60
}

reserved_concurrency = 10
}
}

terraform {
Expand Down
25 changes: 24 additions & 1 deletion infra/live/dev/aws/consumer/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,33 @@ include {
path = find_in_parent_folders("root.hcl")
}

locals {
sqs_queue_name = "dev-serverless-consumer-queue"
}

inputs = {
sqs_dlq_alarm_threshold = 5
sqs_queue_name = local.sqs_queue_name

sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing)
sqs_dlq_alarm_evaluation_periods = 1
sqs_dlq_alarm_datapoints_to_alarm = 1

deployment_config = {
strategy = "canary"
percentage = 50
interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers
}

provisioned_config = {
sqs_scale = {
min = 1
max = 5
visible_messages = 10
queue_name = local.sqs_queue_name
scale_in_cooldown_seconds = 60
scale_out_cooldown_seconds = 60
}
}
}

terraform {
Expand Down
22 changes: 20 additions & 2 deletions infra/live/prod/aws/api/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,26 @@ include {

inputs = {
api_5xx_alarm_threshold = 5.0
api_5xx_alarm_evaluation_periods = 1
api_5xx_alarm_datapoints_to_alarm = 1
api_5xx_alarm_evaluation_periods = 3
api_5xx_alarm_datapoints_to_alarm = 3

deployment_config = {
strategy = "canary"
percentage = 10
interval_minutes = 5
}

provisioned_config = {
auto_scale = {
max = 2
min = 1
trigger_percent = 20
scale_in_cooldown_seconds = 60
scale_out_cooldown_seconds = 60
}

reserved_concurrency = 10
}
}

terraform {
Expand Down
29 changes: 26 additions & 3 deletions infra/live/prod/aws/consumer/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,33 @@ include {
path = find_in_parent_folders("root.hcl")
}

locals {
sqs_queue_name = "serverless-consumer-queue"
}

inputs = {
sqs_dlq_alarm_threshold = 5
sqs_dlq_alarm_evaluation_periods = 1
sqs_dlq_alarm_datapoints_to_alarm = 1
sqs_queue_name = local.sqs_queue_name

sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ
sqs_dlq_alarm_evaluation_periods = 3
sqs_dlq_alarm_datapoints_to_alarm = 3

deployment_config = {
strategy = "canary"
percentage = 10
interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers
}

provisioned_config = {
sqs_scale = {
min = 1
max = 5
visible_messages = 10
queue_name = local.sqs_queue_name
scale_in_cooldown_seconds = 60
scale_out_cooldown_seconds = 60
}
}
}

terraform {
Expand Down
5 changes: 5 additions & 0 deletions infra/modules/aws/_shared/lambda/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,9 @@ locals {
pc_trigger_percent = try(var.provisioned_config.auto_scale.trigger_percent, 70) / 100
pc_sqs_target_visible_messages = try(var.provisioned_config.sqs_scale.visible_messages, 0)
pc_sqs_queue_name = try(var.provisioned_config.sqs_scale.queue_name, "")

codedeploy_alarm_tags = {
for idx, alarm in var.codedeploy_alarm_names :
"CodeDeployAlarm${idx + 1}" => alarm
}
}
13 changes: 8 additions & 5 deletions infra/modules/aws/_shared/lambda/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@ resource "aws_lambda_function" "lambda" {
}

# tags for identifying the code deploy app and its deployment config. Used in CI/CD pipelines.
tags = {
CodeDeployApplication = aws_codedeploy_app.app.name
CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name
DeploymentStrategy = local.deploy_config.type
}
tags = merge(
{
CodeDeployApplication = aws_codedeploy_app.app.name
CodeDeployGroup = aws_codedeploy_deployment_group.dg.deployment_group_name
DeploymentStrategy = local.deploy_config.type
},
local.codedeploy_alarm_tags
)

lifecycle {
# Do not update on changes to the initial s3 file version
Expand Down
20 changes: 3 additions & 17 deletions infra/modules/aws/api/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,13 @@ module "lambda_api" {
DEBUG_DELAY_MS = 500
}

deployment_config = {
strategy = "canary"
percentage = 10
interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers
}
deployment_config = var.deployment_config

codedeploy_alarm_names = [
local.api_5xx_alarm_name
]

provisioned_config = {
auto_scale = {
max = 2
min = 1 # always have 1 lambda ready to go
trigger_percent = 20
scale_in_cooldown_seconds = 60
scale_out_cooldown_seconds = 60
}

reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting
}
provisioned_config = var.provisioned_config
}

resource "aws_apigatewayv2_api" "http_api" {
Expand Down Expand Up @@ -103,7 +89,7 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" {
namespace = "AWS/ApiGateway"
metric_name = local.apigw_http_5xx_metric
stat = "Sum"
period = 60
period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms

dimensions = {
ApiId = aws_apigatewayv2_api.http_api.id
Expand Down
29 changes: 29 additions & 0 deletions infra/modules/aws/api/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,35 @@ variable "lambda_bucket" {
}
### end of static vars set in root.hcl ###

variable "deployment_config" {
description = "Traffic shifting: all_at_once | canary | linear"
type = object({
strategy = string # all_at_once | canary | linear
percentage = optional(number) # 1..99 (req for canary/linear)
interval_minutes = optional(number) # >=1 (req for canary/linear)
})
}

variable "provisioned_config" {
description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none"
type = object({
fixed = optional(number) # 0/omit = off, >0 = fixed PC
reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency

auto_scale = optional(object({
min = number
max = number
trigger_percent = optional(number)
scale_in_cooldown_seconds = optional(number)
scale_out_cooldown_seconds = optional(number)
}))
})
default = {
fixed = 0
reserved_concurrency = 1
}
}

variable "api_5xx_alarm_threshold" {
type = number
description = "The threshold for the API 5xx error rate alarm"
Expand Down
36 changes: 12 additions & 24 deletions infra/modules/aws/consumer/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,13 @@ module "lambda_consumer" {
module.sqs_queue.sqs_queue_read_policy_arn
]

deployment_config = {
strategy = "canary"
percentage = 10
interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers
}
deployment_config = var.deployment_config

codedeploy_alarm_names = [
local.sqs_dlq_name
aws_cloudwatch_metric_alarm.dlq_new_messages.alarm_name
]

provisioned_config = {
sqs_scale = {
min = 1
max = 5
visible_messages = 10
queue_name = module.sqs_queue.sqs_queue_name
scale_in_cooldown_seconds = 60
scale_out_cooldown_seconds = 60
}
}
provisioned_config = var.provisioned_config
}

# configure a deadletter queue (DLQ) for the SQS queue used by the Lambda consumer
Expand All @@ -57,19 +44,20 @@ resource "aws_lambda_event_source_mapping" "sqs" {
function_response_types = ["ReportBatchItemFailures"]
}

resource "aws_cloudwatch_metric_alarm" "dlq_messages_present" {
alarm_name = local.sqs_dlq_name
alarm_description = "Messages present in DLQ ${local.sqs_dlq_name}"
resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" {
alarm_name = "${local.sqs_dlq_name}-new-messages"
alarm_description = "New messages sent to DLQ ${local.sqs_dlq_name}"
actions_enabled = true

namespace = "AWS/SQS"
metric_name = "ApproximateNumberOfMessagesVisible"
statistic = "Sum"
period = 60
namespace = "AWS/SQS"
metric_name = "NumberOfMessagesSent"
statistic = "Sum"
period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms

evaluation_periods = var.sqs_dlq_alarm_evaluation_periods
datapoints_to_alarm = var.sqs_dlq_alarm_datapoints_to_alarm

comparison_operator = "GreaterThanThreshold"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.sqs_dlq_alarm_threshold
treat_missing_data = "notBreaching"

Expand Down
37 changes: 36 additions & 1 deletion infra/modules/aws/consumer/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,44 @@ variable "lambda_bucket" {
}
### end of static vars set in root.hcl ###

variable "sqs_queue_name" {
type = string
description = "The name of the SQS queue"
}

variable "deployment_config" {
description = "Traffic shifting: all_at_once | canary | linear"
type = object({
strategy = string # all_at_once | canary | linear
percentage = optional(number) # 1..99 (req for canary/linear)
interval_minutes = optional(number) # >=1 (req for canary/linear)
})
}

variable "provisioned_config" {
description = "Either fixed provisioned concurrency (fixed) or autoscaled (auto_scale); omit/zero = none"
type = object({
fixed = optional(number) # 0/omit = off, >0 = fixed PC
reserved_concurrency = optional(number) # 0/omit = no concurrency limit, >0 = limited concurrency

sqs_scale = optional(object({
min = number
max = number
visible_messages = number
queue_name = optional(string)
scale_in_cooldown_seconds = optional(number)
scale_out_cooldown_seconds = optional(number)
}))
})
default = {
fixed = 0
reserved_concurrency = 1
}
}

variable "sqs_dlq_alarm_threshold" {
type = number
description = "The threshold for the SQS DLQ alarm"
description = "Age in seconds, e.g. 300 for 5 minutes of messages in the DLQ, to trigger the alarm"
}

variable "sqs_dlq_alarm_evaluation_periods" {
Expand Down
Loading