From 7e11ef1f0cff238d58c12e6bc4b8cadc1fd4566d Mon Sep 17 00:00:00 2001 From: Seamus McShane Date: Mon, 21 Oct 2024 10:05:55 +0100 Subject: [PATCH] LIME-1208 - ALign DL F.E scaling with HMRC KBV F.E with MinCapacity at 4 - MinCapacity 4 aimed at handling large inital burst traffic from 0% load --- deploy/template.yaml | 158 +++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/deploy/template.yaml b/deploy/template.yaml index db60faa..d6ac451 100644 --- a/deploy/template.yaml +++ b/deploy/template.yaml @@ -53,6 +53,7 @@ Conditions: - !Equals [!Ref Environment, production] IsProduction: !Equals [!Ref Environment, production] IsPerformance: !Or + - !Equals [!Ref Environment, dev] - !Equals [!Ref Environment, build] - !Equals [!Ref Environment, production] UsePermissionsBoundary: !Not @@ -69,7 +70,6 @@ Mappings: dynatraceSecretArn: arn:aws:secretsmanager:eu-west-2:216552277552:secret:DynatraceNonProductionVariables fargateCPUsize: "256" fargateRAMsize: "512" - desiredTaskCount: 2 ga4Disabled: "false" uaDisabled: "false" languageToggleDisabled: "false" @@ -78,7 +78,6 @@ Mappings: dynatraceSecretArn: arn:aws:secretsmanager:eu-west-2:216552277552:secret:DynatraceNonProductionVariables fargateCPUsize: "1024" fargateRAMsize: "2048" - desiredTaskCount: 2 ga4Disabled: "false" uaDisabled: "false" languageToggleDisabled: "false" @@ -87,7 +86,6 @@ Mappings: dynatraceSecretArn: arn:aws:secretsmanager:eu-west-2:216552277552:secret:DynatraceNonProductionVariables fargateCPUsize: "256" fargateRAMsize: "512" - desiredTaskCount: 2 ga4Disabled: "false" uaDisabled: "false" languageToggleDisabled: "false" @@ -96,7 +94,6 @@ Mappings: dynatraceSecretArn: arn:aws:secretsmanager:eu-west-2:216552277552:secret:DynatraceNonProductionVariables fargateCPUsize: "512" fargateRAMsize: "1024" - desiredTaskCount: 2 ga4Disabled: "false" uaDisabled: "false" languageToggleDisabled: "false" @@ -105,7 +102,6 @@ Mappings: dynatraceSecretArn: arn:aws:secretsmanager:eu-west-2:216552277552:secret:DynatraceProductionVariables fargateCPUsize: "1024" fargateRAMsize: "2048" - desiredTaskCount: 2 ga4Disabled: "false" uaDisabled: "false" languageToggleDisabled: "false" @@ -284,6 +280,9 @@ Resources: HealthCheckEnabled: TRUE HealthCheckProtocol: HTTP HealthCheckPath: /healthcheck + HealthCheckTimeoutSeconds: 2 + HealthCheckIntervalSeconds: 5 + HealthyThresholdCount: 2 Matcher: HttpCode: 200 Port: 80 @@ -443,10 +442,6 @@ Resources: - UseCanaryDeployment - CODE_DEPLOY - ECS - DesiredCount: !FindInMap - - EnvironmentConfiguration - - !Ref "Environment" - - desiredTaskCount EnableECSManagedTags: false HealthCheckGracePeriodSeconds: !If - UseCanaryDeployment @@ -808,12 +803,33 @@ Resources: ArnLike: "kms:EncryptionContext:aws:logs:arn": !Sub "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:*" + PassportFrontSessionsTable: + Type: AWS::DynamoDB::Table + Properties: + # checkov:skip=CKV_AWS_28: Point in time recovery is not necessary for this table. + TableName: !Sub "cri-passport-front-sessions-${Environment}" + BillingMode: "PAY_PER_REQUEST" + AttributeDefinitions: + - AttributeName: "id" + AttributeType: "S" + KeySchema: + - AttributeName: "id" + KeyType: "HASH" + TimeToLiveSpecification: + AttributeName: "expires" + Enabled: true + SSESpecification: + # checkov:skip=CKV_AWS_119: Implement Customer Managed Keys in PYIC-1391 + SSEEnabled: true + SSEType: KMS + + # ECS Autoscaling ECSAutoScalingTarget: Condition: IsPerformance Type: AWS::ApplicationAutoScaling::ScalableTarget Properties: + MinCapacity: 4 MaxCapacity: 60 - MinCapacity: 2 ResourceId: !Join - '/' - - "service" @@ -823,36 +839,15 @@ Resources: ScalableDimension: ecs:service:DesiredCount ServiceNamespace: ecs - ECSAutoScalingPolicy: + EcsStepScaleOutPolicy: Condition: IsPerformance DependsOn: ECSAutoScalingTarget Type: AWS::ApplicationAutoScaling::ScalingPolicy Properties: - PolicyName: ECSAutoScalingPolicy - PolicyType: TargetTrackingScaling - ResourceId: !Join - - "/" - - - "service" - - !Ref PassportFrontEcsCluster - - !GetAtt PassportFrontEcsService.Name - ScalableDimension: ecs:service:DesiredCount - ServiceNamespace: ecs - TargetTrackingScalingPolicyConfiguration: - PredefinedMetricSpecification: - PredefinedMetricType: ECSServiceAverageCPUUtilization - TargetValue: 60 - ScaleInCooldown: 420 - ScaleOutCooldown: 60 - - StepScaleInPolicy: - Condition: IsPerformance - DependsOn: ECSAutoScalingTarget - Type: AWS::ApplicationAutoScaling::ScalingPolicy - Properties: - PolicyName: StepScalingInPolicy + PolicyName: EcsStepScalingOutPolicy PolicyType: StepScaling ResourceId: !Join - - '/' + - "/" - - "service" - !Ref PassportFrontEcsCluster - !GetAtt PassportFrontEcsService.Name @@ -860,20 +855,41 @@ Resources: ServiceNamespace: ecs StepScalingPolicyConfiguration: AdjustmentType: PercentChangeInCapacity - Cooldown: 420 + Cooldown: + 180 # The policy will continue to respond to additional alarm breaches, + # even while a scaling activity is in progress. This means Application + # Auto Scaling will evaluate all alarm breaches as they occur. + # A cooldown period is used to protect against over-scaling due to + # multiple alarm breaches occurring in rapid succession. + MinAdjustmentMagnitude: 1 StepAdjustments: - - MetricIntervalUpperBound: -40 - ScalingAdjustment: -50 + - MetricIntervalUpperBound: 0 # 60% + ScalingAdjustment: 100 # Scale by 100% of containers if the metric is breached + # with <60% utilisation + - MetricIntervalLowerBound: 0 # 60% + MetricIntervalUpperBound: 30 # 90% + ScalingAdjustment: 200 # Scale by 200% of containers if the metric is breached + # with 80-90% utilisation + - MetricIntervalLowerBound: 30 # 90% + MetricIntervalUpperBound: 35 # 95% + ScalingAdjustment: 300 # Scale by 300% of containers if the metric is breached + # with 90-95% utilisation + - MetricIntervalLowerBound: 35 # 95% + ScalingAdjustment: + 500 # Scale by 500% of containers if the metric is breached + # with >95% utilisation + # Note: CPU can scale greater than 100% in a burst mode + # on Fargate, so leave the upper bound open - StepScaleOutPolicy: + EcsStepScaleInPolicy: Condition: IsPerformance DependsOn: ECSAutoScalingTarget Type: AWS::ApplicationAutoScaling::ScalingPolicy Properties: - PolicyName: StepScalingOutPolicy + PolicyName: EcsStepScalingInPolicy PolicyType: StepScaling ResourceId: !Join - - '/' + - "/" - - "service" - !Ref PassportFrontEcsCluster - !GetAtt PassportFrontEcsService.Name @@ -881,51 +897,55 @@ Resources: ServiceNamespace: ecs StepScalingPolicyConfiguration: AdjustmentType: PercentChangeInCapacity - Cooldown: 120 - MinAdjustmentMagnitude: 5 + Cooldown: + 180 # The policy will continue to respond to additional alarm breaches, + # even while a scaling activity is in progress. This means Application + # Auto Scaling will evaluate all alarm breaches as they occur. + # A cooldown period is used to protect against under-scaling due to + # multiple alarm breaches occurring in rapid succession. StepAdjustments: - - MetricIntervalLowerBound: 20 - MetricIntervalUpperBound: 30 - ScalingAdjustment: 200 - - MetricIntervalLowerBound: 30 - MetricIntervalUpperBound: 35 - ScalingAdjustment: 300 - - MetricIntervalLowerBound: 35 - ScalingAdjustment: 500 + - MetricIntervalUpperBound: -15 # 5% + ScalingAdjustment: -90 # Scale down by 90% of containers if the metric is breached + # with <5% utilisation + - MetricIntervalLowerBound: -15 # 5% + MetricIntervalUpperBound: 0 # 20% + ScalingAdjustment: + -50 # Scale down 50% of containers if the metric is breached + # with <20% utilisation - StepScaleOutAlarm: + EcsStepScaleOutAlarm: Condition: IsPerformance DependsOn: ECSAutoScalingTarget Type: AWS::CloudWatch::Alarm Properties: ActionsEnabled: true AlarmActions: - - !Ref StepScaleOutPolicy - AlarmDescription: "PassportFrontClusterOver60PercentCPU" + - !Ref EcsStepScaleOutPolicy + AlarmDescription: "EcsClusterOver60PercentCPU" ComparisonOperator: "GreaterThanThreshold" - DatapointsToAlarm: "2" + DatapointsToAlarm: "1" Dimensions: - Name: ClusterName Value: !Ref PassportFrontEcsCluster - Name: ServiceName Value: !GetAtt PassportFrontEcsService.Name Unit: "Percent" - EvaluationPeriods: "2" + EvaluationPeriods: "1" MetricName: "CPUUtilization" Namespace: "AWS/ECS" Statistic: "Average" Period: "60" Threshold: "60" - StepScaleInAlarm: + EcsStepScaleInAlarm: Condition: IsPerformance DependsOn: ECSAutoScalingTarget Type: AWS::CloudWatch::Alarm Properties: ActionsEnabled: true AlarmActions: - - !Ref StepScaleInPolicy - AlarmDescription: "PassportFrontClusterUnder60PercentCPU" + - !Ref EcsStepScaleInPolicy + AlarmDescription: "EcsClusterUnder60PercentCPU" ComparisonOperator: "LessThanThreshold" DatapointsToAlarm: "5" Dimensions: @@ -939,27 +959,7 @@ Resources: Namespace: "AWS/ECS" Statistic: "Average" Period: "60" - Threshold: "60" - - PassportFrontSessionsTable: - Type: AWS::DynamoDB::Table - Properties: - # checkov:skip=CKV_AWS_28: Point in time recovery is not necessary for this table. - TableName: !Sub "cri-passport-front-sessions-${Environment}" - BillingMode: "PAY_PER_REQUEST" - AttributeDefinitions: - - AttributeName: "id" - AttributeType: "S" - KeySchema: - - AttributeName: "id" - KeyType: "HASH" - TimeToLiveSpecification: - AttributeName: "expires" - Enabled: true - SSESpecification: - # checkov:skip=CKV_AWS_119: Implement Customer Managed Keys in PYIC-1391 - SSEEnabled: true - SSEType: KMS + Threshold: "20" #################################################################### # #