From 806e6ffd0b0be63e7680f3eb4bba5c4793e2ad68 Mon Sep 17 00:00:00 2001 From: Kyle Rarey Date: Tue, 6 Oct 2020 15:56:37 -0400 Subject: [PATCH] Add instance health checks and make idempotent for CI --- CHANGELOG.md | 3 ++ README.md | 39 ++++++++++++++++++++++++++ cmd/root.go | 1 + deploy/deploy.go | 73 ++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 README.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..10a3300 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,3 @@ +# v1.0.0 +- Will now test whether any VMSS instances report that they are not the latest model. If all instances report they are the latest model, exit early with a successful status code. This lets us stay idempotent when running in CI/CD +- Added support for VM health checks. The script will wait (until the global timeout) for all instances to report healthy before mocing forward with scaling down the outgoing instance set. This feature can be disabled with the `--skip-health-check` flag. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d94f813 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# Azure VM Scale Set - Blue/Green Upgrade Utility +This utility performs a blue/green upgrade on an Azure VM Scale Set. This is useful when a VMSS is set to Manual upgrade mode, and the user wishes to perform an update by scaling in a set of replacement nodes, rather than asking Azure to remediate a set of already-running instances. The upgrade is carried out performing the following steps: + +- Verify VM Scale Set instances require update. We assume a given scale set requires update if any instances report they are not the 'latest model'. +- Double the capacity of the Scale Set +- (Optional) Wait for all VMSS instance health checks to succeed +- Apply scale-in protection to all instances in the replacement set +- Reduce the capacity of the Scale Set by half +- Remove scale-in protection from the remaining instances + +The entire operation is subject to a 20-minute global timeout. In the even this timeout is exceeded, or an error is encountered anywhere else in the process, we eagerly bail out of the operation and leave the scale set in place. This is done so that the failed instances are available for root cause analysis, and to protect applications like [HashiCorp Consul](https://github.com/hashicorp/consul) that may experience data loss if the outgoing instance set is inadvertently stopped before a migration has been fully verified. + +## Configuration +Currently, we assume the user has a valid set of credentials provided by Azure CLI. These can be generated using the following commands: +- Azure AD User: `az login` +- Azure AD Service Principal: `az login --service-principal` +- Managed Service Identity: `az login --identity` + +The Azure Subscription, Resource Group, and VM Scale Set name must be provided at run time. The relevant flags can be found by running the following command: + +``` +$ ./azure-cluster-upgrade --help +Interacts with the Azure API to perform a blue/green deployment. + +Expects a Virtual Machine Scale Set whose configuration has recently been updated. +Expands the chosen scale set by a factor of two, and once all VMs have entered the +'Running' state, protects the replacement instances and reduces Scale Set capacity +to its original value. + +Usage: + azure-cluster-upgrade [flags] + +Flags: + -h, --help help for azure-cluster-upgrade + -r, --resource-group string Resource Group name + --skip-health-check Skip testing instance health checks + -s, --subscription-id string Subscription ID + -v, --vm-scale-set string Virtual Machine Scale Set name +``` diff --git a/cmd/root.go b/cmd/root.go index 2d6531e..74dad6c 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -36,6 +36,7 @@ func init() { rootCmd.Flags().StringP("subscription-id", "s", "", "Subscription ID") rootCmd.Flags().StringP("resource-group", "r", "", "Resource Group name") rootCmd.Flags().StringP("vm-scale-set", "v", "", "Virtual Machine Scale Set name") + rootCmd.Flags().Bool("skip-health-check", false, "Skip testing instance health checks") rootCmd.MarkFlagRequired("subscription-id") rootCmd.MarkFlagRequired("resource-group") diff --git a/deploy/deploy.go b/deploy/deploy.go index 9451545..73e7461 100644 --- a/deploy/deploy.go +++ b/deploy/deploy.go @@ -3,7 +3,6 @@ package deploy import ( "context" "math" - "os" "sync" "time" @@ -55,12 +54,12 @@ func (s *azureSession) setVMProtection(ctx context.Context, protect bool) ([]com if protect { filter = "properties/latestModelApplied eq true" - log.Info("Applying scale-in protection to new instances...") + log.Info("Applying scale-in protection to new instances") } else { // Leave this defaulted to an empty string for now // This will un-protect ALL members of the VMSS upon completion // filter = "properties/latestModelApplied eq false" - log.Info("Removing scale-in protection from Scale Set instances...") + log.Info("Removing scale-in protection from Scale Set instances") } for vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, filter, "", ""); vms.NotDone(); err = vms.Next() { @@ -157,7 +156,7 @@ func (s *azureSession) scaleVMSSByFactor(ctx context.Context, factor float64) er // Ick newCapacity := int64(math.Floor(float64(*scaleSet.Sku.Capacity) * factor)) - log.Infof("Scaling VMSS %s to %d instances...", *scaleSet.Name, newCapacity) + log.Infof("Scaling VMSS %s to %d instances", *scaleSet.Name, newCapacity) future, err := client.Update( ctx, @@ -183,6 +182,47 @@ func (s *azureSession) scaleVMSSByFactor(ctx context.Context, factor float64) er return nil } +func (s *azureSession) instancesNeedUpgrade(ctx context.Context) (bool, error) { + client := s.getVMSSVMClient() + const filter string = "properties/latestModelApplied eq false" + + vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, filter, "", "") + if err != nil { + return false, err + } + return vms.NotDone(), nil +} + +func (s *azureSession) awaitInstanceHealthChecks(ctx context.Context) error { + client := s.getVMSSVMClient() + const healthy string = "HealthState/healthy" + + log.Info("Waiting for instance health checks to pass") + // Rely on the context timeout to kill this if we run too long + for true { + instanceUnhealthy := false + for vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, "", "", "instanceView"); vms.NotDone(); err = vms.Next() { + if err != nil { + return err + } + + vm := vms.Value() + if *vm.InstanceView.VMHealth.Status.Code != healthy { + instanceUnhealthy = true + break + } + } + + if instanceUnhealthy { + log.Info("VM instances do not yet report healthy. Backing off and retrying in 30 seconds.") + time.Sleep(30 * time.Second) + } else { + break + } + } + return nil +} + // Initializes a new azureSession struct. Mostly used to get // rid of unnecessary variable passing and allow the chosen // authorizer to be easily replaced. @@ -215,43 +255,52 @@ func Run(cmd *cobra.Command, args []string) { ) if err != nil { log.Fatal(err) - os.Exit(1) + } + + cont, err := sess.instancesNeedUpgrade(ctx) + if err != nil { + log.Fatal(err) + } else if !cont { + log.Info("All VMs report up-to-date. Nothing to do.") + return } if err = sess.scaleVMSSByFactor(ctx, 2); err != nil { log.Fatal(err) - os.Exit(1) } - log.Info("Waiting for new instances to reach Running state...") + skip, _ := cmd.Flags().GetBool("skip-health-check") + if !skip { + err = sess.awaitInstanceHealthChecks(ctx) + if err != nil { + log.Fatal(err) + } + } else { + log.Warn("Health checks skipped by user") + } // Protect newly-created instances scaleOutFutures, err := sess.setVMProtection(ctx, true) if err != nil { log.Fatal(err) - os.Exit(1) } if err = sess.awaitVMFutures(ctx, scaleOutFutures); err != nil { log.Fatal(err) - os.Exit(1) } // Halve VMSS Capacity if err = sess.scaleVMSSByFactor(ctx, 0.5); err != nil { log.Fatal(err) - os.Exit(1) } // Un-protect instances scaleInFutures, err := sess.setVMProtection(ctx, false) if err != nil { log.Fatal(err) - os.Exit(1) } if err = sess.awaitVMFutures(ctx, scaleInFutures); err != nil { log.Fatal(err) - os.Exit(1) } }