Add instance health checks and make idempotent for CI

krarey · Oct 6, 2020 · 806e6ff · 806e6ff
1 parent 9d85ad6
commit 806e6ff
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,3 @@
+# v1.0.0
+- Will now test whether any VMSS instances report that they are not the latest model. If all instances report they are the latest model, exit early with a successful status code. This lets us stay idempotent when running in CI/CD
+- Added support for VM health checks. The script will wait (until the global timeout) for all instances to report healthy before mocing forward with scaling down the outgoing instance set. This feature can be disabled with the `--skip-health-check` flag.
diff --git a/README.md b/README.md
@@ -0,0 +1,39 @@
+# Azure VM Scale Set - Blue/Green Upgrade Utility 
+This utility performs a blue/green upgrade on an Azure VM Scale Set. This is useful when a VMSS is set to Manual upgrade mode, and the user wishes to perform an update by scaling in a set of replacement nodes, rather than asking Azure to remediate a set of already-running instances. The upgrade is carried out performing the following steps:
+
+- Verify VM Scale Set instances require update. We assume a given scale set requires update if any instances report they are not the 'latest model'.
+- Double the capacity of the Scale Set
+- (Optional) Wait for all VMSS instance health checks to succeed
+- Apply scale-in protection to all instances in the replacement set
+- Reduce the capacity of the Scale Set by half
+- Remove scale-in protection from the remaining instances
+
+The entire operation is subject to a 20-minute global timeout. In the even this timeout is exceeded, or an error is encountered anywhere else in the process, we eagerly bail out of the operation and leave the scale set in place. This is done so that the failed instances are available for root cause analysis, and to protect applications like [HashiCorp Consul](https://github.com/hashicorp/consul) that may experience data loss if the outgoing instance set is inadvertently stopped before a migration has been fully verified.
+
+## Configuration
+Currently, we assume the user has a valid set of credentials provided by Azure CLI. These can be generated using the following commands:
+- Azure AD User: `az login`
+- Azure AD Service Principal: `az login --service-principal`
+- Managed Service Identity: `az login --identity`
+
+The Azure Subscription, Resource Group, and VM Scale Set name must be provided at run time. The relevant flags can be found by running the following command:
+
+```
+$ ./azure-cluster-upgrade --help
+Interacts with the Azure API to perform a blue/green deployment.
+
+Expects a Virtual Machine Scale Set whose configuration has recently been updated.
+Expands the chosen scale set by a factor of two, and once all VMs have entered the
+'Running' state, protects the replacement instances and reduces Scale Set capacity
+to its original value.
+
+Usage:
+  azure-cluster-upgrade [flags]
+
+Flags:
+  -h, --help                     help for azure-cluster-upgrade
+  -r, --resource-group string    Resource Group name
+      --skip-health-check        Skip testing instance health checks
+  -s, --subscription-id string   Subscription ID
+  -v, --vm-scale-set string      Virtual Machine Scale Set name
+```
diff --git a/cmd/root.go b/cmd/root.go
@@ -36,6 +36,7 @@ func init() {
 	rootCmd.Flags().StringP("subscription-id", "s", "", "Subscription ID")
 	rootCmd.Flags().StringP("resource-group", "r", "", "Resource Group name")
 	rootCmd.Flags().StringP("vm-scale-set", "v", "", "Virtual Machine Scale Set name")
+	rootCmd.Flags().Bool("skip-health-check", false, "Skip testing instance health checks")
 
 	rootCmd.MarkFlagRequired("subscription-id")
 	rootCmd.MarkFlagRequired("resource-group")

diff --git a/deploy/deploy.go b/deploy/deploy.go
@@ -3,7 +3,6 @@ package deploy
 import (
 	"context"
 	"math"
-	"os"
 	"sync"
 	"time"
 
@@ -55,12 +54,12 @@ func (s *azureSession) setVMProtection(ctx context.Context, protect bool) ([]com
 
 	if protect {
 		filter = "properties/latestModelApplied eq true"
-		log.Info("Applying scale-in protection to new instances...")
+		log.Info("Applying scale-in protection to new instances")
 	} else {
 		// Leave this defaulted to an empty string for now
 		// This will un-protect ALL members of the VMSS upon completion
 		// filter = "properties/latestModelApplied eq false"
-		log.Info("Removing scale-in protection from Scale Set instances...")
+		log.Info("Removing scale-in protection from Scale Set instances")
 	}
 
 	for vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, filter, "", ""); vms.NotDone(); err = vms.Next() {
@@ -157,7 +156,7 @@ func (s *azureSession) scaleVMSSByFactor(ctx context.Context, factor float64) er
 	// Ick
 	newCapacity := int64(math.Floor(float64(*scaleSet.Sku.Capacity) * factor))
 
-	log.Infof("Scaling VMSS %s to %d instances...", *scaleSet.Name, newCapacity)
+	log.Infof("Scaling VMSS %s to %d instances", *scaleSet.Name, newCapacity)
 
 	future, err := client.Update(
 		ctx,
@@ -183,6 +182,47 @@ func (s *azureSession) scaleVMSSByFactor(ctx context.Context, factor float64) er
 	return nil
 }
 
+func (s *azureSession) instancesNeedUpgrade(ctx context.Context) (bool, error) {
+	client := s.getVMSSVMClient()
+	const filter string = "properties/latestModelApplied eq false"
+
+	vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, filter, "", "")
+	if err != nil {
+		return false, err
+	}
+	return vms.NotDone(), nil
+}
+
+func (s *azureSession) awaitInstanceHealthChecks(ctx context.Context) error {
+	client := s.getVMSSVMClient()
+	const healthy string = "HealthState/healthy"
+
+	log.Info("Waiting for instance health checks to pass")
+	// Rely on the context timeout to kill this if we run too long
+	for true {
+		instanceUnhealthy := false
+		for vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, "", "", "instanceView"); vms.NotDone(); err = vms.Next() {
+			if err != nil {
+				return err
+			}
+
+			vm := vms.Value()
+			if *vm.InstanceView.VMHealth.Status.Code != healthy {
+				instanceUnhealthy = true
+				break
+			}
+		}
+
+		if instanceUnhealthy {
+			log.Info("VM instances do not yet report healthy. Backing off and retrying in 30 seconds.")
+			time.Sleep(30 * time.Second)
+		} else {
+			break
+		}
+	}
+	return nil
+}
+
 // Initializes a new azureSession struct. Mostly used to get
 // rid of unnecessary variable passing and allow the chosen
 // authorizer to be easily replaced.
@@ -215,43 +255,52 @@ func Run(cmd *cobra.Command, args []string) {
 	)
 	if err != nil {
 		log.Fatal(err)
-		os.Exit(1)
+	}
+
+	cont, err := sess.instancesNeedUpgrade(ctx)
+	if err != nil {
+		log.Fatal(err)
+	} else if !cont {
+		log.Info("All VMs report up-to-date. Nothing to do.")
+		return
 	}
 
 	if err = sess.scaleVMSSByFactor(ctx, 2); err != nil {
 		log.Fatal(err)
-		os.Exit(1)
 	}
 
-	log.Info("Waiting for new instances to reach Running state...")
+	skip, _ := cmd.Flags().GetBool("skip-health-check")
+	if !skip {
+		err = sess.awaitInstanceHealthChecks(ctx)
+		if err != nil {
+			log.Fatal(err)
+		}
+	} else {
+		log.Warn("Health checks skipped by user")
+	}
 
 	// Protect newly-created instances
 	scaleOutFutures, err := sess.setVMProtection(ctx, true)
 	if err != nil {
 		log.Fatal(err)
-		os.Exit(1)
 	}
 
 	if err = sess.awaitVMFutures(ctx, scaleOutFutures); err != nil {
 		log.Fatal(err)
-		os.Exit(1)
 	}
 
 	// Halve VMSS Capacity
 	if err = sess.scaleVMSSByFactor(ctx, 0.5); err != nil {
 		log.Fatal(err)
-		os.Exit(1)
 	}
 
 	// Un-protect instances
 	scaleInFutures, err := sess.setVMProtection(ctx, false)
 	if err != nil {
 		log.Fatal(err)
-		os.Exit(1)
 	}
 
 	if err = sess.awaitVMFutures(ctx, scaleInFutures); err != nil {
 		log.Fatal(err)
-		os.Exit(1)
 	}
 }