Skip to content

Commit

Permalink
Add instance health checks and make idempotent for CI
Browse files Browse the repository at this point in the history
  • Loading branch information
krarey committed Oct 6, 2020
1 parent 9d85ad6 commit 806e6ff
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# v1.0.0
- Will now test whether any VMSS instances report that they are not the latest model. If all instances report they are the latest model, exit early with a successful status code. This lets us stay idempotent when running in CI/CD
- Added support for VM health checks. The script will wait (until the global timeout) for all instances to report healthy before mocing forward with scaling down the outgoing instance set. This feature can be disabled with the `--skip-health-check` flag.
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Azure VM Scale Set - Blue/Green Upgrade Utility
This utility performs a blue/green upgrade on an Azure VM Scale Set. This is useful when a VMSS is set to Manual upgrade mode, and the user wishes to perform an update by scaling in a set of replacement nodes, rather than asking Azure to remediate a set of already-running instances. The upgrade is carried out performing the following steps:

- Verify VM Scale Set instances require update. We assume a given scale set requires update if any instances report they are not the 'latest model'.
- Double the capacity of the Scale Set
- (Optional) Wait for all VMSS instance health checks to succeed
- Apply scale-in protection to all instances in the replacement set
- Reduce the capacity of the Scale Set by half
- Remove scale-in protection from the remaining instances

The entire operation is subject to a 20-minute global timeout. In the even this timeout is exceeded, or an error is encountered anywhere else in the process, we eagerly bail out of the operation and leave the scale set in place. This is done so that the failed instances are available for root cause analysis, and to protect applications like [HashiCorp Consul](https://github.com/hashicorp/consul) that may experience data loss if the outgoing instance set is inadvertently stopped before a migration has been fully verified.

## Configuration
Currently, we assume the user has a valid set of credentials provided by Azure CLI. These can be generated using the following commands:
- Azure AD User: `az login`
- Azure AD Service Principal: `az login --service-principal`
- Managed Service Identity: `az login --identity`

The Azure Subscription, Resource Group, and VM Scale Set name must be provided at run time. The relevant flags can be found by running the following command:

```
$ ./azure-cluster-upgrade --help
Interacts with the Azure API to perform a blue/green deployment.
Expects a Virtual Machine Scale Set whose configuration has recently been updated.
Expands the chosen scale set by a factor of two, and once all VMs have entered the
'Running' state, protects the replacement instances and reduces Scale Set capacity
to its original value.
Usage:
azure-cluster-upgrade [flags]
Flags:
-h, --help help for azure-cluster-upgrade
-r, --resource-group string Resource Group name
--skip-health-check Skip testing instance health checks
-s, --subscription-id string Subscription ID
-v, --vm-scale-set string Virtual Machine Scale Set name
```
1 change: 1 addition & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ func init() {
rootCmd.Flags().StringP("subscription-id", "s", "", "Subscription ID")
rootCmd.Flags().StringP("resource-group", "r", "", "Resource Group name")
rootCmd.Flags().StringP("vm-scale-set", "v", "", "Virtual Machine Scale Set name")
rootCmd.Flags().Bool("skip-health-check", false, "Skip testing instance health checks")

rootCmd.MarkFlagRequired("subscription-id")
rootCmd.MarkFlagRequired("resource-group")
Expand Down
73 changes: 61 additions & 12 deletions deploy/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package deploy
import (
"context"
"math"
"os"
"sync"
"time"

Expand Down Expand Up @@ -55,12 +54,12 @@ func (s *azureSession) setVMProtection(ctx context.Context, protect bool) ([]com

if protect {
filter = "properties/latestModelApplied eq true"
log.Info("Applying scale-in protection to new instances...")
log.Info("Applying scale-in protection to new instances")
} else {
// Leave this defaulted to an empty string for now
// This will un-protect ALL members of the VMSS upon completion
// filter = "properties/latestModelApplied eq false"
log.Info("Removing scale-in protection from Scale Set instances...")
log.Info("Removing scale-in protection from Scale Set instances")
}

for vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, filter, "", ""); vms.NotDone(); err = vms.Next() {
Expand Down Expand Up @@ -157,7 +156,7 @@ func (s *azureSession) scaleVMSSByFactor(ctx context.Context, factor float64) er
// Ick
newCapacity := int64(math.Floor(float64(*scaleSet.Sku.Capacity) * factor))

log.Infof("Scaling VMSS %s to %d instances...", *scaleSet.Name, newCapacity)
log.Infof("Scaling VMSS %s to %d instances", *scaleSet.Name, newCapacity)

future, err := client.Update(
ctx,
Expand All @@ -183,6 +182,47 @@ func (s *azureSession) scaleVMSSByFactor(ctx context.Context, factor float64) er
return nil
}

func (s *azureSession) instancesNeedUpgrade(ctx context.Context) (bool, error) {
client := s.getVMSSVMClient()
const filter string = "properties/latestModelApplied eq false"

vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, filter, "", "")
if err != nil {
return false, err
}
return vms.NotDone(), nil
}

func (s *azureSession) awaitInstanceHealthChecks(ctx context.Context) error {
client := s.getVMSSVMClient()
const healthy string = "HealthState/healthy"

log.Info("Waiting for instance health checks to pass")
// Rely on the context timeout to kill this if we run too long
for true {
instanceUnhealthy := false
for vms, err := client.ListComplete(ctx, s.ResourceGroupName, s.ScaleSetName, "", "", "instanceView"); vms.NotDone(); err = vms.Next() {
if err != nil {
return err
}

vm := vms.Value()
if *vm.InstanceView.VMHealth.Status.Code != healthy {
instanceUnhealthy = true
break
}
}

if instanceUnhealthy {
log.Info("VM instances do not yet report healthy. Backing off and retrying in 30 seconds.")
time.Sleep(30 * time.Second)
} else {
break
}
}
return nil
}

// Initializes a new azureSession struct. Mostly used to get
// rid of unnecessary variable passing and allow the chosen
// authorizer to be easily replaced.
Expand Down Expand Up @@ -215,43 +255,52 @@ func Run(cmd *cobra.Command, args []string) {
)
if err != nil {
log.Fatal(err)
os.Exit(1)
}

cont, err := sess.instancesNeedUpgrade(ctx)
if err != nil {
log.Fatal(err)
} else if !cont {
log.Info("All VMs report up-to-date. Nothing to do.")
return
}

if err = sess.scaleVMSSByFactor(ctx, 2); err != nil {
log.Fatal(err)
os.Exit(1)
}

log.Info("Waiting for new instances to reach Running state...")
skip, _ := cmd.Flags().GetBool("skip-health-check")
if !skip {
err = sess.awaitInstanceHealthChecks(ctx)
if err != nil {
log.Fatal(err)
}
} else {
log.Warn("Health checks skipped by user")
}

// Protect newly-created instances
scaleOutFutures, err := sess.setVMProtection(ctx, true)
if err != nil {
log.Fatal(err)
os.Exit(1)
}

if err = sess.awaitVMFutures(ctx, scaleOutFutures); err != nil {
log.Fatal(err)
os.Exit(1)
}

// Halve VMSS Capacity
if err = sess.scaleVMSSByFactor(ctx, 0.5); err != nil {
log.Fatal(err)
os.Exit(1)
}

// Un-protect instances
scaleInFutures, err := sess.setVMProtection(ctx, false)
if err != nil {
log.Fatal(err)
os.Exit(1)
}

if err = sess.awaitVMFutures(ctx, scaleInFutures); err != nil {
log.Fatal(err)
os.Exit(1)
}
}

0 comments on commit 806e6ff

Please sign in to comment.