Skip to content

Commit

Permalink
autoupdate: implement halt-on-error strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
hugoShaka committed Dec 9, 2024
1 parent b91395a commit 73afca5
Show file tree
Hide file tree
Showing 2 changed files with 595 additions and 0 deletions.
131 changes: 131 additions & 0 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package rollout

import (
"context"
"log/slog"
"time"

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"

"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
update "github.com/gravitational/teleport/api/types/autoupdate"
)

type haltOnErrorStrategy struct {
log *slog.Logger
clock clockwork.Clock
}

func (h *haltOnErrorStrategy) name() string {
return update.AgentsStrategyHaltOnError
}

func newHaltOnErrorStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStrategy, error) {
if log == nil {
return nil, trace.BadParameter("missing log")
}
if clock == nil {
return nil, trace.BadParameter("missing clock")
}
return &haltOnErrorStrategy{
log: log.With("strategy", update.AgentsStrategyHaltOnError),
clock: clock,
}, nil
}

func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
now := h.clock.Now()
// We process every group in order, all the previous groups must be in the DONE state
// for the next group to become active. Even if some early groups are not DONE,
// later groups might be ACTIVE and need to transition to DONE, so we cannot
// return early and must process every group.
//
// For example, in a dev/staging/prod setup, the "dev" group might get rolled
// back while "staging" is still ACTIVE. We must not start PROD but still need
// to transition "staging" to DONE.
previousGroupsAreDone := true

for i, group := range groups {
switch group.State {
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED:
var previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup
if i != 0 {
previousGroup = groups[i-1]
}
canStart, err := canStartHaltOnError(group, previousGroup, now)
if err != nil {
// In halt-on-error rollouts, groups are dependent.
// Failing to transition a group should prevent other groups from transitioning.
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
}
switch {
case previousGroupsAreDone && canStart:
// We can start
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now)
case previousGroupsAreDone:
// All previous groups are OK, but time-related criterias are not OK
setGroupState(group, group.State, updateReasonCannotStart, now)
default:
// At least one previous group is not DONE
setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now)
}
previousGroupsAreDone = false
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK:
// The group has been manually rolled back. We don't touch anything and
// don't process the next groups.
previousGroupsAreDone = false
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE:
// The group has already been updated, we can look at the next group
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE:
// The group is currently being updated. We check if we can transition it to the done state
done, reason := isDoneHaltOnError(group, now)

if done {
// We transition to the done state. We continue processing the groups as we might be able to start the next one.
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, reason, now)
} else {
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, reason, now)
}
previousGroupsAreDone = false

default:
return trace.BadParameter("unknown autoupdate group state: %v", group.State)
}
}
return nil
}

func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
// check wait days
if group.ConfigWaitDays != 0 {
if previousGroup == nil {
return false, trace.BadParameter("The first group cannot have non-zero wait days")
}

previousStart := previousGroup.StartTime.AsTime()
if previousStart.IsZero() || previousStart.Unix() == 0 {
return false, trace.BadParameter("The previous group doesn't have a start time, cannot check the 'wait_days' criteria")
}

// Take the day of the previous group start, add 'wait_days' and truncate to midnight.
y, m, d := previousStart.AddDate(0, 0, int(group.ConfigWaitDays)).Date()
// Check if the wait_day criteria is OK, if we are at least after 'wait_days' since the previous start.
if now.Before(time.Date(y, m, d, 0, 0, 0, 0, previousStart.Location())) {
return false, nil
}
}

return inWindow(group, now)
}

func isDoneHaltOnError(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, string) {
// Currently we don't implement status reporting from groups/agents.
// So we just wait 60 minutes and consider the maintenance done.
// This will change as we introduce agent status report and aggregated agent counts.
if group.StartTime.AsTime().Add(time.Hour).Before(now) {
return true, updateReasonUpdateComplete
}
return false, updateReasonUpdateInProgress
}
Loading

0 comments on commit 73afca5

Please sign in to comment.