From 73afca5adfa295576b54b132b8497500deee729c Mon Sep 17 00:00:00 2001 From: hugoShaka Date: Tue, 3 Dec 2024 18:03:37 -0500 Subject: [PATCH] autoupdate: implement halt-on-error strategy --- .../rollout/strategy_haltonerror.go | 131 +++++ .../rollout/strategy_haltonerror_test.go | 464 ++++++++++++++++++ 2 files changed, 595 insertions(+) create mode 100644 lib/autoupdate/rollout/strategy_haltonerror.go create mode 100644 lib/autoupdate/rollout/strategy_haltonerror_test.go diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go new file mode 100644 index 0000000000000..522b1c8cc3b4d --- /dev/null +++ b/lib/autoupdate/rollout/strategy_haltonerror.go @@ -0,0 +1,131 @@ +package rollout + +import ( + "context" + "log/slog" + "time" + + "github.com/gravitational/trace" + "github.com/jonboulle/clockwork" + + "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + update "github.com/gravitational/teleport/api/types/autoupdate" +) + +type haltOnErrorStrategy struct { + log *slog.Logger + clock clockwork.Clock +} + +func (h *haltOnErrorStrategy) name() string { + return update.AgentsStrategyHaltOnError +} + +func newHaltOnErrorStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStrategy, error) { + if log == nil { + return nil, trace.BadParameter("missing log") + } + if clock == nil { + return nil, trace.BadParameter("missing clock") + } + return &haltOnErrorStrategy{ + log: log.With("strategy", update.AgentsStrategyHaltOnError), + clock: clock, + }, nil +} + +func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error { + now := h.clock.Now() + // We process every group in order, all the previous groups must be in the DONE state + // for the next group to become active. Even if some early groups are not DONE, + // later groups might be ACTIVE and need to transition to DONE, so we cannot + // return early and must process every group. + // + // For example, in a dev/staging/prod setup, the "dev" group might get rolled + // back while "staging" is still ACTIVE. We must not start PROD but still need + // to transition "staging" to DONE. + previousGroupsAreDone := true + + for i, group := range groups { + switch group.State { + case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: + var previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup + if i != 0 { + previousGroup = groups[i-1] + } + canStart, err := canStartHaltOnError(group, previousGroup, now) + if err != nil { + // In halt-on-error rollouts, groups are dependent. + // Failing to transition a group should prevent other groups from transitioning. + setGroupState(group, group.State, updateReasonReconcilerError, now) + return err + } + switch { + case previousGroupsAreDone && canStart: + // We can start + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) + case previousGroupsAreDone: + // All previous groups are OK, but time-related criterias are not OK + setGroupState(group, group.State, updateReasonCannotStart, now) + default: + // At least one previous group is not DONE + setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now) + } + previousGroupsAreDone = false + case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: + // The group has been manually rolled back. We don't touch anything and + // don't process the next groups. + previousGroupsAreDone = false + case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: + // The group has already been updated, we can look at the next group + case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: + // The group is currently being updated. We check if we can transition it to the done state + done, reason := isDoneHaltOnError(group, now) + + if done { + // We transition to the done state. We continue processing the groups as we might be able to start the next one. + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, reason, now) + } else { + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, reason, now) + } + previousGroupsAreDone = false + + default: + return trace.BadParameter("unknown autoupdate group state: %v", group.State) + } + } + return nil +} + +func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) { + // check wait days + if group.ConfigWaitDays != 0 { + if previousGroup == nil { + return false, trace.BadParameter("The first group cannot have non-zero wait days") + } + + previousStart := previousGroup.StartTime.AsTime() + if previousStart.IsZero() || previousStart.Unix() == 0 { + return false, trace.BadParameter("The previous group doesn't have a start time, cannot check the 'wait_days' criteria") + } + + // Take the day of the previous group start, add 'wait_days' and truncate to midnight. + y, m, d := previousStart.AddDate(0, 0, int(group.ConfigWaitDays)).Date() + // Check if the wait_day criteria is OK, if we are at least after 'wait_days' since the previous start. + if now.Before(time.Date(y, m, d, 0, 0, 0, 0, previousStart.Location())) { + return false, nil + } + } + + return inWindow(group, now) +} + +func isDoneHaltOnError(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, string) { + // Currently we don't implement status reporting from groups/agents. + // So we just wait 60 minutes and consider the maintenance done. + // This will change as we introduce agent status report and aggregated agent counts. + if group.StartTime.AsTime().Add(time.Hour).Before(now) { + return true, updateReasonUpdateComplete + } + return false, updateReasonUpdateInProgress +} diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go new file mode 100644 index 0000000000000..91b8018228118 --- /dev/null +++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go @@ -0,0 +1,464 @@ +package rollout + +import ( + "context" + "testing" + "time" + + "github.com/jonboulle/clockwork" + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/types/known/timestamppb" + + "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/lib/utils" +) + +func Test_canStartHaltOnError(t *testing.T) { + now := testSunday + yesterday := testSaturday + + tests := []struct { + name string + group *autoupdate.AutoUpdateAgentRolloutStatusGroup + previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup + want bool + wantErr require.ErrorAssertionFunc + }{ + { + name: "first group, no wait_days", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "test-group", + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 0, + }, + want: true, + wantErr: require.NoError, + }, + { + name: "first group, wait_days (invalid)", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "test-group", + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 1, + }, + want: false, + wantErr: require.Error, + }, + { + name: "second group, no wait_days", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "test-group", + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 0, + }, + previousGroup: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "previous-group", + StartTime: timestamppb.New(now), + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 0, + }, + want: true, + wantErr: require.NoError, + }, + { + name: "second group, wait_days not over", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "test-group", + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 2, + }, + previousGroup: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "previous-group", + StartTime: timestamppb.New(yesterday), + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 0, + }, + want: false, + wantErr: require.NoError, + }, + { + name: "second group, wait_days over", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "test-group", + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 1, + }, + previousGroup: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "previous-group", + StartTime: timestamppb.New(yesterday), + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + ConfigDays: everyWeekday, + ConfigStartHour: int32(now.Hour()), + ConfigWaitDays: 0, + }, + want: true, + wantErr: require.NoError, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := canStartHaltOnError(tt.group, tt.previousGroup, now) + tt.wantErr(t, err) + require.Equal(t, tt.want, got) + }) + } +} + +func Test_progressGroupsHaltOnError(t *testing.T) { + clock := clockwork.NewFakeClockAt(testSunday) + log := utils.NewSlogLoggerForTests() + strategy, err := newHaltOnErrorStrategy(log, clock) + require.NoError(t, err) + + fewMinutesAgo := clock.Now().Add(-5 * time.Minute) + yesterday := testSaturday + canStartToday := everyWeekday + cannotStartToday := everyWeekdayButSunday + ctx := context.Background() + + group1Name := "group1" + group2Name := "group2" + group3Name := "group3" + + tests := []struct { + name string + initialState []*autoupdate.AutoUpdateAgentRolloutStatusGroup + expectedState []*autoupdate.AutoUpdateAgentRolloutStatusGroup + }{ + { + name: "single group unstarted -> unstarted", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: cannotStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCannotStart, + ConfigDays: cannotStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, + { + name: "single group unstarted -> active", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + StartTime: timestamppb.New(clock.Now()), + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, + { + name: "single group active -> active", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + StartTime: timestamppb.New(fewMinutesAgo), + LastUpdateTime: timestamppb.New(fewMinutesAgo), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + StartTime: timestamppb.New(fewMinutesAgo), + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonUpdateInProgress, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, + { + name: "single group active -> done", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonUpdateInProgress, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonUpdateComplete, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, + { + name: "single group done -> done", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonUpdateComplete, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonUpdateComplete, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, + { + name: "single group rolledback -> rolledback", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: "manual_rollback", + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: "manual_rollback", + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, + { + name: "first group done, second should activate, third should not progress", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonUpdateComplete, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + { + Name: group2Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 1, + }, + { + Name: group3Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 0, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonUpdateComplete, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + { + Name: group2Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + StartTime: timestamppb.New(clock.Now()), + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 1, + }, + { + Name: group3Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonPreviousGroupsNotDone, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 0, + }, + }, + }, + { + name: "first group rolledback, second should not start", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: "manual_rollback", + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + { + Name: group2Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 1, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: "manual_rollback", + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + { + Name: group2Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonPreviousGroupsNotDone, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 1, + }, + }, + }, + { + name: "first group rolledback, second is active and should become done, third should not progress", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: "manual_rollback", + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + { + Name: group2Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 0, + }, + { + Name: group3Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 0, + }, + }, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: "manual_rollback", + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + { + Name: group2Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + StartTime: timestamppb.New(yesterday), + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonUpdateComplete, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 0, + }, + { + Name: group3Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonPreviousGroupsNotDone, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + ConfigWaitDays: 0, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := strategy.progressRollout(ctx, tt.initialState) + require.NoError(t, err) + // We use require.Equal instead of Elements match because group order matters. + // It's not super important for time-based, but is crucial for halt-on-error. + // So it's better to be more conservative and validate order never changes for + // both strategies. + require.Equal(t, tt.expectedState, tt.initialState) + }) + } +}