From 6981f888341ec0dd0cc8bfd1475cb494475f603b Mon Sep 17 00:00:00 2001 From: Wantong Jiang Date: Thu, 9 Jan 2025 00:44:10 +0000 Subject: [PATCH] fix policyObservedClusterCount --- .../updaterun/execution_integration_test.go | 3 +- pkg/controllers/updaterun/initialization.go | 10 ++++ .../initialization_integration_test.go | 48 +++++++++++++++++++ .../updaterun/validation_integration_test.go | 7 +-- 4 files changed, 64 insertions(+), 4 deletions(-) diff --git a/pkg/controllers/updaterun/execution_integration_test.go b/pkg/controllers/updaterun/execution_integration_test.go index aa09a5aed..eb0fee47a 100644 --- a/pkg/controllers/updaterun/execution_integration_test.go +++ b/pkg/controllers/updaterun/execution_integration_test.go @@ -283,7 +283,8 @@ var _ = Describe("UpdateRun execution tests", func() { waitEndTime := meta.FindStatusCondition(updateRun.Status.StagesStatus[0].AfterStageTaskStatus[0].Conditions, string(placementv1beta1.AfterStageTaskConditionWaitTimeElapsed)).LastTransitionTime.Time // In this test, I set wait time to be 4 seconds, while stageClusterUpdatingWaitTime is 3 seconds. // So it needs 2 rounds of reconcile to wait for the waitTime to elapse, waitEndTime - waitStartTime should be around 6 seconds. - Expect(waitStartTime.Add(updateStrategy.Spec.Stages[0].AfterStageTasks[0].WaitTime.Duration).Before(waitEndTime)).Should(BeTrue()) + Expect(waitStartTime.Add(updateStrategy.Spec.Stages[0].AfterStageTasks[0].WaitTime.Duration).Before(waitEndTime)).Should(BeTrue(), + fmt.Sprintf("waitEndTime %v did not pass waitStartTime %v long enough, want at least %v", waitEndTime, waitStartTime, updateStrategy.Spec.Stages[0].AfterStageTasks[0].WaitTime.Duration)) }) It("Should mark the 1st cluster in the 2nd stage as succeeded after marking the binding available", func() { diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go index 7c60d1c89..c199887b8 100644 --- a/pkg/controllers/updaterun/initialization.go +++ b/pkg/controllers/updaterun/initialization.go @@ -199,6 +199,16 @@ func (r *Reconciler) collectScheduledClusters( // no more retries here. return nil, nil, fmt.Errorf("%w: %s", errInitializedFailed, nobindingErr.Error()) } + + if updateRun.Status.PolicyObservedClusterCount == -1 { + // For pickAll policy, the observed cluster count is not included in the policy snapshot. We set it to the number of selected bindings. + updateRun.Status.PolicyObservedClusterCount = len(selectedBindings) + } else if updateRun.Status.PolicyObservedClusterCount != len(selectedBindings) { + countErr := controller.NewUnexpectedBehaviorError(fmt.Errorf("the number of selected bindings %d is not equal to the observed cluster count %d", len(selectedBindings), updateRun.Status.PolicyObservedClusterCount)) + klog.ErrorS(countErr, "Failed to collect clusterResourceBindings", "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef) + // no more retries here. + return nil, nil, fmt.Errorf("%w: %s", errInitializedFailed, countErr.Error()) + } return selectedBindings, toBeDeletedBindings, nil } diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go index 3befa65e0..9c1834ba7 100644 --- a/pkg/controllers/updaterun/initialization_integration_test.go +++ b/pkg/controllers/updaterun/initialization_integration_test.go @@ -379,6 +379,17 @@ var _ = Describe("Updaterun initialization tests", func() { }) It("Should not report error if there are only to-be-deleted clusters", func() { + By("Updating the policy snapshot to use pickAll policy to avoid binding count check") + policySnapshot.Spec.Policy.PlacementType = placementv1beta1.PickAllPlacementType + Expect(k8sClient.Update(ctx, policySnapshot)).To(Succeed()) + meta.SetStatusCondition(&policySnapshot.Status.Conditions, metav1.Condition{ + Type: string(placementv1beta1.PolicySnapshotScheduled), + Status: metav1.ConditionTrue, + ObservedGeneration: policySnapshot.Generation, + Reason: "scheduled", + }) + Expect(k8sClient.Status().Update(ctx, policySnapshot)).Should(Succeed()) + By("Creating a to-be-deleted clusterResourceBinding") binding := generateTestClusterResourceBinding(policySnapshot.Name+"a", "cluster-0", placementv1beta1.BindingStateUnscheduled) Expect(k8sClient.Create(ctx, binding)).To(Succeed()) @@ -391,6 +402,43 @@ var _ = Describe("Updaterun initialization tests", func() { validateFailedInitCondition(ctx, updateRun, "referenced clusterStagedUpdateStrategy not found") }) + It("Should fail to initialize if the number of selected bindings does not match the observed cluster count", func() { + By("Creating only one scheduled cluterResourceBinding") + Expect(k8sClient.Create(ctx, resourceBindings[0])).To(Succeed()) + + By("Creating a new clusterStagedUpdateRun") + Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) + + By("Validating the initialization failed") + validateFailedInitCondition(ctx, updateRun, "the number of selected bindings 1 is not equal to the observed cluster count 10") + }) + + It("Should update the ObservedClusterCount to the number of scheduled bindings if it's pickAll policy", func() { + By("Updating the policy snapshot to use pickAll policy") + policySnapshot.Spec.Policy.PlacementType = placementv1beta1.PickAllPlacementType + Expect(k8sClient.Update(ctx, policySnapshot)).To(Succeed()) + meta.SetStatusCondition(&policySnapshot.Status.Conditions, metav1.Condition{ + Type: string(placementv1beta1.PolicySnapshotScheduled), + Status: metav1.ConditionTrue, + ObservedGeneration: policySnapshot.Generation, + Reason: "scheduled", + }) + Expect(k8sClient.Status().Update(ctx, policySnapshot)).Should(Succeed()) + + By("Creating only one scheduled cluterResourceBinding") + Expect(k8sClient.Create(ctx, resourceBindings[0])).To(Succeed()) + + By("Creating a new clusterStagedUpdateRun") + Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) + + By("Validating the initialization not failed due to no selected cluster") + // it should fail due to strategy not found + validateFailedInitCondition(ctx, updateRun, "referenced clusterStagedUpdateStrategy not found") + + By("Validating the ObservedClusterCount is updated") + Expect(updateRun.Status.PolicyObservedClusterCount).To(Equal(1), "failed to update the updateRun PolicyObservedClusterCount status") + }) + It("Should fail to initialize if the bindings with latest policy snapshots are not in Scheduled or Bound state", func() { By("Creating a not scheduled clusterResourceBinding") binding := generateTestClusterResourceBinding(policySnapshot.Name, "cluster-1", placementv1beta1.BindingStateUnscheduled) diff --git a/pkg/controllers/updaterun/validation_integration_test.go b/pkg/controllers/updaterun/validation_integration_test.go index 161eeaf76..abef28aea 100644 --- a/pkg/controllers/updaterun/validation_integration_test.go +++ b/pkg/controllers/updaterun/validation_integration_test.go @@ -333,12 +333,13 @@ var _ = Describe("UpdateRun validation tests", func() { }) It("Should fail to validate if the number of clusters has changed in a stage", func() { - By("Deleting a cluster resource binding") - Expect(k8sClient.Delete(ctx, resourceBindings[0])).Should(Succeed()) + By("Changing 1st cluster's so that it's selected by the 1st stage") + targetClusters[0].Labels["region"] = regionEastus + Expect(k8sClient.Update(ctx, targetClusters[0])).Should(Succeed()) By("Validating the validation failed") wantStatus = generateFailedValidationStatus(updateRun, wantStatus) - validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "the number of clusters in index `1` stage has changed") + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "the number of clusters in index `0` stage has changed") }) It("Should fail to validate if the cluster name has changed in a stage", func() {