From ee637261e86258e3e86ac90140a883c2deb68b2a Mon Sep 17 00:00:00 2001 From: mprahl Date: Fri, 1 Dec 2023 12:43:45 -0500 Subject: [PATCH] Add retries to some templating related failures In particular, after the replicated policy controller was created, multiple replicated policies for the same cluster can now be processed at the same time. This led to race conditions of each goroutine trying to create the policy encryption key for the first time. This adds more retries to make hub templates more resilient. Relates: https://issues.redhat.com/browse/ACM-8744 Signed-off-by: mprahl (cherry picked from commit 49da24d85788e1aaac37ae7943807ae55257c853) --- controllers/propagator/encryption.go | 9 ++++++++- controllers/propagator/propagation.go | 17 +++++++++++++++-- .../propagator/replicatedpolicy_controller.go | 13 ++++++++----- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/controllers/propagator/encryption.go b/controllers/propagator/encryption.go index 6551fdeb..51a5c9c6 100644 --- a/controllers/propagator/encryption.go +++ b/controllers/propagator/encryption.go @@ -64,7 +64,14 @@ func (r *Propagator) getEncryptionKey(ctx context.Context, clusterName string) ( } err = r.Create(ctx, encryptionSecret) - if err != nil { + if k8serrors.IsAlreadyExists(err) { + // Some kind of race condition occurred (e.g. cache not updated in time), so just refetch the encryption + // secret. + err := r.Get(ctx, objectKey, encryptionSecret) + if err != nil { + return nil, fmt.Errorf("failed to get the Secret %s/%s: %w", clusterName, EncryptionKeySecret, err) + } + } else if err != nil { return nil, fmt.Errorf("failed to create the Secret %s/%s: %w", clusterName, EncryptionKeySecret, err) } } else if err != nil { diff --git a/controllers/propagator/propagation.go b/controllers/propagator/propagation.go index 05015105..307b2a3a 100644 --- a/controllers/propagator/propagation.go +++ b/controllers/propagator/propagation.go @@ -5,6 +5,7 @@ package propagator import ( "context" + "errors" "fmt" "strconv" "strings" @@ -34,6 +35,8 @@ const ( TriggerUpdateAnnotation = "policy.open-cluster-management.io/trigger-update" ) +var ErrRetryable = errors.New("") + type Propagator struct { client.Client Scheme *runtime.Scheme @@ -297,7 +300,7 @@ func (r *ReplicatedPolicyReconciler) processTemplates( if err != nil { log.Error(err, "Failed to get/generate the policy encryption key") - return err + return fmt.Errorf("%w%w", ErrRetryable, err) } // Get/generate the initialization vector @@ -366,6 +369,16 @@ func (r *ReplicatedPolicyReconciler) processTemplates( } } + // If the failure was due to a Kubernetes API error that could be recoverable, let's retry it. + // Missing objects are handled by the templating library sending reconcile requests when they get created. + if errors.Is(tplErr, templates.ErrMissingAPIResource) || + k8serrors.IsInternalError(tplErr) || + k8serrors.IsServiceUnavailable(tplErr) || + k8serrors.IsTimeout(tplErr) || + k8serrors.IsTooManyRequests(tplErr) { + tplErr = fmt.Errorf("%w%w", ErrRetryable, tplErr) + } + return tplErr } @@ -405,7 +418,7 @@ func (r *ReplicatedPolicyReconciler) processTemplates( if templateResult.CacheCleanUp != nil { err := templateResult.CacheCleanUp() if err != nil { - return err + return fmt.Errorf("%w%w", ErrRetryable, err) } } diff --git a/controllers/propagator/replicatedpolicy_controller.go b/controllers/propagator/replicatedpolicy_controller.go index e575a299..e9a6b3ca 100644 --- a/controllers/propagator/replicatedpolicy_controller.go +++ b/controllers/propagator/replicatedpolicy_controller.go @@ -2,6 +2,7 @@ package propagator import ( "context" + "errors" "fmt" "strings" "sync" @@ -236,11 +237,13 @@ func (r *ReplicatedPolicyReconciler) Reconcile(ctx context.Context, request ctrl } } - // resolve hubTemplate before replicating - // #nosec G104 -- any errors are logged and recorded in the processTemplates method, - // but the ignored status will be handled appropriately by the policy controllers on - // the managed cluster(s). - _ = r.processTemplates(ctx, desiredReplicatedPolicy, decision.Cluster, rootPolicy) + // Any errors to expose to the user are logged and recorded in the processTemplates method. Only retry + // the request if it's determined to be a retryable error (i.e. don't retry syntax errors). + err := r.processTemplates(ctx, desiredReplicatedPolicy, decision.Cluster, rootPolicy) + if errors.Is(err, ErrRetryable) { + // Return the error if it's retryable, which will utilize controller-runtime's exponential backoff. + return reconcile.Result{}, err + } } else { watcherErr := r.TemplateResolver.UncacheWatcher(instanceObjID) if watcherErr != nil {