diff --git a/controllers/propagator/encryption.go b/controllers/propagator/encryption.go index 6551fdeb..51a5c9c6 100644 --- a/controllers/propagator/encryption.go +++ b/controllers/propagator/encryption.go @@ -64,7 +64,14 @@ func (r *Propagator) getEncryptionKey(ctx context.Context, clusterName string) ( } err = r.Create(ctx, encryptionSecret) - if err != nil { + if k8serrors.IsAlreadyExists(err) { + // Some kind of race condition occurred (e.g. cache not updated in time), so just refetch the encryption + // secret. + err := r.Get(ctx, objectKey, encryptionSecret) + if err != nil { + return nil, fmt.Errorf("failed to get the Secret %s/%s: %w", clusterName, EncryptionKeySecret, err) + } + } else if err != nil { return nil, fmt.Errorf("failed to create the Secret %s/%s: %w", clusterName, EncryptionKeySecret, err) } } else if err != nil { diff --git a/controllers/propagator/propagation.go b/controllers/propagator/propagation.go index 05015105..307b2a3a 100644 --- a/controllers/propagator/propagation.go +++ b/controllers/propagator/propagation.go @@ -5,6 +5,7 @@ package propagator import ( "context" + "errors" "fmt" "strconv" "strings" @@ -34,6 +35,8 @@ const ( TriggerUpdateAnnotation = "policy.open-cluster-management.io/trigger-update" ) +var ErrRetryable = errors.New("") + type Propagator struct { client.Client Scheme *runtime.Scheme @@ -297,7 +300,7 @@ func (r *ReplicatedPolicyReconciler) processTemplates( if err != nil { log.Error(err, "Failed to get/generate the policy encryption key") - return err + return fmt.Errorf("%w%w", ErrRetryable, err) } // Get/generate the initialization vector @@ -366,6 +369,16 @@ func (r *ReplicatedPolicyReconciler) processTemplates( } } + // If the failure was due to a Kubernetes API error that could be recoverable, let's retry it. + // Missing objects are handled by the templating library sending reconcile requests when they get created. + if errors.Is(tplErr, templates.ErrMissingAPIResource) || + k8serrors.IsInternalError(tplErr) || + k8serrors.IsServiceUnavailable(tplErr) || + k8serrors.IsTimeout(tplErr) || + k8serrors.IsTooManyRequests(tplErr) { + tplErr = fmt.Errorf("%w%w", ErrRetryable, tplErr) + } + return tplErr } @@ -405,7 +418,7 @@ func (r *ReplicatedPolicyReconciler) processTemplates( if templateResult.CacheCleanUp != nil { err := templateResult.CacheCleanUp() if err != nil { - return err + return fmt.Errorf("%w%w", ErrRetryable, err) } } diff --git a/controllers/propagator/replicatedpolicy_controller.go b/controllers/propagator/replicatedpolicy_controller.go index e575a299..e9a6b3ca 100644 --- a/controllers/propagator/replicatedpolicy_controller.go +++ b/controllers/propagator/replicatedpolicy_controller.go @@ -2,6 +2,7 @@ package propagator import ( "context" + "errors" "fmt" "strings" "sync" @@ -236,11 +237,13 @@ func (r *ReplicatedPolicyReconciler) Reconcile(ctx context.Context, request ctrl } } - // resolve hubTemplate before replicating - // #nosec G104 -- any errors are logged and recorded in the processTemplates method, - // but the ignored status will be handled appropriately by the policy controllers on - // the managed cluster(s). - _ = r.processTemplates(ctx, desiredReplicatedPolicy, decision.Cluster, rootPolicy) + // Any errors to expose to the user are logged and recorded in the processTemplates method. Only retry + // the request if it's determined to be a retryable error (i.e. don't retry syntax errors). + err := r.processTemplates(ctx, desiredReplicatedPolicy, decision.Cluster, rootPolicy) + if errors.Is(err, ErrRetryable) { + // Return the error if it's retryable, which will utilize controller-runtime's exponential backoff. + return reconcile.Result{}, err + } } else { watcherErr := r.TemplateResolver.UncacheWatcher(instanceObjID) if watcherErr != nil {