From e97b9956fec0c9305403557dc90c8767c0878a60 Mon Sep 17 00:00:00 2001 From: Andrew Lavery <laverya@umich.edu> Date: Mon, 7 Oct 2024 23:13:44 +0200 Subject: [PATCH] record events in the installation object (#1292) * record events in the installation object * properly set k0s upgrade state * add 'events' permissions, only send events on change * f * fix checks... --- ...embedded-cluster-operator-clusterrole.yaml | 7 ++ .../controllers/installation_controller.go | 34 +++++++--- operator/pkg/charts/detection.go | 6 +- operator/pkg/charts/helm_test.go | 18 ++--- operator/pkg/charts/reconcile.go | 68 +++++++++++++------ operator/pkg/charts/reconcile_test.go | 6 +- operator/pkg/cli/root.go | 1 + operator/pkg/upgrade/upgrade.go | 2 +- 8 files changed, 97 insertions(+), 45 deletions(-) diff --git a/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml b/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml index f33b61e41..f8f867a98 100644 --- a/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml +++ b/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml @@ -69,6 +69,13 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - events + verbs: + - patch + - create - apiGroups: - embeddedcluster.replicated.com resources: diff --git a/operator/controllers/installation_controller.go b/operator/controllers/installation_controller.go index 4b7d2e7ee..32b451b8d 100644 --- a/operator/controllers/installation_controller.go +++ b/operator/controllers/installation_controller.go @@ -36,6 +36,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/discovery" + "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -140,6 +141,7 @@ type InstallationReconciler struct { client.Client Discovery discovery.DiscoveryInterface Scheme *runtime.Scheme + Recorder record.EventRecorder } // NodeHasChanged returns true if the node configuration has changed when compared to @@ -205,9 +207,11 @@ func (r *InstallationReconciler) ReconcileNodeStatuses(ctx context.Context, in * return nil, fmt.Errorf("failed to update node status: %w", err) } if isnew { + r.Recorder.Eventf(in, corev1.EventTypeNormal, "NodeAdded", "Node %s has been added", node.Name) batch.NodesAdded = append(batch.NodesAdded, event) continue } + r.Recorder.Eventf(in, corev1.EventTypeNormal, "NodeUpdated", "Node %s has been updated", node.Name) batch.NodesUpdated = append(batch.NodesUpdated, event) } trimmed := []v1beta1.NodeStatus{} @@ -219,6 +223,7 @@ func (r *InstallationReconciler) ReconcileNodeStatuses(ctx context.Context, in * rmevent := metrics.NodeRemovedEvent{ ClusterID: in.Spec.ClusterID, NodeName: nodeStatus.Name, } + r.Recorder.Eventf(in, corev1.EventTypeNormal, "NodeRemoved", "Node %s has been removed", nodeStatus.Name) batch.NodesRemoved = append(batch.NodesRemoved, rmevent) } sort.SliceStable(trimmed, func(i, j int) bool { return trimmed[i].Name < trimmed[j].Name }) @@ -379,6 +384,7 @@ func (r *InstallationReconciler) ReconcileK0sVersion(ctx context.Context, in *v1 return fmt.Errorf("failed to determine if k0s should be upgraded: %w", err) } if shouldUpgrade { + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgrade", "Upgrading k0s to %s", desiredVersion) log.Info("Starting k0s autopilot upgrade plan", "version", desiredVersion) // there is no autopilot plan in the cluster so we are free to @@ -408,7 +414,7 @@ func (r *InstallationReconciler) ReconcileK0sVersion(ctx context.Context, in *v1 // actually upgrades the k0s version. we need to make sure that this is the second plan // before setting the installation state to the plan state. if isAutopilotUpgradeToVersion(&plan, desiredVersion) { - r.SetStateBasedOnPlan(in, plan) + r.SetStateBasedOnPlan(in, plan, desiredVersion) return nil } } @@ -582,7 +588,7 @@ func (r *InstallationReconciler) ReconcileHAStatus(ctx context.Context, in *v1be // SetStateBasedOnPlan sets the installation state based on the Plan state. For now we do not // report anything fancy but we should consider reporting here a summary of how many nodes // have been upgraded and how many are still pending. -func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, plan apv1b2.Plan) { +func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, plan apv1b2.Plan, desiredVersion string) { reason := autopilot.ReasonForState(plan) switch plan.Status.State { case "": @@ -598,14 +604,17 @@ func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, p case apcore.PlanMissingSignalNode: fallthrough case apcore.PlanApplyFailed: + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeFailed", "Upgrade of k0s to %s failed (%q)", desiredVersion, plan.Status.State) in.Status.SetState(v1beta1.InstallationStateFailed, reason, nil) case apcore.PlanSchedulable: fallthrough case apcore.PlanSchedulableWait: in.Status.SetState(v1beta1.InstallationStateInstalling, reason, nil) case apcore.PlanCompleted: + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeComplete", "Upgrade of k0s to %s completed", desiredVersion) in.Status.SetState(v1beta1.InstallationStateKubernetesInstalled, reason, nil) default: + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeUnknownState", "Upgrade of k0s to %s has an unknown state %q", desiredVersion, plan.Status.State) in.Status.SetState(v1beta1.InstallationStateFailed, reason, nil) } } @@ -660,12 +669,15 @@ func (r *InstallationReconciler) DisableOldInstallations(ctx context.Context, it }) for _, in := range items[1:] { in.Status.NodesStatus = nil - in.Status.SetState( - v1beta1.InstallationStateObsolete, - "This is not the most recent installation object", - nil, - ) - r.Status().Update(ctx, &in) + if in.Status.State != v1beta1.InstallationStateObsolete { + r.Recorder.Eventf(&in, corev1.EventTypeNormal, "Obsolete", "This has been obsoleted by a newer installation object") + in.Status.SetState( + v1beta1.InstallationStateObsolete, + "This is not the most recent installation object", + nil, + ) + r.Status().Update(ctx, &in) + } } } @@ -841,9 +853,13 @@ func (r *InstallationReconciler) Reconcile(ctx context.Context, req ctrl.Request // reconcile the add-ons (k0s helm extensions). log.Info("Reconciling addons") - if err := charts.ReconcileHelmCharts(ctx, r.Client, in); err != nil { + ev, err := charts.ReconcileHelmCharts(ctx, r.Client, in) + if err != nil { return ctrl.Result{}, fmt.Errorf("failed to reconcile helm charts: %w", err) } + if ev != nil { + r.Recorder.Event(in, corev1.EventTypeNormal, ev.Reason, ev.Message) + } if err := r.ReconcileHAStatus(ctx, in); err != nil { return ctrl.Result{}, fmt.Errorf("failed to reconcile HA status: %w", err) diff --git a/operator/pkg/charts/detection.go b/operator/pkg/charts/detection.go index 31f1b3a70..99c9427d8 100644 --- a/operator/pkg/charts/detection.go +++ b/operator/pkg/charts/detection.go @@ -84,9 +84,9 @@ func YamlDiff(a, b string) (bool, error) { } // check if all charts in the combinedConfigs are installed successfully with the desired version and values -func DetectChartCompletion(existingHelm *v1beta1.HelmExtensions, installedCharts v1beta2.ChartList) ([]string, []string, error) { +func DetectChartCompletion(existingHelm *v1beta1.HelmExtensions, installedCharts v1beta2.ChartList) ([]string, map[string]string, error) { incompleteCharts := []string{} - chartErrors := []string{} + chartErrors := map[string]string{} if existingHelm == nil { return incompleteCharts, chartErrors, nil } @@ -115,7 +115,7 @@ func DetectChartCompletion(existingHelm *v1beta1.HelmExtensions, installedCharts } if installedChart.Status.Error != "" { - chartErrors = append(chartErrors, installedChart.Status.Error) + chartErrors[chart.Name] = installedChart.Status.Error diffDetected = false } diff --git a/operator/pkg/charts/helm_test.go b/operator/pkg/charts/helm_test.go index ef23020dd..9bfd344eb 100644 --- a/operator/pkg/charts/helm_test.go +++ b/operator/pkg/charts/helm_test.go @@ -16,7 +16,7 @@ func Test_detectChartCompletion(t *testing.T) { tests := []struct { name string args args - wantChartErrors []string + wantChartErrors map[string]string wantIncompleteCharts []string }{ { @@ -58,7 +58,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{}, - wantChartErrors: []string{}, + wantChartErrors: map[string]string{}, }, { name: "new chart", @@ -90,7 +90,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{"test2"}, - wantChartErrors: []string{}, + wantChartErrors: map[string]string{}, }, { name: "removed chart", @@ -126,7 +126,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{}, - wantChartErrors: []string{}, + wantChartErrors: map[string]string{}, }, { name: "added and removed chart", @@ -154,7 +154,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{"test2"}, - wantChartErrors: []string{}, + wantChartErrors: map[string]string{}, }, { name: "no drift, but error", @@ -195,7 +195,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{}, - wantChartErrors: []string{"test chart error", "test chart two error"}, + wantChartErrors: map[string]string{"test": "test chart error", "test2": "test chart two error"}, }, { name: "drift and error", @@ -236,7 +236,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{}, - wantChartErrors: []string{"test chart error", "test chart two error"}, + wantChartErrors: map[string]string{"test": "test chart error", "test2": "test chart two error"}, }, { name: "drift values", @@ -270,7 +270,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{"test"}, - wantChartErrors: []string{}, + wantChartErrors: map[string]string{}, }, { name: "values hash differs", @@ -304,7 +304,7 @@ func Test_detectChartCompletion(t *testing.T) { }, }, wantIncompleteCharts: []string{"test"}, - wantChartErrors: []string{}, + wantChartErrors: map[string]string{}, }, } for _, tt := range tests { diff --git a/operator/pkg/charts/reconcile.go b/operator/pkg/charts/reconcile.go index b15036c6f..8ebe60f1e 100644 --- a/operator/pkg/charts/reconcile.go +++ b/operator/pkg/charts/reconcile.go @@ -3,6 +3,7 @@ package charts import ( "context" "fmt" + "sort" "strings" v1beta3 "github.com/k0sproject/k0s/pkg/apis/helm/v1beta1" @@ -13,13 +14,18 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +type RecordedEvent struct { + Reason string + Message string +} + // ReconcileHelmCharts reconciles the helm charts from the Installation metadata with the clusterconfig object. -func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Installation) error { +func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Installation) (*RecordedEvent, error) { if in.Spec.Config == nil || in.Spec.Config.Version == "" { if in.Status.State == v1beta1.InstallationStateKubernetesInstalled { in.Status.SetState(v1beta1.InstallationStateInstalled, "Installed", nil) } - return nil + return nil, nil } log := controllerruntime.LoggerFrom(ctx) @@ -27,13 +33,13 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins if in.Status.State == v1beta1.InstallationStateFailed || !in.Status.GetKubernetesInstalled() { log.Info("Skipping helm chart reconciliation", "state", in.Status.State) - return nil + return nil, nil } meta, err := release.MetadataFor(ctx, in, cli) if err != nil { in.Status.SetState(v1beta1.InstallationStateHelmChartUpdateFailure, err.Error(), nil) - return nil + return nil, nil } // skip if the new release has no addon configs - this should not happen in production @@ -42,24 +48,24 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins if in.Status.State == v1beta1.InstallationStateKubernetesInstalled { in.Status.SetState(v1beta1.InstallationStateInstalled, "Installed", nil) } - return nil + return nil, nil } // fetch the current clusterConfig var clusterConfig v1beta2.ClusterConfig if err := cli.Get(ctx, client.ObjectKey{Name: "k0s", Namespace: "kube-system"}, &clusterConfig); err != nil { - return fmt.Errorf("failed to get cluster config: %w", err) + return nil, fmt.Errorf("failed to get cluster config: %w", err) } combinedConfigs, err := K0sHelmExtensionsFromInstallation(ctx, in, meta, &clusterConfig) if err != nil { - return fmt.Errorf("failed to get helm charts from installation: %w", err) + return nil, fmt.Errorf("failed to get helm charts from installation: %w", err) } cfgs := &v1beta2.HelmExtensions{} cfgs, err = v1beta1.ConvertTo(*combinedConfigs, cfgs) if err != nil { - return fmt.Errorf("failed to convert chart types: %w", err) + return nil, fmt.Errorf("failed to convert chart types: %w", err) } existingHelm := &v1beta2.HelmExtensions{} @@ -69,17 +75,17 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins chartDrift, changedCharts, err := DetectChartDrift(cfgs, existingHelm) if err != nil { - return fmt.Errorf("failed to check chart drift: %w", err) + return nil, fmt.Errorf("failed to check chart drift: %w", err) } // detect drift between the cluster config and the installer metadata var installedCharts v1beta3.ChartList if err := cli.List(ctx, &installedCharts); err != nil { - return fmt.Errorf("failed to list installed charts: %w", err) + return nil, fmt.Errorf("failed to list installed charts: %w", err) } pendingCharts, chartErrors, err := DetectChartCompletion(existingHelm, installedCharts) if err != nil { - return fmt.Errorf("failed to check chart completion: %w", err) + return nil, fmt.Errorf("failed to check chart completion: %w", err) } // If any chart has errors, update installer state and return @@ -87,39 +93,60 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins // we should update the cluster instead of letting chart errors stop deployment permanently // otherwise if there are errors we need to abort if len(chartErrors) > 0 && !chartDrift { - chartErrorString := strings.Join(chartErrors, ",") - chartErrorString = "failed to update helm charts: " + chartErrorString + chartErrorString := "" + chartsWithErrors := []string{} + for k := range chartErrors { + chartsWithErrors = append(chartsWithErrors, k) + } + sort.Strings(chartsWithErrors) + for _, chartName := range chartsWithErrors { + chartErrorString += fmt.Sprintf("%s: %s\n", chartName, chartErrors[chartName]) + } + + chartErrorString = "failed to update helm charts: \n" + chartErrorString log.Info("Chart errors", "errors", chartErrorString) if len(chartErrorString) > 1024 { chartErrorString = chartErrorString[:1024] } + var ev *RecordedEvent + if in.Status.State != v1beta1.InstallationStateHelmChartUpdateFailure || chartErrorString != in.Status.Reason { + ev = &RecordedEvent{Reason: "ChartErrors", Message: fmt.Sprintf("Chart errors %v", chartsWithErrors)} + } in.Status.SetState(v1beta1.InstallationStateHelmChartUpdateFailure, chartErrorString, nil) - return nil + return ev, nil } // If all addons match their target version + values, mark installation as complete if len(pendingCharts) == 0 && !chartDrift { + var ev *RecordedEvent + if in.Status.State != v1beta1.InstallationStateInstalled { + ev = &RecordedEvent{Reason: "AddonsUpgraded", Message: "Addons upgraded"} + } in.Status.SetState(v1beta1.InstallationStateInstalled, "Addons upgraded", nil) - return nil + return ev, nil } if len(pendingCharts) > 0 { // If there are pending charts, mark the installation as pending with a message about the pending charts + var ev *RecordedEvent + if in.Status.State != v1beta1.InstallationStatePendingChartCreation || strings.Join(pendingCharts, ",") != strings.Join(in.Status.PendingCharts, ",") { + ev = &RecordedEvent{Reason: "PendingHelmCharts", Message: fmt.Sprintf("Pending helm charts %v", pendingCharts)} + } in.Status.SetState(v1beta1.InstallationStatePendingChartCreation, fmt.Sprintf("Pending charts: %v", pendingCharts), pendingCharts) - return nil + return ev, nil } if in.Status.State == v1beta1.InstallationStateAddonsInstalling { // after the first time we apply new helm charts, this will be set to InstallationStateAddonsInstalling // and we will not re-apply the charts to the k0s cluster config while waiting for those changes to propagate - return nil + return nil, nil } if !chartDrift { // if there is no drift, we should not reapply the cluster config // however, the charts have not been applied yet, so we should not mark the installation as complete // this should not happen on upgrades - return nil + return nil, nil } // Replace the current chart configs with the new chart configs @@ -128,7 +155,8 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins log.Info("Updating cluster config with new helm charts", "updated charts", changedCharts) //Update the clusterConfig if err := cli.Update(ctx, &clusterConfig); err != nil { - return fmt.Errorf("failed to update cluster config: %w", err) + return nil, fmt.Errorf("failed to update cluster config: %w", err) } - return nil + ev := RecordedEvent{Reason: "HelmChartsUpdated", Message: fmt.Sprintf("Updated helm charts %v", changedCharts)} + return &ev, nil } diff --git a/operator/pkg/charts/reconcile_test.go b/operator/pkg/charts/reconcile_test.go index e1d7dd374..99a78b57b 100644 --- a/operator/pkg/charts/reconcile_test.go +++ b/operator/pkg/charts/reconcile_test.go @@ -189,7 +189,7 @@ func TestInstallationReconciler_ReconcileHelmCharts(t *testing.T) { }, out: v1beta1.InstallationStatus{ State: v1beta1.InstallationStateHelmChartUpdateFailure, - Reason: "failed to update helm charts: exterror", + Reason: "failed to update helm charts: \nextchart: exterror\n", }, releaseMeta: ectypes.ReleaseMetadata{ Configs: v1beta1.Helm{ @@ -253,7 +253,7 @@ func TestInstallationReconciler_ReconcileHelmCharts(t *testing.T) { }, out: v1beta1.InstallationStatus{ State: v1beta1.InstallationStateHelmChartUpdateFailure, - Reason: "failed to update helm charts: metaerror", + Reason: "failed to update helm charts: \nmetachart: metaerror\n", }, releaseMeta: ectypes.ReleaseMetadata{ Configs: v1beta1.Helm{ @@ -889,7 +889,7 @@ password: original`, req.NoError(k0shelmv1beta1.AddToScheme(sch)) fakeCli := fake.NewClientBuilder().WithScheme(sch).WithRuntimeObjects(tt.fields.State...).Build() - err := ReconcileHelmCharts(context.Background(), fakeCli, &tt.in) + _, err := ReconcileHelmCharts(context.Background(), fakeCli, &tt.in) req.NoError(err) req.Equal(tt.out, tt.in.Status) diff --git a/operator/pkg/cli/root.go b/operator/pkg/cli/root.go index eb931257d..dbbdd7be8 100644 --- a/operator/pkg/cli/root.go +++ b/operator/pkg/cli/root.go @@ -66,6 +66,7 @@ func RootCmd() *cobra.Command { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Discovery: discovery.NewDiscoveryClientForConfigOrDie(ctrl.GetConfigOrDie()), + Recorder: mgr.GetEventRecorderFor("installation-controller"), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Installation") os.Exit(1) diff --git a/operator/pkg/upgrade/upgrade.go b/operator/pkg/upgrade/upgrade.go index c6fa3db41..c59014c2e 100644 --- a/operator/pkg/upgrade/upgrade.go +++ b/operator/pkg/upgrade/upgrade.go @@ -140,7 +140,7 @@ func chartUpgrade(ctx context.Context, cli client.Client, original *clusterv1bet input := original.DeepCopy() input.Status.SetState(v1beta1.InstallationStateKubernetesInstalled, "", nil) - err := charts.ReconcileHelmCharts(ctx, cli, input) + _, err := charts.ReconcileHelmCharts(ctx, cli, input) if err != nil { return fmt.Errorf("failed to reconcile helm charts: %w", err) }