From e97b9956fec0c9305403557dc90c8767c0878a60 Mon Sep 17 00:00:00 2001
From: Andrew Lavery <laverya@umich.edu>
Date: Mon, 7 Oct 2024 23:13:44 +0200
Subject: [PATCH] record events in the installation object (#1292)

* record events in the installation object

* properly set k0s upgrade state

* add 'events' permissions, only send events on change

* f

* fix checks...
---
 ...embedded-cluster-operator-clusterrole.yaml |  7 ++
 .../controllers/installation_controller.go    | 34 +++++++---
 operator/pkg/charts/detection.go              |  6 +-
 operator/pkg/charts/helm_test.go              | 18 ++---
 operator/pkg/charts/reconcile.go              | 68 +++++++++++++------
 operator/pkg/charts/reconcile_test.go         |  6 +-
 operator/pkg/cli/root.go                      |  1 +
 operator/pkg/upgrade/upgrade.go               |  2 +-
 8 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml b/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml
index f33b61e41..f8f867a98 100644
--- a/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml
+++ b/operator/charts/embedded-cluster-operator/templates/embedded-cluster-operator-clusterrole.yaml
@@ -69,6 +69,13 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - patch
+  - create
 - apiGroups:
   - embeddedcluster.replicated.com
   resources:
diff --git a/operator/controllers/installation_controller.go b/operator/controllers/installation_controller.go
index 4b7d2e7ee..32b451b8d 100644
--- a/operator/controllers/installation_controller.go
+++ b/operator/controllers/installation_controller.go
@@ -36,6 +36,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/discovery"
+	"k8s.io/client-go/tools/record"
 	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -140,6 +141,7 @@ type InstallationReconciler struct {
 	client.Client
 	Discovery discovery.DiscoveryInterface
 	Scheme    *runtime.Scheme
+	Recorder  record.EventRecorder
 }
 
 // NodeHasChanged returns true if the node configuration has changed when compared to
@@ -205,9 +207,11 @@ func (r *InstallationReconciler) ReconcileNodeStatuses(ctx context.Context, in *
 			return nil, fmt.Errorf("failed to update node status: %w", err)
 		}
 		if isnew {
+			r.Recorder.Eventf(in, corev1.EventTypeNormal, "NodeAdded", "Node %s has been added", node.Name)
 			batch.NodesAdded = append(batch.NodesAdded, event)
 			continue
 		}
+		r.Recorder.Eventf(in, corev1.EventTypeNormal, "NodeUpdated", "Node %s has been updated", node.Name)
 		batch.NodesUpdated = append(batch.NodesUpdated, event)
 	}
 	trimmed := []v1beta1.NodeStatus{}
@@ -219,6 +223,7 @@ func (r *InstallationReconciler) ReconcileNodeStatuses(ctx context.Context, in *
 		rmevent := metrics.NodeRemovedEvent{
 			ClusterID: in.Spec.ClusterID, NodeName: nodeStatus.Name,
 		}
+		r.Recorder.Eventf(in, corev1.EventTypeNormal, "NodeRemoved", "Node %s has been removed", nodeStatus.Name)
 		batch.NodesRemoved = append(batch.NodesRemoved, rmevent)
 	}
 	sort.SliceStable(trimmed, func(i, j int) bool { return trimmed[i].Name < trimmed[j].Name })
@@ -379,6 +384,7 @@ func (r *InstallationReconciler) ReconcileK0sVersion(ctx context.Context, in *v1
 			return fmt.Errorf("failed to determine if k0s should be upgraded: %w", err)
 		}
 		if shouldUpgrade {
+			r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgrade", "Upgrading k0s to %s", desiredVersion)
 			log.Info("Starting k0s autopilot upgrade plan", "version", desiredVersion)
 
 			// there is no autopilot plan in the cluster so we are free to
@@ -408,7 +414,7 @@ func (r *InstallationReconciler) ReconcileK0sVersion(ctx context.Context, in *v1
 		// actually upgrades the k0s version. we need to make sure that this is the second plan
 		// before setting the installation state to the plan state.
 		if isAutopilotUpgradeToVersion(&plan, desiredVersion) {
-			r.SetStateBasedOnPlan(in, plan)
+			r.SetStateBasedOnPlan(in, plan, desiredVersion)
 			return nil
 		}
 	}
@@ -582,7 +588,7 @@ func (r *InstallationReconciler) ReconcileHAStatus(ctx context.Context, in *v1be
 // SetStateBasedOnPlan sets the installation state based on the Plan state. For now we do not
 // report anything fancy but we should consider reporting here a summary of how many nodes
 // have been upgraded and how many are still pending.
-func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, plan apv1b2.Plan) {
+func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, plan apv1b2.Plan, desiredVersion string) {
 	reason := autopilot.ReasonForState(plan)
 	switch plan.Status.State {
 	case "":
@@ -598,14 +604,17 @@ func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, p
 	case apcore.PlanMissingSignalNode:
 		fallthrough
 	case apcore.PlanApplyFailed:
+		r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeFailed", "Upgrade of k0s to %s failed (%q)", desiredVersion, plan.Status.State)
 		in.Status.SetState(v1beta1.InstallationStateFailed, reason, nil)
 	case apcore.PlanSchedulable:
 		fallthrough
 	case apcore.PlanSchedulableWait:
 		in.Status.SetState(v1beta1.InstallationStateInstalling, reason, nil)
 	case apcore.PlanCompleted:
+		r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeComplete", "Upgrade of k0s to %s completed", desiredVersion)
 		in.Status.SetState(v1beta1.InstallationStateKubernetesInstalled, reason, nil)
 	default:
+		r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeUnknownState", "Upgrade of k0s to %s has an unknown state %q", desiredVersion, plan.Status.State)
 		in.Status.SetState(v1beta1.InstallationStateFailed, reason, nil)
 	}
 }
@@ -660,12 +669,15 @@ func (r *InstallationReconciler) DisableOldInstallations(ctx context.Context, it
 	})
 	for _, in := range items[1:] {
 		in.Status.NodesStatus = nil
-		in.Status.SetState(
-			v1beta1.InstallationStateObsolete,
-			"This is not the most recent installation object",
-			nil,
-		)
-		r.Status().Update(ctx, &in)
+		if in.Status.State != v1beta1.InstallationStateObsolete {
+			r.Recorder.Eventf(&in, corev1.EventTypeNormal, "Obsolete", "This has been obsoleted by a newer installation object")
+			in.Status.SetState(
+				v1beta1.InstallationStateObsolete,
+				"This is not the most recent installation object",
+				nil,
+			)
+			r.Status().Update(ctx, &in)
+		}
 	}
 }
 
@@ -841,9 +853,13 @@ func (r *InstallationReconciler) Reconcile(ctx context.Context, req ctrl.Request
 
 	// reconcile the add-ons (k0s helm extensions).
 	log.Info("Reconciling addons")
-	if err := charts.ReconcileHelmCharts(ctx, r.Client, in); err != nil {
+	ev, err := charts.ReconcileHelmCharts(ctx, r.Client, in)
+	if err != nil {
 		return ctrl.Result{}, fmt.Errorf("failed to reconcile helm charts: %w", err)
 	}
+	if ev != nil {
+		r.Recorder.Event(in, corev1.EventTypeNormal, ev.Reason, ev.Message)
+	}
 
 	if err := r.ReconcileHAStatus(ctx, in); err != nil {
 		return ctrl.Result{}, fmt.Errorf("failed to reconcile HA status: %w", err)
diff --git a/operator/pkg/charts/detection.go b/operator/pkg/charts/detection.go
index 31f1b3a70..99c9427d8 100644
--- a/operator/pkg/charts/detection.go
+++ b/operator/pkg/charts/detection.go
@@ -84,9 +84,9 @@ func YamlDiff(a, b string) (bool, error) {
 }
 
 // check if all charts in the combinedConfigs are installed successfully with the desired version and values
-func DetectChartCompletion(existingHelm *v1beta1.HelmExtensions, installedCharts v1beta2.ChartList) ([]string, []string, error) {
+func DetectChartCompletion(existingHelm *v1beta1.HelmExtensions, installedCharts v1beta2.ChartList) ([]string, map[string]string, error) {
 	incompleteCharts := []string{}
-	chartErrors := []string{}
+	chartErrors := map[string]string{}
 	if existingHelm == nil {
 		return incompleteCharts, chartErrors, nil
 	}
@@ -115,7 +115,7 @@ func DetectChartCompletion(existingHelm *v1beta1.HelmExtensions, installedCharts
 				}
 
 				if installedChart.Status.Error != "" {
-					chartErrors = append(chartErrors, installedChart.Status.Error)
+					chartErrors[chart.Name] = installedChart.Status.Error
 					diffDetected = false
 				}
 
diff --git a/operator/pkg/charts/helm_test.go b/operator/pkg/charts/helm_test.go
index ef23020dd..9bfd344eb 100644
--- a/operator/pkg/charts/helm_test.go
+++ b/operator/pkg/charts/helm_test.go
@@ -16,7 +16,7 @@ func Test_detectChartCompletion(t *testing.T) {
 	tests := []struct {
 		name                 string
 		args                 args
-		wantChartErrors      []string
+		wantChartErrors      map[string]string
 		wantIncompleteCharts []string
 	}{
 		{
@@ -58,7 +58,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{},
-			wantChartErrors:      []string{},
+			wantChartErrors:      map[string]string{},
 		},
 		{
 			name: "new chart",
@@ -90,7 +90,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{"test2"},
-			wantChartErrors:      []string{},
+			wantChartErrors:      map[string]string{},
 		},
 		{
 			name: "removed chart",
@@ -126,7 +126,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{},
-			wantChartErrors:      []string{},
+			wantChartErrors:      map[string]string{},
 		},
 		{
 			name: "added and removed chart",
@@ -154,7 +154,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{"test2"},
-			wantChartErrors:      []string{},
+			wantChartErrors:      map[string]string{},
 		},
 		{
 			name: "no drift, but error",
@@ -195,7 +195,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{},
-			wantChartErrors:      []string{"test chart error", "test chart two error"},
+			wantChartErrors:      map[string]string{"test": "test chart error", "test2": "test chart two error"},
 		},
 		{
 			name: "drift and error",
@@ -236,7 +236,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{},
-			wantChartErrors:      []string{"test chart error", "test chart two error"},
+			wantChartErrors:      map[string]string{"test": "test chart error", "test2": "test chart two error"},
 		},
 		{
 			name: "drift values",
@@ -270,7 +270,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{"test"},
-			wantChartErrors:      []string{},
+			wantChartErrors:      map[string]string{},
 		},
 		{
 			name: "values hash differs",
@@ -304,7 +304,7 @@ func Test_detectChartCompletion(t *testing.T) {
 				},
 			},
 			wantIncompleteCharts: []string{"test"},
-			wantChartErrors:      []string{},
+			wantChartErrors:      map[string]string{},
 		},
 	}
 	for _, tt := range tests {
diff --git a/operator/pkg/charts/reconcile.go b/operator/pkg/charts/reconcile.go
index b15036c6f..8ebe60f1e 100644
--- a/operator/pkg/charts/reconcile.go
+++ b/operator/pkg/charts/reconcile.go
@@ -3,6 +3,7 @@ package charts
 import (
 	"context"
 	"fmt"
+	"sort"
 	"strings"
 
 	v1beta3 "github.com/k0sproject/k0s/pkg/apis/helm/v1beta1"
@@ -13,13 +14,18 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
+type RecordedEvent struct {
+	Reason  string
+	Message string
+}
+
 // ReconcileHelmCharts reconciles the helm charts from the Installation metadata with the clusterconfig object.
-func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Installation) error {
+func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Installation) (*RecordedEvent, error) {
 	if in.Spec.Config == nil || in.Spec.Config.Version == "" {
 		if in.Status.State == v1beta1.InstallationStateKubernetesInstalled {
 			in.Status.SetState(v1beta1.InstallationStateInstalled, "Installed", nil)
 		}
-		return nil
+		return nil, nil
 	}
 
 	log := controllerruntime.LoggerFrom(ctx)
@@ -27,13 +33,13 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins
 	if in.Status.State == v1beta1.InstallationStateFailed ||
 		!in.Status.GetKubernetesInstalled() {
 		log.Info("Skipping helm chart reconciliation", "state", in.Status.State)
-		return nil
+		return nil, nil
 	}
 
 	meta, err := release.MetadataFor(ctx, in, cli)
 	if err != nil {
 		in.Status.SetState(v1beta1.InstallationStateHelmChartUpdateFailure, err.Error(), nil)
-		return nil
+		return nil, nil
 	}
 
 	// skip if the new release has no addon configs - this should not happen in production
@@ -42,24 +48,24 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins
 		if in.Status.State == v1beta1.InstallationStateKubernetesInstalled {
 			in.Status.SetState(v1beta1.InstallationStateInstalled, "Installed", nil)
 		}
-		return nil
+		return nil, nil
 	}
 
 	// fetch the current clusterConfig
 	var clusterConfig v1beta2.ClusterConfig
 	if err := cli.Get(ctx, client.ObjectKey{Name: "k0s", Namespace: "kube-system"}, &clusterConfig); err != nil {
-		return fmt.Errorf("failed to get cluster config: %w", err)
+		return nil, fmt.Errorf("failed to get cluster config: %w", err)
 	}
 
 	combinedConfigs, err := K0sHelmExtensionsFromInstallation(ctx, in, meta, &clusterConfig)
 	if err != nil {
-		return fmt.Errorf("failed to get helm charts from installation: %w", err)
+		return nil, fmt.Errorf("failed to get helm charts from installation: %w", err)
 	}
 
 	cfgs := &v1beta2.HelmExtensions{}
 	cfgs, err = v1beta1.ConvertTo(*combinedConfigs, cfgs)
 	if err != nil {
-		return fmt.Errorf("failed to convert chart types: %w", err)
+		return nil, fmt.Errorf("failed to convert chart types: %w", err)
 	}
 
 	existingHelm := &v1beta2.HelmExtensions{}
@@ -69,17 +75,17 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins
 
 	chartDrift, changedCharts, err := DetectChartDrift(cfgs, existingHelm)
 	if err != nil {
-		return fmt.Errorf("failed to check chart drift: %w", err)
+		return nil, fmt.Errorf("failed to check chart drift: %w", err)
 	}
 
 	// detect drift between the cluster config and the installer metadata
 	var installedCharts v1beta3.ChartList
 	if err := cli.List(ctx, &installedCharts); err != nil {
-		return fmt.Errorf("failed to list installed charts: %w", err)
+		return nil, fmt.Errorf("failed to list installed charts: %w", err)
 	}
 	pendingCharts, chartErrors, err := DetectChartCompletion(existingHelm, installedCharts)
 	if err != nil {
-		return fmt.Errorf("failed to check chart completion: %w", err)
+		return nil, fmt.Errorf("failed to check chart completion: %w", err)
 	}
 
 	// If any chart has errors, update installer state and return
@@ -87,39 +93,60 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins
 	// we should update the cluster instead of letting chart errors stop deployment permanently
 	// otherwise if there are errors we need to abort
 	if len(chartErrors) > 0 && !chartDrift {
-		chartErrorString := strings.Join(chartErrors, ",")
-		chartErrorString = "failed to update helm charts: " + chartErrorString
+		chartErrorString := ""
+		chartsWithErrors := []string{}
+		for k := range chartErrors {
+			chartsWithErrors = append(chartsWithErrors, k)
+		}
+		sort.Strings(chartsWithErrors)
+		for _, chartName := range chartsWithErrors {
+			chartErrorString += fmt.Sprintf("%s: %s\n", chartName, chartErrors[chartName])
+		}
+
+		chartErrorString = "failed to update helm charts: \n" + chartErrorString
 		log.Info("Chart errors", "errors", chartErrorString)
 		if len(chartErrorString) > 1024 {
 			chartErrorString = chartErrorString[:1024]
 		}
+		var ev *RecordedEvent
+		if in.Status.State != v1beta1.InstallationStateHelmChartUpdateFailure || chartErrorString != in.Status.Reason {
+			ev = &RecordedEvent{Reason: "ChartErrors", Message: fmt.Sprintf("Chart errors %v", chartsWithErrors)}
+		}
 		in.Status.SetState(v1beta1.InstallationStateHelmChartUpdateFailure, chartErrorString, nil)
-		return nil
+		return ev, nil
 	}
 
 	// If all addons match their target version + values, mark installation as complete
 	if len(pendingCharts) == 0 && !chartDrift {
+		var ev *RecordedEvent
+		if in.Status.State != v1beta1.InstallationStateInstalled {
+			ev = &RecordedEvent{Reason: "AddonsUpgraded", Message: "Addons upgraded"}
+		}
 		in.Status.SetState(v1beta1.InstallationStateInstalled, "Addons upgraded", nil)
-		return nil
+		return ev, nil
 	}
 
 	if len(pendingCharts) > 0 {
 		// If there are pending charts, mark the installation as pending with a message about the pending charts
+		var ev *RecordedEvent
+		if in.Status.State != v1beta1.InstallationStatePendingChartCreation || strings.Join(pendingCharts, ",") != strings.Join(in.Status.PendingCharts, ",") {
+			ev = &RecordedEvent{Reason: "PendingHelmCharts", Message: fmt.Sprintf("Pending helm charts %v", pendingCharts)}
+		}
 		in.Status.SetState(v1beta1.InstallationStatePendingChartCreation, fmt.Sprintf("Pending charts: %v", pendingCharts), pendingCharts)
-		return nil
+		return ev, nil
 	}
 
 	if in.Status.State == v1beta1.InstallationStateAddonsInstalling {
 		// after the first time we apply new helm charts, this will be set to InstallationStateAddonsInstalling
 		// and we will not re-apply the charts to the k0s cluster config while waiting for those changes to propagate
-		return nil
+		return nil, nil
 	}
 
 	if !chartDrift {
 		// if there is no drift, we should not reapply the cluster config
 		// however, the charts have not been applied yet, so we should not mark the installation as complete
 		// this should not happen on upgrades
-		return nil
+		return nil, nil
 	}
 
 	// Replace the current chart configs with the new chart configs
@@ -128,7 +155,8 @@ func ReconcileHelmCharts(ctx context.Context, cli client.Client, in *v1beta1.Ins
 	log.Info("Updating cluster config with new helm charts", "updated charts", changedCharts)
 	//Update the clusterConfig
 	if err := cli.Update(ctx, &clusterConfig); err != nil {
-		return fmt.Errorf("failed to update cluster config: %w", err)
+		return nil, fmt.Errorf("failed to update cluster config: %w", err)
 	}
-	return nil
+	ev := RecordedEvent{Reason: "HelmChartsUpdated", Message: fmt.Sprintf("Updated helm charts %v", changedCharts)}
+	return &ev, nil
 }
diff --git a/operator/pkg/charts/reconcile_test.go b/operator/pkg/charts/reconcile_test.go
index e1d7dd374..99a78b57b 100644
--- a/operator/pkg/charts/reconcile_test.go
+++ b/operator/pkg/charts/reconcile_test.go
@@ -189,7 +189,7 @@ func TestInstallationReconciler_ReconcileHelmCharts(t *testing.T) {
 			},
 			out: v1beta1.InstallationStatus{
 				State:  v1beta1.InstallationStateHelmChartUpdateFailure,
-				Reason: "failed to update helm charts: exterror",
+				Reason: "failed to update helm charts: \nextchart: exterror\n",
 			},
 			releaseMeta: ectypes.ReleaseMetadata{
 				Configs: v1beta1.Helm{
@@ -253,7 +253,7 @@ func TestInstallationReconciler_ReconcileHelmCharts(t *testing.T) {
 			},
 			out: v1beta1.InstallationStatus{
 				State:  v1beta1.InstallationStateHelmChartUpdateFailure,
-				Reason: "failed to update helm charts: metaerror",
+				Reason: "failed to update helm charts: \nmetachart: metaerror\n",
 			},
 			releaseMeta: ectypes.ReleaseMetadata{
 				Configs: v1beta1.Helm{
@@ -889,7 +889,7 @@ password: original`,
 			req.NoError(k0shelmv1beta1.AddToScheme(sch))
 			fakeCli := fake.NewClientBuilder().WithScheme(sch).WithRuntimeObjects(tt.fields.State...).Build()
 
-			err := ReconcileHelmCharts(context.Background(), fakeCli, &tt.in)
+			_, err := ReconcileHelmCharts(context.Background(), fakeCli, &tt.in)
 			req.NoError(err)
 			req.Equal(tt.out, tt.in.Status)
 
diff --git a/operator/pkg/cli/root.go b/operator/pkg/cli/root.go
index eb931257d..dbbdd7be8 100644
--- a/operator/pkg/cli/root.go
+++ b/operator/pkg/cli/root.go
@@ -66,6 +66,7 @@ func RootCmd() *cobra.Command {
 				Client:    mgr.GetClient(),
 				Scheme:    mgr.GetScheme(),
 				Discovery: discovery.NewDiscoveryClientForConfigOrDie(ctrl.GetConfigOrDie()),
+				Recorder:  mgr.GetEventRecorderFor("installation-controller"),
 			}).SetupWithManager(mgr); err != nil {
 				setupLog.Error(err, "unable to create controller", "controller", "Installation")
 				os.Exit(1)
diff --git a/operator/pkg/upgrade/upgrade.go b/operator/pkg/upgrade/upgrade.go
index c6fa3db41..c59014c2e 100644
--- a/operator/pkg/upgrade/upgrade.go
+++ b/operator/pkg/upgrade/upgrade.go
@@ -140,7 +140,7 @@ func chartUpgrade(ctx context.Context, cli client.Client, original *clusterv1bet
 	input := original.DeepCopy()
 	input.Status.SetState(v1beta1.InstallationStateKubernetesInstalled, "", nil)
 
-	err := charts.ReconcileHelmCharts(ctx, cli, input)
+	_, err := charts.ReconcileHelmCharts(ctx, cli, input)
 	if err != nil {
 		return fmt.Errorf("failed to reconcile helm charts: %w", err)
 	}