Skip to content

Commit

Permalink
Add Config Option to Control Kubernetes Node Removal (#577)
Browse files Browse the repository at this point in the history
  • Loading branch information
bschimke95 authored Aug 2, 2024
1 parent 9e90fdc commit aad1105
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 21 deletions.
8 changes: 8 additions & 0 deletions src/k8s/api/v1/annotations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package apiv1

const (
// AnnotationSkipCleanupKubernetesNodeOnRemove if set, only the microcluster & file cleanup is done.
// This is useful, if an external controller (e.g. CAPI) is responsible for the Kubernetes node life cycle.
// By default, the Kubernetes node is removed by k8sd if a node is removed from the cluster.
AnnotationSkipCleanupKubernetesNodeOnRemove = "k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove"
)
1 change: 1 addition & 0 deletions src/k8s/cmd/k8s/k8s_bootstrap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ var testCases = []testCase{
Enabled: utils.Pointer(true),
},
CloudProvider: utils.Pointer("external"),
Annotations: map[string]string{apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove: "true"},
},
ControlPlaneTaints: []string{"node-role.kubernetes.io/control-plane:NoSchedule"},
PodCIDR: utils.Pointer("10.100.0.0/16"),
Expand Down
2 changes: 2 additions & 0 deletions src/k8s/cmd/k8s/testdata/bootstrap-config-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ cluster-config:
metrics-server:
enabled: true
cloud-provider: external
annotations:
k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove: true
control-plane-taints:
- node-role.kubernetes.io/control-plane:NoSchedule
pod-cidr: 10.100.0.0/16
Expand Down
12 changes: 12 additions & 0 deletions src/k8s/pkg/k8sd/api/cluster_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"

apiv1 "github.com/canonical/k8s/api/v1"
databaseutil "github.com/canonical/k8s/pkg/k8sd/database/util"
"github.com/canonical/k8s/pkg/log"
"github.com/canonical/k8s/pkg/utils"
"github.com/canonical/k8s/pkg/utils/control"
Expand Down Expand Up @@ -72,6 +73,17 @@ func (e *Endpoints) postClusterRemove(s state.State, r *http.Request) response.R
return response.SyncResponse(true, nil)
}

cfg, err := databaseutil.GetClusterConfig(ctx, s)
if err != nil {
return response.InternalError(fmt.Errorf("failed to get cluster config: %w", err))
}

if _, ok := cfg.Annotations[apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove]; ok {
// Explicitly skip removing the node from Kubernetes.
log.Info("Skipping Kubernetes worker node removal")
return response.SyncResponse(true, nil)
}

client, err := snap.KubernetesClient("")
if err != nil {
return response.InternalError(fmt.Errorf("failed to create k8s client: %w", err))
Expand Down
38 changes: 19 additions & 19 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net"
"os"

apiv1 "github.com/canonical/k8s/api/v1"
databaseutil "github.com/canonical/k8s/pkg/k8sd/database/util"
"github.com/canonical/k8s/pkg/k8sd/pki"
"github.com/canonical/k8s/pkg/k8sd/setup"
Expand All @@ -23,7 +24,7 @@ import (
func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr error) {
snap := a.Snap()

log := log.FromContext(ctx).WithValues("hook", "preremove")
log := log.FromContext(ctx).WithValues("hook", "preremove", "node", s.Name())
log.Info("Running preremove hook")

log.Info("Waiting for node to finish microcluster join before removing")
Expand All @@ -43,8 +44,19 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
return notPending, nil
})

cfg, clusterConfigErr := databaseutil.GetClusterConfig(ctx, s)
if clusterConfigErr == nil {
if cfg, err := databaseutil.GetClusterConfig(ctx, s); err == nil {
if _, ok := cfg.Annotations[apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove]; !ok {
c, err := snap.KubernetesClient("")
if err != nil {
log.Error(err, "Failed to create Kubernetes client", err)
}

log.Info("Deleting node from Kubernetes cluster")
if err := c.DeleteNode(ctx, s.Name()); err != nil {
log.Error(err, "Failed to remove k8s node %q: %w", s.Name(), err)
}
}

switch cfg.Datastore.GetType() {
case "k8s-dqlite":
client, err := snap.K8sDqliteClient(ctx)
Expand Down Expand Up @@ -72,17 +84,7 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
default:
}
} else {
log.Error(clusterConfigErr, "Failed to retrieve cluster config")
}

c, err := snap.KubernetesClient("")
if err != nil {
log.Error(err, "Failed to create Kubernetes client", err)
}

log.Info("Deleting node from Kubernetes cluster")
if err := c.DeleteNode(ctx, s.Name()); err != nil {
log.Error(err, "Failed to remove k8s node %q: %w", s.Name(), err)
log.Error(err, "Failed to retrieve cluster config")
}

for _, dir := range []string{snap.ServiceArgumentsDir()} {
Expand Down Expand Up @@ -115,11 +117,9 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
log.Error(err, "failed to cleanup control plane certificates")
}

if clusterConfigErr == nil {
log.Info("Stopping control plane services")
if err := snaputil.StopControlPlaneServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop control-plane services")
}
log.Info("Stopping control plane services")
if err := snaputil.StopControlPlaneServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop control-plane services")
}

return nil
Expand Down
2 changes: 1 addition & 1 deletion src/k8s/pkg/k8sd/types/cluster_config_defaults_test.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package types_test

import (
"github.com/canonical/k8s/pkg/utils"
"testing"

"github.com/canonical/k8s/pkg/k8sd/types"
"github.com/canonical/k8s/pkg/utils"
. "github.com/onsi/gomega"
)

Expand Down
9 changes: 9 additions & 0 deletions tests/integration/templates/bootstrap-no-k8s-node-remove.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
cluster-config:
network:
enabled: true
dns:
enabled: true
metrics-server:
enabled: true
annotations:
k8sd/v1alpha/lifecycle/skip-cleanup-kubernetes-node-on-remove: true
32 changes: 31 additions & 1 deletion tests/integration/tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List

import pytest
from test_util import harness, util
from test_util import config, harness, util

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -64,3 +64,33 @@ def test_worker_nodes(instances: List[harness.Instance]):
] and other_joining_node.id in [
node["metadata"]["name"] for node in nodes
], f"only {cluster_node.id} should be left in cluster"


@pytest.mark.node_count(3)
@pytest.mark.bootstrap_config(
(config.MANIFESTS_DIR / "bootstrap-no-k8s-node-remove.yaml").read_text()
)
def test_no_remove(instances: List[harness.Instance]):
cluster_node = instances[0]
joining_cp = instances[1]
joining_worker = instances[2]

join_token = util.get_join_token(cluster_node, joining_cp)
join_token_worker = util.get_join_token(cluster_node, joining_worker, "--worker")
util.join_cluster(joining_cp, join_token)
util.join_cluster(joining_worker, join_token_worker)

util.wait_until_k8s_ready(cluster_node, instances)
nodes = util.ready_nodes(cluster_node)
assert len(nodes) == 3, "nodes should have joined cluster"

assert "control-plane" in util.get_local_node_status(cluster_node)
assert "control-plane" in util.get_local_node_status(joining_cp)
assert "worker" in util.get_local_node_status(joining_worker)

nodes = util.ready_nodes(cluster_node)

cluster_node.exec(["k8s", "remove-node", joining_cp.id])
assert len(nodes) == 3, "cp node should not have been removed from cluster"
cluster_node.exec(["k8s", "remove-node", joining_worker.id])
assert len(nodes) == 3, "worker node should not have been removed from cluster"

0 comments on commit aad1105

Please sign in to comment.