Skip to content

Commit

Permalink
Improve join logic to handle unreachable nodes (#560)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Mateo Florido <[email protected]>
Co-authored-by: Louise K. Schmidtgen <[email protected]>
  • Loading branch information
3 people authored Jul 31, 2024
1 parent 4e15dae commit f7167b5
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 4 deletions.
17 changes: 13 additions & 4 deletions src/k8s/pkg/k8sd/app/hooks_bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,25 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT
}
// TODO(neoaggelos): figure out how to use the microcluster client instead

// Get remote certificate from the cluster member
cert, err := utils.GetRemoteCertificate(token.JoinAddresses[0])
// Get remote certificate from the cluster member. We only need one node to be reachable for this.
// One might fail because the node is not part of the cluster anymore but was at the time the token was created.
var cert *x509.Certificate
var address string
var err error
for _, address = range token.JoinAddresses {
cert, err = utils.GetRemoteCertificate(address)
if err == nil {
break
}
}
if err != nil {
return fmt.Errorf("failed to get certificate of cluster member: %w", err)
}

// verify that the fingerprint of the certificate matches the fingerprint of the token
fingerprint := utils.CertFingerprint(cert)
if fingerprint != token.Fingerprint {
return fmt.Errorf("fingerprint from token (%q) does not match fingerprint of node %q (%q)", token.Fingerprint, token.JoinAddresses[0], fingerprint)
return fmt.Errorf("fingerprint from token (%q) does not match fingerprint of node %q (%q)", token.Fingerprint, address, fingerprint)
}

// Create the http client with trusted certificate
Expand All @@ -104,7 +113,7 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT
return fmt.Errorf("failed to prepare worker info request: %w", err)
}

httpRequest, err := http.NewRequest("POST", fmt.Sprintf("https://%s/1.0/k8sd/worker/info", token.JoinAddresses[0]), bytes.NewBuffer(requestBody))
httpRequest, err := http.NewRequest("POST", fmt.Sprintf("https://%s/1.0/k8sd/worker/info", address), bytes.NewBuffer(requestBody))
if err != nil {
return fmt.Errorf("failed to prepare HTTP request: %w", err)
}
Expand Down
28 changes: 28 additions & 0 deletions tests/integration/tests/test_clustering_race.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#
# Copyright 2024 Canonical, Ltd.
#
from typing import List

import pytest
from test_util import harness, util


@pytest.mark.node_count(3)
def test_wrong_token_race(instances: List[harness.Instance]):
cluster_node = instances[0]

join_token = util.get_join_token(cluster_node, instances[1])
util.join_cluster(instances[1], join_token)

new_join_token = util.get_join_token(cluster_node, instances[2])

cluster_node.exec(["k8s", "remove-node", instances[1].id])

another_join_token = util.get_join_token(cluster_node, instances[2])

# The join token should have changed after the node was removed as
# it contains the ip addresses of all cluster nodes.
assert (
new_join_token != another_join_token
), "join token is not updated after node removal"
util.join_cluster(instances[2], new_join_token)

0 comments on commit f7167b5

Please sign in to comment.