From 4291ca381ab8b62ab2e20bc0d6494005367b78cf Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Wed, 13 Nov 2024 13:31:09 +0200 Subject: [PATCH 1/9] Fix flaky tests (#767) --- .github/workflows/cron-jobs.yaml | 4 +- .github/workflows/integration-informing.yaml | 6 +- .github/workflows/integration.yaml | 10 ++- .github/workflows/nightly-test.yaml | 2 +- .../patches/strict/0001-Strict-patch.patch | 77 ++++++++++++++++--- k8s/lib.sh | 11 +++ ...tstrap-session.yaml => bootstrap-all.yaml} | 0 tests/integration/tests/conftest.py | 48 ++---------- tests/integration/tests/test_cleanup.py | 40 +--------- tests/integration/tests/test_clustering.py | 22 +++++- tests/integration/tests/test_dns.py | 20 +++-- tests/integration/tests/test_gateway.py | 35 ++++++--- tests/integration/tests/test_ingress.py | 33 +++++--- .../integration/tests/test_metrics_server.py | 20 +++-- tests/integration/tests/test_network.py | 20 +++-- tests/integration/tests/test_storage.py | 40 ++++++---- tests/integration/tests/test_util/config.py | 2 +- tests/integration/tests/test_util/util.py | 42 ++++++++++ 18 files changed, 275 insertions(+), 157 deletions(-) rename tests/integration/templates/{bootstrap-session.yaml => bootstrap-all.yaml} (100%) diff --git a/.github/workflows/cron-jobs.yaml b/.github/workflows/cron-jobs.yaml index bd9e125e7..59a1227a1 100644 --- a/.github/workflows/cron-jobs.yaml +++ b/.github/workflows/cron-jobs.yaml @@ -108,6 +108,8 @@ jobs: format: "sarif" output: "trivy-k8s-repo-scan--results.sarif" severity: "MEDIUM,HIGH,CRITICAL" + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db" - name: Gather Trivy repo scan results run: | cp trivy-k8s-repo-scan--results.sarif ./sarifs/ @@ -119,7 +121,7 @@ jobs: for var in $(env | grep -o '^TRIVY_[^=]*'); do unset "$var" done - ./trivy rootfs ./squashfs-root/ --format sarif > sarifs/snap.sarif + ./trivy --db-repository public.ecr.aws/aquasecurity/trivy-db rootfs ./squashfs-root/ --format sarif > sarifs/snap.sarif - name: Get HEAD sha run: | SHA="$(git rev-parse HEAD)" diff --git a/.github/workflows/integration-informing.yaml b/.github/workflows/integration-informing.yaml index 49c0b46ee..d4ca93ae9 100644 --- a/.github/workflows/integration-informing.yaml +++ b/.github/workflows/integration-informing.yaml @@ -60,14 +60,14 @@ jobs: os: ["ubuntu:20.04"] patch: ["strict", "moonray"] fail-fast: false - runs-on: ubuntu-20.04 + runs-on: ["self-hosted", "Linux", "AMD64", "jammy", "large"] steps: - name: Check out code uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.10' - name: Install tox run: pip install tox - name: Install lxd @@ -76,6 +76,8 @@ jobs: sudo lxd init --auto sudo usermod --append --groups lxd $USER sg lxd -c 'lxc version' + sudo iptables -I DOCKER-USER -i lxdbr0 -j ACCEPT + sudo iptables -I DOCKER-USER -o lxdbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT - name: Download snap uses: actions/download-artifact@v4 with: diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 3214f1645..bfcc903a0 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -73,7 +73,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu:20.04", "ubuntu:22.04", "ubuntu:24.04"] - runs-on: ubuntu-20.04 + runs-on: ["self-hosted", "Linux", "AMD64", "jammy", "large"] needs: build steps: @@ -82,7 +82,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.10' - name: Install tox run: pip install tox - name: Install lxd @@ -91,6 +91,8 @@ jobs: sudo lxd init --auto sudo usermod --append --groups lxd $USER sg lxd -c 'lxc version' + sudo iptables -I DOCKER-USER -i lxdbr0 -j ACCEPT + sudo iptables -I DOCKER-USER -o lxdbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT - name: Download snap uses: actions/download-artifact@v4 with: @@ -163,6 +165,8 @@ jobs: format: "sarif" output: "trivy-k8s-repo-scan--results.sarif" severity: "MEDIUM,HIGH,CRITICAL" + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db" - name: Gather Trivy repo scan results run: | cp trivy-k8s-repo-scan--results.sarif ./manual-trivy/sarifs/ @@ -173,7 +177,7 @@ jobs: done cp build/k8s.snap . unsquashfs k8s.snap - ./manual-trivy/trivy rootfs ./squashfs-root/ --format sarif > ./manual-trivy/sarifs/snap.sarif + ./manual-trivy/trivy --db-repository public.ecr.aws/aquasecurity/trivy-db rootfs ./squashfs-root/ --format sarif > ./manual-trivy/sarifs/snap.sarif - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v3 with: diff --git a/.github/workflows/nightly-test.yaml b/.github/workflows/nightly-test.yaml index e8dab660c..d2fd49972 100644 --- a/.github/workflows/nightly-test.yaml +++ b/.github/workflows/nightly-test.yaml @@ -17,7 +17,7 @@ jobs: release: ["latest/edge"] fail-fast: false # TODO: remove once arm64 works - runs-on: ${{ matrix.arch == 'arm64' && 'Ubuntu_ARM64_4C_16G_01' || 'ubuntu-20.04' }} + runs-on: ${{ matrix.arch == 'arm64' && ["self-hosted", "Linux", "ARM64", "jammy", "large"] || ["self-hosted", "Linux", "AMD64", "jammy", "large"] }} steps: - name: Checking out repo diff --git a/build-scripts/patches/strict/0001-Strict-patch.patch b/build-scripts/patches/strict/0001-Strict-patch.patch index 2ca7c50fa..7bbc5a71b 100644 --- a/build-scripts/patches/strict/0001-Strict-patch.patch +++ b/build-scripts/patches/strict/0001-Strict-patch.patch @@ -1,16 +1,17 @@ -From 3338580f4e22b001615320c40b1c1ad95f8a945e Mon Sep 17 00:00:00 2001 +From 94dadc0e3963e0b01af66e490500c619ec45c019 Mon Sep 17 00:00:00 2001 From: Angelos Kolaitis Date: Fri, 10 May 2024 19:17:55 +0300 Subject: [PATCH] Strict patch --- - k8s/hack/init.sh | 6 +- - k8s/wrappers/services/containerd | 5 - - snap/snapcraft.yaml | 168 ++++++++++++++++++++++++++++++- - 3 files changed, 172 insertions(+), 7 deletions(-) + k8s/hack/init.sh | 6 +- + k8s/wrappers/services/containerd | 5 - + snap/snapcraft.yaml | 171 +++++++++++++++++++++- + tests/integration/tests/test_util/util.py | 38 +++-- + 4 files changed, 198 insertions(+), 22 deletions(-) diff --git a/k8s/hack/init.sh b/k8s/hack/init.sh -index a0b57c7d..d53b528a 100755 +index a0b57c7..d53b528 100755 --- a/k8s/hack/init.sh +++ b/k8s/hack/init.sh @@ -1,3 +1,7 @@ @@ -23,7 +24,7 @@ index a0b57c7d..d53b528a 100755 +"${DIR}/connect-interfaces.sh" +"${DIR}/network-requirements.sh" diff --git a/k8s/wrappers/services/containerd b/k8s/wrappers/services/containerd -index c3f71a01..a82e1c03 100755 +index c3f71a0..a82e1c0 100755 --- a/k8s/wrappers/services/containerd +++ b/k8s/wrappers/services/containerd @@ -21,9 +21,4 @@ You can try to apply the profile manually by running: @@ -37,7 +38,7 @@ index c3f71a01..a82e1c03 100755 - k8s::common::execute_service containerd diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml -index 54b5fc0b..01631684 100644 +index 9d21e55..26f49ad 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -7,7 +7,7 @@ description: |- @@ -49,7 +50,7 @@ index 54b5fc0b..01631684 100644 base: core20 environment: REAL_PATH: $PATH -@@ -216,6 +216,20 @@ parts: +@@ -217,6 +217,20 @@ parts: apps: k8s: command: k8s/wrappers/commands/k8s @@ -70,7 +71,7 @@ index 54b5fc0b..01631684 100644 containerd: command: k8s/wrappers/services/containerd daemon: notify -@@ -226,43 +240,195 @@ apps: +@@ -227,43 +241,198 @@ apps: restart-condition: always start-timeout: 5m before: [kubelet] @@ -263,9 +264,61 @@ index 54b5fc0b..01631684 100644 + plugs: + - network + - network-bind -+ - process-control + - network-control ++ - network-observe ++ - process-control + - firewall-control ++ - system-observe ++ - mount-observe +diff --git a/tests/integration/tests/test_util/util.py b/tests/integration/tests/test_util/util.py +index 3e54d68..295c458 100644 +--- a/tests/integration/tests/test_util/util.py ++++ b/tests/integration/tests/test_util/util.py +@@ -191,21 +191,29 @@ def remove_k8s_snap(instance: harness.Instance): + ["snap", "remove", config.SNAP_NAME, "--purge"] + ) + +- LOG.info("Waiting for shims to go away...") +- stubbornly(retries=20, delay_s=5).on(instance).until( +- lambda p: all( +- x not in p.stdout.decode() +- for x in ["containerd-shim", "cilium", "coredns", "/pause"] +- ) +- ).exec(["ps", "-fea"]) +- +- LOG.info("Waiting for kubelet and containerd mounts to go away...") +- stubbornly(retries=20, delay_s=5).on(instance).until( +- lambda p: all( +- x not in p.stdout.decode() +- for x in ["/var/lib/kubelet/pods", "/run/containerd/io.containerd"] +- ) +- ).exec(["mount"]) ++ # NOTE(lpetrut): on "strict", the snap remove hook is unable to: ++ # * terminate processes ++ # * remove network namespaces ++ # * list mounts ++ # ++ # https://paste.ubuntu.com/p/WscCCfnvGH/plain/ ++ # https://paste.ubuntu.com/p/sSnJVvZkrr/plain/ ++ # ++ # LOG.info("Waiting for shims to go away...") ++ # stubbornly(retries=20, delay_s=5).on(instance).until( ++ # lambda p: all( ++ # x not in p.stdout.decode() ++ # for x in ["containerd-shim", "cilium", "coredns", "/pause"] ++ # ) ++ # ).exec(["ps", "-fea"]) ++ # ++ # LOG.info("Waiting for kubelet and containerd mounts to go away...") ++ # stubbornly(retries=20, delay_s=5).on(instance).until( ++ # lambda p: all( ++ # x not in p.stdout.decode() ++ # for x in ["/var/lib/kubelet/pods", "/run/containerd/io.containerd"] ++ # ) ++ # ).exec(["mount"]) + + # NOTE(neoaggelos): Temporarily disable this as it fails on strict. + # For details, `snap changes` then `snap change $remove_k8s_snap_change`. -- -2.34.1 +2.43.0 diff --git a/k8s/lib.sh b/k8s/lib.sh index 3ef47f516..a30f3a30a 100755 --- a/k8s/lib.sh +++ b/k8s/lib.sh @@ -60,6 +60,10 @@ k8s::remove::containers() { # delete cni network namespaces ip netns list | cut -f1 -d' ' | grep -- "^cni-" | xargs -n1 -r -t ip netns delete || true + # The PVC loopback devices use container paths, making them tricky to identify. + # We'll rely on the volume mount paths (/var/lib/kubelet/*). + local LOOP_DEVICES=`cat /proc/mounts | grep /var/lib/kubelet/pods | grep /dev/loop | cut -d " " -f 1` + # unmount Pod NFS volumes forcefully, as unmounting them normally may hang otherwise. cat /proc/mounts | grep /run/containerd/io.containerd. | grep "nfs[34]" | cut -f2 -d' ' | xargs -r -t umount -f || true cat /proc/mounts | grep /var/lib/kubelet/pods | grep "nfs[34]" | cut -f2 -d' ' | xargs -r -t umount -f || true @@ -79,6 +83,13 @@ k8s::remove::containers() { # so kubelet won't try to access inexistent plugins on reinstallation. find /var/lib/kubelet/plugins/ -name "*.sock" | xargs rm -f || true rm /var/lib/kubelet/plugins_registry/*.sock || true + + cat /proc/mounts | grep /var/snap/k8s/common/var/lib/containerd/ | cut -f2 -d' ' | xargs -r -t umount || true + + # cleanup loopback devices + for dev in $LOOP_DEVICES; do + losetup -d $dev + done } # Run a ctr command against the local containerd socket diff --git a/tests/integration/templates/bootstrap-session.yaml b/tests/integration/templates/bootstrap-all.yaml similarity index 100% rename from tests/integration/templates/bootstrap-session.yaml rename to tests/integration/templates/bootstrap-all.yaml diff --git a/tests/integration/tests/conftest.py b/tests/integration/tests/conftest.py index 20164d3b9..04efacd98 100644 --- a/tests/integration/tests/conftest.py +++ b/tests/integration/tests/conftest.py @@ -185,55 +185,17 @@ def instances( # Cleanup after each test. # We cannot execute _harness_clean() here as this would also - # remove the session_instance. The harness ensures that everything is cleaned up + # remove session scoped instances. The harness ensures that everything is cleaned up # at the end of the test session. for instance in instances: if config.INSPECTION_REPORTS_DIR is not None: LOG.debug("Generating inspection reports for test instances") _generate_inspection_report(h, instance.id) - h.delete_instance(instance.id) - - -@pytest.fixture(scope="session") -def session_instance( - h: harness.Harness, tmp_path_factory: pytest.TempPathFactory, request -) -> Generator[harness.Instance, None, None]: - """Constructs and bootstraps an instance that persists over a test session. - - Bootstraps the instance with all k8sd features enabled to reduce testing time. - """ - LOG.info("Setup node and enable all features") - - tmp_path = tmp_path_factory.mktemp("data") - instance = h.new_instance() - snap = next(snap_versions(request)) - util.setup_k8s_snap(instance, tmp_path, snap) - - bootstrap_config_path = "/home/ubuntu/bootstrap-session.yaml" - instance.send_file( - (config.MANIFESTS_DIR / "bootstrap-session.yaml").as_posix(), - bootstrap_config_path, - ) - - instance_default_ip = util.get_default_ip(instance) - - instance.exec(["k8s", "bootstrap", "--file", bootstrap_config_path]) - instance_default_cidr = util.get_default_cidr(instance, instance_default_ip) - - lb_cidr = util.find_suitable_cidr( - parent_cidr=instance_default_cidr, - excluded_ips=[instance_default_ip], - ) - - instance.exec( - ["k8s", "set", f"load-balancer.cidrs={lb_cidr}", "load-balancer.l2-mode=true"] - ) - util.wait_until_k8s_ready(instance, [instance]) - util.wait_for_network(instance) - util.wait_for_dns(instance) - - yield instance + try: + util.remove_k8s_snap(instance) + finally: + h.delete_instance(instance.id) @pytest.fixture(scope="function") diff --git a/tests/integration/tests/test_cleanup.py b/tests/integration/tests/test_cleanup.py index fb19a1ebf..18ce44400 100644 --- a/tests/integration/tests/test_cleanup.py +++ b/tests/integration/tests/test_cleanup.py @@ -5,7 +5,7 @@ from typing import List import pytest -from test_util import config, harness, util +from test_util import harness, util LOG = logging.getLogger(__name__) @@ -16,40 +16,4 @@ def test_node_cleanup(instances: List[harness.Instance]): util.wait_for_dns(instance) util.wait_for_network(instance) - LOG.info("Uninstall k8s...") - instance.exec(["snap", "remove", config.SNAP_NAME, "--purge"]) - - LOG.info("Waiting for shims to go away...") - util.stubbornly(retries=5, delay_s=5).on(instance).until( - lambda p: all( - x not in p.stdout.decode() - for x in ["containerd-shim", "cilium", "coredns", "/pause"] - ) - ).exec(["ps", "-fea"]) - - LOG.info("Waiting for kubelet and containerd mounts to go away...") - util.stubbornly(retries=5, delay_s=5).on(instance).until( - lambda p: all( - x not in p.stdout.decode() - for x in ["/var/lib/kubelet/pods", "/run/containerd/io.containerd"] - ) - ).exec(["mount"]) - - # NOTE(neoaggelos): Temporarily disable this as it fails on strict. - # For details, `snap changes` then `snap change $remove_k8s_snap_change`. - # Example output follows: - # - # 2024-02-23T14:10:42Z ERROR ignoring failure in hook "remove": - # ----- - # ... - # ip netns delete cni-UUID1 - # Cannot remove namespace file "/run/netns/cni-UUID1": Device or resource busy - # ip netns delete cni-UUID2 - # Cannot remove namespace file "/run/netns/cni-UUID2": Device or resource busy - # ip netns delete cni-UUID3 - # Cannot remove namespace file "/run/netns/cni-UUID3": Device or resource busy - - # LOG.info("Waiting for CNI network namespaces to go away...") - # util.stubbornly(retries=5, delay_s=5).on(instance).until( - # lambda p: "cni-" not in p.stdout.decode() - # ).exec(["ip", "netns", "list"]) + util.remove_k8s_snap(instance) diff --git a/tests/integration/tests/test_clustering.py b/tests/integration/tests/test_clustering.py index 8235e56cf..b5a24df31 100644 --- a/tests/integration/tests/test_clustering.py +++ b/tests/integration/tests/test_clustering.py @@ -119,7 +119,19 @@ def test_no_remove(instances: List[harness.Instance]): assert "control-plane" in util.get_local_node_status(joining_cp) assert "worker" in util.get_local_node_status(joining_worker) - cluster_node.exec(["k8s", "remove-node", joining_cp.id]) + # TODO: k8sd sometimes fails when requested to remove nodes immediately + # after bootstrapping the cluster. It seems that it takes a little + # longer for trust store changes to be propagated to all nodes, which + # should probably be fixed on the microcluster side. + # + # For now, we'll perform some retries. + # + # failed to POST /k8sd/cluster/remove: failed to delete cluster member + # k8s-integration-c1aee0-2: No truststore entry found for node with name + # "k8s-integration-c1aee0-2" + util.stubbornly(retries=3, delay_s=5).on(cluster_node).exec( + ["k8s", "remove-node", joining_cp.id] + ) nodes = util.ready_nodes(cluster_node) assert len(nodes) == 3, "cp node should not have been removed from cluster" cluster_node.exec(["k8s", "remove-node", joining_worker.id]) @@ -142,10 +154,12 @@ def test_skip_services_stop_on_remove(instances: List[harness.Instance]): join_token_worker = util.get_join_token(cluster_node, worker, "--worker") util.join_cluster(worker, join_token_worker) - # We don't care if the node is ready or the CNI is up. - util.stubbornly(retries=5, delay_s=3).until(util.get_nodes(cluster_node) == 3) + util.wait_until_k8s_ready(cluster_node, instances) - cluster_node.exec(["k8s", "remove-node", joining_cp.id]) + # TODO: skip retrying this once the microcluster trust store issue is addressed. + util.stubbornly(retries=3, delay_s=5).on(cluster_node).exec( + ["k8s", "remove-node", joining_cp.id] + ) nodes = util.ready_nodes(cluster_node) assert len(nodes) == 2, "cp node should have been removed from the cluster" services = joining_cp.exec( diff --git a/tests/integration/tests/test_dns.py b/tests/integration/tests/test_dns.py index e51285d4b..72b13b82a 100644 --- a/tests/integration/tests/test_dns.py +++ b/tests/integration/tests/test_dns.py @@ -2,14 +2,22 @@ # Copyright 2024 Canonical, Ltd. # import logging +from typing import List -from test_util import harness, util +import pytest +from test_util import config, harness, util LOG = logging.getLogger(__name__) -def test_dns(session_instance: harness.Instance): - session_instance.exec( +@pytest.mark.bootstrap_config((config.MANIFESTS_DIR / "bootstrap-all.yaml").read_text()) +def test_dns(instances: List[harness.Instance]): + instance = instances[0] + util.wait_until_k8s_ready(instance, [instance]) + util.wait_for_network(instance) + util.wait_for_dns(instance) + + instance.exec( [ "k8s", "kubectl", @@ -23,7 +31,7 @@ def test_dns(session_instance: harness.Instance): ], ) - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -37,14 +45,14 @@ def test_dns(session_instance: harness.Instance): ] ) - result = session_instance.exec( + result = instance.exec( ["k8s", "kubectl", "exec", "busybox", "--", "nslookup", "kubernetes.default"], capture_output=True, ) assert "10.152.183.1 kubernetes.default.svc.cluster.local" in result.stdout.decode() - result = session_instance.exec( + result = instance.exec( ["k8s", "kubectl", "exec", "busybox", "--", "nslookup", "canonical.com"], capture_output=True, check=False, diff --git a/tests/integration/tests/test_gateway.py b/tests/integration/tests/test_gateway.py index dd5f33ba0..c2116e7cb 100644 --- a/tests/integration/tests/test_gateway.py +++ b/tests/integration/tests/test_gateway.py @@ -6,8 +6,10 @@ import subprocess import time from pathlib import Path +from typing import List -from test_util import harness, util +import pytest +from test_util import config, harness, util from test_util.config import MANIFESTS_DIR LOG = logging.getLogger(__name__) @@ -66,20 +68,35 @@ def get_external_service_ip(instance: harness.Instance) -> str: return gateway_ip -def test_gateway(session_instance: harness.Instance): +@pytest.mark.bootstrap_config((config.MANIFESTS_DIR / "bootstrap-all.yaml").read_text()) +def test_gateway(instances: List[harness.Instance]): + instance = instances[0] + instance_default_ip = util.get_default_ip(instance) + instance_default_cidr = util.get_default_cidr(instance, instance_default_ip) + lb_cidr = util.find_suitable_cidr( + parent_cidr=instance_default_cidr, + excluded_ips=[instance_default_ip], + ) + instance.exec( + ["k8s", "set", f"load-balancer.cidrs={lb_cidr}", "load-balancer.l2-mode=true"] + ) + util.wait_until_k8s_ready(instance, [instance]) + util.wait_for_network(instance) + util.wait_for_dns(instance) + manifest = MANIFESTS_DIR / "gateway-test.yaml" - session_instance.exec( + instance.exec( ["k8s", "kubectl", "apply", "-f", "-"], input=Path(manifest).read_bytes(), ) LOG.info("Waiting for nginx pod to show up...") - util.stubbornly(retries=5, delay_s=10).on(session_instance).until( + util.stubbornly(retries=5, delay_s=10).on(instance).until( lambda p: "my-nginx" in p.stdout.decode() ).exec(["k8s", "kubectl", "get", "pod", "-o", "json"]) LOG.info("Nginx pod showed up.") - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -97,7 +114,7 @@ def test_gateway(session_instance: harness.Instance): gateway_http_port = None result = ( util.stubbornly(retries=7, delay_s=3) - .on(session_instance) + .on(instance) .until(lambda p: get_gateway_service_node_port(p) is not None) .exec(["k8s", "kubectl", "get", "service", "-o", "json"]) ) @@ -106,12 +123,12 @@ def test_gateway(session_instance: harness.Instance): assert gateway_http_port is not None, "No Gateway nodePort found." # Test the Gateway service via loadbalancer IP. - util.stubbornly(retries=5, delay_s=5).on(session_instance).until( + util.stubbornly(retries=5, delay_s=5).on(instance).until( lambda p: "Welcome to nginx!" in p.stdout.decode() ).exec(["curl", f"localhost:{gateway_http_port}"]) - gateway_ip = get_external_service_ip(session_instance) + gateway_ip = get_external_service_ip(instance) assert gateway_ip is not None, "No Gateway IP found." - util.stubbornly(retries=5, delay_s=5).on(session_instance).until( + util.stubbornly(retries=5, delay_s=5).on(instance).until( lambda p: "Welcome to nginx!" in p.stdout.decode() ).exec(["curl", f"{gateway_ip}", "-H", "Host: foo.bar.com"]) diff --git a/tests/integration/tests/test_ingress.py b/tests/integration/tests/test_ingress.py index ad8a9541e..c39115f83 100644 --- a/tests/integration/tests/test_ingress.py +++ b/tests/integration/tests/test_ingress.py @@ -8,7 +8,8 @@ from pathlib import Path from typing import List -from test_util import harness, util +import pytest +from test_util import config, harness, util from test_util.config import MANIFESTS_DIR LOG = logging.getLogger(__name__) @@ -72,11 +73,25 @@ def get_external_service_ip(instance: harness.Instance, service_namespace) -> st return ingress_ip -def test_ingress(session_instance: List[harness.Instance]): +@pytest.mark.bootstrap_config((config.MANIFESTS_DIR / "bootstrap-all.yaml").read_text()) +def test_ingress(instances: List[harness.Instance]): + instance = instances[0] + instance_default_ip = util.get_default_ip(instance) + instance_default_cidr = util.get_default_cidr(instance, instance_default_ip) + lb_cidr = util.find_suitable_cidr( + parent_cidr=instance_default_cidr, + excluded_ips=[instance_default_ip], + ) + instance.exec( + ["k8s", "set", f"load-balancer.cidrs={lb_cidr}", "load-balancer.l2-mode=true"] + ) + util.wait_until_k8s_ready(instance, [instance]) + util.wait_for_network(instance) + util.wait_for_dns(instance) result = ( util.stubbornly(retries=7, delay_s=3) - .on(session_instance) + .on(instance) .until(lambda p: get_ingress_service_node_port(p) is not None) .exec(["k8s", "kubectl", "get", "service", "-A", "-o", "json"]) ) @@ -86,18 +101,18 @@ def test_ingress(session_instance: List[harness.Instance]): assert ingress_http_port is not None, "No ingress nodePort found." manifest = MANIFESTS_DIR / "ingress-test.yaml" - session_instance.exec( + instance.exec( ["k8s", "kubectl", "apply", "-f", "-"], input=Path(manifest).read_bytes(), ) LOG.info("Waiting for nginx pod to show up...") - util.stubbornly(retries=5, delay_s=10).on(session_instance).until( + util.stubbornly(retries=5, delay_s=10).on(instance).until( lambda p: "my-nginx" in p.stdout.decode() ).exec(["k8s", "kubectl", "get", "pod", "-o", "json"]) LOG.info("Nginx pod showed up.") - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -111,19 +126,19 @@ def test_ingress(session_instance: List[harness.Instance]): ] ) - util.stubbornly(retries=5, delay_s=5).on(session_instance).until( + util.stubbornly(retries=5, delay_s=5).on(instance).until( lambda p: "Welcome to nginx!" in p.stdout.decode() ).exec(["curl", f"localhost:{ingress_http_port}", "-H", "Host: foo.bar.com"]) # Test the ingress service via loadbalancer IP ingress_ip = get_external_service_ip( - session_instance, + instance, [ {"service": "ck-ingress-contour-envoy", "namespace": "projectcontour"}, {"service": "cilium-ingress", "namespace": "kube-system"}, ], ) assert ingress_ip is not None, "No ingress IP found." - util.stubbornly(retries=5, delay_s=5).on(session_instance).until( + util.stubbornly(retries=5, delay_s=5).on(instance).until( lambda p: "Welcome to nginx!" in p.stdout.decode() ).exec(["curl", f"{ingress_ip}", "-H", "Host: foo.bar.com"]) diff --git a/tests/integration/tests/test_metrics_server.py b/tests/integration/tests/test_metrics_server.py index 1fa0331c9..0759a41e4 100644 --- a/tests/integration/tests/test_metrics_server.py +++ b/tests/integration/tests/test_metrics_server.py @@ -2,20 +2,28 @@ # Copyright 2024 Canonical, Ltd. # import logging +from typing import List -from test_util import harness, util +import pytest +from test_util import config, harness, util LOG = logging.getLogger(__name__) -def test_metrics_server(session_instance: harness.Instance): +@pytest.mark.bootstrap_config((config.MANIFESTS_DIR / "bootstrap-all.yaml").read_text()) +def test_metrics_server(instances: List[harness.Instance]): + instance = instances[0] + util.wait_until_k8s_ready(instance, [instance]) + util.wait_for_network(instance) + util.wait_for_dns(instance) + LOG.info("Waiting for metrics-server pod to show up...") - util.stubbornly(retries=15, delay_s=5).on(session_instance).until( + util.stubbornly(retries=15, delay_s=5).on(instance).until( lambda p: "metrics-server" in p.stdout.decode() ).exec(["k8s", "kubectl", "get", "pod", "-n", "kube-system", "-o", "json"]) LOG.info("Metrics-server pod showed up.") - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -31,6 +39,6 @@ def test_metrics_server(session_instance: harness.Instance): ] ) - util.stubbornly(retries=15, delay_s=5).on(session_instance).until( - lambda p: session_instance.id in p.stdout.decode() + util.stubbornly(retries=15, delay_s=5).on(instance).until( + lambda p: instance.id in p.stdout.decode() ).exec(["k8s", "kubectl", "top", "node"]) diff --git a/tests/integration/tests/test_network.py b/tests/integration/tests/test_network.py index e4d483b4a..838a5c249 100644 --- a/tests/integration/tests/test_network.py +++ b/tests/integration/tests/test_network.py @@ -4,21 +4,29 @@ import json import logging from pathlib import Path +from typing import List -from test_util import harness, util +import pytest +from test_util import config, harness, util from test_util.config import MANIFESTS_DIR LOG = logging.getLogger(__name__) -def test_network(session_instance: harness.Instance): +@pytest.mark.bootstrap_config((config.MANIFESTS_DIR / "bootstrap-all.yaml").read_text()) +def test_network(instances: List[harness.Instance]): + instance = instances[0] + util.wait_until_k8s_ready(instance, [instance]) + util.wait_for_network(instance) + util.wait_for_dns(instance) + manifest = MANIFESTS_DIR / "nginx-pod.yaml" - p = session_instance.exec( + p = instance.exec( ["k8s", "kubectl", "apply", "-f", "-"], input=Path(manifest).read_bytes(), ) - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -32,7 +40,7 @@ def test_network(session_instance: harness.Instance): ] ) - p = session_instance.exec( + p = instance.exec( [ "k8s", "kubectl", @@ -51,6 +59,6 @@ def test_network(session_instance: harness.Instance): assert len(out["items"]) > 0, "No NGINX pod found" podIP = out["items"][0]["status"]["podIP"] - util.stubbornly(retries=5, delay_s=5).on(session_instance).until( + util.stubbornly(retries=5, delay_s=5).on(instance).until( lambda p: "Welcome to nginx!" in p.stdout.decode() ).exec(["curl", "-s", f"http://{podIP}"]) diff --git a/tests/integration/tests/test_storage.py b/tests/integration/tests/test_storage.py index 2a8ce2c6f..497e401d9 100644 --- a/tests/integration/tests/test_storage.py +++ b/tests/integration/tests/test_storage.py @@ -5,8 +5,10 @@ import logging import subprocess from pathlib import Path +from typing import List -from test_util import harness, util +import pytest +from test_util import config, harness, util from test_util.config import MANIFESTS_DIR LOG = logging.getLogger(__name__) @@ -20,14 +22,20 @@ def check_pvc_bound(p: subprocess.CompletedProcess) -> bool: return False -def test_storage(session_instance: harness.Instance): +@pytest.mark.bootstrap_config((config.MANIFESTS_DIR / "bootstrap-all.yaml").read_text()) +def test_storage(instances: List[harness.Instance]): + instance = instances[0] + util.wait_until_k8s_ready(instance, [instance]) + util.wait_for_network(instance) + util.wait_for_dns(instance) + LOG.info("Waiting for storage provisioner pod to show up...") - util.stubbornly(retries=15, delay_s=5).on(session_instance).until( + util.stubbornly(retries=15, delay_s=5).on(instance).until( lambda p: "ck-storage" in p.stdout.decode() ).exec(["k8s", "kubectl", "get", "pod", "-n", "kube-system", "-o", "json"]) LOG.info("Storage provisioner pod showed up.") - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -44,18 +52,18 @@ def test_storage(session_instance: harness.Instance): ) manifest = MANIFESTS_DIR / "storage-setup.yaml" - session_instance.exec( + instance.exec( ["k8s", "kubectl", "apply", "-f", "-"], input=Path(manifest).read_bytes(), ) LOG.info("Waiting for storage writer pod to show up...") - util.stubbornly(retries=3, delay_s=10).on(session_instance).until( + util.stubbornly(retries=3, delay_s=10).on(instance).until( lambda p: "storage-writer-pod" in p.stdout.decode() ).exec(["k8s", "kubectl", "get", "pod", "-o", "json"]) LOG.info("Storage writer pod showed up.") - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -70,16 +78,16 @@ def test_storage(session_instance: harness.Instance): ) LOG.info("Waiting for storage to get provisioned...") - util.stubbornly(retries=3, delay_s=1).on(session_instance).until( - check_pvc_bound - ).exec(["k8s", "kubectl", "get", "pvc", "-o", "json"]) + util.stubbornly(retries=3, delay_s=1).on(instance).until(check_pvc_bound).exec( + ["k8s", "kubectl", "get", "pvc", "-o", "json"] + ) LOG.info("Storage got provisioned and pvc is bound.") - util.stubbornly(retries=5, delay_s=10).on(session_instance).until( + util.stubbornly(retries=5, delay_s=10).on(instance).until( lambda p: "LOREM IPSUM" in p.stdout.decode() ).exec(["k8s", "kubectl", "logs", "storage-writer-pod"]) - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -92,18 +100,18 @@ def test_storage(session_instance: harness.Instance): ) manifest = MANIFESTS_DIR / "storage-test.yaml" - session_instance.exec( + instance.exec( ["k8s", "kubectl", "apply", "-f", "-"], input=Path(manifest).read_bytes(), ) LOG.info("Waiting for storage reader pod to show up...") - util.stubbornly(retries=3, delay_s=10).on(session_instance).until( + util.stubbornly(retries=3, delay_s=10).on(instance).until( lambda p: "storage-reader-pod" in p.stdout.decode() ).exec(["k8s", "kubectl", "get", "pod", "-o", "json"]) LOG.info("Storage reader pod showed up.") - util.stubbornly(retries=3, delay_s=1).on(session_instance).exec( + util.stubbornly(retries=3, delay_s=1).on(instance).exec( [ "k8s", "kubectl", @@ -117,7 +125,7 @@ def test_storage(session_instance: harness.Instance): ] ) - util.stubbornly(retries=5, delay_s=10).on(session_instance).until( + util.stubbornly(retries=5, delay_s=10).on(instance).until( lambda p: "LOREM IPSUM" in p.stdout.decode() ).exec(["k8s", "kubectl", "logs", "storage-reader-pod"]) diff --git a/tests/integration/tests/test_util/config.py b/tests/integration/tests/test_util/config.py index f85021573..f44ae122e 100644 --- a/tests/integration/tests/test_util/config.py +++ b/tests/integration/tests/test_util/config.py @@ -7,7 +7,7 @@ DIR = Path(__file__).absolute().parent # The following defaults are used to define how long to wait for a condition to be met. -DEFAULT_WAIT_RETRIES = int(os.getenv("TEST_DEFAULT_WAIT_RETRIES") or 30) +DEFAULT_WAIT_RETRIES = int(os.getenv("TEST_DEFAULT_WAIT_RETRIES") or 60) DEFAULT_WAIT_DELAY_S = int(os.getenv("TEST_DEFAULT_WAIT_DELAY_S") or 5) MANIFESTS_DIR = DIR / ".." / ".." / "templates" diff --git a/tests/integration/tests/test_util/util.py b/tests/integration/tests/test_util/util.py index 352589c48..5ae53f2e2 100644 --- a/tests/integration/tests/test_util/util.py +++ b/tests/integration/tests/test_util/util.py @@ -185,6 +185,48 @@ def setup_k8s_snap( instance.exec(["/snap/k8s/current/k8s/hack/init.sh"], stdout=subprocess.DEVNULL) +def remove_k8s_snap(instance: harness.Instance): + LOG.info("Uninstall k8s...") + stubbornly(retries=20, delay_s=5).on(instance).exec( + ["snap", "remove", config.SNAP_NAME, "--purge"] + ) + + LOG.info("Waiting for shims to go away...") + stubbornly(retries=20, delay_s=5).on(instance).until( + lambda p: all( + x not in p.stdout.decode() + for x in ["containerd-shim", "cilium", "coredns", "/pause"] + ) + ).exec(["ps", "-fea"]) + + LOG.info("Waiting for kubelet and containerd mounts to go away...") + stubbornly(retries=20, delay_s=5).on(instance).until( + lambda p: all( + x not in p.stdout.decode() + for x in ["/var/lib/kubelet/pods", "/run/containerd/io.containerd"] + ) + ).exec(["mount"]) + + # NOTE(neoaggelos): Temporarily disable this as it fails on strict. + # For details, `snap changes` then `snap change $remove_k8s_snap_change`. + # Example output follows: + # + # 2024-02-23T14:10:42Z ERROR ignoring failure in hook "remove": + # ----- + # ... + # ip netns delete cni-UUID1 + # Cannot remove namespace file "/run/netns/cni-UUID1": Device or resource busy + # ip netns delete cni-UUID2 + # Cannot remove namespace file "/run/netns/cni-UUID2": Device or resource busy + # ip netns delete cni-UUID3 + # Cannot remove namespace file "/run/netns/cni-UUID3": Device or resource busy + + # LOG.info("Waiting for CNI network namespaces to go away...") + # stubbornly(retries=5, delay_s=5).on(instance).until( + # lambda p: "cni-" not in p.stdout.decode() + # ).exec(["ip", "netns", "list"]) + + def wait_until_k8s_ready( control_node: harness.Instance, instances: List[harness.Instance], From 5b032a933106e7931a1fbe6b3b2998cfbbf56f8a Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Wed, 13 Nov 2024 15:03:11 +0200 Subject: [PATCH 2/9] Fix lint and sbom jobs (#791) --- build-scripts/hack/generate-sbom.py | 14 ++++++++------ .../patches/strict/0001-Strict-patch.patch | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/build-scripts/hack/generate-sbom.py b/build-scripts/hack/generate-sbom.py index 6ea25c9ea..c9cd22a27 100755 --- a/build-scripts/hack/generate-sbom.py +++ b/build-scripts/hack/generate-sbom.py @@ -139,13 +139,15 @@ def k8s_snap_c_dqlite_components(manifest, extra_files): def rock_cilium(manifest, extra_files): LOG.info("Generating SBOM info for Cilium rocks") + cilium_version = "1.15.2" + with util.git_repo(CILIUM_ROCK_REPO, CILIUM_ROCK_TAG) as d: rock_repo_commit = util.parse_output(["git", "rev-parse", "HEAD"], cwd=d) - rockcraft = (d / "cilium/rockcraft.yaml").read_text() - operator_rockcraft = (d / "cilium-operator-generic/rockcraft.yaml").read_text() + rockcraft = (d / f"{cilium_version}/cilium/rockcraft.yaml").read_text() + operator_rockcraft = (d / f"{cilium_version}/cilium-operator-generic/rockcraft.yaml").read_text() - extra_files["cilium/rockcraft.yaml"] = rockcraft - extra_files["cilium-operator-generic/rockcraft.yaml"] = operator_rockcraft + extra_files[f"{cilium_version}/cilium/rockcraft.yaml"] = rockcraft + extra_files[f"{cilium_version}/cilium-operator-generic/rockcraft.yaml"] = operator_rockcraft rockcraft_yaml = yaml.safe_load(rockcraft) repo_url = rockcraft_yaml["parts"]["cilium"]["source"] @@ -169,10 +171,10 @@ def rock_cilium(manifest, extra_files): }, "language": "go", "details": [ - "cilium/rockcraft.yaml", + f"{cilium_version}/cilium/rockcraft.yaml", "cilium/go.mod", "cilium/go.sum", - "cilium-operator-generic/rockcraft.yaml", + f"{cilium_version}/cilium-operator-generic/rockcraft.yaml", "cilium-operator-generic/go.mod", "cilium-operator-generic/go.sum", ], diff --git a/build-scripts/patches/strict/0001-Strict-patch.patch b/build-scripts/patches/strict/0001-Strict-patch.patch index 7bbc5a71b..bed2ed6ff 100644 --- a/build-scripts/patches/strict/0001-Strict-patch.patch +++ b/build-scripts/patches/strict/0001-Strict-patch.patch @@ -299,7 +299,7 @@ index 3e54d68..295c458 100644 + # * list mounts + # + # https://paste.ubuntu.com/p/WscCCfnvGH/plain/ -+ # https://paste.ubuntu.com/p/sSnJVvZkrr/plain/ ++ # https://paste.ubuntu.com/p/sSnJVvZkrr/plain/ + # + # LOG.info("Waiting for shims to go away...") + # stubbornly(retries=20, delay_s=5).on(instance).until( From 2e7135300921a97d926eb8a27bdfa9ff2f03706c Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Thu, 14 Nov 2024 09:23:47 +0200 Subject: [PATCH 3/9] Increase timeouts and log test timestamps (#792) * Configure pytest to log timestamps We'll configure pytest to use log timestamps, which allows us to correlate test logs with kubernetes logs. * Bump test timeouts again We're waiting about 5 minutes for new nodes to become ready, however sometimes this isn't enough. It can take around one minute just to pull the cilium image. For this reason, we'll double the test timeouts again. --- tests/branch_management/tox.ini | 2 ++ tests/integration/tests/test_util/config.py | 2 +- tests/integration/tox.ini | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/branch_management/tox.ini b/tests/branch_management/tox.ini index 4ee5619c2..a7f2cd8de 100644 --- a/tests/branch_management/tox.ini +++ b/tests/branch_management/tox.ini @@ -38,6 +38,8 @@ commands = --tb native \ --log-cli-level DEBUG \ --disable-warnings \ + --log-format "%(asctime)s %(levelname)s %(message)s" \ + --log-date-format "%Y-%m-%d %H:%M:%S" \ {posargs} \ {tox_root}/tests pass_env = diff --git a/tests/integration/tests/test_util/config.py b/tests/integration/tests/test_util/config.py index f44ae122e..40e375c23 100644 --- a/tests/integration/tests/test_util/config.py +++ b/tests/integration/tests/test_util/config.py @@ -7,7 +7,7 @@ DIR = Path(__file__).absolute().parent # The following defaults are used to define how long to wait for a condition to be met. -DEFAULT_WAIT_RETRIES = int(os.getenv("TEST_DEFAULT_WAIT_RETRIES") or 60) +DEFAULT_WAIT_RETRIES = int(os.getenv("TEST_DEFAULT_WAIT_RETRIES") or 120) DEFAULT_WAIT_DELAY_S = int(os.getenv("TEST_DEFAULT_WAIT_DELAY_S") or 5) MANIFESTS_DIR = DIR / ".." / ".." / "templates" diff --git a/tests/integration/tox.ini b/tests/integration/tox.ini index 1b33bcda9..bdd82d029 100644 --- a/tests/integration/tox.ini +++ b/tests/integration/tox.ini @@ -37,6 +37,8 @@ commands = --maxfail 1 \ --tb native \ --log-cli-level DEBUG \ + --log-format "%(asctime)s %(levelname)s %(message)s" \ + --log-date-format "%Y-%m-%d %H:%M:%S" \ --disable-warnings \ {posargs} \ {toxinidir}/tests From aa8b78fd8010b25b36d7fff8951ba46b2f1d1a6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20Tekin=20=C3=96z?= Date: Thu, 14 Nov 2024 10:24:29 +0300 Subject: [PATCH 4/9] Remove strict testing from PRs (#793) --- .github/workflows/integration-informing.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-informing.yaml b/.github/workflows/integration-informing.yaml index d4ca93ae9..9ade424d0 100644 --- a/.github/workflows/integration-informing.yaml +++ b/.github/workflows/integration-informing.yaml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - patch: ["strict", "moonray"] + patch: ["moonray"] fail-fast: false steps: - name: Harden Runner @@ -58,7 +58,7 @@ jobs: strategy: matrix: os: ["ubuntu:20.04"] - patch: ["strict", "moonray"] + patch: ["moonray"] fail-fast: false runs-on: ["self-hosted", "Linux", "AMD64", "jammy", "large"] steps: From e1dd58e65154623962d62f93322add4bb817647a Mon Sep 17 00:00:00 2001 From: eaudetcobello Date: Thu, 14 Nov 2024 12:09:16 -0500 Subject: [PATCH 5/9] Verify Microk8s Installation Status (#785) --- src/k8s/cmd/k8s/k8s_bootstrap.go | 19 +++++++++ src/k8s/pkg/client/snapd/snap_info.go | 41 +++++++++++++++++++ .../tests/test_util/test_bootstrap.py | 16 ++++++++ 3 files changed, 76 insertions(+) create mode 100644 src/k8s/pkg/client/snapd/snap_info.go create mode 100644 tests/integration/tests/test_util/test_bootstrap.py diff --git a/src/k8s/cmd/k8s/k8s_bootstrap.go b/src/k8s/cmd/k8s/k8s_bootstrap.go index 5510ed34e..9ba6525d3 100644 --- a/src/k8s/cmd/k8s/k8s_bootstrap.go +++ b/src/k8s/cmd/k8s/k8s_bootstrap.go @@ -13,6 +13,7 @@ import ( apiv1 "github.com/canonical/k8s-snap-api/api/v1" cmdutil "github.com/canonical/k8s/cmd/util" + "github.com/canonical/k8s/pkg/client/snapd" "github.com/canonical/k8s/pkg/config" "github.com/canonical/k8s/pkg/k8sd/features" "github.com/canonical/k8s/pkg/utils" @@ -47,6 +48,24 @@ func newBootstrapCmd(env cmdutil.ExecutionEnvironment) *cobra.Command { Long: "Generate certificates, configure service arguments and start the Kubernetes services.", PreRun: chainPreRunHooks(hookRequireRoot(env), hookInitializeFormatter(env, &opts.outputFormat), hookCheckLXD()), Run: func(cmd *cobra.Command, args []string) { + snapdClient, err := snapd.NewClient() + if err != nil { + cmd.PrintErrln("Error: failed to create snapd client: %w", err) + env.Exit(1) + return + } + microk8sInfo, err := snapdClient.GetSnapInfo("microk8s") + if err != nil { + cmd.PrintErrln("Error: failed to check if microk8s is installed: %w", err) + env.Exit(1) + return + } + if microk8sInfo.StatusCode == 200 && microk8sInfo.HasInstallDate() { + cmd.PrintErrln("Error: microk8s snap is installed. Please remove it using the following command and try again:\n\n sudo snap remove microk8s") + env.Exit(1) + return + } + if opts.interactive && opts.configFile != "" { cmd.PrintErrln("Error: --interactive and --file flags cannot be set at the same time.") env.Exit(1) diff --git a/src/k8s/pkg/client/snapd/snap_info.go b/src/k8s/pkg/client/snapd/snap_info.go new file mode 100644 index 000000000..b09f61ec4 --- /dev/null +++ b/src/k8s/pkg/client/snapd/snap_info.go @@ -0,0 +1,41 @@ +package snapd + +import ( + "encoding/json" + "fmt" + "io" + "time" +) + +type SnapInfoResult struct { + InstallDate time.Time `json:"install-date"` +} + +type SnapInfoResponse struct { + StatusCode int `json:"status-code"` + Result SnapInfoResult `json:"result"` +} + +func (c *Client) GetSnapInfo(snap string) (*SnapInfoResponse, error) { + resp, err := c.client.Get(fmt.Sprintf("http://localhost/v2/snaps/%s", snap)) + if err != nil { + return nil, fmt.Errorf("failed to get snapd snap info: %w", err) + } + defer resp.Body.Close() + + resBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("client: could not read response body: %w", err) + } + + var snapInfoResponse SnapInfoResponse + if err := json.Unmarshal(resBody, &snapInfoResponse); err != nil { + return nil, fmt.Errorf("client: could not unmarshal response body: %w", err) + } + + return &snapInfoResponse, nil +} + +func (s SnapInfoResponse) HasInstallDate() bool { + return !s.Result.InstallDate.IsZero() +} diff --git a/tests/integration/tests/test_util/test_bootstrap.py b/tests/integration/tests/test_util/test_bootstrap.py new file mode 100644 index 000000000..1bcc688dd --- /dev/null +++ b/tests/integration/tests/test_util/test_bootstrap.py @@ -0,0 +1,16 @@ +# +# Copyright 2024 Canonical, Ltd. +# +from typing import List + +import pytest +from test_util import harness + + +@pytest.mark.node_count(1) +@pytest.mark.disable_k8s_bootstrapping() +def test_microk8s_installed(instances: List[harness.Instance]): + instance = instances[0] + instance.exec("snap install microk8s --classic".split()) + result = instance.exec("k8s bootstrap".split(), capture_output=True, check=False) + assert "Error: microk8s snap is installed" in result.stderr.decode() From a8e140be5569dc4b0e742d99bc27ac130d5a8183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20Tekin=20=C3=96z?= Date: Fri, 15 Nov 2024 10:01:14 +0300 Subject: [PATCH 6/9] Cleanup left-over iptables rules from kubeproxy and cilium (#788) --- k8s/lib.sh | 2 ++ snap/snapcraft.yaml | 1 + src/k8s/pkg/k8sd/features/cilium/cleanup.go | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/k8s/lib.sh b/k8s/lib.sh index a30f3a30a..ba6d96b1a 100755 --- a/k8s/lib.sh +++ b/k8s/lib.sh @@ -46,6 +46,8 @@ k8s::common::is_strict() { # Cleanup configuration left by the network feature k8s::remove::network() { k8s::common::setup_env + + "${SNAP}/bin/kube-proxy" --cleanup || true k8s::cmd::k8s x-cleanup network || true } diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index 9d21e55f1..435f40fb2 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -164,6 +164,7 @@ parts: - ethtool - hostname - iproute2 + - ipset - kmod - libatm1 - libnss-resolve diff --git a/src/k8s/pkg/k8sd/features/cilium/cleanup.go b/src/k8s/pkg/k8sd/features/cilium/cleanup.go index bb97321e8..679e56135 100644 --- a/src/k8s/pkg/k8sd/features/cilium/cleanup.go +++ b/src/k8s/pkg/k8sd/features/cilium/cleanup.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "os/exec" + "strings" "github.com/canonical/k8s/pkg/snap" ) @@ -18,5 +19,25 @@ func CleanupNetwork(ctx context.Context, snap snap.Snap) error { } } + for _, cmd := range []string{"iptables", "ip6tables", "iptables-legacy", "ip6tables-legacy"} { + out, err := exec.Command(fmt.Sprintf("%s-save", cmd)).Output() + if err != nil { + return fmt.Errorf("failed to read iptables rules: %w", err) + } + + lines := strings.Split(string(out), "\n") + for i, line := range lines { + if strings.Contains(strings.ToLower(line), "cilium") { + lines[i] = "" + } + } + + restore := exec.Command(fmt.Sprintf("%s-restore", cmd)) + restore.Stdin = strings.NewReader(strings.Join(lines, "\n")) + if err := restore.Run(); err != nil { + return fmt.Errorf("failed to restore iptables rules: %w", err) + } + } + return nil } From 352819381595cde4bfbdccfb90c7738bb2d23bb5 Mon Sep 17 00:00:00 2001 From: Homayoon Alimohammadi Date: Fri, 15 Nov 2024 18:28:35 +0400 Subject: [PATCH 7/9] Add in-place upgrade explanation (#770) --- .../src/capi/explanation/in-place-upgrades.md | 132 ++++++++++++++++++ docs/src/capi/explanation/index.md | 2 +- 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 docs/src/capi/explanation/in-place-upgrades.md diff --git a/docs/src/capi/explanation/in-place-upgrades.md b/docs/src/capi/explanation/in-place-upgrades.md new file mode 100644 index 000000000..5196fd7d1 --- /dev/null +++ b/docs/src/capi/explanation/in-place-upgrades.md @@ -0,0 +1,132 @@ +# In-Place Upgrades + +Regularly upgrading the Kubernetes version of the machines in a cluster +is important. While rolling upgrades are a popular strategy, certain +situations will require in-place upgrades: + +- Resource constraints (i.e. cost of additional machines). +- Expensive manual setup process for nodes. + +## Annotations + +CAPI machines are considered immutable. Consequently, machines are replaced +instead of reconfigured. +While CAPI doesn't support in-place upgrades, {{product}} CAPI does +by leveraging annotations for the implementation. +For a deeper understanding of the CAPI design decisions, consider reading about +[machine immutability in CAPI][1], and Kubernetes objects: [`labels`][2], +[`spec` and `status`][3]. + +## Controllers + +In {{product}} CAPI, there are two main types of controllers that handle the +process of performing in-place upgrades: + +- Single Machine In-Place Upgrade Controller +- Orchestrated In-Place Upgrade Controller + +The core component of performing an in-place upgrade is the `Single Machine +Upgrader`. The controller watches for annotations on machines and reconciles +them to ensure the upgrades happen smoothly. + +The `Orchestrator` watches for certain annotations on +machine owners, reconciles them and upgrades groups of owned machines. +It’s responsible for ensuring that all the machines owned by the +reconciled object get upgraded successfully. + +The main annotations that drive the upgrade process are as follows: + +- `v1beta2.k8sd.io/in-place-upgrade-to` --> `upgrade-to` : Instructs +the controller to perform an upgrade with the specified option/method. +- `v1beta2.k8sd.io/in-place-upgrade-status` --> `status` : As soon as the +controller starts the upgrade process, the object will be marked with the +`status` annotation which can either be `in-progress`, `failed` or `done`. +- `v1beta2.k8sd.io/in-place-upgrade-release` --> `release` : When the +upgrade is performed successfully, this annotation will indicate the current +Kubernetes release/version installed on the machine. + +For a complete list of annotations and their values please +refer to the [annotations reference page][4]. This explanation proceeds +to use abbreviations of the mentioned labels. + +### Single Machine In-Place Upgrade Controller + +The Machine objects can be marked with the `upgrade-to` annotation to +trigger an in-place upgrade for that machine. While watching for changes +on the machines, the single machine upgrade controller notices this annotation +and attempts to upgrade the Kubernetes version of that machine to the +specified version. + +Upgrade methods or options can be specified to upgrade to a snap channel, +revision, or a local snap file already placed on the +machine in air-gapped environments. + +A successfully upgraded machine shows the following annotations: + +```yaml +annotations: + v1beta2.k8sd.io/in-place-upgrade-release: "channel=1.31/stable" + v1beta2.k8sd.io/in-place-upgrade-status: "done" +``` + +If the upgrade fails, the controller will mark the machine and retry +the upgrade immediately: + +```yaml +annotations: + # the `upgrade-to` causes the retry to happen + v1beta2.k8sd.io/in-place-upgrade-to: "channel=1.31/stable" + v1beta2.k8sd.io/in-place-upgrade-status: "failed" + + # orchestrator will notice this annotation and knows that the + # upgrade for this machine failed + v1beta2.k8sd.io/in-place-upgrade-last-failed-attempt-at: "Sat, 7 Nov + 2024 13:30:00 +0400" +``` + +By applying and removing annotations, the single machine +upgrader determines the upgrade status of the machine it’s trying to +reconcile and takes necessary actions to successfully complete an +in-place upgrade. The following diagram shows the flow of the in-place +upgrade of a single machine: + +![Diagram][img-single-machine] + +### Machine Upgrade Process + +The {{product}}'s `k8sd` daemon exposes endpoints that can be used to +interact with the cluster. The single machine upgrader calls the +`/snap/refresh` endpoint on the machine to trigger the upgrade +process while checking `/snap/refresh-status` periodically. + +![Diagram][img-k8sd-call] + +### In-place upgrades on large workload clusters + +While the “Single Machine In-Place Upgrade Controller” is responsible +for upgrading individual machines, the "Orchestrated In-Place Upgrade +Controller" ensures that groups of machines will get upgraded. +By applying the `upgrade-to` annotation on an object that owns machines +(e.g. a `MachineDeployment`), this controller will mark the owned machines +one by one which will cause the "Single Machine Upgrader" to pickup those +annotations and upgrade the machines. To avoid undesirable situations + like quorum loss or severe downtime, these upgrades happen in sequence. + +The failures and successes of individual machine upgrades will be reported back +to the orchestrator by the single machine upgrader via annotations. + +The illustrated flow of orchestrated in-place upgrades: + +![Diagram][img-orchestrated] + + + +[img-single-machine]: https://assets.ubuntu.com/v1/1200f040-single-machine.png +[img-k8sd-call]: https://assets.ubuntu.com/v1/518eb73a-k8sd-call.png +[img-orchestrated]: https://assets.ubuntu.com/v1/8f302a00-orchestrated.png + + +[1]: https://cluster-api.sigs.k8s.io/user/concepts#machine-immutability-in-place-upgrade-vs-replace +[2]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +[3]: https://kubernetes.io/docs/concepts/overview/working-with-objects/#object-spec-and-status +[4]: ../reference/annotations.md diff --git a/docs/src/capi/explanation/index.md b/docs/src/capi/explanation/index.md index f10ada1ac..d4ad076be 100644 --- a/docs/src/capi/explanation/index.md +++ b/docs/src/capi/explanation/index.md @@ -16,7 +16,7 @@ Overview about security capi-ck8s.md - +in-place-upgrades.md ``` From b4055dfb278f1bccfef3b25f9e00ab35aa63b5c0 Mon Sep 17 00:00:00 2001 From: Niamh Hennigan Date: Fri, 15 Nov 2024 06:30:29 -0800 Subject: [PATCH 8/9] Arch diagrams (#801) * fix pic order in a previous PR the order of the arch docs got mixed up. Back in correct places now * clarify k8sd diagram fix typos in diagram and help clarify in text and on diagram difference in deploying with Juju vs snap --- docs/src/assets/k8sd-component.puml | 6 +++--- docs/src/snap/reference/architecture.md | 15 ++++++++------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/src/assets/k8sd-component.puml b/docs/src/assets/k8sd-component.puml index f95cd278c..3e39ce90b 100644 --- a/docs/src/assets/k8sd-component.puml +++ b/docs/src/assets/k8sd-component.puml @@ -16,13 +16,13 @@ Container(K8sSnapDistribution.State, "State", $techn="", $descr="Datastores hold Container(K8sSnapDistribution.KubernetesServices, "Kubernetes Services", $techn="", $descr="API server, kubelet, kube-proxy, scheduler, kube-controller", $tags="", $link="") Container_Boundary("K8sSnapDistribution.K8sd_boundary", "K8sd", $tags="") { - Component(K8sSnapDistribution.K8sd.CLI, "CLI", $techn="CLI", $descr="The CLI the offered", $tags="", $link="") + Component(K8sSnapDistribution.K8sd.CLI, "CLI", $techn="CLI", $descr="The CLI offered", $tags="", $link="") Component(K8sSnapDistribution.K8sd.APIviaHTTP, "API via HTTP", $techn="REST", $descr="The API interface offered", $tags="", $link="") - Component(K8sSnapDistribution.K8sd.CLustermanagement, "CLuster management", $techn="", $descr="Management of the cluster with the help of MicroCluster", $tags="", $link="") + Component(K8sSnapDistribution.K8sd.CLustermanagement, "Cluster management", $techn="", $descr="Management of the cluster with the help of MicroCluster", $tags="", $link="") } Rel(K8sAdmin, K8sSnapDistribution.K8sd.CLI, "Sets up and configured the cluster", $techn="", $tags="", $link="") -Rel(CharmK8s, K8sSnapDistribution.K8sd.APIviaHTTP, "Orchestrates the lifecycle management of K8s", $techn="", $tags="", $link="") +Rel(CharmK8s, K8sSnapDistribution.K8sd.APIviaHTTP, "Orchestrates the lifecycle management of K8s when deployed with Juju", $techn="", $tags="", $link="") Rel(K8sSnapDistribution.K8sd.CLustermanagement, K8sSnapDistribution.KubernetesServices, "Configures", $techn="", $tags="", $link="") Rel(K8sSnapDistribution.KubernetesServices, K8sSnapDistribution.State, "Uses by default", $techn="", $tags="", $link="") Rel(K8sSnapDistribution.K8sd.CLustermanagement, K8sSnapDistribution.State, "Keeps state in", $techn="", $tags="", $link="") diff --git a/docs/src/snap/reference/architecture.md b/docs/src/snap/reference/architecture.md index 44981a45b..a24c5ff6a 100644 --- a/docs/src/snap/reference/architecture.md +++ b/docs/src/snap/reference/architecture.md @@ -10,7 +10,7 @@ current design of {{product}}, following the [C4 model]. This overview of {{product}} demonstrates the interactions of Kubernetes with users and with other systems. -![cluster2][] +![cluster5][] Two actors interact with the Kubernetes snap: @@ -19,7 +19,8 @@ Two actors interact with the Kubernetes snap: access to the cluster. That initial user is able to configure the cluster to match their needs and of course create other users that may or may not have admin privileges. The K8s admin is also able to maintain workloads running - in the cluster. + in the cluster. If you deploy {{product}} from a snap, this is how the cluster + is manually orchestrated. - **K8s user**: A user consuming the workloads hosted in the cluster. Users do not have access to the Kubernetes API server. They need to access the cluster @@ -51,7 +52,7 @@ distribution. We have identified the following: Looking more closely at what is contained within the K8s snap itself: -![cluster3][] +![cluster1][] The `k8s` snap distribution includes the following: @@ -72,7 +73,7 @@ The `k8s` snap distribution includes the following: K8sd is the component that implements and exposes the operations functionality needed for managing the Kubernetes cluster. -![cluster4][] +![cluster2][] At the core of the `k8sd` functionality we have the cluster manager that is responsible for configuring the services, workload and features we deem @@ -104,7 +105,7 @@ This functionality is exposed via the following interfaces: Canonical `k8s` Charms encompass two primary components: the [`k8s` charm][K8s charm] and the [`k8s-worker` charm][K8s-worker charm]. -![cluster1][] +![cluster4][] Charms are instantiated on a machine as a Juju unit, and a collection of units constitutes an application. Both `k8s` and `k8s-worker` units are responsible @@ -139,9 +140,9 @@ and flexible {{product}} deployment managed through Juju. [cluster1]: https://assets.ubuntu.com/v1/dfc43753-cluster1.svg -[cluster2]: https://assets.ubuntu.com/v1/0e486a5d-cluster2.svg -[cluster3]: https://assets.ubuntu.com/v1/24fd1773-cluster3.svg +[cluster2]: https://assets.ubuntu.com/v1/f634743e-k8sd.svg [cluster4]: https://assets.ubuntu.com/v1/24fd1773-cluster4.svg +[cluster5]: https://assets.ubuntu.com/v1/bcfe150f-overview.svg [C4 model]: https://c4model.com/ From 81bb027b3947d9656aacbc092ec6cecbe89b7a58 Mon Sep 17 00:00:00 2001 From: eaudetcobello Date: Fri, 15 Nov 2024 10:22:45 -0500 Subject: [PATCH 9/9] [Docs] How to use cloud storage (#794) --- .../assets/how-to-cloud-storage-aws-ccm.yaml | 170 ++++++ docs/src/snap/howto/storage/cloud.md | 496 ++++++++++++++++++ docs/src/snap/howto/storage/index.md | 3 +- 3 files changed, 668 insertions(+), 1 deletion(-) create mode 100644 docs/src/assets/how-to-cloud-storage-aws-ccm.yaml create mode 100644 docs/src/snap/howto/storage/cloud.md diff --git a/docs/src/assets/how-to-cloud-storage-aws-ccm.yaml b/docs/src/assets/how-to-cloud-storage-aws-ccm.yaml new file mode 100644 index 000000000..fa6dc3cb9 --- /dev/null +++ b/docs/src/assets/how-to-cloud-storage-aws-ccm.yaml @@ -0,0 +1,170 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: aws-cloud-controller-manager + namespace: kube-system + labels: + k8s-app: aws-cloud-controller-manager +spec: + selector: + matchLabels: + k8s-app: aws-cloud-controller-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + k8s-app: aws-cloud-controller-manager + spec: + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node.cloudprovider.kubernetes.io/uninitialized + value: "true" + effect: NoSchedule + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + serviceAccountName: cloud-controller-manager + containers: + - name: aws-cloud-controller-manager + image: registry.k8s.io/provider-aws/cloud-controller-manager:v1.28.3 + args: + - --v=2 + - --cloud-provider=aws + - --use-service-account-credentials=true + - --configure-cloud-routes=false + resources: + requests: + cpu: 200m + hostNetwork: true +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cloud-controller-manager + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cloud-controller-manager:apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - apiGroup: "" + kind: ServiceAccount + name: cloud-controller-manager + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:cloud-controller-manager +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - '*' +- apiGroups: + - "" + resources: + - nodes/status + verbs: + - patch +- apiGroups: + - "" + resources: + - services + verbs: + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - services/status + verbs: + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - create + - get + - list + - watch +- apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - get + - list + - update + - watch +- apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - get + - list + - watch + - update +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - get + - list + - watch + - update +- apiGroups: + - "" + resources: + - serviceaccounts/token + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: system:cloud-controller-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:cloud-controller-manager +subjects: + - apiGroup: "" + kind: ServiceAccount + name: cloud-controller-manager + namespace: kube-system diff --git a/docs/src/snap/howto/storage/cloud.md b/docs/src/snap/howto/storage/cloud.md new file mode 100644 index 000000000..920b297dc --- /dev/null +++ b/docs/src/snap/howto/storage/cloud.md @@ -0,0 +1,496 @@ +# How to use cloud storage + +{{product}} simplifies the process of integrating and managing cloud storage +solutions like Amazon EBS. This guide provides steps to configure IAM policies, +deploy the cloud controller manager, and set up the necessary drivers for you +to take advantage of cloud storage solutions in the context of Kubernetes. + +## What you'll need + +This guide is for AWS and assumes the following: + +- You have root or sudo access to an Amazon EC2 instance +- You can create roles and policies in AWS + + +## Set IAM Policies + +Your instance will need a few IAM policies to be able to communciate with the +AWS APIs. The policies provided here are quite open and should be scoped down +based on your security requirements. + +You will most likely want to create a role for your instance. You can call this +role "k8s-control-plane" or "k8s-worker". Then, define and attach the following +policies to the role. Once the role is created with the required policies, +attach the role to the instance. + +For a control plane node: + +```{dropdown} Control Plane Policies +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeTags", + "ec2:DescribeInstances", + "ec2:DescribeRegions", + "ec2:DescribeRouteTables", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeVolumes", + "ec2:DescribeAvailabilityZones", + "ec2:CreateSecurityGroup", + "ec2:CreateTags", + "ec2:CreateVolume", + "ec2:ModifyInstanceAttribute", + "ec2:ModifyVolume", + "ec2:AttachVolume", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:CreateRoute", + "ec2:DeleteRoute", + "ec2:DeleteSecurityGroup", + "ec2:DeleteVolume", + "ec2:DetachVolume", + "ec2:RevokeSecurityGroupIngress", + "ec2:DescribeVpcs", + "ec2:DescribeInstanceTopology", + "elasticloadbalancing:AddTags", + "elasticloadbalancing:AttachLoadBalancerToSubnets", + "elasticloadbalancing:ApplySecurityGroupsToLoadBalancer", + "elasticloadbalancing:CreateLoadBalancer", + "elasticloadbalancing:CreateLoadBalancerPolicy", + "elasticloadbalancing:CreateLoadBalancerListeners", + "elasticloadbalancing:ConfigureHealthCheck", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:DeleteLoadBalancerListeners", + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:DescribeLoadBalancerAttributes", + "elasticloadbalancing:DetachLoadBalancerFromSubnets", + "elasticloadbalancing:DeregisterInstancesFromLoadBalancer", + "elasticloadbalancing:ModifyLoadBalancerAttributes", + "elasticloadbalancing:RegisterInstancesWithLoadBalancer", + "elasticloadbalancing:SetLoadBalancerPoliciesForBackendServer", + "elasticloadbalancing:AddTags", + "elasticloadbalancing:CreateListener", + "elasticloadbalancing:CreateTargetGroup", + "elasticloadbalancing:DeleteListener", + "elasticloadbalancing:DeleteTargetGroup", + "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeLoadBalancerPolicies", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DescribeTargetHealth", + "elasticloadbalancing:ModifyListener", + "elasticloadbalancing:ModifyTargetGroup", + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:DeregisterTargets", + "elasticloadbalancing:SetLoadBalancerPoliciesOfListener", + "iam:CreateServiceLinkedRole", + "kms:DescribeKey" + ], + "Resource": [ + "*" + ] + } + ] +} +``` + +For a worker node: + +```{dropdown} Worker Policies +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "ec2:DescribeRegions", + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:GetRepositoryPolicy", + "ecr:DescribeRepositories", + "ecr:ListImages", + "ecr:BatchGetImage" + ], + "Resource": "*" + } + ] +} +``` + +## Add a tag to your EC2 Instance + +A cluster using the AWS cloud provider needs to label existing nodes and +resources with a ClusterID or the kube-controller-manager will not start. Add +the following tag to your instance, making sure to replace the placeholder id +with your own (this can simply be "k8s" or "my-k8s-cluster"). + +``` +kubernetes.io/cluster/=owned +``` + +## Set your host name + +The cloud controller manager uses the node name to correctly associate the node +with an EC2 instance. In {{product}}, the node name is derived from the +hostname of the machine. Therefore, before bootstrapping the cluster, you must +first set an appropriate host name. + +```bash +echo "$(sudo cloud-init query ds.meta_data.local-hostname)" | sudo tee /etc/hostname +``` + +Then, reboot the machine. + +```bash +sudo reboot +``` + +When the machine is up, use `hostname -f` to check the host name. It should +look like: + +```bash +ip-172-31-11-86.us-east-2.compute.internal +``` + +This host name format is called IP-based naming and is specific to AWS. + + +## Bootstrap {{product}} + +Now that your machine has an appropriate host name, you are ready to bootstrap +{{product}}. + +First, create a bootstrap configuration file that sets the cloud-provider +configuration to "external". + +```bash +echo "cluster-config: + cloud-provider: external" > bootstrap-config.yaml +``` + +Then, bootstrap the cluster: + +```bash +sudo k8s bootstrap --file ./bootstrap-config.yaml +sudo k8s status --wait-ready +``` + +## Deploy the cloud controller manager + +Now that you have an appropriate host name, policies, and a {{product}} +cluster, you have everything you need to deploy the cloud controller manager. + +Here is a YAML definition file that sets appropriate defaults for you, it +configures the necessary service accounts, roles, and daemonsets: + +```{dropdown} CCM deployment manifest +```yaml +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: aws-cloud-controller-manager + namespace: kube-system + labels: + k8s-app: aws-cloud-controller-manager +spec: + selector: + matchLabels: + k8s-app: aws-cloud-controller-manager + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + k8s-app: aws-cloud-controller-manager + spec: + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node.cloudprovider.kubernetes.io/uninitialized + value: "true" + effect: NoSchedule + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + serviceAccountName: cloud-controller-manager + containers: + - name: aws-cloud-controller-manager + image: registry.k8s.io/provider-aws/cloud-controller-manager:v1.28.3 + args: + - --v=2 + - --cloud-provider=aws + - --use-service-account-credentials=true + - --configure-cloud-routes=false + resources: + requests: + cpu: 200m + hostNetwork: true +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cloud-controller-manager + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cloud-controller-manager:apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - apiGroup: "" + kind: ServiceAccount + name: cloud-controller-manager + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:cloud-controller-manager +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - '*' +- apiGroups: + - "" + resources: + - nodes/status + verbs: + - patch +- apiGroups: + - "" + resources: + - services + verbs: + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - services/status + verbs: + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - create + - get + - list + - watch +- apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - get + - list + - update + - watch +- apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - get + - list + - watch + - update +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - get + - list + - watch + - update +- apiGroups: + - "" + resources: + - serviceaccounts/token + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: system:cloud-controller-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:cloud-controller-manager +subjects: + - apiGroup: "" + kind: ServiceAccount + name: cloud-controller-manager + namespace: kube-system +``` + +You can apply the CCM manifest easily by running the following command: + +```bash +sudo k8s kubectl apply -f https://raw.githubusercontent.com/canonical/k8s-snap/main/docs/src/assets/how-to-cloud-storage-aws-ccm.yaml +``` + +After a moment, you should see the cloud controller manager pod was +successfully deployed. + +```bash +NAME READY STATUS RESTARTS AGE +aws-cloud-controller-manager-ndbtq 1/1 Running 1 (3h51m ago) 9h +``` + +## Deploy the EBS CSI Driver + +Now that the cloud controller manager is deployed and can communicate with AWS, +you are ready to deploy the EBS CSI driver. The easiest way to deploy the +driver is with the Helm chart. Luckily, {{product}} has a built-in Helm +command. + +If you want to create encrypted drives, you need to add the statement to the +policy you are using for the instance. + +```json +{ + "Effect": "Allow", + "Action": [ + "kms:Decrypt", + "kms:GenerateDataKeyWithoutPlaintext", + "kms:CreateGrant" + ], + "Resource": "*" +} +``` + +Then, add the Helm repo for the EBS CSI Driver. + +```bash +sudo k8s helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver +sudo k8s helm repo update +``` + +Finally, install the Helm chart, making sure to set the correct region as an +argument. + +```bash +sudo k8s helm upgrade --install aws-ebs-csi-driver \ + --namespace kube-system \ + aws-ebs-csi-driver/aws-ebs-csi-driver \ + --set controller.region= +``` + +Once the command completes, you can verify the pods are successfully deployed: + +```bash +kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-ebs-csi-driver +``` + +```bash +NAME READY STATUS RESTARTS AGE +ebs-csi-controller-78bcd46cf8-5zk8q 5/5 Running 2 (3h48m ago) 8h +ebs-csi-controller-78bcd46cf8-g7l5h 5/5 Running 1 (3h48m ago) 8h +ebs-csi-node-nx6rg 3/3 Running 0 9h +``` + +The status of all pods should be "Running". + +## Deploy a workload + +Everything is in place for you to deploy a workload that dynamically creates +and uses an EBS volume. + +First, create a StorageClass and a PersistentVolumeClaim: + +``` +sudo k8s kubectl apply -f - < Volumes` page in AWS, you should see a 10Gi gp3 volume. diff --git a/docs/src/snap/howto/storage/index.md b/docs/src/snap/howto/storage/index.md index 43728a9ba..f79732d4d 100644 --- a/docs/src/snap/howto/storage/index.md +++ b/docs/src/snap/howto/storage/index.md @@ -14,4 +14,5 @@ default storage built-in to {{product}}. Use default storage Use Ceph storage -``` \ No newline at end of file +Use cloud storage +```