Skip to content

Commit

Permalink
Merge pull request #1202 from Nordix/add-healthcheck-test/minna
Browse files Browse the repository at this point in the history
🌱 Add healthcheck e2e test
  • Loading branch information
metal3-io-bot authored Oct 17, 2023
2 parents fcc863a + 2626875 commit 429a4e9
Show file tree
Hide file tree
Showing 5 changed files with 291 additions and 15 deletions.
12 changes: 9 additions & 3 deletions test/e2e/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ func MachineToVMName(ctx context.Context, cli client.Client, m *clusterv1.Machin
return "", fmt.Errorf("no matching Metal3Machine found for current Machine")
}

// MachineTiIPAddress gets IPAddress based on machine, from machine -> m3machine -> m3data -> IPAddress.
func MachineToIPAddress(ctx context.Context, cli client.Client, m *clusterv1.Machine) (string, error) {
m3Machine := &infrav1.Metal3Machine{}
err := cli.Get(ctx, types.NamespacedName{
Expand Down Expand Up @@ -630,6 +631,7 @@ func MachineToIPAddress(ctx context.Context, cli client.Client, m *clusterv1.Mac
return string(IPAddress.Spec.Address), nil
}

// RunCommand runs a command via ssh. If logfolder is "", no logs are saved.
func runCommand(logFolder, filename, machineIP, user, command string) error {
home, err := os.UserHomeDir()
if err != nil {
Expand Down Expand Up @@ -661,7 +663,6 @@ func runCommand(logFolder, filename, machineIP, user, command string) error {
if err != nil {
return fmt.Errorf("couldn't open a new session: %w", err)
}
logFile := path.Join(logFolder, filename)
var stdoutBuf bytes.Buffer
var stderrBuf bytes.Buffer
session.Stdout = &stdoutBuf
Expand All @@ -670,14 +671,19 @@ func runCommand(logFolder, filename, machineIP, user, command string) error {
return fmt.Errorf("unable to send command %q: %w", "sudo "+command, err)
}
result := strings.TrimSuffix(stdoutBuf.String(), "\n") + "\n" + strings.TrimSuffix(stderrBuf.String(), "\n")
if err := os.WriteFile(logFile, []byte(result), 0400); err != nil {
return fmt.Errorf("error writing log file: %w", err)
if logFolder != "" {
// Write logs is folder path is provided.
logFile := path.Join(logFolder, filename)
if err := os.WriteFile(logFile, []byte(result), 0400); err != nil {
return fmt.Errorf("error writing log file: %w", err)
}
}
return nil
}

type Metal3LogCollector struct{}

// CollectMachineLog collects specific logs from machines.
func (Metal3LogCollector) CollectMachineLog(ctx context.Context, cli client.Client, m *clusterv1.Machine, outputPath string) error {
VMName, err := MachineToVMName(ctx, cli, m)
if err != nil {
Expand Down
51 changes: 51 additions & 0 deletions test/e2e/healthcheck_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package e2e

import (
"os"
"path/filepath"
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

/*
* Healthcheck Test:
* - For both worker and controlplane machines:
* - Create and deploy machinehealthcheck.
* - Stop kubelet on the machine.
* - Wait for the healthcheck to notice the unhealthy machine.
* - Wait for the remediation request to be created.
* - Wait for the machine to appear as healthy again.
* - Wait for the remediation request to be deleted.
**/

var _ = Describe("When testing healthcheck [healthcheck]", func() {
BeforeEach(func() {
osType := strings.ToLower(os.Getenv("OS"))
Expect(osType).ToNot(Equal(""))
validateGlobals(specName)

// We need to override clusterctl apply log folder to avoid getting our credentials exposed.
clusterctlLogFolder = filepath.Join(os.TempDir(), "clusters", bootstrapClusterProxy.GetName())
})

It("Should remediate unhealthy machines", func() {
By("Fetching cluster configuration")
k8sVersion := e2eConfig.GetVariable("KUBERNETES_VERSION")
By("Provision Workload cluster")
targetCluster, _ = createTargetCluster(k8sVersion)

healthcheck(ctx, func() HealthCheckInput {
return HealthCheckInput{
BootstrapClusterProxy: bootstrapClusterProxy,
ClusterName: clusterName,
Namespace: namespace,
}
})
})

AfterEach(func() {
DumpSpecResourcesAndCleanup(ctx, specName, bootstrapClusterProxy, artifactFolder, namespace, e2eConfig.GetIntervals, clusterName, clusterctlLogFolder, skipCleanup)
})
})
203 changes: 203 additions & 0 deletions test/e2e/healthchek.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
package e2e

import (
"context"
"fmt"
"time"

infrav1 "github.com/metal3-io/cluster-api-provider-metal3/api/v1beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/test/framework"

"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
timeout = 40 * time.Minute
freq = 30 * time.Second
)

type HealthCheckInput struct {
BootstrapClusterProxy framework.ClusterProxy
ClusterName string
Namespace string
}

func healthcheck(ctx context.Context, inputGetter func() HealthCheckInput) {
input := inputGetter()
cli := input.BootstrapClusterProxy.GetClient()
namespace := input.Namespace
clusterName := input.ClusterName
controlplaneM3Machines, workerM3Machines := GetMetal3Machines(ctx, cli, clusterName, namespace)

// Worker
By("Healthchecking the workers")
workerHealthcheck, err := DeployWorkerHealthCheck(ctx, cli, namespace, clusterName)
Expect(err).ToNot(HaveOccurred())
workerMachineName, err := Metal3MachineToMachineName(workerM3Machines[0])
Expect(err).ToNot(HaveOccurred())
workerMachine := GetMachine(ctx, cli, client.ObjectKey{Name: workerMachineName, Namespace: namespace})
workerIP, err := MachineToIPAddress(ctx, cli, &workerMachine)
Expect(err).ToNot(HaveOccurred())
Expect(runCommand("", "", workerIP, "metal3", "systemctl stop kubelet")).To(Succeed())
// Wait until node is marked unhealthy and then check that it becomes healthy again
Logf("Waiting for unhealthy worker...")
WaitForHealthCheckCurrentHealthyToMatch(ctx, cli, 0, workerHealthcheck, timeout, freq)
Logf("Waiting for remediationrequest to exist ...")
WaitForRemediationRequest(ctx, cli, client.ObjectKeyFromObject(&workerMachine), true, timeout, freq)
Logf("Waiting for worker to get healthy again...")
WaitForHealthCheckCurrentHealthyToMatch(ctx, cli, 1, workerHealthcheck, timeout, freq)
Logf("Waiting for remediationrequest to not exist ...")
WaitForRemediationRequest(ctx, cli, client.ObjectKeyFromObject(&workerMachine), false, timeout, freq)

// Controlplane
By("Healthchecking the controlplane")
controlplaneHealthcheck, err := DeployControlplaneHealthCheck(ctx, cli, namespace, clusterName)
Expect(err).ToNot(HaveOccurred())
controlplaneMachineName, err := Metal3MachineToMachineName(controlplaneM3Machines[0])
Expect(err).ToNot(HaveOccurred())
controlplaneMachine := GetMachine(ctx, cli, client.ObjectKey{Name: controlplaneMachineName, Namespace: namespace})
controlplaneIP, err := MachineToIPAddress(ctx, cli, &controlplaneMachine)
Expect(err).ToNot(HaveOccurred())
Expect(runCommand("", "", controlplaneIP, "metal3", "systemctl stop kubelet")).To(Succeed())
// Wait until node is marked unhealthy and then check that it becomes healthy again
Logf("Waiting for unhealthy controlplane ...")
WaitForHealthCheckCurrentHealthyToMatch(ctx, cli, 2, controlplaneHealthcheck, timeout, freq)
Logf("Waiting for remediationrequest to exist ...")
WaitForRemediationRequest(ctx, cli, client.ObjectKeyFromObject(&controlplaneMachine), true, timeout, freq)
Logf("Waiting for controlplane to be healthy again...")
WaitForHealthCheckCurrentHealthyToMatch(ctx, cli, 3, controlplaneHealthcheck, timeout, freq)
Logf("Waiting for remediationrequest to not exist ...")
WaitForRemediationRequest(ctx, cli, client.ObjectKeyFromObject(&controlplaneMachine), false, timeout, freq)
}

// DeployControlplaneHealthCheck creates a MachineHealthcheck and Metal3RemediationTemplate for controlplane machines.
func DeployControlplaneHealthCheck(ctx context.Context, cli client.Client, namespace, clusterName string) (*clusterv1.MachineHealthCheck, error) {
remediationTemplateName := "controlplane-remediation-request"
healthCheckName := "controlplane-healthcheck"
matchLabels := map[string]string{
"cluster.x-k8s.io/control-plane": "",
}
healthcheck, err := DeployMachineHealthCheck(ctx, cli, namespace, clusterName, remediationTemplateName, healthCheckName, matchLabels)
if err != nil {
return nil, fmt.Errorf("creating controlplane healthcheck failed: %w", err)
}
return healthcheck, nil
}

// DeployWorkerHealthCheck creates a MachineHealthcheck and Metal3RemediationTemplate for worker machines.
func DeployWorkerHealthCheck(ctx context.Context, cli client.Client, namespace, clusterName string) (*clusterv1.MachineHealthCheck, error) {
remediationTemplateName := "worker-remediation-request"
healthCheckName := "worker-healthcheck"
matchLabels := map[string]string{
"nodepool": "nodepool-0",
}
healthcheck, err := DeployMachineHealthCheck(ctx, cli, namespace, clusterName, remediationTemplateName, healthCheckName, matchLabels)
if err != nil {
return nil, fmt.Errorf("creating worker healthcheck failed: %w", err)
}
return healthcheck, nil
}

// DeployMachineHealthCheck creates a MachineHealthcheck and Metal3RemediationTemplate with given values.
func DeployMachineHealthCheck(ctx context.Context, cli client.Client, namespace, clusterName, remediationTemplateName, healthCheckName string, matchLabels map[string]string) (*clusterv1.MachineHealthCheck, error) {
remediationTemplate := infrav1.Metal3RemediationTemplate{
TypeMeta: metav1.TypeMeta{
Kind: "Metal3RemediationTemplate",
},
ObjectMeta: metav1.ObjectMeta{
Name: remediationTemplateName,
Namespace: namespace,
},
Spec: infrav1.Metal3RemediationTemplateSpec{
Template: infrav1.Metal3RemediationTemplateResource{
Spec: infrav1.Metal3RemediationSpec{
Strategy: &infrav1.RemediationStrategy{
Type: infrav1.RebootRemediationStrategy,
RetryLimit: 1,
Timeout: &metav1.Duration{Duration: time.Second * 300},
},
},
},
},
}

err := cli.Create(ctx, &remediationTemplate)
if err != nil {
return nil, fmt.Errorf("couldn't create remediation template: %w", err)
}

healthCheck := &clusterv1.MachineHealthCheck{
TypeMeta: metav1.TypeMeta{
Kind: "MachineHealthCheck",
},
ObjectMeta: metav1.ObjectMeta{
Name: healthCheckName,
Namespace: namespace,
},
Spec: clusterv1.MachineHealthCheckSpec{
ClusterName: clusterName,
Selector: metav1.LabelSelector{
MatchLabels: matchLabels,
},
UnhealthyConditions: []clusterv1.UnhealthyCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{
Duration: time.Second * 300,
},
},
{
Type: corev1.NodeReady,
Status: "False",
Timeout: metav1.Duration{
Duration: time.Second * 300,
},
},
},
MaxUnhealthy: &intstr.IntOrString{
Type: intstr.String,
StrVal: "100%",
},
NodeStartupTimeout: &clusterv1.ZeroDuration,
RemediationTemplate: &corev1.ObjectReference{
Kind: "Metal3RemediationTemplate",
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
Name: remediationTemplateName,
},
},
}
err = cli.Create(ctx, healthCheck)
if err != nil {
return nil, fmt.Errorf("couldn't create healthCheck: %w", err)
}
return healthCheck, nil
}

// WaitForHealthCheckCurrentHealthyToMatch waits for current healthy machines watched by healthcheck to match the number given.
func WaitForHealthCheckCurrentHealthyToMatch(ctx context.Context, cli client.Client, number int32, healthcheck *clusterv1.MachineHealthCheck, timeout, frequency time.Duration) {
Eventually(func(g Gomega) int32 {
g.Expect(cli.Get(ctx, client.ObjectKeyFromObject(healthcheck), healthcheck)).To(Succeed())
return healthcheck.Status.CurrentHealthy
}, timeout, frequency).Should(Equal(number))
}

// WaitForRemediationRequest waits until a remediation request created with healthcheck either exists or is deleted.
func WaitForRemediationRequest(ctx context.Context, cli client.Client, healthcheckName types.NamespacedName, toExist bool, timeout, frequency time.Duration) {
Eventually(func(g Gomega) {
remediation := &infrav1.Metal3Remediation{}
if toExist {
g.Expect(cli.Get(ctx, healthcheckName, remediation)).To(Succeed())
} else {
g.Expect(cli.Get(ctx, healthcheckName, remediation)).NotTo(Succeed())
}
}, timeout, frequency).Should(Succeed())
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

type Metal3RemediationInput struct {
type NodeDeletionRemediation struct {
E2EConfig *clusterctl.E2EConfig
BootstrapClusterProxy framework.ClusterProxy
TargetCluster framework.ClusterProxy
Expand All @@ -28,7 +28,7 @@ type Metal3RemediationInput struct {
}

/*
* Metal3 Remediation Test
* Node Deletion Remediation Test
*
* This test evaluates node deletion in reboot remediation feature added to CAPM3 Remediation Controller.
* issue #392: Reboot remediation is incomplete
Expand All @@ -51,12 +51,10 @@ type Metal3RemediationInput struct {
* Metal3Remediation test ensures that Metal3 Remediation Controller can effectively remediate worker nodes by orchestrating
* the reboot process and validating the successful recovery of the nodes. It helps ensure the stability and
* resiliency of the cluster by allowing workloads to be seamlessly migrated from unhealthy nodes to healthy node
*
* TODO: Add full metal3remediation test issue #1060: Add Healthcheck Test to E2E for CAPM3.
*/

func metal3remediation(ctx context.Context, inputGetter func() Metal3RemediationInput) {
Logf("Starting metal3 remediation tests")
func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletionRemediation) {
Logf("Starting node deletion remediation tests")
input := inputGetter()
bootstrapClient := input.BootstrapClusterProxy.GetClient()
targetClient := input.TargetCluster.GetClient()
Expand Down Expand Up @@ -128,7 +126,7 @@ func metal3remediation(ctx context.Context, inputGetter func() Metal3Remediation
return apierrors.IsNotFound(err)
}, 2*time.Minute, 10*time.Second).Should(BeTrue(), "Metal3Remediation should have been deleted")

By("METAL3REMEDIATION TESTS PASSED!")
By("NODE DELETION TESTS PASSED!")
}

func waitForNodeDeletion(ctx context.Context, cl client.Client, name string, intervals ...interface{}) {
Expand Down
28 changes: 23 additions & 5 deletions test/e2e/remediation_based_feature_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
* 1. Metal3Remediation Test: This test specifically evaluates the Metal3 Remediation Controller's node deletion feature in the reboot remediation strategy.
* 2. Remediation Test: This test focuses on verifying various annotations and actions related to remediation in the CAPM3 (Cluster API Provider for Metal3).
*
* Metal3Remediation Test:
* NodeDeletionRemediation Test:
* - Retrieve the list of Metal3 machines associated with the worker nodes.
* - Identify the target worker Metal3Machine and its corresponding BareMetalHost (BMH) object.
* - Create a Metal3Remediation resource with a remediation strategy of type "Reboot" and a specified timeout.
Expand All @@ -31,6 +31,15 @@ import (
* - Delete the Metal3Remediation resource.
* - Verify that the Metal3Remediation resource has been successfully deleted.
*
* Healthcheck Test:
* - For both worker and controlplane machines:
* - Create and deploy machinehealthcheck.
* - Stop kubelet on the machine.
* - Wait for the healthcheck to notice the unhealthy machine.
* - Wait for the remediation request to be created.
* - Wait for the machine to appear healthy again.
* - Wait until the remediation request has been deleted.
*
* Remediation Test:
* - Reboot Annotation: Mark a worker BMH for reboot and wait for the associated VM to transition to the "shutoff" state and then to the "running" state.
* - Poweroff Annotation: Verify the power off and power on actions by turning off and on the specified machines.
Expand Down Expand Up @@ -62,9 +71,9 @@ var _ = Describe("Testing nodes remediation [remediation] [features]", func() {
targetCluster, _ = createTargetCluster(e2eConfig.GetVariable("KUBERNETES_VERSION"))

// Run Metal3Remediation test first, doesn't work after remediation...
By("Running Metal3Remediation tests")
metal3remediation(ctx, func() Metal3RemediationInput {
return Metal3RemediationInput{
By("Running node deletion remediation tests")
nodeDeletionRemediation(ctx, func() NodeDeletionRemediation {
return NodeDeletionRemediation{
E2EConfig: e2eConfig,
BootstrapClusterProxy: bootstrapClusterProxy,
TargetCluster: targetCluster,
Expand All @@ -74,7 +83,16 @@ var _ = Describe("Testing nodes remediation [remediation] [features]", func() {
}
})

By("Running remediation tests")
By("Running healthcheck tests")
healthcheck(ctx, func() HealthCheckInput {
return HealthCheckInput{
BootstrapClusterProxy: bootstrapClusterProxy,
ClusterName: clusterName,
Namespace: namespace,
}
})

By("Running annotated powercycle remediation tests")
remediation(ctx, func() RemediationInput {
return RemediationInput{
E2EConfig: e2eConfig,
Expand Down

0 comments on commit 429a4e9

Please sign in to comment.