Skip to content

Commit

Permalink
test: automate scale test execution
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Castilio dos Santos <[email protected]>
  • Loading branch information
alexcastilio committed Jan 21, 2025
1 parent b3cd0ec commit 09bfdd9
Show file tree
Hide file tree
Showing 10 changed files with 279 additions and 46 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/daily-scale-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Daily Scale Test

on:
push:
branches:
- alexcastilio/scale-test-workflow
schedule:
- cron: "0 0 * * *"

permissions:
contents: read
id-token: write

jobs:
call-scale-test:
uses: ./.github/workflows/scale-test.yaml
with:
num_deployments: 300
num_replicas: 100
# TODO: Fix values
num_netpol: 300
num_nodes: 300
cleanup: false
secrets: inherit
26 changes: 12 additions & 14 deletions .github/workflows/scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
description: "Image Namespace (if not set, default namespace will be used)"
type: string
image_tag:
description: "Image Tag (if not set, default for this commit will be used)"
description: "Image Tag (if not set, latest commit from 'main' will be used)"
type: string
num_deployments:
description: "Number of Traffic Deployments"
Expand All @@ -36,25 +36,21 @@ on:

workflow_call:
inputs:
resource_group:
description: "Azure Resource Group"
required: true
type: string
cluster_name:
description: "AKS Cluster Name"
required: true
type: string
num_deployments:
description: "Number of Traffic Deployments"
default: 1000
default: 100
type: number
num_replicas:
description: "Number of Traffic Replicas per Deployment"
default: 40
default: 10
type: number
num_netpol:
description: "Number of Network Policies"
default: 1000
default: 100
type: number
num_nodes:
description: "Number of nodes per pool"
default: 100
type: number
cleanup:
description: "Clean up environment after test"
Expand Down Expand Up @@ -100,8 +96,10 @@ jobs:
IMAGE_NAMESPACE: ${{ github.repository }}
TAG: ${{ inputs.image_tag }}
AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }}
NODES_PER_POOL: ${{ inputs.num_nodes }}
CREATE_INFRA: ${{ github.event_name != 'workflow_dispatch' }}
shell: bash
run: |
set -euo pipefail
[[ $TAG == "" ]] && TAG=$(make version)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false
[[ $TAG == "" ]] && TAG=$(curl -s https://api.github.com/repos/microsoft/retina/commits | jq -r '.[0].sha' | cut -c1-7)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=$(echo $CREATE_INFRA) -delete-infra=$(echo $CREATE_INFRA)
53 changes: 51 additions & 2 deletions test/e2e/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ package common

import (
"flag"
"os"
"os/user"
"strconv"
"testing"
"time"

"github.com/microsoft/retina/test/e2e/framework/params"
"github.com/stretchr/testify/require"
)

Expand All @@ -30,10 +30,59 @@ var (
Architectures = []string{"amd64", "arm64"}
CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing")
DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing")
ScaleTestInfra = ScaleTestInfraHandler{
location: params.Location,
subscriptionID: params.SubscriptionID,
resourceGroup: params.ResourceGroup,
clusterName: params.ClusterName,
nodesPerPool: params.NodesPerPool,
}
)

type ScaleTestInfraHandler struct {
location string
subscriptionID string
resourceGroup string
clusterName string
nodesPerPool string
}

func (s ScaleTestInfraHandler) GetSubscriptionID() string {
return s.subscriptionID
}

func (s ScaleTestInfraHandler) GetLocation() string {
if s.location == "" {
return "westus2"
}
return s.location
}

func (s ScaleTestInfraHandler) GetResourceGroup() string {
if s.resourceGroup != "" {
return s.resourceGroup
}
// Use the cluster name as the resource group name by default.
return s.GetClusterName()
}

func (s ScaleTestInfraHandler) GetNodesPerPool() string {
if s.nodesPerPool == "" {
// Default to 100 nodes per pool
return "100"
}
return s.nodesPerPool
}

func (s ScaleTestInfraHandler) GetClusterName() string {
if s.clusterName != "" {
return s.clusterName
}
return "retina-scale-test"
}

func ClusterNameForE2ETest(t *testing.T) string {
clusterName := os.Getenv("CLUSTER_NAME")
clusterName := params.ClusterName
if clusterName == "" {
curuser, err := user.Current()
require.NoError(t, err)
Expand Down
38 changes: 35 additions & 3 deletions test/e2e/framework/azure/create-cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,24 @@ type CreateCluster struct {
ResourceGroupName string
Location string
ClusterName string
podCidr string
vmSize string
networkPluginMode string
NodesPerPool int32
}

func (c *CreateCluster) SetPodCidr(podCidr string) *CreateCluster {
c.podCidr = podCidr
return c
}

func (c *CreateCluster) SetVMSize(vmSize string) *CreateCluster {
c.vmSize = vmSize
return c
}
func (c *CreateCluster) SetNetworkPluginMode(networkPluginMode string) *CreateCluster {
c.networkPluginMode = networkPluginMode
return c
}

func (c *CreateCluster) Run() error {
Expand All @@ -36,8 +54,22 @@ func (c *CreateCluster) Run() error {
if err != nil {
return fmt.Errorf("failed to create client: %w", err)
}
if c.NodesPerPool == 0 {
c.NodesPerPool = MaxNumberOfNodes
}

template := GetStarterClusterTemplate(c.Location, c.NodesPerPool)
if c.podCidr != "" {
template.Properties.NetworkProfile.PodCidr = to.Ptr(c.podCidr)
}
if c.vmSize != "" {
template.Properties.AgentPoolProfiles[0].VMSize = to.Ptr(c.vmSize)
}
if c.networkPluginMode != "" {
template.Properties.NetworkProfile.NetworkPluginMode = to.Ptr(armcontainerservice.NetworkPluginMode(c.networkPluginMode))
}

poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, template, nil)
if err != nil {
return fmt.Errorf("failed to finish the create cluster request: %w", err)
}
Expand All @@ -49,7 +81,7 @@ func (c *CreateCluster) Run() error {
return nil
}

func GetStarterClusterTemplate(location string) armcontainerservice.ManagedCluster {
func GetStarterClusterTemplate(location string, numOfNodes int32) armcontainerservice.ManagedCluster {
id := armcontainerservice.ResourceIdentityTypeSystemAssigned
return armcontainerservice.ManagedCluster{
Location: to.Ptr(location),
Expand All @@ -70,7 +102,7 @@ func GetStarterClusterTemplate(location string) armcontainerservice.ManagedClust
{
Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets),
// AvailabilityZones: []*string{to.Ptr("1")},
Count: to.Ptr[int32](MaxNumberOfNodes),
Count: to.Ptr[int32](numOfNodes),
EnableNodePublicIP: to.Ptr(false),
Mode: to.Ptr(armcontainerservice.AgentPoolModeSystem),
OSType: to.Ptr(armcontainerservice.OSTypeLinux),
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/framework/azure/enable-ama.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ az aks update --enable-azure-monitor-metrics \
return fmt.Errorf("failed to write cluster JSON to file for AMA: %w", err)
}

poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil)
poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location, MaxNumberOfNodes), nil)
if err != nil {
return fmt.Errorf("failed to finish the update cluster request for AMA: %w", err)
}
Expand Down
76 changes: 76 additions & 0 deletions test/e2e/framework/kubernetes/label-nodes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package kubernetes

import (
"context"
"encoding/json"
"fmt"
"log"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)

type patchStringValue struct {
Op string `json:"op"`
Path string `json:"path"`
Value string `json:"value"`
}

type LabelNodes struct {
KubeConfigFilePath string
Labels map[string]string
}

func (l *LabelNodes) Prevalidate() error {
return nil
}

func (l *LabelNodes) Run() error {
config, err := clientcmd.BuildConfigFromFlags("", l.KubeConfigFilePath)
if err != nil {
return fmt.Errorf("error building kubeconfig: %w", err)
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
defer cancel()

nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to get nodes: %w", err)
}

patch := []patchStringValue{}
for k, v := range l.Labels {
patch = append(patch, patchStringValue{
Op: "add",
Path: "/metadata/labels/" + k,
Value: v,
})
}
b, err := json.Marshal(patch)
if err != nil {
return fmt.Errorf("failed to marshal patch: %w", err)
}

for i := range nodes.Items {
log.Println("Labeling node", nodes.Items[i].Name)
_, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("failed to patch pod: %w", err)
}
}

return nil
}

func (l *LabelNodes) Stop() error {
return nil
}
17 changes: 17 additions & 0 deletions test/e2e/framework/params/params.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package params

import (
"os"
)

var (
Location = os.Getenv("LOCATION")
SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID")
ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP")
ClusterName = os.Getenv("CLUSTER_NAME")
NodesPerPool = os.Getenv("NODES_PER_POOL")
NumDeployments = os.Getenv("NUM_DEPLOYMENTS")
NumReplicas = os.Getenv("NUM_REPLICAS")
NumNetworkPolicies = os.Getenv("NUM_NET_POL")
CleanUp = os.Getenv("CLEANUP")
)
1 change: 1 addition & 0 deletions test/e2e/jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string
PodCidr: "10.128.0.0/9",
DNSServiceIP: "192.168.0.10",
ServiceCidr: "192.168.0.0/28",
NodesPerPool: 1,
}, nil)

job.AddStep(&azure.GetAKSKubeConfig{
Expand Down
47 changes: 47 additions & 0 deletions test/e2e/jobs/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"time"

"github.com/microsoft/retina/test/e2e/common"
"github.com/microsoft/retina/test/e2e/framework/azure"
"github.com/microsoft/retina/test/e2e/framework/generic"
"github.com/microsoft/retina/test/e2e/framework/kubernetes"
"github.com/microsoft/retina/test/e2e/framework/scaletest"
"github.com/microsoft/retina/test/e2e/framework/types"
Expand Down Expand Up @@ -45,6 +47,51 @@ func DefaultScaleTestOptions() scaletest.Options {
}
}

func GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string, nodesPerPool int32, createInfra bool) *types.Job {
job := types.NewJob("Get scale test infrastructure")

if createInfra {
job.AddStep(&azure.CreateResourceGroup{
SubscriptionID: subID,
ResourceGroupName: rg,
Location: location,
}, nil)

job.AddStep((&azure.CreateCluster{
ClusterName: clusterName,
NodesPerPool: nodesPerPool,
}).
SetPodCidr("100.64.0.0/10").
SetVMSize("Standard_D4_v3").
SetNetworkPluginMode("overlay"), nil)

job.AddStep(&azure.GetAKSKubeConfig{
KubeConfigFilePath: kubeConfigFilePath,
}, nil)

} else {
job.AddStep(&azure.GetAKSKubeConfig{
KubeConfigFilePath: kubeConfigFilePath,
ClusterName: clusterName,
SubscriptionID: subID,
ResourceGroupName: rg,
Location: location,
}, nil)
}

job.AddStep(&kubernetes.LabelNodes{
Labels: map[string]string{"scale-test": "true"},
}, nil)

job.AddStep(&generic.LoadFlags{
TagEnv: generic.DefaultTagEnv,
ImageNamespaceEnv: generic.DefaultImageNamespace,
ImageRegistryEnv: generic.DefaultImageRegistry,
}, nil)

return job
}

func ScaleTest(opt *scaletest.Options) *types.Job {
job := types.NewJob("Scale Test")

Expand Down
Loading

0 comments on commit 09bfdd9

Please sign in to comment.