diff --git a/.github/workflows/daily-scale-test.yaml b/.github/workflows/daily-scale-test.yaml new file mode 100644 index 0000000000..13c05d937f --- /dev/null +++ b/.github/workflows/daily-scale-test.yaml @@ -0,0 +1,22 @@ +name: Scale Test + +on: + push: + branches: + - alexcastilio/scale-test-workflow + # schedule: + # - cron: "0 0 * * *" + +# permissions: +# contents: read +# id-token: write + +jobs: + call-scale-test: + uses: ./.github/workflows/scale-test.yaml + with: + num_deployments: 10 + num_replicas: 10 + # TODO: Fix value + num_netpol: 0 + cleanup: false diff --git a/.github/workflows/scale-test.yaml b/.github/workflows/scale-test.yaml index 36a70fe84d..73e4e17fef 100644 --- a/.github/workflows/scale-test.yaml +++ b/.github/workflows/scale-test.yaml @@ -36,25 +36,17 @@ on: workflow_call: inputs: - resource_group: - description: "Azure Resource Group" - required: true - type: string - cluster_name: - description: "AKS Cluster Name" - required: true - type: string num_deployments: description: "Number of Traffic Deployments" - default: 1000 + default: 100 type: number num_replicas: description: "Number of Traffic Replicas per Deployment" - default: 40 + default: 10 type: number num_netpol: description: "Number of Network Policies" - default: 1000 + default: 100 type: number cleanup: description: "Clean up environment after test" @@ -100,7 +92,11 @@ jobs: IMAGE_NAMESPACE: ${{ github.repository }} TAG: ${{ inputs.image_tag }} AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }} + # TODO: FIX VALUE + NODES_PER_POOL: "" + WORKFLOW_CALL: ${{ github.event.workflow == '.github/workflows/daily-scale-test.yaml' }} + run: | shell: bash run: | set -euo pipefail - go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -image-tag=$( [[ $TAG == "" ]] && make version || echo $TAG ) -create-infra=false -delete-infra=false + go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -image-tag=$( [[ $TAG == "" ]] && make version || echo $TAG ) -create-infra=$(echo $WORKFLOW_CALL) -delete-infra=false diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 9772320685..c71debf4f4 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -6,12 +6,12 @@ package common import ( "flag" - "os" "os/user" "strconv" "testing" "time" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/stretchr/testify/require" ) @@ -29,10 +29,61 @@ var ( Architectures = []string{"amd64", "arm64"} CreateInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing") DeleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing") + ScaleTestInfra = ScaleTestInfraHandler{ + location: params.Location, + subscriptionID: params.SubscriptionID, + resourceGroup: params.ResourceGroup, + clusterName: params.ClusterName, + nodesPerPool: params.NodesPerPool, + } ) +type ScaleTestInfraHandler struct { + location string + subscriptionID string + resourceGroup string + clusterName string + nodesPerPool string +} + +func (s ScaleTestInfraHandler) GetSubscriptionID(t *testing.T) string { + require.NotEmpty(t, s.subscriptionID) + return s.subscriptionID +} + +func (s ScaleTestInfraHandler) GetLocation(t *testing.T) string { + if s.location == "" { + return "westus2" + } + return s.location +} + +func (s ScaleTestInfraHandler) GetResourceGroup(t *testing.T) string { + if s.resourceGroup != "" { + return s.resourceGroup + } + // Use the cluster name as the resource group name by default. + return s.GetClusterName(t) +} + +func (s ScaleTestInfraHandler) GetNodesPerPool(t *testing.T) int32 { + if s.nodesPerPool == "" { + return 5 + } + nodesPerPool, err := strconv.Atoi(s.nodesPerPool) + require.NoError(t, err, "NODES_PER_POOL must be an integer") + return int32(nodesPerPool) +} + +func (s ScaleTestInfraHandler) GetClusterName(t *testing.T) string { + if s.clusterName != "" { + return s.clusterName + } + return "retina-scale-test" +} + func ClusterNameForE2ETest(t *testing.T) string { - clusterName := os.Getenv("CLUSTER_NAME") + clusterName := params.ClusterName if clusterName == "" { curuser, err := user.Current() require.NoError(t, err) diff --git a/test/e2e/framework/azure/create-cluster-with-npm.go b/test/e2e/framework/azure/create-cluster-with-npm.go index fe9ea656f6..e715e4b8ed 100644 --- a/test/e2e/framework/azure/create-cluster-with-npm.go +++ b/test/e2e/framework/azure/create-cluster-with-npm.go @@ -22,7 +22,6 @@ const ( clusterCreateTicker = 30 * time.Second pollFrequency = 5 * time.Second AgentARMSKU = "Standard_D4pls_v5" - AuxilaryNodeCount = 1 ) type CreateNPMCluster struct { @@ -35,6 +34,7 @@ type CreateNPMCluster struct { PodCidr string DNSServiceIP string ServiceCidr string + NodesPerPool int32 } func (c *CreateNPMCluster) Prevalidate() error { @@ -55,7 +55,7 @@ func (c *CreateNPMCluster) Run() error { npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), // AvailabilityZones: []*string{to.Ptr("1")}, - Count: to.Ptr[int32](AuxilaryNodeCount), + Count: to.Ptr[int32](c.NodesPerPool), EnableNodePublicIP: to.Ptr(false), Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), OSType: to.Ptr(armcontainerservice.OSTypeWindows), @@ -86,7 +86,7 @@ func (c *CreateNPMCluster) Run() error { npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), // AvailabilityZones: []*string{to.Ptr("1")}, - Count: to.Ptr[int32](AuxilaryNodeCount), + Count: to.Ptr[int32](c.NodesPerPool), EnableNodePublicIP: to.Ptr(false), Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), OSType: to.Ptr(armcontainerservice.OSTypeLinux), diff --git a/test/e2e/framework/params/params.go b/test/e2e/framework/params/params.go new file mode 100644 index 0000000000..88706d1e9f --- /dev/null +++ b/test/e2e/framework/params/params.go @@ -0,0 +1,17 @@ +package params + +import ( + "os" +) + +var ( + Location = os.Getenv("LOCATION") + SubscriptionID = os.Getenv("AZURE_SUBSCRIPTION_ID") + ResourceGroup = os.Getenv("AZURE_RESOURCE_GROUP") + ClusterName = os.Getenv("CLUSTER_NAME") + NodesPerPool = os.Getenv("NODES_PER_POOL") + NumDeployments = os.Getenv("NUM_DEPLOYMENTS") + NumReplicas = os.Getenv("NUM_REPLICAS") + NumNetworkPolicies = os.Getenv("NUM_NET_POL") + CleanUp = os.Getenv("CLEANUP") +) diff --git a/test/e2e/jobs/jobs.go b/test/e2e/jobs/jobs.go index 375f69aa4e..92f6e55953 100644 --- a/test/e2e/jobs/jobs.go +++ b/test/e2e/jobs/jobs.go @@ -43,6 +43,7 @@ func CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string PodCidr: "10.128.0.0/9", DNSServiceIP: "192.168.0.10", ServiceCidr: "192.168.0.0/28", + NodesPerPool: 1, }, nil) job.AddStep(&azure.GetAKSKubeConfig{ diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 89215785c1..c0e6daabb2 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -4,12 +4,49 @@ import ( "os" "time" + "github.com/microsoft/retina/test/e2e/framework/azure" + "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/e2e/framework/scaletest" "github.com/microsoft/retina/test/e2e/framework/types" ) func DefaultScaleTestOptions() scaletest.Options { + // var NumRealDeployments int + // var NumReplicas int + // var err error + // + // if env.NumDeployments != "" { + // NumRealDeployments, err = strconv.Atoi(env.NumDeployments) + // require.NoError(t, err, "Failed to convert NUM_DEPLOYMENTS to int") + // } else { + // NumRealDeployments = 1000 + // } + // + // if env.NumReplicas != "" { + // NumReplicas, err = strconv.Atoi(env.NumReplicas) + // require.NoError(t, err, "Failed to convert NUM_REPLICAS to int") + // } else { + // NumReplicas = "40" + // NumNetworkPolicies := env.NumNetworkPolicies + // CleanUp := env.CleanUp + // + // if NumDeployments != "" { + // } else { + // NumRealDeployments = 1000 + // } + // if NumReplicas != "" { + // opt.NumRealReplicas, err = strconv.Atoi(NumReplicas) + // require.NoError(t, err) + // } + // if NumNetworkPolicies != "" { + // opt.NumNetworkPolicies, err = strconv.Atoi(NumNetworkPolicies) + // require.NoError(t, err) + // } + // if CleanUp != "" { + // opt.DeleteLabels, err = strconv.ParseBool(CleanUp) + // require.NoError(t, err) + // } return scaletest.Options{ Namespace: "scale-test", MaxKwokPodsPerNode: 0, @@ -43,6 +80,57 @@ func DefaultScaleTestOptions() scaletest.Options { } } +func GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath string, nodesPerPool int32, createInfra bool) *types.Job { + job := types.NewJob("Get scale test infrastructure") + + if createInfra { + job.AddStep(&azure.CreateResourceGroup{ + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + + job.AddStep(&azure.CreateVNet{ + VnetName: "testvnet", + VnetAddressSpace: "10.0.0.0/9", + }, nil) + + job.AddStep(&azure.CreateSubnet{ + SubnetName: "testsubnet", + SubnetAddressSpace: "10.0.0.0/12", + }, nil) + + job.AddStep(&azure.CreateNPMCluster{ + ClusterName: clusterName, + PodCidr: "10.128.0.0/9", + DNSServiceIP: "192.168.0.10", + ServiceCidr: "192.168.0.0/28", + NodesPerPool: nodesPerPool, + }, nil) + + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + }, nil) + + } else { + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + ClusterName: clusterName, + SubscriptionID: subID, + ResourceGroupName: rg, + Location: location, + }, nil) + } + + job.AddStep(&generic.LoadFlags{ + TagEnv: generic.DefaultTagEnv, + ImageNamespaceEnv: generic.DefaultImageNamespace, + ImageRegistryEnv: generic.DefaultImageRegistry, + }, nil) + + return job +} + func ScaleTest(opt *scaletest.Options) *types.Job { job := types.NewJob("Scale Test") @@ -66,7 +154,7 @@ func ScaleTest(opt *scaletest.Options) *types.Job { job.AddStep(&scaletest.GetAndPublishMetrics{ Labels: opt.LabelsToGetMetrics, AdditionalTelemetryProperty: opt.AdditionalTelemetryProperty, - OutputFilePath: os.Getenv("OUTPUT_FILEPATH"), + OutputFilePath: os.Getenv("OUTPUT_FILEPATH"), }, &types.StepOptions{ SkipSavingParametersToJob: true, RunInBackgroundWithID: "get-metrics", diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 6769dccc09..aa302113ef 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -3,8 +3,6 @@ package retina import ( - "crypto/rand" - "math/big" "os" "path/filepath" "strconv" @@ -12,6 +10,7 @@ import ( "github.com/microsoft/retina/test/e2e/common" "github.com/microsoft/retina/test/e2e/framework/azure" + "github.com/microsoft/retina/test/e2e/framework/params" "github.com/microsoft/retina/test/e2e/framework/generic" "github.com/microsoft/retina/test/e2e/framework/helpers" "github.com/microsoft/retina/test/e2e/framework/types" @@ -23,25 +22,11 @@ func TestE2ERetina_Scale(t *testing.T) { ctx, cancel := helpers.Context(t) defer cancel() - clusterName := common.ClusterNameForE2ETest(t) - - subID := os.Getenv("AZURE_SUBSCRIPTION_ID") - require.NotEmpty(t, subID) - - location := os.Getenv("AZURE_LOCATION") - if location == "" { - nBig, err := rand.Int(rand.Reader, big.NewInt(int64(len(common.AzureLocations)))) - if err != nil { - t.Fatal("Failed to generate a secure random index", err) - } - location = common.AzureLocations[nBig.Int64()] - } - - rg := os.Getenv("AZURE_RESOURCE_GROUP") - if rg == "" { - // Use the cluster name as the resource group name by default. - rg = clusterName - } + clusterName := common.ScaleTestInfra.GetClusterName(t) + subID := common.ScaleTestInfra.GetSubscriptionID(t) + location := common.ScaleTestInfra.GetLocation(t) + rg := common.ScaleTestInfra.GetResourceGroup(t) + nodesPerPool := common.ScaleTestInfra.GetNodesPerPool(t) cwd, err := os.Getwd() require.NoError(t, err) @@ -56,10 +41,10 @@ func TestE2ERetina_Scale(t *testing.T) { opt := jobs.DefaultScaleTestOptions() opt.KubeconfigPath = kubeConfigFilePath - NumDeployments := os.Getenv("NUM_DEPLOYMENTS") - NumReplicas := os.Getenv("NUM_REPLICAS") - NumNetworkPolicies := os.Getenv("NUM_NET_POL") - CleanUp := os.Getenv("CLEANUP") + NumDeployments := params.NumDeployments + NumReplicas := params.NumReplicas + NumNetworkPolicies := params.NumNetworkPolicies + CleanUp := params.CleanUp if NumDeployments != "" { opt.NumRealDeployments, err = strconv.Atoi(NumDeployments) @@ -89,9 +74,11 @@ func TestE2ERetina_Scale(t *testing.T) { opt.LabelsToGetMetrics = map[string]string{"k8s-app": "retina"} + createInfra := *common.CreateInfra + // CreateTestInfra - createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, *common.CreateInfra)) - createTestInfra.Run(ctx) + infra := types.NewRunner(t, jobs.GetScaleTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, nodesPerPool, createInfra)) + infra.Run(ctx) t.Cleanup(func() { if *common.DeleteInfra {