This is an automated email from the ASF dual-hosted git repository.
wwei pushed a commit to branch soak-test
in repository https://gitbox.apache.org/repos/asf/yunikorn-release.git
The following commit(s) were added to refs/heads/soak-test by this push:
new 3b51902 [YUNIKORN-3008] Add framework core bits and ddd clusterloader
end to end test (#197)
3b51902 is described below
commit 3b519023c90bec0ebaef3383e8b5da8faafc8f3d
Author: Shravan Achar <[email protected]>
AuthorDate: Tue May 13 10:30:09 2025 -0700
[YUNIKORN-3008] Add framework core bits and ddd clusterloader end to end
test (#197)
Co-authored-by: Shravan Achar <[email protected]>
---
soak/autoscaler/run.go | 72 ++++++++++
soak/{pkg/setup => autoscaler}/setup.go | 101 ++++++-------
soak/{pkg/setup => autoscaler}/setup_test.go | 20 +--
soak/conf.yaml | 160 ++-------------------
soak/{conf.yaml => conf.yaml.example} | 1 +
soak/framework/config.go | 13 +-
soak/framework/interface.go | 33 +++++
soak/logger/log.go | 15 ++
soak/main.go | 35 ++++-
soak/pkg/setup/test_conf.yaml | 29 ----
soak/templates/autoscaler-configmap.yaml | 21 ---
soak/templates/kwok-node-template.yaml | 14 +-
soak/templates/kwok-provider-config.yaml | 46 ++++++
.../basic-scheduler-throughput/cl2-metadata.json | 1 +
soak/tests/basic-scheduler-throughput/config.yaml | 81 +++++++++++
.../basic-scheduler-throughput/pod-default.yaml | 25 ++++
16 files changed, 390 insertions(+), 277 deletions(-)
diff --git a/soak/autoscaler/run.go b/soak/autoscaler/run.go
new file mode 100644
index 0000000..a746065
--- /dev/null
+++ b/soak/autoscaler/run.go
@@ -0,0 +1,72 @@
+package autoscaler
+
+import (
+ "fmt"
+ "github.com/apache/yunikorn-release/soak/framework"
+ "github.com/apache/yunikorn-release/soak/logger"
+ "go.uber.org/zap"
+ "os/exec"
+ "path/filepath"
+ "strings"
+)
+
+var log = logger.Logger
+
+type AutoscalingScenario struct {
+ templateConf framework.Template
+ testCases []framework.TestCase
+}
+
+func New(config *framework.Config) *AutoscalingScenario {
+ for _, c := range config.Tests {
+ if c.Name == "autoscaling" {
+ return &AutoscalingScenario{
+ templateConf: c.Template,
+ testCases: c.TestCases,
+ }
+ }
+ }
+ return nil
+}
+
+func (a *AutoscalingScenario) GetName() string {
+ return "autoscaling"
+}
+
+func (a *AutoscalingScenario) Init() error {
+ if err := a.upgradeSchedulerPerConfig(); err != nil {
+ return err
+ }
+
+ return a.setAutoscalerPerConfig()
+}
+
+func (a *AutoscalingScenario) Tests() []framework.TestCase {
+ // enable or disable test cases here
+ return a.testCases
+}
+
+func (a *AutoscalingScenario) Run() ([]string, error) {
+ log := logger.Logger
+ results := make([]string, len(a.testCases))
+ for idx, tests := range a.testCases {
+ clusterLoaderConfigPath := tests.ClusterLoaderConfigPath
+ reportDir := filepath.Dir(clusterLoaderConfigPath)
+ args := []string{fmt.Sprintf("----testconfig=%s",
clusterLoaderConfigPath),
+ "--provider=kind", fmt.Sprintf("--kubeconfig=%s",
a.templateConf.Kubeconfig.Path),
+ "--v=4", fmt.Sprintf("--report-dir=%s", reportDir)}
+ cmd := exec.Command("clusterloader2", args...)
+ log.Info("Clusterloader command to be executed",
+ zap.String("command", fmt.Sprintf("clusterloader2 %s",
strings.Join(args, " "))))
+ results[idx] = reportDir
+ _, err := cmd.CombinedOutput()
+ if err != nil {
+ log.Error("Clusterloader command failed. Check results
directory for more info",
+ zap.String("command",
fmt.Sprintf("clusterloader2 %s", strings.Join(args, " "))))
+
+ return results, err
+
+ }
+ }
+ return results, nil
+}
diff --git a/soak/pkg/setup/setup.go b/soak/autoscaler/setup.go
similarity index 65%
rename from soak/pkg/setup/setup.go
rename to soak/autoscaler/setup.go
index 000375c..8d674f4 100644
--- a/soak/pkg/setup/setup.go
+++ b/soak/autoscaler/setup.go
@@ -14,12 +14,10 @@
limitations under the License.
*/
-package setup
+package autoscaler
import (
"fmt"
- "github.com/apache/yunikorn-core/pkg/log"
- "github.com/apache/yunikorn-release/soak/framework"
"github.com/apache/yunikorn-release/soak/pkg/constants"
"go.uber.org/zap"
"gopkg.in/yaml.v3"
@@ -29,23 +27,24 @@ import (
"strings"
)
-var logger *zap.Logger = log.Log(log.Test)
-
-func setK8sContext() error {
+func (a *AutoscalingScenario) setK8sContext() error {
homeDir, err := os.UserHomeDir()
if err != nil {
return fmt.Errorf("failed to get home directory: %v", err)
}
kubeconfigPath := filepath.Join(homeDir, ".kube", "config")
+ if len(a.templateConf.Kubeconfig.Path) > 0 {
+ kubeconfigPath = a.templateConf.Kubeconfig.Path
+ }
os.Setenv("KUBECONFIG", kubeconfigPath)
- logger.Info("Set KUBECONFIG", zap.String("path", kubeconfigPath))
+ log.Info("Set KUBECONFIG", zap.String("path", kubeconfigPath))
contextCmd := exec.Command("kubectl", "config", "use-context",
constants.KindSoakTestCluster)
contextOutput, err := contextCmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to switch kubectl context: %v,
output: %s", err, string(contextOutput))
}
- logger.Info("Kubectl context switch output", zap.String("output",
strings.TrimSpace(string(contextOutput))))
+ log.Info("Kubectl context switch output", zap.String("output",
strings.TrimSpace(string(contextOutput))))
currentContextCmd := exec.Command("kubectl", "config",
"current-context")
_, err = currentContextCmd.CombinedOutput()
@@ -56,13 +55,16 @@ func setK8sContext() error {
return nil
}
-func upgradeSchedulerPerConfig(scheduler framework.SchedulerFields) error {
- if err := setK8sContext(); err != nil {
- logger.Fatal("failed to set kubernetes context", zap.Error(err))
+func (a *AutoscalingScenario) upgradeSchedulerPerConfig() error {
+ if err := a.setK8sContext(); err != nil {
+ log.Fatal("failed to set kubernetes context", zap.Error(err))
return err
}
- logger.Info("Scheduler details",
+ // TODO: Support multiple yunikorn scheduler config directives.
Currently take the first one
+ scheduler := a.templateConf.Scheduler[0]
+
+ log.Info("Scheduler details",
zap.String("VcoreRequests", scheduler.VcoreRequests),
zap.String("MemoryRequests", scheduler.MemoryRequests),
zap.String("VcoreLimits", scheduler.VcoreLimits),
@@ -96,7 +98,7 @@ func upgradeSchedulerPerConfig(scheduler
framework.SchedulerFields) error {
cmd := exec.Command("helm", args...)
- logger.Info("Helm command to be executed",
+ log.Info("Helm command to be executed",
zap.String("command", fmt.Sprintf("helm %s",
strings.Join(args, " "))))
output, err := cmd.CombinedOutput()
@@ -104,7 +106,7 @@ func upgradeSchedulerPerConfig(scheduler
framework.SchedulerFields) error {
return fmt.Errorf("helm upgrade failed: %v", err)
}
- logger.Info("Helm upgrade successful",
+ log.Info("Helm upgrade successful",
zap.String("command", fmt.Sprintf("helm %s",
strings.Join(args, " "))),
zap.String("output", string(output)))
}
@@ -113,74 +115,77 @@ func upgradeSchedulerPerConfig(scheduler
framework.SchedulerFields) error {
kubectlArgs := []string{"apply"}
kubectlArgs = append(kubectlArgs, "-f", scheduler.Path, "-n",
"yunikorn")
kubectlCmd := exec.Command("kubectl", kubectlArgs...)
- logger.Info("Kubectl command to be executed",
+ log.Info("Kubectl command to be executed",
zap.String("command", fmt.Sprintf("kubectl %s",
strings.Join(kubectlArgs, " "))))
kubectlOutput, err := kubectlCmd.CombinedOutput()
if err != nil {
return fmt.Errorf("kubectl apply failed: %v", err)
}
- logger.Info("Kubectl apply successful", zap.String("output",
strings.TrimSpace(string(kubectlOutput))))
+ log.Info("Kubectl apply successful", zap.String("output",
strings.TrimSpace(string(kubectlOutput))))
}
return nil
}
-func setAutoscalerPerConfig(node framework.NodeFields) error {
- if err := setK8sContext(); err != nil {
- logger.Fatal("failed to set kubernetes context", zap.Error(err))
+func (a *AutoscalingScenario) setAutoscalerPerConfig() error {
+ if err := a.setK8sContext(); err != nil {
+ log.Fatal("failed to set kubernetes context", zap.Error(err))
return err
}
- logger.Info("Node details",
- zap.String("path", node.Path),
- zap.String("NodesDesiredCount", node.DesiredCount),
- zap.String("maxCount", node.MaxCount))
+ // TODO: Support multiple kwok node configs. Currently take the first
node template
+ nodeConfig := a.templateConf.Node[0]
+
+ log.Info("Node details",
+ zap.String("path", nodeConfig.Path),
+ zap.String("NodesDesiredCount", nodeConfig.DesiredCount),
+ zap.String("maxCount", nodeConfig.MaxCount))
- templateContent, err := os.ReadFile(node.Path)
+ templateContent, err := os.ReadFile(nodeConfig.Path)
if err != nil {
- logger.Error("failed to read template file", zap.Error(err))
+ log.Error("failed to read template file", zap.Error(err))
return err
}
var nodeTemplate map[string]interface{}
err = yaml.Unmarshal(templateContent, &nodeTemplate)
if err != nil {
- logger.Error("failed to parse template YAML", zap.Error(err))
+ log.Error("failed to parse template YAML", zap.Error(err))
return err
}
metadata, ok := nodeTemplate["metadata"].(map[string]interface{})
if !ok {
- logger.Error("invalid metadata format in node template")
+ log.Error("invalid metadata format in node template")
return fmt.Errorf("invalid metadata format in node template")
}
annotations, ok := metadata["annotations"].(map[string]interface{})
if !ok {
- logger.Error("invalid annotations format in node template")
+ log.Error("invalid annotations format in node template")
return fmt.Errorf("invalid annotations format in node template")
}
- annotations["cluster-autoscaler.kwok.nodegroup/max-count"] =
node.MaxCount
- annotations["cluster-autoscaler.kwok.nodegroup/min-count"] =
node.DesiredCount
- annotations["cluster-autoscaler.kwok.nodegroup/desired-count"] =
node.DesiredCount
+ annotations["cluster-autoscaler.kwok.nodegroup/max-count"] =
nodeConfig.MaxCount
+ annotations["cluster-autoscaler.kwok.nodegroup/min-count"] =
nodeConfig.DesiredCount
+ annotations["cluster-autoscaler.kwok.nodegroup/desired-count"] =
nodeConfig.DesiredCount
- autoscalerConfigmapPath := "../../templates/autoscaler-configmap.yaml"
+ kwokProviderConfigmap := "../../templates/kwok-provider-config.yaml"
- autoscalerConfigmap, err := os.ReadFile(autoscalerConfigmapPath)
+ autoscalerConfigmap, err := os.ReadFile(kwokProviderConfigmap)
if err != nil {
- logger.Error("failed to read autoscaler configmap template",
zap.Error(err))
+ log.Error("failed to read autoscaler configmap template",
zap.Error(err))
return err
}
var autoscalerNodeList map[string]interface{}
err = yaml.Unmarshal(autoscalerConfigmap, &autoscalerNodeList)
if err != nil {
- logger.Error("failed to parse autoscalerConfigmap YAML",
zap.Error(err))
+ log.Error("failed to parse autoscalerConfigmap YAML",
zap.Error(err))
return err
}
- logger.Info("Autoscaler Node List", zap.Any("autoscalerNodeList",
autoscalerNodeList))
+ log.Info("Autoscaler Node List", zap.Any("autoscalerNodeList",
autoscalerNodeList))
var itemsSlice []interface{}
itemsSlice = append(itemsSlice, nodeTemplate)
@@ -188,14 +193,14 @@ func setAutoscalerPerConfig(node framework.NodeFields)
error {
autoscalerNodeListYaml, err := yaml.Marshal(autoscalerNodeList)
if err != nil {
- logger.Error("failed to convert updated autoscalerNodeList to
YAML", zap.Error(err))
+ log.Error("failed to convert updated autoscalerNodeList to
YAML", zap.Error(err))
return err
}
- logger.Info("Encoded autoscalerNodeListYaml",
zap.Any("autoscalerNodeListYaml", autoscalerNodeListYaml))
+ log.Info("Encoded autoscalerNodeListYaml",
zap.Any("autoscalerNodeListYaml", autoscalerNodeListYaml))
updatedAcCmTempFile, err := os.CreateTemp("",
"updated-autoscaler-configmap-temp.yaml")
if err != nil {
- logger.Error("failed to create
updated-autoscaler-configmap-temp file", zap.Error(err))
+ log.Error("failed to create updated-autoscaler-configmap-temp
file", zap.Error(err))
return err
}
@@ -204,11 +209,11 @@ func setAutoscalerPerConfig(node framework.NodeFields)
error {
if _, err = updatedAcCmTempFile.Write(autoscalerNodeListYaml); err !=
nil {
updatedAcCmTempFile.Close()
- logger.Error("failed to write to
updated-autoscaler-configmap-temp file", zap.Error(err))
+ log.Error("failed to write to updated-autoscaler-configmap-temp
file", zap.Error(err))
return err
}
if err = updatedAcCmTempFile.Close(); err != nil {
- logger.Error("failed to close updated-autoscaler-configmap-temp
file", zap.Error(err))
+ log.Error("failed to close updated-autoscaler-configmap-temp
file", zap.Error(err))
return err
}
@@ -216,31 +221,31 @@ func setAutoscalerPerConfig(node framework.NodeFields)
error {
deleteConfigMapCmd := exec.Command("kubectl", "delete", "cm",
"kwok-provider-templates")
deleteConfigMapCmdOutput, err := deleteConfigMapCmd.CombinedOutput()
if err != nil {
- logger.Error("fail to delete configmap", zap.Error(err))
+ log.Error("fail to delete configmap", zap.Error(err))
return err
}
- logger.Info(string(deleteConfigMapCmdOutput))
+ log.Info(string(deleteConfigMapCmdOutput))
// Create a new autoscaler configMap
createConfigMapCmd := exec.Command("kubectl", "create", "cm",
"kwok-provider-templates",
"--from-file=templates="+updatedAcCmTempFilePath)
createConfigMapCmdOutput, err := createConfigMapCmd.CombinedOutput()
if err != nil {
- logger.Error("fail to create new configmap", zap.Error(err))
+ log.Error("fail to create new configmap", zap.Error(err))
return err
}
- logger.Info(string(createConfigMapCmdOutput))
+ log.Info(string(createConfigMapCmdOutput))
// Restart the autoscaler pod after updating the configmap
restartAutoscalerPodCmd := exec.Command("kubectl", "rollout",
"restart", "deployment", "autoscaler-kwok-cluster-autoscaler")
restartAutoscalerPodCmdOutput, err :=
restartAutoscalerPodCmd.CombinedOutput()
if err != nil {
- logger.Error("failed to restart autoscaler deployment",
zap.Error(err))
+ log.Error("failed to restart autoscaler deployment",
zap.Error(err))
return err
}
- logger.Info("Restarted autoscaler deployment", zap.String("output",
string(restartAutoscalerPodCmdOutput)))
+ log.Info("Restarted autoscaler deployment", zap.String("output",
string(restartAutoscalerPodCmdOutput)))
- logger.Info("Successfully set up kwok provider cluster autoscaler for
desiredNodeCount and MaxNodeCount")
+ log.Info("Successfully set up kwok provider cluster autoscaler for
desiredNodeCount and MaxNodeCount")
return nil
}
diff --git a/soak/pkg/setup/setup_test.go b/soak/autoscaler/setup_test.go
similarity index 70%
rename from soak/pkg/setup/setup_test.go
rename to soak/autoscaler/setup_test.go
index eb64e57..b223983 100644
--- a/soak/pkg/setup/setup_test.go
+++ b/soak/autoscaler/setup_test.go
@@ -14,7 +14,7 @@
limitations under the License.
*/
-package setup
+package autoscaler
import (
"github.com/apache/yunikorn-release/soak/framework"
@@ -24,18 +24,12 @@ import (
)
func TestSetAutoScalerPerConfig(t *testing.T) {
- conf, err := framework.InitConfig("test_conf.yaml")
+ conf, err := framework.InitConfig("conf.yaml")
if err != nil {
- logger.Fatal("failed to parse config", zap.Error(err))
- }
- logger.Info("config successfully loaded", zap.Any("conf", conf))
-
- for _, test := range conf.Tests {
- if len(test.Template.Node) > 0 {
- for _, nodeTemplate := range test.Template.Node {
- err := setAutoscalerPerConfig(nodeTemplate)
- assert.NoError(t, err)
- }
- }
+ log.Fatal("failed to parse config", zap.Error(err))
}
+ log.Info("config successfully loaded", zap.Any("conf", conf))
+ a := New(conf)
+ err = a.setAutoscalerPerConfig()
+ assert.NoError(t, err)
}
diff --git a/soak/conf.yaml b/soak/conf.yaml
index a469951..99a2a97 100644
--- a/soak/conf.yaml
+++ b/soak/conf.yaml
@@ -1,4 +1,3 @@
-#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -16,150 +15,15 @@
# limitations under the License.
tests:
-- name: autoscaling
- template:
- kubeconfig:
- path: ../templates/kubeconfig
- node:
- - path: ../templates/nodeGroupTemplates.yaml
- maxCount: "$nodesMaxCount"
- desiredCount: "$nodesDesiredCount"
- job:
- - path: ../templates/jobATemplate.yaml
- count: "$numJobs"
- podCount: "$numPods"
- mode: "always" #one of ["always", "random-max-percent",
"fixed-percent"]
- value: "50" # when mode is "random-max-percent" or "fixed-percent"
- - path: ../templates/jobBTemplate.yaml
- count: "$numJobs"
- podCount: "$numPods"
- scheduler:
- - path: ../templates/autoscaling-queues.yaml
- vcoreRequests: 2
- vcoreLimits: 2
- memoryRequests: 16Gi
- memoryLimits: 16Gi
- testCases:
- - name: "1000-nodes-cluster"
- params:
- nodesMaxCount: 1000
- nodesDesiredCount: 20
- numPods: 5000
- numJobs: 200
- schedule: once
- labels: ["short"]
- # labels: ["soak-test"]
- threshold:
- maxRuntime: "10m"
- pendingPods: 0
- metrics:
- maxAllocationDelay: "5s"
- - name: "5000-nodes-cluster"
- params:
- nodesMaxCount: 5000
- nodesDesiredCount: 20
- numPods: 20000
- numJobs: 700
- schedule: once
- runs: 1
- # labels: ["soak-test", "benchmark-test"]
- labels: ["short"]
- threshold:
- maxRuntime: "60m"
- pendingPods: 0
- maxAllocationDelay: "20s"
- - name: "300-nodes-cluster-schedule"
- params:
- nodesMaxCount: 300
- nodesDesiredCount: 0
- numPods: 2000
- numJobs: 150
- schedule: "*/15 * * * *"
- runs: 10
- #labels: ["soak-test"]
- labels: ["super-long"]
- threshold:
- maxRuns: 10
- pendingPods: 0
- metrics:
- maxAllocationDelay: "5s"
-- name: chaos-faults
- template:
- kubeconfig:
- path: ../templates/kubeconfig
- node:
- - path: ../templates/nodeGroupTemplates.yaml
- maxCount: "$nodesMaxCount"
- desiredCount: "$nodesDesiredCount"
- job:
- - path: ../templates/jobATemplate.yaml
- count: "$numJobs"
- podCount: "$numPods"
- choas:
- - path: ../templates/chaos.yaml
- count: "$numChaos"
- scheduler:
- - path: ../templates/chaos-queues.yaml
- vcoreRequests: 2
- vcoreLimits: 2
- memoryRequests: 16Gi
- memoryLimits: 16Gi
- testCases:
- - name: "1000-nodes-cluster"
- params:
- nodesMaxCount: 1000
- nodesDesiredCount: 20
- numPods: 5000
- numJobs: 200
- numChaos: 0
- schedule: once
- labels: ["short"]
- # labels: ["soak-test", "benchmark-test", "integration-test"]
- threshold:
- maxRuntime: "10m"
- pendingPods: 0
- detectDeadlock: false
- metrics:
- schedulerRestarts: 0
- maxAllocationDelay: "10s"
- - name: "5000-nodes-cluster"
- params:
- nodesMaxCount: 5000
- nodesDesiredCount: 20
- numPods: 20000
- numJobs: 700
- numChaos: 200
- schedule: once
- runs: 1
- labels: ["long"]
- # labels: ["soak-test", "benchmark-test"]
- threshold:
- maxRuntime: "60m"
- pendingPods: 0
- detectDeadlock: true
- metrics:
- schedulerRestarts: 1
- maxAllocationDelay: "60s"
- - name: "300-nodes-cluster-schedule"
- params:
- nodesMaxCount: 300
- nodesDesiredCount: 0
- numPods: 2000
- numJobs: 150
- numChaos: 10
- schedule: "*/15 * * * *"
- runs: 10
- # labels: ["soak-test"]
- labels: ["super-long"]
- threshold:
- maxRuntime: "60m"
- pendingPods: 0
- detectDeadlock: true
- metrics:
- schedulerRestarts: 5
- maxAllocationDelay: "60s"
- prom:
- - query:
'sum(rate(go_memstats_heap_inuse_bytes{service="yunikorn"}[60m])) by (service)'
- expression: 'sprintf("%.0f", query_result / 1000000)'
- value: '20'
- op: '<='
\ No newline at end of file
+ - name: autoscaling
+ template:
+ node:
+ - path: ../../templates/kwok-node-template.yaml
+ maxCount: "10"
+ desiredCount: "5"
+ scheduler:
+ - path: ../../templates/autoscaling-queues.yaml
+ vcoreRequests: 2
+ vcoreLimits: 2
+ memoryRequests: 16Gi
+ memoryLimits: 16Gi
\ No newline at end of file
diff --git a/soak/conf.yaml b/soak/conf.yaml.example
similarity index 97%
copy from soak/conf.yaml
copy to soak/conf.yaml.example
index a469951..30c42f2 100644
--- a/soak/conf.yaml
+++ b/soak/conf.yaml.example
@@ -41,6 +41,7 @@ tests:
memoryLimits: 16Gi
testCases:
- name: "1000-nodes-cluster"
+ clusterLoaderConfigPath: ../tests/basic-scheduler-throughput/config.yaml
params:
nodesMaxCount: 1000
nodesDesiredCount: 20
diff --git a/soak/framework/config.go b/soak/framework/config.go
index 1de9b14..70582de 100644
--- a/soak/framework/config.go
+++ b/soak/framework/config.go
@@ -89,12 +89,13 @@ type Threshold struct {
}
type TestCase struct {
- Name string `yaml:"name,omitempty"`
- Params TestCaseParams `yaml:"params,omitempty"`
- Schedule string `yaml:"schedule,omitempty"`
- Runs int `yaml:"runs,omitempty"`
- Labels []string `yaml:"labels,omitempty"`
- Threshold Threshold `yaml:"threshold,omitempty"`
+ Name string `yaml:"name,omitempty"`
+ Params TestCaseParams `yaml:"params,omitempty"`
+ Schedule string `yaml:"schedule,omitempty"`
+ Runs int `yaml:"runs,omitempty"`
+ Labels []string `yaml:"labels,omitempty"`
+ Threshold Threshold `yaml:"threshold,omitempty"`
+ ClusterLoaderConfigPath string
`yaml:"clusterLoaderConfigPath,omitempty"`
}
type Test struct {
diff --git a/soak/framework/interface.go b/soak/framework/interface.go
new file mode 100644
index 0000000..e4ae8a2
--- /dev/null
+++ b/soak/framework/interface.go
@@ -0,0 +1,33 @@
+package framework
+
+import (
+ "github.com/apache/yunikorn-release/soak/logger"
+ "go.uber.org/zap"
+)
+
+var log = logger.Logger
+
+type Scenarios struct {
+ registeredTestScenarios map[string]TestScenario
+}
+
+var testScenarios Scenarios
+
+func init() {
+ testScenarios.registeredTestScenarios = make(map[string]TestScenario)
+}
+
+func Register(ts TestScenario) {
+ testScenarios.registeredTestScenarios[ts.GetName()] = ts
+ log.Info("register scenario", zap.String("scenarioName", ts.GetName()))
+}
+
+func GetRegisteredTestScenarios() map[string]TestScenario {
+ return testScenarios.registeredTestScenarios
+}
+
+type TestScenario interface {
+ GetName() string
+ Init() error
+ Run() ([]string, error)
+}
diff --git a/soak/logger/log.go b/soak/logger/log.go
new file mode 100644
index 0000000..36696d0
--- /dev/null
+++ b/soak/logger/log.go
@@ -0,0 +1,15 @@
+package logger
+
+import (
+ "github.com/apache/yunikorn-core/pkg/log"
+ "go.uber.org/zap"
+ "strconv"
+)
+
+var Logger *zap.Logger = log.Log(log.Test)
+
+func SetLogLevel(level int) {
+ log.UpdateLoggingConfig(map[string]string{
+ "log.level": strconv.Itoa(level),
+ })
+}
diff --git a/soak/main.go b/soak/main.go
index c048fbb..690c6c3 100644
--- a/soak/main.go
+++ b/soak/main.go
@@ -19,8 +19,9 @@
package main
import (
- "github.com/apache/yunikorn-core/pkg/log"
+ "github.com/apache/yunikorn-release/soak/autoscaler"
"github.com/apache/yunikorn-release/soak/framework"
+ "github.com/apache/yunikorn-release/soak/logger"
"go.uber.org/zap"
)
@@ -28,12 +29,36 @@ const (
ConfigFileName = "conf.yaml"
)
-var logger *zap.Logger = log.Log(log.Test)
-
func main() {
conf, err := framework.InitConfig(ConfigFileName)
+ log := logger.Logger
if err != nil {
- logger.Fatal("failed to parse config", zap.Error(err))
+ log.Fatal("failed to parse config", zap.Error(err))
+ }
+ log.Info("config successfully loaded", zap.Any("conf", conf))
+
+ // Register scenarios
+ a := autoscaler.New(conf)
+ if a != nil {
+ framework.Register(a)
+ }
+
+ for _, ts := range framework.GetRegisteredTestScenarios() {
+ err = ts.Init()
+ if err != nil {
+ log.Fatal("failed to initialize scenario",
zap.String("scenarioName", ts.GetName()),
+ zap.Error(err))
+ }
+
+ reportDirs, err := ts.Run()
+ if err != nil {
+ log.Error("failed to run scenario",
zap.String("scenarioName", ts.GetName()),
+ zap.Error(err))
+ }
+ log.Info("Reports are generated for scenario",
+ zap.String("scenarioName", ts.GetName()),
+ zap.Strings("reportDirectories", reportDirs))
+
}
- logger.Info("config successully loaded", zap.Any("conf", conf))
+
}
diff --git a/soak/pkg/setup/test_conf.yaml b/soak/pkg/setup/test_conf.yaml
deleted file mode 100644
index 99a2a97..0000000
--- a/soak/pkg/setup/test_conf.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-tests:
- - name: autoscaling
- template:
- node:
- - path: ../../templates/kwok-node-template.yaml
- maxCount: "10"
- desiredCount: "5"
- scheduler:
- - path: ../../templates/autoscaling-queues.yaml
- vcoreRequests: 2
- vcoreLimits: 2
- memoryRequests: 16Gi
- memoryLimits: 16Gi
\ No newline at end of file
diff --git a/soak/templates/autoscaler-configmap.yaml
b/soak/templates/autoscaler-configmap.yaml
deleted file mode 100644
index 3ca6613..0000000
--- a/soak/templates/autoscaler-configmap.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: v1
-items:
-kind: List
-metadata:
- resourceVersion: ""
diff --git a/soak/templates/kwok-node-template.yaml
b/soak/templates/kwok-node-template.yaml
index eea8399..c33e8c1 100644
--- a/soak/templates/kwok-node-template.yaml
+++ b/soak/templates/kwok-node-template.yaml
@@ -18,9 +18,9 @@ apiVersion: v1
kind: Node
metadata:
annotations:
- cluster-autoscaler.kwok.nodegroup/max-count:
- cluster-autoscaler.kwok.nodegroup/min-count:
- cluster-autoscaler.kwok.nodegroup/desired-count:
+ cluster-autoscaler.kwok.nodegroup/max-count: {{$MAX_COUNT}}
+ cluster-autoscaler.kwok.nodegroup/min-count: {{$MIN_COUNT}}
+ cluster-autoscaler.kwok.nodegroup/desired-count: {{$DESIRED_COUNT}}
labels:
beta.kubernetes.io/arch: amd64
beta.kubernetes.io/os: linux
@@ -31,13 +31,13 @@ metadata:
name: kwok-node
status:
allocatable:
- cpu: 32
+ cpu: "32"
memory: 256Gi
- pods: 110
+ pods: "110"
capacity:
- cpu: 32
+ cpu: "32"
memory: 256Gi
- pods: 110
+ pods: "110"
nodeInfo:
architecture: amd64
bootID: ""
diff --git a/soak/templates/kwok-provider-config.yaml
b/soak/templates/kwok-provider-config.yaml
new file mode 100644
index 0000000..c5c8760
--- /dev/null
+++ b/soak/templates/kwok-provider-config.yaml
@@ -0,0 +1,46 @@
+apiVersion: v1
+data:
+ config: |-
+ # if you see '\n' everywhere, remove all the trailing spaces
+ apiVersion: v1alpha1
+ readNodesFrom: configmap # possible values: [cluster,configmap]
+ nodegroups:
+ # to specify how to group nodes into a nodegroup
+ # e.g., you want to treat nodes with same instance type as a nodegroup
+ # node1: m5.xlarge
+ # node2: c5.xlarge
+ # node3: m5.xlarge
+ # nodegroup1: [node1,node3]
+ # nodegroup2: [node2]
+ fromNodeLabelKey: "kwok-nodegroup"
+ # you can either specify fromNodeLabelKey OR fromNodeAnnotation
+ # fromNodeAnnotation: "abc.domain.com/nodegroup"
+ nodes:
+ # gpuConfig:
+ # # to tell kwok provider what label should be considered as GPU label
+ # gpuLabelKey: "abc.domain.com/accelerator"
+ # availableGPUTypes:
+ # "nvidia-tesla-k80": {}
+ # "nvidia-tesla-p100": {}
+ configmap:
+ name: kwok-provider-templates
+ kwok: {} # default: fetch latest release of kwok from github and install it
+ # # you can also manually specify which kwok release you want to install
+ # # for example:
+ # kwok:
+ # release: v0.3.0
+ # # you can also disable installing kwok in CA code (and install your own
kwok release)
+ # kwok:
+ # install: false (true if not specified)
+kind: ConfigMap
+metadata:
+ annotations:
+ meta.helm.sh/release-name: autoscaler
+ meta.helm.sh/release-namespace: default
+ creationTimestamp: "2025-03-07T18:36:19Z"
+ labels:
+ app.kubernetes.io/managed-by: Helm
+ name: kwok-provider-config
+ namespace: default
+ resourceVersion: "3713"
+ uid: 6c058143-1de9-4f91-8944-51d59cdb17e1
diff --git a/soak/tests/basic-scheduler-throughput/cl2-metadata.json
b/soak/tests/basic-scheduler-throughput/cl2-metadata.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/soak/tests/basic-scheduler-throughput/cl2-metadata.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/soak/tests/basic-scheduler-throughput/config.yaml
b/soak/tests/basic-scheduler-throughput/config.yaml
new file mode 100644
index 0000000..0f4f8f5
--- /dev/null
+++ b/soak/tests/basic-scheduler-throughput/config.yaml
@@ -0,0 +1,81 @@
+{{$totalSchedulerThroughputPods := DefaultParam .CL2_SCHEDULER_THROUGHPUT_PODS
10}}
+{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 10}}
+{{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 5}}
+{{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 10}}
+
+{{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam
.CL2_SCHEDULER_THROUGHPUT_THRESHOLD 10}}
+
+name: direct-scheduler-throughput
+namespace:
+ number: 1
+tuningSets:
+# default is a tuningset that is meant to be used when we don't have any
specific requirements on pace of operations.
+- name: default
+ globalQPSLoad:
+ qps: {{$defaultQps}}
+ burst: {{$defaultBurst}}
+- name: UniformQPS
+ qpsLoad:
+ qps: {{$uniformQps}}
+steps:
+- name: Creating scheduler throughput measurements
+ measurements:
+ - Identifier: DirectSchedulerThroughputPodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: group = direct-scheduler-throughput
+ threshold: 5s
+ - Identifier: DirectSchedulingThroughput
+# TODO: Move to SchedulingThroughputPrometheus which requires cl2 prom stack
setup as pre-req
+ Method: SchedulingThroughput
+ Params:
+ action: start
+ labelSelector: group = direct-scheduler-throughput
+ measurmentInterval: 1s
+- name: create scheduler throughput pods
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 1
+ replicasPerNamespace: {{$totalSchedulerThroughputPods}}
+ tuningSet: UniformQPS
+ objectBundle:
+ - basename: direct-scheduler-throughput-pod
+ objectTemplatePath: pod-default.yaml
+ templateFillMap:
+ Group: direct-scheduler-throughput
+- name: Waiting for scheduler throughput pods to be created
+ measurements:
+ - Identifier: WaitForDirectSchedulerThroughputPods
+ Method: WaitForRunningPods
+ Params:
+ action: gather
+ timeout: 5m
+ desiredPodCount: {{$totalSchedulerThroughputPods}}
+ labelSelector: group = direct-scheduler-throughput
+- name: Collecting scheduler throughput measurements
+ measurements:
+ - Identifier: DirectSchedulerThroughputPodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
+ schedulerName: yunikorn
+ - Identifier: DirectSchedulingThroughput
+ Method: SchedulingThroughput
+ Params:
+ action: gather
+ enableViolations: true
+ threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}}
+- name: Delete scheduler throughput pods
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 1
+ replicasPerNamespace: 0
+ tuningSet: default
+ objectBundle:
+ - basename: direct-scheduler-throughput-pod
+ objectTemplatePath: pod-default.yaml
+ templateFillMap:
+ Group: direct-scheduler-throughput
diff --git a/soak/tests/basic-scheduler-throughput/pod-default.yaml
b/soak/tests/basic-scheduler-throughput/pod-default.yaml
new file mode 100644
index 0000000..74d600f
--- /dev/null
+++ b/soak/tests/basic-scheduler-throughput/pod-default.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: Pod
+metadata:
+ generateName: pod-churn-
+ labels:
+ group: {{.Group}}
+spec:
+ schedulerName: yunikorn
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: kwok-nodegroup
+ operator: In
+ values:
+ - kind-worker
+ - kind-worker2
+ tolerations:
+ - key: "kwok-provider"
+ operator: "Exists"
+ effect: "NoSchedule"
+ containers:
+ - image: registry.k8s.io/pause:3.9
+ name: pause
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]