This is an automated email from the ASF dual-hosted git repository.
cdmikechen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/submarine.git
The following commit(s) were added to refs/heads/master by this push:
new 83f8aabf SUBMARINE-1324. Fix the experiment pods label selector after
using the new training operator
83f8aabf is described below
commit 83f8aabf59ba62df4d7827f2e6307038bb0a70e0
Author: cdmikechen <[email protected]>
AuthorDate: Thu Oct 6 08:18:11 2022 +0800
SUBMARINE-1324. Fix the experiment pods label selector after using the new
training operator
### What is this PR for?
After updated to new training operator, the metrics/logs/params of the
experiment cannot be shown, which is found that the pod label had changed to
`job-name`.
### What type of PR is it?
Bug Fix
### Todos
* [x] - Change label to job-name
### What is the Jira issue?
https://issues.apache.org/jira/browse/SUBMARINE-1324
### How should this be tested?
Consider adding after refactoring.
### Screenshots (if appropriate)

### Questions:
* Do the license files need updating? No
* Are there breaking changes for older versions? No
* Does this need new documentation? No
Author: cdmikechen <[email protected]>
Author: cdmikechen <[email protected]>
Signed-off-by: cdmikechen <[email protected]>
Closes #995 from cdmikechen/SUBMARINE-1324 and squashes the following
commits:
971f4a66 [cdmikechen] Add test
6bda7200 [cdmikechen] Change label to job-name
---
.../server/submitter/k8s/K8sSubmitter.java | 24 +++-------------------
.../submitter/k8s/model/mljob/MLJobFactory.java | 8 ++++++++
.../submitter/k8s/ExperimentSpecParserTest.java | 7 +++++++
3 files changed, 18 insertions(+), 21 deletions(-)
diff --git
a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
index 9be76a9c..301fc38d 100644
---
a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
+++
b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java
@@ -48,7 +48,6 @@ import org.apache.submarine.server.api.experiment.MlflowInfo;
import org.apache.submarine.server.api.experiment.TensorboardInfo;
import org.apache.submarine.server.api.model.ServeSpec;
import org.apache.submarine.server.api.notebook.Notebook;
-import org.apache.submarine.server.api.spec.ExperimentMeta;
import org.apache.submarine.server.api.spec.ExperimentSpec;
import org.apache.submarine.server.api.spec.NotebookSpec;
import org.apache.submarine.server.submitter.k8s.client.K8sClient;
@@ -77,10 +76,6 @@ public class K8sSubmitter implements Submitter {
private static final Logger LOG =
LoggerFactory.getLogger(K8sSubmitter.class);
- private static final String TF_JOB_SELECTOR_KEY = "tf-job-name=";
- private static final String PYTORCH_JOB_SELECTOR_KEY = "pytorch-job-name=";
- private static final String XGBoost_JOB_SELECTOR_KEY = "xgboost-job-name=";
-
// Add an exception Consumer, handle the problem that delete operation does
not have the resource
public static final Function<ApiException, Object>
API_EXCEPTION_404_CONSUMER = e -> {
if (e.getCode() != 404) {
@@ -261,7 +256,7 @@ public class K8sSubmitter implements Submitter {
experimentLog.setExperimentId(id);
try {
ListOptions listOptions = new ListOptions();
- listOptions.setLabelSelector(getJobLabelSelector(spec));
+ listOptions.setLabelSelector(MLJobFactory.getJobLabelSelector(spec));
final V1PodList podList =
k8sClient.getPodClient().list(getServerNamespace(), listOptions)
.throwsApiException().getObject();
for (V1Pod pod : podList.getItems()) {
@@ -280,7 +275,7 @@ public class K8sSubmitter implements Submitter {
experimentLog.setExperimentId(id);
try {
ListOptions listOptions = new ListOptions();
- listOptions.setLabelSelector(getJobLabelSelector(spec));
+ listOptions.setLabelSelector(MLJobFactory.getJobLabelSelector(spec));
final V1PodList podList =
k8sClient.getPodClient().list(getServerNamespace(), listOptions)
.throwsApiException().getObject();
for (V1Pod pod : podList.getItems()) {
@@ -397,7 +392,7 @@ public class K8sSubmitter implements Submitter {
spec.getMeta().getName(), agentPod.getMetadata().getName()));
dependents.add(agentPod);
- // delete resources
+ // delete resources
return deleteResourcesTransaction(notebookCR,
dependents.toArray(dependents.toArray(new K8sResource[0])));
}
@@ -438,17 +433,4 @@ public class K8sSubmitter implements Submitter {
deleteResourcesTransaction(seldonDeployment, istioVirtualService);
}
- private String getJobLabelSelector(ExperimentSpec experimentSpec) {
- if (experimentSpec.getMeta().getFramework()
-
.equalsIgnoreCase(ExperimentMeta.SupportedMLFramework.TENSORFLOW.getName())) {
- return TF_JOB_SELECTOR_KEY + experimentSpec.getMeta().getExperimentId();
- } else if (experimentSpec.getMeta().getFramework()
-
.equalsIgnoreCase(ExperimentMeta.SupportedMLFramework.XGBOOST.getName())) {
- return XGBoost_JOB_SELECTOR_KEY +
experimentSpec.getMeta().getExperimentId();
- }
- else {
- return PYTORCH_JOB_SELECTOR_KEY +
experimentSpec.getMeta().getExperimentId();
- }
- }
-
}
diff --git
a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java
b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java
index d766ea03..b591b7ca 100644
---
a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java
+++
b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java
@@ -52,4 +52,12 @@ public class MLJobFactory {
}
}
+ /**
+ * Get ml job labelSelector.
+ * The new training-operator has unified label.
+ */
+ public static String getJobLabelSelector(ExperimentSpec experimentSpec) {
+ return String.format("job-name=%s",
experimentSpec.getMeta().getExperimentId());
+ }
+
}
diff --git
a/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java
b/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java
index acbf48d5..04467b8c 100644
---
a/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java
+++
b/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java
@@ -84,6 +84,13 @@ public class ExperimentSpecParserTest extends SpecBuilder {
}
}
+ @Test
+ public void testValidLabel() throws IOException, URISyntaxException {
+ ExperimentSpec experimentSpec = (ExperimentSpec)
buildFromJsonFile(ExperimentSpec.class, tfJobReqFile);
+ String label = MLJobFactory.getJobLabelSelector(experimentSpec);
+ Assert.assertEquals("job-name=" +
experimentSpec.getMeta().getExperimentId(), label);
+ }
+
@Test
public void testValidTensorFlowExperiment() throws IOException,
URISyntaxException, InvalidSpecException {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]