This is an automated email from the ASF dual-hosted git repository. cdmikechen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/submarine.git
The following commit(s) were added to refs/heads/master by this push: new 83f8aabf SUBMARINE-1324. Fix the experiment pods label selector after using the new training operator 83f8aabf is described below commit 83f8aabf59ba62df4d7827f2e6307038bb0a70e0 Author: cdmikechen <cdmikec...@hotmail.com> AuthorDate: Thu Oct 6 08:18:11 2022 +0800 SUBMARINE-1324. Fix the experiment pods label selector after using the new training operator ### What is this PR for? After updated to new training operator, the metrics/logs/params of the experiment cannot be shown, which is found that the pod label had changed to `job-name`. ### What type of PR is it? Bug Fix ### Todos * [x] - Change label to job-name ### What is the Jira issue? https://issues.apache.org/jira/browse/SUBMARINE-1324 ### How should this be tested? Consider adding after refactoring. ### Screenshots (if appropriate) ![image](https://user-images.githubusercontent.com/12069428/189625495-7e640088-b6b8-4e04-85b3-3659f923256b.png) ### Questions: * Do the license files need updating? No * Are there breaking changes for older versions? No * Does this need new documentation? No Author: cdmikechen <cdmikec...@hotmail.com> Author: cdmikechen <cdmikec...@apache.org> Signed-off-by: cdmikechen <cdmikec...@apache.org> Closes #995 from cdmikechen/SUBMARINE-1324 and squashes the following commits: 971f4a66 [cdmikechen] Add test 6bda7200 [cdmikechen] Change label to job-name --- .../server/submitter/k8s/K8sSubmitter.java | 24 +++------------------- .../submitter/k8s/model/mljob/MLJobFactory.java | 8 ++++++++ .../submitter/k8s/ExperimentSpecParserTest.java | 7 +++++++ 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java index 9be76a9c..301fc38d 100644 --- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java +++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/K8sSubmitter.java @@ -48,7 +48,6 @@ import org.apache.submarine.server.api.experiment.MlflowInfo; import org.apache.submarine.server.api.experiment.TensorboardInfo; import org.apache.submarine.server.api.model.ServeSpec; import org.apache.submarine.server.api.notebook.Notebook; -import org.apache.submarine.server.api.spec.ExperimentMeta; import org.apache.submarine.server.api.spec.ExperimentSpec; import org.apache.submarine.server.api.spec.NotebookSpec; import org.apache.submarine.server.submitter.k8s.client.K8sClient; @@ -77,10 +76,6 @@ public class K8sSubmitter implements Submitter { private static final Logger LOG = LoggerFactory.getLogger(K8sSubmitter.class); - private static final String TF_JOB_SELECTOR_KEY = "tf-job-name="; - private static final String PYTORCH_JOB_SELECTOR_KEY = "pytorch-job-name="; - private static final String XGBoost_JOB_SELECTOR_KEY = "xgboost-job-name="; - // Add an exception Consumer, handle the problem that delete operation does not have the resource public static final Function<ApiException, Object> API_EXCEPTION_404_CONSUMER = e -> { if (e.getCode() != 404) { @@ -261,7 +256,7 @@ public class K8sSubmitter implements Submitter { experimentLog.setExperimentId(id); try { ListOptions listOptions = new ListOptions(); - listOptions.setLabelSelector(getJobLabelSelector(spec)); + listOptions.setLabelSelector(MLJobFactory.getJobLabelSelector(spec)); final V1PodList podList = k8sClient.getPodClient().list(getServerNamespace(), listOptions) .throwsApiException().getObject(); for (V1Pod pod : podList.getItems()) { @@ -280,7 +275,7 @@ public class K8sSubmitter implements Submitter { experimentLog.setExperimentId(id); try { ListOptions listOptions = new ListOptions(); - listOptions.setLabelSelector(getJobLabelSelector(spec)); + listOptions.setLabelSelector(MLJobFactory.getJobLabelSelector(spec)); final V1PodList podList = k8sClient.getPodClient().list(getServerNamespace(), listOptions) .throwsApiException().getObject(); for (V1Pod pod : podList.getItems()) { @@ -397,7 +392,7 @@ public class K8sSubmitter implements Submitter { spec.getMeta().getName(), agentPod.getMetadata().getName())); dependents.add(agentPod); - // delete resources + // delete resources return deleteResourcesTransaction(notebookCR, dependents.toArray(dependents.toArray(new K8sResource[0]))); } @@ -438,17 +433,4 @@ public class K8sSubmitter implements Submitter { deleteResourcesTransaction(seldonDeployment, istioVirtualService); } - private String getJobLabelSelector(ExperimentSpec experimentSpec) { - if (experimentSpec.getMeta().getFramework() - .equalsIgnoreCase(ExperimentMeta.SupportedMLFramework.TENSORFLOW.getName())) { - return TF_JOB_SELECTOR_KEY + experimentSpec.getMeta().getExperimentId(); - } else if (experimentSpec.getMeta().getFramework() - .equalsIgnoreCase(ExperimentMeta.SupportedMLFramework.XGBOOST.getName())) { - return XGBoost_JOB_SELECTOR_KEY + experimentSpec.getMeta().getExperimentId(); - } - else { - return PYTORCH_JOB_SELECTOR_KEY + experimentSpec.getMeta().getExperimentId(); - } - } - } diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java index d766ea03..b591b7ca 100644 --- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java +++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/model/mljob/MLJobFactory.java @@ -52,4 +52,12 @@ public class MLJobFactory { } } + /** + * Get ml job labelSelector. + * The new training-operator has unified label. + */ + public static String getJobLabelSelector(ExperimentSpec experimentSpec) { + return String.format("job-name=%s", experimentSpec.getMeta().getExperimentId()); + } + } diff --git a/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java b/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java index acbf48d5..04467b8c 100644 --- a/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java +++ b/submarine-server/server-submitter/submitter-k8s/src/test/java/org/apache/submarine/server/submitter/k8s/ExperimentSpecParserTest.java @@ -84,6 +84,13 @@ public class ExperimentSpecParserTest extends SpecBuilder { } } + @Test + public void testValidLabel() throws IOException, URISyntaxException { + ExperimentSpec experimentSpec = (ExperimentSpec) buildFromJsonFile(ExperimentSpec.class, tfJobReqFile); + String label = MLJobFactory.getJobLabelSelector(experimentSpec); + Assert.assertEquals("job-name=" + experimentSpec.getMeta().getExperimentId(), label); + } + @Test public void testValidTensorFlowExperiment() throws IOException, URISyntaxException, InvalidSpecException { --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@submarine.apache.org For additional commands, e-mail: dev-h...@submarine.apache.org