[
https://issues.apache.org/jira/browse/GOBBLIN-2057?focusedWorklogId=918457&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-918457
]
ASF GitHub Bot logged work on GOBBLIN-2057:
-------------------------------------------
Author: ASF GitHub Bot
Created on: 09/May/24 03:02
Start Date: 09/May/24 03:02
Worklog Time Spent: 10m
Work Description: phet commented on code in PR #3938:
URL: https://github.com/apache/gobblin/pull/3938#discussion_r1594897042
##########
gobblin-service/src/main/java/org/apache/gobblin/service/monitoring/DagManagementDagActionStoreChangeMonitor.java:
##########
@@ -63,10 +63,11 @@ protected void handleDagAction(DagActionStore.DagAction
dagAction, boolean isSta
case LAUNCH :
case REEVALUATE :
case KILL :
+ case RESUME:
Review Comment:
nit: let's alphabetize these
##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/task/ResumeDagTask.java:
##########
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.task;
+
+import org.apache.gobblin.service.modules.orchestration.DagActionStore;
+import org.apache.gobblin.service.modules.orchestration.DagTaskVisitor;
+import org.apache.gobblin.service.modules.orchestration.LeaseAttemptStatus;
+
+
+/**
+ * A {@link DagTask} responsible for killing running jobs.
Review Comment:
needs correction
also, I don't believe we put a blank line between javadoc and the class
##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProc.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import com.google.common.collect.Maps;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.metrics.event.TimingEvent;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import
org.apache.gobblin.service.modules.orchestration.DagManagementStateStore;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import org.apache.gobblin.service.modules.orchestration.TimingEventUtils;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.apache.gobblin.service.ExecutionStatus.CANCELLED;
+import static org.apache.gobblin.service.ExecutionStatus.FAILED;
+import static org.apache.gobblin.service.ExecutionStatus.PENDING_RESUME;
+
+
+/**
+ * An implementation for {@link DagProc} that resumes a dag and submits the
job that failed/killed previously.
+ */
+@Slf4j
+public class ResumeDagProc extends DagProc<Optional<Dag<JobExecutionPlan>>> {
+
+ public ResumeDagProc(ResumeDagTask resumeDagTask) {
+ super(resumeDagTask);
+ }
+
+ @Override
+ protected Optional<Dag<JobExecutionPlan>> initialize(DagManagementStateStore
dagManagementStateStore)
+ throws IOException {
+ return dagManagementStateStore.getFailedDag(getDagId());
+ }
+
+ @Override
+ protected void act(DagManagementStateStore dagManagementStateStore,
Optional<Dag<JobExecutionPlan>> dag)
+ throws IOException {
+ log.info("Request to resume dag {}", getDagId());
+
+ if (!dag.isPresent()) {
+ // todo - add a metric here
+ log.error("Dag " + dagId + " was not found in dag state store");
+ return;
+ }
+
+ long flowResumeTime = System.currentTimeMillis();
+
+ // Set the flow and its failed or cancelled nodes to PENDING_RESUME so
that the flow will be resumed from the point before it failed
+ DagManagerUtils.emitFlowEvent(eventSubmitter, dag.get(),
TimingEvent.FlowTimings.FLOW_PENDING_RESUME);
+
+ for (Dag.DagNode<JobExecutionPlan> node : dag.get().getNodes()) {
+ ExecutionStatus executionStatus = node.getValue().getExecutionStatus();
+ if (executionStatus.equals(FAILED) || executionStatus.equals(CANCELLED))
{
+ node.getValue().setExecutionStatus(PENDING_RESUME);
+ // reset currentAttempts because we do not want to count previous
execution's attempts in deciding whether to retry a job
+ node.getValue().setCurrentAttempts(0);
+ DagManagerUtils.incrementJobGeneration(node);
+ Map<String, String> jobMetadata =
TimingEventUtils.getJobMetadata(Maps.newHashMap(), node.getValue());
+
eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.JOB_PENDING_RESUME).stop(jobMetadata);
+ }
+ // Set flowStartTime so that flow SLA will be based on current time
instead of original flow
+ node.getValue().setFlowStartTime(flowResumeTime);
+ }
+
+ dagManagementStateStore.checkpointDag(dag.get());
+ dagManagementStateStore.deleteFailedDag(dag.get());
+ resumeDag(dagManagementStateStore, dag.get());
+ }
+
+ private void resumeDag(DagManagementStateStore dagManagementStateStore,
Dag<JobExecutionPlan> dag) {
+ Set<Dag.DagNode<JobExecutionPlan>> nextNodes =
DagManagerUtils.getNext(dag);
+
+ if (nextNodes.size() > 1) {
+ handleMultipleJobs(nextNodes);
+ }
+
+ //Submit jobs from the dag ready for execution.
+ for (Dag.DagNode<JobExecutionPlan> dagNode : nextNodes) {
+ DagProcUtils.submitJobToExecutor(dagManagementStateStore, dagNode,
getDagId());
+ log.info("Submitted job {} for dagId {}",
DagManagerUtils.getJobName(dagNode), getDagId());
+ }
+ }
+
+ private void handleMultipleJobs(Set<Dag.DagNode<JobExecutionPlan>>
nextNodes) {
+ throw new UnsupportedOperationException("More than one start job is not
allowed");
+ }
Review Comment:
after this PR, I believe we'll be ready to implement this (all three cases)
##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProc.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import com.google.common.collect.Maps;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.metrics.event.TimingEvent;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import
org.apache.gobblin.service.modules.orchestration.DagManagementStateStore;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import org.apache.gobblin.service.modules.orchestration.TimingEventUtils;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.apache.gobblin.service.ExecutionStatus.CANCELLED;
+import static org.apache.gobblin.service.ExecutionStatus.FAILED;
+import static org.apache.gobblin.service.ExecutionStatus.PENDING_RESUME;
+
+
+/**
+ * An implementation for {@link DagProc} that resumes a dag and submits the
job that failed/killed previously.
+ */
+@Slf4j
+public class ResumeDagProc extends DagProc<Optional<Dag<JobExecutionPlan>>> {
+
+ public ResumeDagProc(ResumeDagTask resumeDagTask) {
+ super(resumeDagTask);
+ }
+
+ @Override
+ protected Optional<Dag<JobExecutionPlan>> initialize(DagManagementStateStore
dagManagementStateStore)
+ throws IOException {
+ return dagManagementStateStore.getFailedDag(getDagId());
+ }
+
+ @Override
+ protected void act(DagManagementStateStore dagManagementStateStore,
Optional<Dag<JobExecutionPlan>> dag)
+ throws IOException {
+ log.info("Request to resume dag {}", getDagId());
+
+ if (!dag.isPresent()) {
+ // todo - add a metric here
+ log.error("Dag " + dagId + " was not found in dag state store");
+ return;
+ }
+
+ long flowResumeTime = System.currentTimeMillis();
+
+ // Set the flow and its failed or cancelled nodes to PENDING_RESUME so
that the flow will be resumed from the point before it failed
+ DagManagerUtils.emitFlowEvent(eventSubmitter, dag.get(),
TimingEvent.FlowTimings.FLOW_PENDING_RESUME);
+
+ for (Dag.DagNode<JobExecutionPlan> node : dag.get().getNodes()) {
+ ExecutionStatus executionStatus = node.getValue().getExecutionStatus();
+ if (executionStatus.equals(FAILED) || executionStatus.equals(CANCELLED))
{
+ node.getValue().setExecutionStatus(PENDING_RESUME);
+ // reset currentAttempts because we do not want to count previous
execution's attempts in deciding whether to retry a job
+ node.getValue().setCurrentAttempts(0);
+ DagManagerUtils.incrementJobGeneration(node);
+ Map<String, String> jobMetadata =
TimingEventUtils.getJobMetadata(Maps.newHashMap(), node.getValue());
+
eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.JOB_PENDING_RESUME).stop(jobMetadata);
+ }
+ // Set flowStartTime so that flow SLA will be based on current time
instead of original flow
+ node.getValue().setFlowStartTime(flowResumeTime);
+ }
+
+ dagManagementStateStore.checkpointDag(dag.get());
+ dagManagementStateStore.deleteFailedDag(dag.get());
Review Comment:
does `checkpointDag` effectively add this to the dag state store` whereas
`deleteFailedDag` removes it from the failed dag state store--basically a swap?
a code comment would likely help maintainers
##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProc.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import com.google.common.collect.Maps;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.metrics.event.TimingEvent;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import
org.apache.gobblin.service.modules.orchestration.DagManagementStateStore;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import org.apache.gobblin.service.modules.orchestration.TimingEventUtils;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.apache.gobblin.service.ExecutionStatus.CANCELLED;
+import static org.apache.gobblin.service.ExecutionStatus.FAILED;
+import static org.apache.gobblin.service.ExecutionStatus.PENDING_RESUME;
+
+
+/**
+ * An implementation for {@link DagProc} that resumes a dag and submits the
job that failed/killed previously.
+ */
+@Slf4j
+public class ResumeDagProc extends DagProc<Optional<Dag<JobExecutionPlan>>> {
+
+ public ResumeDagProc(ResumeDagTask resumeDagTask) {
+ super(resumeDagTask);
+ }
+
+ @Override
+ protected Optional<Dag<JobExecutionPlan>> initialize(DagManagementStateStore
dagManagementStateStore)
+ throws IOException {
+ return dagManagementStateStore.getFailedDag(getDagId());
+ }
+
+ @Override
+ protected void act(DagManagementStateStore dagManagementStateStore,
Optional<Dag<JobExecutionPlan>> dag)
+ throws IOException {
+ log.info("Request to resume dag {}", getDagId());
+
+ if (!dag.isPresent()) {
+ // todo - add a metric here
+ log.error("Dag " + dagId + " was not found in dag state store");
+ return;
+ }
+
+ long flowResumeTime = System.currentTimeMillis();
+
+ // Set the flow and its failed or cancelled nodes to PENDING_RESUME so
that the flow will be resumed from the point before it failed
+ DagManagerUtils.emitFlowEvent(eventSubmitter, dag.get(),
TimingEvent.FlowTimings.FLOW_PENDING_RESUME);
+
+ for (Dag.DagNode<JobExecutionPlan> node : dag.get().getNodes()) {
+ ExecutionStatus executionStatus = node.getValue().getExecutionStatus();
+ if (executionStatus.equals(FAILED) || executionStatus.equals(CANCELLED))
{
+ node.getValue().setExecutionStatus(PENDING_RESUME);
+ // reset currentAttempts because we do not want to count previous
execution's attempts in deciding whether to retry a job
+ node.getValue().setCurrentAttempts(0);
+ DagManagerUtils.incrementJobGeneration(node);
+ Map<String, String> jobMetadata =
TimingEventUtils.getJobMetadata(Maps.newHashMap(), node.getValue());
+
eventSubmitter.getTimingEvent(TimingEvent.LauncherTimings.JOB_PENDING_RESUME).stop(jobMetadata);
+ }
+ // Set flowStartTime so that flow SLA will be based on current time
instead of original flow
Review Comment:
nit: flow *deadlines* (rather than "SLA")
and to clarify, is this applicable to both job start deadline and flow
completion deadline?
##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProc.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import com.google.common.collect.Maps;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.metrics.event.TimingEvent;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import
org.apache.gobblin.service.modules.orchestration.DagManagementStateStore;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import org.apache.gobblin.service.modules.orchestration.TimingEventUtils;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.apache.gobblin.service.ExecutionStatus.CANCELLED;
+import static org.apache.gobblin.service.ExecutionStatus.FAILED;
+import static org.apache.gobblin.service.ExecutionStatus.PENDING_RESUME;
+
+
+/**
+ * An implementation for {@link DagProc} that resumes a dag and submits the
job that failed/killed previously.
Review Comment:
nit: "... submits the job that previously failed or was killed."
##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProc.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import com.google.common.collect.Maps;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.metrics.event.TimingEvent;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import
org.apache.gobblin.service.modules.orchestration.DagManagementStateStore;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import org.apache.gobblin.service.modules.orchestration.TimingEventUtils;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.apache.gobblin.service.ExecutionStatus.CANCELLED;
+import static org.apache.gobblin.service.ExecutionStatus.FAILED;
+import static org.apache.gobblin.service.ExecutionStatus.PENDING_RESUME;
+
+
+/**
+ * An implementation for {@link DagProc} that resumes a dag and submits the
job that failed/killed previously.
+ */
+@Slf4j
+public class ResumeDagProc extends DagProc<Optional<Dag<JobExecutionPlan>>> {
+
+ public ResumeDagProc(ResumeDagTask resumeDagTask) {
+ super(resumeDagTask);
+ }
+
+ @Override
+ protected Optional<Dag<JobExecutionPlan>> initialize(DagManagementStateStore
dagManagementStateStore)
+ throws IOException {
+ return dagManagementStateStore.getFailedDag(getDagId());
+ }
+
+ @Override
+ protected void act(DagManagementStateStore dagManagementStateStore,
Optional<Dag<JobExecutionPlan>> dag)
+ throws IOException {
+ log.info("Request to resume dag {}", getDagId());
+
+ if (!dag.isPresent()) {
+ // todo - add a metric here
+ log.error("Dag " + dagId + " was not found in dag state store");
+ return;
+ }
+
+ long flowResumeTime = System.currentTimeMillis();
+
+ // Set the flow and its failed or cancelled nodes to PENDING_RESUME so
that the flow will be resumed from the point before it failed
+ DagManagerUtils.emitFlowEvent(eventSubmitter, dag.get(),
TimingEvent.FlowTimings.FLOW_PENDING_RESUME);
+
+ for (Dag.DagNode<JobExecutionPlan> node : dag.get().getNodes()) {
+ ExecutionStatus executionStatus = node.getValue().getExecutionStatus();
+ if (executionStatus.equals(FAILED) || executionStatus.equals(CANCELLED))
{
+ node.getValue().setExecutionStatus(PENDING_RESUME);
+ // reset currentAttempts because we do not want to count previous
execution's attempts in deciding whether to retry a job
+ node.getValue().setCurrentAttempts(0);
+ DagManagerUtils.incrementJobGeneration(node);
Review Comment:
would be good to contextualize the currentAttempts "reset" with the
"generation" increment
##########
gobblin-service/src/test/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProcTest.java:
##########
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.ExecutionException;
+import java.util.stream.Collectors;
+
+import org.mockito.Mockito;
+import org.testng.Assert;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+import com.typesafe.config.ConfigFactory;
+import com.typesafe.config.ConfigValueFactory;
+
+import org.apache.gobblin.configuration.ConfigurationKeys;
+import org.apache.gobblin.metastore.testing.ITestMetastoreDatabase;
+import org.apache.gobblin.metastore.testing.TestMetastoreDatabaseFactory;
+import org.apache.gobblin.runtime.api.FlowSpec;
+import org.apache.gobblin.runtime.api.Spec;
+import org.apache.gobblin.runtime.api.SpecProducer;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import org.apache.gobblin.service.modules.orchestration.DagActionStore;
+import org.apache.gobblin.service.modules.orchestration.DagManager;
+import org.apache.gobblin.service.modules.orchestration.DagManagerTest;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import
org.apache.gobblin.service.modules.orchestration.MostlyMySqlDagManagementStateStore;
+import
org.apache.gobblin.service.modules.orchestration.MostlyMySqlDagManagementStateStoreTest;
+import org.apache.gobblin.service.modules.orchestration.MysqlDagActionStore;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.doNothing;
+import static org.mockito.Mockito.doReturn;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+
+
+public class ResumeDagProcTest {
+ private MostlyMySqlDagManagementStateStore dagManagementStateStore;
+ private ITestMetastoreDatabase testDb;
+
+ @BeforeClass
+ public void setUp() throws Exception {
+ testDb = TestMetastoreDatabaseFactory.get();
+ this.dagManagementStateStore =
spy(MostlyMySqlDagManagementStateStoreTest.getDummyDMSS(testDb));
+
doReturn(FlowSpec.builder().build()).when(this.dagManagementStateStore).getFlowSpec(any());
+ doNothing().when(this.dagManagementStateStore).tryAcquireQuota(any());
+ doNothing().when(this.dagManagementStateStore).addDagNodeState(any(),
any());
+ }
+
+ @AfterClass(alwaysRun = true)
+ public void tearDown() throws Exception {
+ if (testDb != null) {
+ testDb.close();
+ }
+ }
+
+ /*
+ This test creates a failed dag and launches a resume dag proc for it. It
then verifies that the next jobs are set to run.
+ */
+ @Test
+ public void resumeDag() throws IOException, URISyntaxException {
+ long flowExecutionId = System.currentTimeMillis();
+ Dag<JobExecutionPlan> dag = DagManagerTest.buildDag("1", flowExecutionId,
DagManager.FailureOption.FINISH_ALL_POSSIBLE.name(),
+ 5, "user5",
ConfigFactory.empty().withValue(ConfigurationKeys.FLOW_GROUP_KEY,
ConfigValueFactory.fromAnyRef("fg")));
+ // simulate a failed dag in store
+
dag.getNodes().get(0).getValue().setExecutionStatus(ExecutionStatus.COMPLETE);
+
dag.getNodes().get(1).getValue().setExecutionStatus(ExecutionStatus.FAILED);
+
dag.getNodes().get(2).getValue().setExecutionStatus(ExecutionStatus.COMPLETE);
+
dag.getNodes().get(4).getValue().setExecutionStatus(ExecutionStatus.COMPLETE);
+
doReturn(Optional.of(dag)).when(dagManagementStateStore).getFailedDag(any());
+
+ ResumeDagProc resumeDagProc = new ResumeDagProc(new ResumeDagTask(new
DagActionStore.DagAction("fg", "flow1",
+ String.valueOf(flowExecutionId),
MysqlDagActionStore.NO_JOB_NAME_DEFAULT, DagActionStore.DagActionType.RESUME),
+ null, mock(DagActionStore.class)));
+ resumeDagProc.process(this.dagManagementStateStore);
+
+ List<SpecProducer<Spec>> specProducers = dag.getNodes().stream().map(n -> {
+ try {
+ return DagManagerUtils.getSpecProducer(n);
+ } catch (ExecutionException | InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }).collect(Collectors.toList());
+ int expectedNumOfResumedJobs = 1; // = number of resumed nodes
+
+ long resumedJobCount = specProducers.stream()
+ .mapToLong(p -> Mockito.mockingDetails(p)
+ .getInvocations()
+ .stream()
+ .filter(a -> a.getMethod().getName().equals("addSpec"))
+ .count())
+ .sum();
+ long addedDagNodeStates =
Mockito.mockingDetails(this.dagManagementStateStore).getInvocations().stream()
+ .filter(a ->
a.getMethod().getName().equals("addDagNodeState")).count();
+
+ Assert.assertEquals(resumedJobCount, expectedNumOfResumedJobs);
+ Assert.assertEquals(addedDagNodeStates, expectedNumOfResumedJobs);
Review Comment:
would the more common formulation not be equivalent?
```
Mockito.verify(this.dagManagementStateStore,
Mockito.times(expectedNumOfResumedJobs)).addDagNodeState(any(), any());
```
##########
gobblin-service/src/test/java/org/apache/gobblin/service/modules/orchestration/proc/ResumeDagProcTest.java:
##########
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.ExecutionException;
+import java.util.stream.Collectors;
+
+import org.mockito.Mockito;
+import org.testng.Assert;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+import com.typesafe.config.ConfigFactory;
+import com.typesafe.config.ConfigValueFactory;
+
+import org.apache.gobblin.configuration.ConfigurationKeys;
+import org.apache.gobblin.metastore.testing.ITestMetastoreDatabase;
+import org.apache.gobblin.metastore.testing.TestMetastoreDatabaseFactory;
+import org.apache.gobblin.runtime.api.FlowSpec;
+import org.apache.gobblin.runtime.api.Spec;
+import org.apache.gobblin.runtime.api.SpecProducer;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import org.apache.gobblin.service.modules.orchestration.DagActionStore;
+import org.apache.gobblin.service.modules.orchestration.DagManager;
+import org.apache.gobblin.service.modules.orchestration.DagManagerTest;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import
org.apache.gobblin.service.modules.orchestration.MostlyMySqlDagManagementStateStore;
+import
org.apache.gobblin.service.modules.orchestration.MostlyMySqlDagManagementStateStoreTest;
+import org.apache.gobblin.service.modules.orchestration.MysqlDagActionStore;
+import org.apache.gobblin.service.modules.orchestration.task.ResumeDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.doNothing;
+import static org.mockito.Mockito.doReturn;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+
+
+public class ResumeDagProcTest {
+ private MostlyMySqlDagManagementStateStore dagManagementStateStore;
+ private ITestMetastoreDatabase testDb;
+
+ @BeforeClass
+ public void setUp() throws Exception {
+ testDb = TestMetastoreDatabaseFactory.get();
+ this.dagManagementStateStore =
spy(MostlyMySqlDagManagementStateStoreTest.getDummyDMSS(testDb));
+
doReturn(FlowSpec.builder().build()).when(this.dagManagementStateStore).getFlowSpec(any());
+ doNothing().when(this.dagManagementStateStore).tryAcquireQuota(any());
+ doNothing().when(this.dagManagementStateStore).addDagNodeState(any(),
any());
+ }
+
+ @AfterClass(alwaysRun = true)
+ public void tearDown() throws Exception {
+ if (testDb != null) {
+ testDb.close();
+ }
+ }
+
+ /*
+ This test creates a failed dag and launches a resume dag proc for it. It
then verifies that the next jobs are set to run.
+ */
+ @Test
+ public void resumeDag() throws IOException, URISyntaxException {
+ long flowExecutionId = System.currentTimeMillis();
+ Dag<JobExecutionPlan> dag = DagManagerTest.buildDag("1", flowExecutionId,
DagManager.FailureOption.FINISH_ALL_POSSIBLE.name(),
+ 5, "user5",
ConfigFactory.empty().withValue(ConfigurationKeys.FLOW_GROUP_KEY,
ConfigValueFactory.fromAnyRef("fg")));
+ // simulate a failed dag in store
+
dag.getNodes().get(0).getValue().setExecutionStatus(ExecutionStatus.COMPLETE);
+
dag.getNodes().get(1).getValue().setExecutionStatus(ExecutionStatus.FAILED);
+
dag.getNodes().get(2).getValue().setExecutionStatus(ExecutionStatus.COMPLETE);
+
dag.getNodes().get(4).getValue().setExecutionStatus(ExecutionStatus.COMPLETE);
+
doReturn(Optional.of(dag)).when(dagManagementStateStore).getFailedDag(any());
+
+ ResumeDagProc resumeDagProc = new ResumeDagProc(new ResumeDagTask(new
DagActionStore.DagAction("fg", "flow1",
Review Comment:
let's give magic strings a `static final` name
...and was "flow1" shown above as "1" (on line 87)?
Issue Time Tracking
-------------------
Worklog Id: (was: 918457)
Time Spent: 0.5h (was: 20m)
> create resume dag proc
> ----------------------
>
> Key: GOBBLIN-2057
> URL: https://issues.apache.org/jira/browse/GOBBLIN-2057
> Project: Apache Gobblin
> Issue Type: Improvement
> Reporter: Arjun Singh Bora
> Priority: Major
> Time Spent: 0.5h
> Remaining Estimate: 0h
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)