[ 
https://issues.apache.org/jira/browse/GOBBLIN-2022?focusedWorklogId=911432&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-911432
 ]

ASF GitHub Bot logged work on GOBBLIN-2022:
-------------------------------------------

                Author: ASF GitHub Bot
            Created on: 26/Mar/24 00:17
            Start Date: 26/Mar/24 00:17
    Worklog Time Spent: 10m 
      Work Description: arjun4084346 commented on code in PR #3896:
URL: https://github.com/apache/gobblin/pull/3896#discussion_r1538392271


##########
gobblin-service/src/main/java/org/apache/gobblin/service/modules/orchestration/proc/ReevaluateDagProc.java:
##########
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.orchestration.proc;
+
+import java.io.IOException;
+import java.util.Optional;
+import java.util.Set;
+
+import com.codahale.metrics.Timer;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.gobblin.metrics.ServiceMetricNames;
+import org.apache.gobblin.metrics.event.TimingEvent;
+import org.apache.gobblin.runtime.api.DagActionStore;
+import org.apache.gobblin.service.ExecutionStatus;
+import org.apache.gobblin.service.modules.flowgraph.Dag;
+import org.apache.gobblin.service.modules.flowgraph.DagNodeId;
+import 
org.apache.gobblin.service.modules.orchestration.DagManagementStateStore;
+import org.apache.gobblin.service.modules.orchestration.DagManagerUtils;
+import org.apache.gobblin.service.modules.orchestration.task.ReevaluateDagTask;
+import org.apache.gobblin.service.modules.spec.JobExecutionPlan;
+import org.apache.gobblin.service.monitoring.FlowStatusGenerator;
+import org.apache.gobblin.service.monitoring.JobStatus;
+import org.apache.gobblin.service.monitoring.JobStatusRetriever;
+
+
+/**
+ * An implementation for {@link DagProc} that launches a new job if there 
exists a job whose pre-requisite jobs are
+ * completed successfully. If there are no more jobs to run and no job is 
running for the Dag, it cleans up the Dag.
+ */
+@Slf4j
+public class ReevaluateDagProc extends 
DagProc<Optional<Dag.DagNode<JobExecutionPlan>>, Void> {
+  private final JobStatusRetriever jobStatusRetriever;
+  private final Timer jobStatusPolledTimer;
+  private final DagNodeId dagNodeId;
+  private JobStatus jobStatus;
+
+  public ReevaluateDagProc(ReevaluateDagTask reEvaluateDagTask, 
JobStatusRetriever jobStatusRetriever) {
+    super(reEvaluateDagTask);
+    this.jobStatusRetriever = jobStatusRetriever;
+    this.jobStatusPolledTimer = 
metricContext.timer(ServiceMetricNames.JOB_STATUS_POLLED_TIMER);
+    this.dagNodeId = getDagNodeId();
+  }
+
+  @Override
+  protected Optional<Dag.DagNode<JobExecutionPlan>> 
initialize(DagManagementStateStore dagManagementStateStore)
+      throws IOException {
+    Optional<Dag.DagNode<JobExecutionPlan>> dagNode = 
dagManagementStateStore.getDagNode(this.dagNodeId);
+    if (!dagNode.isPresent()) {
+      log.error("DagNode not found for a Reevaluate DagAction with dag node id 
{}", this.dagNodeId);
+      return Optional.empty();
+    }
+    this.jobStatus = DagManagerUtils.pollJobStatus(dagNode.get(), 
this.jobStatusRetriever, this.jobStatusPolledTimer).get();
+    ExecutionStatus executionStatus = 
ExecutionStatus.valueOf(jobStatus.getEventName());
+    if 
(!FlowStatusGenerator.FINISHED_STATUSES.contains(executionStatus.name())) {
+      log.warn("Job status for dagNode {} is {}. Expected Statuses are {}", 
dagNodeId, executionStatus, FlowStatusGenerator.FINISHED_STATUSES);
+      return Optional.empty();
+    }
+    setStatus(dagManagementStateStore, dagNode.get(), executionStatus);
+    return dagNode;
+  }
+
+  @Override
+  protected Void act(DagManagementStateStore dagManagementStateStore, 
Optional<Dag.DagNode<JobExecutionPlan>> dagNode)
+      throws IOException {
+    if (!dagNode.isPresent()) {
+      return null;
+    }
+
+    ExecutionStatus executionStatus = 
dagNode.get().getValue().getExecutionStatus();
+    onJobFinish(dagManagementStateStore, dagNode.get(), executionStatus);
+    dagManagementStateStore.deleteDagNodeState(getDagId(), dagNode.get());
+
+    Dag<JobExecutionPlan> dag = 
dagManagementStateStore.getDag(getDagId()).get();
+
+    if (this.jobStatus.isShouldRetry()) {
+      log.info("Retrying job: {}, current attempts: {}, max attempts: {}",
+          DagManagerUtils.getFullyQualifiedJobName(dagNode.get()),
+          jobStatus.getCurrentAttempts(), jobStatus.getMaxAttempts());
+      dag.setFlowEvent(null);
+      DagProcUtils.submitJobToExecutor(dagManagementStateStore, dagNode.get(), 
getDagId());
+    }
+
+    if (!DagProcUtils.hasRunningJobs(getDagId(), dagManagementStateStore)) {
+      if (dag.getFlowEvent() == null) {
+        // If the dag flow event is not set, then it is successful
+        dag.setFlowEvent(TimingEvent.FlowTimings.FLOW_SUCCEEDED);
+        // send an event before cleaning up dag
+        DagManagerUtils.emitFlowEvent(eventSubmitter, dag, dag.getFlowEvent());
+        // todo - verify if work from PR#3641 is required
+        dagManagementStateStore.deleteDag(getDagId());
+      } else {
+        DagManagerUtils.emitFlowEvent(eventSubmitter, dag, dag.getFlowEvent());
+        dagManagementStateStore.markDagFailed(dag);
+      }
+    }
+
+    return null;
+  }
+
+  /**
+   * Sets status of a dag node inside the given Dag.
+   * todo - DMSS should support this functionality like an atomic get-and-set 
operation.
+   */
+  private void setStatus(DagManagementStateStore dagManagementStateStore,
+      Dag.DagNode<JobExecutionPlan> dagNode, ExecutionStatus executionStatus) 
throws IOException {
+    Dag<JobExecutionPlan> dag = 
dagManagementStateStore.getDag(getDagId()).get();
+    DagNodeId dagNodeId = dagNode.getValue().getId();
+    for (Dag.DagNode<JobExecutionPlan> node : dag.getNodes()) {
+      if (node.getValue().getId().equals(dagNodeId)) {
+        node.getValue().setExecutionStatus(executionStatus);
+        dagManagementStateStore.checkpointDag(dag);
+        return;
+      }
+    }
+    log.error("DagNode with id {} not found in Dag {}", dagNodeId, getDagId());
+  }
+
+  /**
+   * Method that defines the actions to be performed when a job finishes 
either successfully or with failure.
+   * This method updates the state of the dag and performs clean up actions as 
necessary.
+   */
+  private void onJobFinish(DagManagementStateStore dagManagementStateStore,
+      Dag.DagNode<JobExecutionPlan> dagNode, ExecutionStatus executionStatus)
+      throws IOException {
+    String jobName = DagManagerUtils.getFullyQualifiedJobName(dagNode);
+    log.info("Job {} of Dag {} has finished with status {}", jobName, 
getDagId(), executionStatus.name());
+    // Only decrement counters and quota for jobs that actually ran on the 
executor, not from a GaaS side failure/skip event
+    if (dagManagementStateStore.releaseQuota(dagNode)) {
+      
dagManagementStateStore.getDagManagerMetrics().decrementRunningJobMetrics(dagNode);
+    }
+
+    Dag<JobExecutionPlan> dag = 
dagManagementStateStore.getDag(getDagId()).get();
+
+    switch (executionStatus) {
+      case FAILED:
+        dag.setMessage("Flow failed because job " + jobName + " failed");
+        dag.setFlowEvent(TimingEvent.FlowTimings.FLOW_FAILED);
+        
dagManagementStateStore.getDagManagerMetrics().incrementExecutorFailed(dagNode);
+        break;
+      case CANCELLED:
+        dag.setFlowEvent(TimingEvent.FlowTimings.FLOW_CANCELLED);
+        break;
+      case COMPLETE:
+        
dagManagementStateStore.getDagManagerMetrics().incrementExecutorSuccess(dagNode);

Review Comment:
   a successful dag is recognized by empty flowEvent field.
   we can change this behavior, but for now it is done like this everywhere





Issue Time Tracking
-------------------

    Worklog Id:     (was: 911432)
    Time Spent: 1h 50m  (was: 1h 40m)

> create dag proc for taking actions on job completion
> ----------------------------------------------------
>
>                 Key: GOBBLIN-2022
>                 URL: https://issues.apache.org/jira/browse/GOBBLIN-2022
>             Project: Apache Gobblin
>          Issue Type: Task
>            Reporter: Arjun Singh Bora
>            Priority: Major
>          Time Spent: 1h 50m
>  Remaining Estimate: 0h
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to