chaimt commented on a change in pull request #4633: AIRFLOW-3791: Dataflow -
Support check status if pipeline spans on multiple jobs
URL: https://github.com/apache/airflow/pull/4633#discussion_r303920379
##########
File path: airflow/contrib/hooks/gcp_dataflow_hook.py
##########
@@ -47,58 +47,131 @@ class DataflowJobStatus:
JOB_STATE_FAILED = "JOB_STATE_FAILED"
JOB_STATE_CANCELLED = "JOB_STATE_CANCELLED"
JOB_STATE_PENDING = "JOB_STATE_PENDING"
+ FAILED_END_STATES = {JOB_STATE_FAILED, JOB_STATE_CANCELLED}
+ SUCCEEDED_END_STATES = {JOB_STATE_DONE}
+ END_STATES = SUCCEEDED_END_STATES | FAILED_END_STATES
class _DataflowJob(LoggingMixin):
def __init__(self, dataflow, project_number, name, location, poll_sleep=10,
- job_id=None, num_retries=None):
+ job_id=None, num_retries=None, multiple_jobs=None):
self._dataflow = dataflow
self._project_number = project_number
self._job_name = name
self._job_location = location
+ self._multiple_jobs = multiple_jobs
self._job_id = job_id
self._num_retries = num_retries
- self._job = self._get_job()
+ if self._num_retries is None:
+ self._num_retries = 0
self._poll_sleep = poll_sleep
+ self._jobs = self._get_jobs()
+
+ def is_job_running(self):
+ """
+ Helper method to check if jos is still running in dataflow
+
+ :return: True if job is running.
+ :rtype: bool
+ """
+ for job in self._jobs:
+ if job['currentState'] not in DataflowJobStatus.END_STATES:
+ return True
+ return False
def _get_job_id_from_name(self):
+ """
+ Helper method to get list of jobs that start with job name
+
+ :return: list of jobs including id's
+ :rtype: list
+ """
jobs = self._dataflow.projects().locations().jobs().list(
projectId=self._project_number,
location=self._job_location
).execute(num_retries=self._num_retries)
- for job in jobs['jobs']:
- if job['name'].lower() == self._job_name.lower():
- self._job_id = job['id']
- return job
- return None
+ dataflow_jobs = []
+ if jobs:
+ for job in jobs['jobs']:
+ if job['name'].startswith(self._job_name.lower()):
+ dataflow_jobs.append(job)
+ if len(dataflow_jobs) == 1:
+ self._job_id = dataflow_jobs[0]['id']
+ return dataflow_jobs
+
+ def _get_jobs(self):
+ """
+ Helper method to get all jobs by name
- def _get_job(self):
- if self._job_id:
- job = self._dataflow.projects().locations().jobs().get(
+ :return: jobs
+ :rtype: list
+ """
+ if not self._multiple_jobs and self._job_id:
+ self._jobs = []
+ self._jobs.append(self._dataflow.projects().locations().jobs().get(
projectId=self._project_number,
location=self._job_location,
- jobId=self._job_id).execute(num_retries=self._num_retries)
+ jobId=self._job_id).execute(num_retries=self._num_retries))
elif self._job_name:
- job = self._get_job_id_from_name()
+ self._jobs = self._get_job_id_from_name()
else:
raise Exception('Missing both dataflow job ID and name.')
- if job and 'currentState' in job:
- self.log.info(
- 'Google Cloud DataFlow job %s is %s',
- job['name'], job['currentState']
- )
- elif job:
- self.log.info(
- 'Google Cloud DataFlow with job_id %s has name %s',
- self._job_id, job['name']
- )
- else:
- self.log.info(
- 'Google Cloud DataFlow job not available yet..'
- )
+ for job in self._jobs:
+ if job and 'currentState' in job:
+ self._job_state = job['currentState']
+ self.log.info(
+ 'Google Cloud DataFlow job %s is %s',
+ job['name'], job['currentState']
+ )
+ elif job:
+ self.log.info(
+ 'Google Cloud DataFlow with job_id %s has name %s',
+ self._job_id, job['name']
+ )
+ else:
+ self.log.info(
+ 'Google Cloud DataFlow job not available yet..'
+ )
- return job
+ return self._jobs
+
+ # pylint: disable=too-many-nested-blocks
+ def check_dataflow_job_state(self, job):
+ """
+ Helper method to check the state of all jobs in dataflow for this task
+ if job failed raise exception
+ :return: True if job is done.
+ :rtype: bool
+ :raise: Exception
+ """
+ if DataflowJobStatus.JOB_STATE_DONE == job['currentState']:
+ # check all jobs are done
+ count_not_done = 0
+ for inner_jobs in self._jobs:
+ if inner_jobs and 'currentState' in job:
+ if not DataflowJobStatus.JOB_STATE_DONE ==
inner_jobs['currentState']:
+ count_not_done += 1
+ if count_not_done == 0:
+ return True
+ elif DataflowJobStatus.JOB_STATE_RUNNING == job['currentState'] and \
+ DataflowJobStatus.JOB_TYPE_STREAMING == job['type']:
+ return True
+ elif DataflowJobStatus.JOB_STATE_FAILED == job['currentState']:
+ raise Exception("Google Cloud Dataflow job {} has failed.".format(
+ job['name']))
+ elif DataflowJobStatus.JOB_STATE_CANCELLED == job['currentState']:
+ raise Exception("Google Cloud Dataflow job {} was
cancelled.".format(
+ job['name']))
+ elif job['currentState'] in {DataflowJobStatus.JOB_STATE_RUNNING,
Review comment:
organized by logical order
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services