coufon commented on a change in pull request #5594: [AIRFLOW-4924] Loading DAGs asynchronously in Airflow webserver URL: https://github.com/apache/airflow/pull/5594#discussion_r304658408
########## File path: airflow/dag/stringified_dags.py ########## @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# DagCached is a new feature in Airflow that caches processed DAGs in Airflow database. +# DAGs are stringified first and seriailized by Pickle to be stored in database. +# Stringified DAGs holds metadata of original DAGs and tasks, and can be used by +# Airflow webserver and scheduler. + +"""Methods to stringify DAGs and tasks to be compatible with pickle.""" + +import copy +import functools +import inspect +import logging + +from airflow import models + + +# Stringify all fields of DAGs and tasks except for time related fields. +_dag_fields_to_keep = { + 'schedule_interval', 'start_date', 'end_date', 'dagrun_timeout', + 'timezone', 'last_loaded', '_schedule_interval', 'test_field'} + +_task_fields_to_keep = { + 'retry_delay', 'max_retry_delay', 'start_date', 'end_date', + 'schedule_interval', 'sla', 'execution_timeout'} + +_primitive_types = (int, bool, float, str, bytes) + + +def _is_primitive(x): + return x is None or isinstance(x, _primitive_types) + + +def _stringify_dag_or_task(x, stringified_dags, is_dag): + """Returns a stringified DAG or task.""" + if is_dag and x.dag_id in stringified_dags: + return stringified_dags[x.dag_id] + + # Cast any operators defined in non-airlfow modules to BaseOperator to ensure + # unpickle is successful. The downside is that the task will be displayed as + # BaseOperator in UI. + if not is_dag and not ( + x.__class__.__module__.startswith('airflow.operators') or + x.__class__.__module__.startswith('airflow.contrib.operators')): + new_x = object.__new__(models.BaseOperator) + else: + new_x = object.__new__(x.__class__) + + if is_dag: + stringified_dags[x.dag_id] = new_x + + fields_to_keep = _dag_fields_to_keep if is_dag else _task_fields_to_keep + for k, v in x.__dict__.items(): + new_x.__dict__[k] = (copy.deepcopy(v) if k in fields_to_keep else + _stringify(v, stringified_dags)) + new_x.is_stringified = True + return new_x + + +def _stringify(x, stringified_dags): # pylint: disable=too-many-return-statements + """Returns a deep copy of stringified data to make DAGs and tasks picklable. + + Stringifying fields by Depth-First-Search. The order of examing fields is roughly + based on their occuring frequency. + stringified_dags stores DAGs that are being stringifying for have been stringified, + for: + (1) preventing deadlock loop caused by task.dag, task._dag, and dag.parent_dag; + (2) replacing the fields in (1) with stringified counterparts. + """ + try: + if _is_primitive(x): + return x + elif isinstance(x, dict): + return {k: _stringify(v, stringified_dags) for k, v in x.items()} + elif isinstance(x, models.DAG): + return _stringify_dag_or_task(x, stringified_dags, True) + elif isinstance(x, models.BaseOperator): + return _stringify_dag_or_task(x, stringified_dags, False) + elif callable(x): + return _get_python_source(x) + elif isinstance(x, list): + return [_stringify(v, stringified_dags) for v in x] + elif isinstance(x, set): + return {_stringify(v, stringified_dags) for v in x} + elif isinstance(x, tuple): + return tuple([_stringify(v, stringified_dags) for v in x]) + else: + return str(x) + except Exception: # pylint: disable=broad-except + logging.warning('Failed to stringify.', exc_info=True) + return 'failed_to_stringify' + + +def stringify(x): + """Returns a deep copy of stringified data to make DAGs and tasks picklable.""" + return _stringify(x, {}) + + +def _get_python_source(x): Review comment: Yes. But airflow.www.utils.get_python_source has been deprecated since Airflow 1.10.4, so I added it here. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
