(airflow) branch main updated: Add DatabricksWorkflowTaskGroup (#39771)

pankajkoti Thu, 30 May 2024 02:19:51 -0700

This is an automated email from the ASF dual-hosted git repository.

pankajkoti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new 2ecf7fa07d Add DatabricksWorkflowTaskGroup (#39771)
2ecf7fa07d is described below

commit 2ecf7fa07d6d681c73ae4831801f9d98db298d89
Author: Pankaj Koti <pankajkoti...@gmail.com>
AuthorDate: Thu May 30 14:49:37 2024 +0530

    Add DatabricksWorkflowTaskGroup (#39771)
    
    This pull request introduces the 
[DatabricksWorkflowTaskGroup](https://github.com/astronomer/astro-provider-databricks/blob/main/src/astro_databricks/operators/workflow.py#L226)
    to the Airflow Databricks provider from the 
[astro-provider-databricks](https://github.com/astronomer/astro-provider-databricks/tree/main)
    repository.
    It marks another pull request aimed at contributing
    operators and features from that repository into the Airflow
    Databricks provider, the previous PR being 
https://github.com/apache/airflow/pull/39178.
    
    The task group launches a [Databricks 
Workflow](https://docs.databricks.com/en/workflows/index.html)
    and runs the notebook jobs from within it, resulting in a
    [75% cost reduction](https://www.databricks.com/product/pricing) ($0.40/DBU 
for all-purpose compute,
    $0.07/DBU for Jobs compute) when compared to executing
    ``DatabricksNotebookOperator`` outside of ``DatabricksWorkflowTaskGroup``.
    
    ---------
    Co-authored-by: Daniel Imberman <daniel.imber...@gmail.com>
    Co-authored-by: Tatiana Al-Chueyr <tatiana.alchu...@gmail.com>
    Co-authored-by: Wei Lee <weilee...@gmail.com>
---
 airflow/providers/databricks/hooks/databricks.py   |  18 ++
 .../providers/databricks/operators/databricks.py   | 141 ++++++++--
 .../databricks/operators/databricks_workflow.py    | 312 +++++++++++++++++++++
 airflow/providers/databricks/provider.yaml         |  10 +
 ...icks_workflow_task_group_airflow_graph_view.png | Bin 0 -> 188072 bytes
 .../img/workflow_run_databricks_graph_view.png     | Bin 0 -> 405053 bytes
 .../operators/workflow.rst                         |  71 +++++
 generated/provider_dependencies.json               |   1 +
 .../databricks/operators/test_databricks.py        | 119 ++++++++
 .../operators/test_databricks_workflow.py          | 233 +++++++++++++++
 .../databricks/example_databricks_workflow.py      | 118 ++++++++
 11 files changed, 998 insertions(+), 25 deletions(-)

diff --git a/airflow/providers/databricks/hooks/databricks.py 
b/airflow/providers/databricks/hooks/databricks.py
index 710074d239..ee5349add0 100644
--- a/airflow/providers/databricks/hooks/databricks.py
+++ b/airflow/providers/databricks/hooks/databricks.py
@@ -29,6 +29,7 @@ or the ``api/2.1/jobs/runs/submit``
 from __future__ import annotations
 
 import json
+from enum import Enum
 from typing import Any
 
 from requests import exceptions as requests_exceptions
@@ -63,6 +64,23 @@ WORKSPACE_GET_STATUS_ENDPOINT = ("GET", 
"api/2.0/workspace/get-status")
 SPARK_VERSIONS_ENDPOINT = ("GET", "api/2.0/clusters/spark-versions")
 
 
+class RunLifeCycleState(Enum):
+    """Enum for the run life cycle state concept of Databricks runs.
+
+    See more information at: 
https://docs.databricks.com/api/azure/workspace/jobs/listruns#runs-state-life_cycle_state
+    """
+
+    BLOCKED = "BLOCKED"
+    INTERNAL_ERROR = "INTERNAL_ERROR"
+    PENDING = "PENDING"
+    QUEUED = "QUEUED"
+    RUNNING = "RUNNING"
+    SKIPPED = "SKIPPED"
+    TERMINATED = "TERMINATED"
+    TERMINATING = "TERMINATING"
+    WAITING_FOR_RETRY = "WAITING_FOR_RETRY"
+
+
 class RunState:
     """Utility class for the run state concept of Databricks runs."""
 
diff --git a/airflow/providers/databricks/operators/databricks.py 
b/airflow/providers/databricks/operators/databricks.py
index ff8de10132..d6118f247f 100644
--- a/airflow/providers/databricks/operators/databricks.py
+++ b/airflow/providers/databricks/operators/databricks.py
@@ -29,13 +29,18 @@ from deprecated import deprecated
 from airflow.configuration import conf
 from airflow.exceptions import AirflowException, 
AirflowProviderDeprecationWarning
 from airflow.models import BaseOperator, BaseOperatorLink, XCom
-from airflow.providers.databricks.hooks.databricks import DatabricksHook, 
RunState
+from airflow.providers.databricks.hooks.databricks import DatabricksHook, 
RunLifeCycleState, RunState
+from airflow.providers.databricks.operators.databricks_workflow import (
+    DatabricksWorkflowTaskGroup,
+    WorkflowRunMetadata,
+)
 from airflow.providers.databricks.triggers.databricks import 
DatabricksExecutionTrigger
 from airflow.providers.databricks.utils.databricks import 
normalise_json_content, validate_trigger_event
 
 if TYPE_CHECKING:
     from airflow.models.taskinstancekey import TaskInstanceKey
     from airflow.utils.context import Context
+    from airflow.utils.task_group import TaskGroup
 
 DEFER_METHOD_NAME = "execute_complete"
 XCOM_RUN_ID_KEY = "run_id"
@@ -926,7 +931,10 @@ class DatabricksNotebookOperator(BaseOperator):
     :param deferrable: Run operator in the deferrable mode.
     """
 
-    template_fields = ("notebook_params",)
+    template_fields = (
+        "notebook_params",
+        "workflow_run_metadata",
+    )
     CALLER = "DatabricksNotebookOperator"
 
     def __init__(
@@ -944,6 +952,7 @@ class DatabricksNotebookOperator(BaseOperator):
         databricks_retry_args: dict[Any, Any] | None = None,
         wait_for_termination: bool = True,
         databricks_conn_id: str = "databricks_default",
+        workflow_run_metadata: dict | None = None,
         deferrable: bool = conf.getboolean("operators", "default_deferrable", 
fallback=False),
         **kwargs: Any,
     ):
@@ -962,6 +971,10 @@ class DatabricksNotebookOperator(BaseOperator):
         self.databricks_conn_id = databricks_conn_id
         self.databricks_run_id: int | None = None
         self.deferrable = deferrable
+
+        # This is used to store the metadata of the Databricks job run when 
the job is launched from within DatabricksWorkflowTaskGroup.
+        self.workflow_run_metadata: dict | None = workflow_run_metadata
+
         super().__init__(**kwargs)
 
     @cached_property
@@ -1016,6 +1029,79 @@ class DatabricksNotebookOperator(BaseOperator):
         """Get the databricks task ID using dag_id and task_id. Removes 
illegal characters."""
         return f"{self.dag_id}__{task_id.replace('.', '__')}"
 
+    @property
+    def _databricks_workflow_task_group(self) -> DatabricksWorkflowTaskGroup | 
None:
+        """
+        Traverse up parent TaskGroups until the `is_databricks` flag 
associated with the root DatabricksWorkflowTaskGroup is found.
+
+        If found, returns the task group. Otherwise, return None.
+        """
+        parent_tg: TaskGroup | DatabricksWorkflowTaskGroup | None = 
self.task_group
+
+        while parent_tg:
+            if getattr(parent_tg, "is_databricks", False):
+                return parent_tg  # type: ignore[return-value]
+
+            if getattr(parent_tg, "task_group", None):
+                parent_tg = parent_tg.task_group
+            else:
+                return None
+
+        return None
+
+    def _extend_workflow_notebook_packages(
+        self, databricks_workflow_task_group: DatabricksWorkflowTaskGroup
+    ) -> None:
+        """Extend the task group packages into the notebook's packages, 
without adding any duplicates."""
+        for task_group_package in 
databricks_workflow_task_group.notebook_packages:
+            exists = any(
+                task_group_package == existing_package for existing_package in 
self.notebook_packages
+            )
+            if not exists:
+                self.notebook_packages.append(task_group_package)
+
+    def _convert_to_databricks_workflow_task(
+        self, relevant_upstreams: list[BaseOperator], context: Context | None 
= None
+    ) -> dict[str, object]:
+        """Convert the operator to a Databricks workflow task that can be a 
task in a workflow."""
+        databricks_workflow_task_group = self._databricks_workflow_task_group
+        if not databricks_workflow_task_group:
+            raise AirflowException(
+                "Calling `_convert_to_databricks_workflow_task` without a 
parent TaskGroup."
+            )
+
+        if hasattr(databricks_workflow_task_group, "notebook_packages"):
+            
self._extend_workflow_notebook_packages(databricks_workflow_task_group)
+
+        if hasattr(databricks_workflow_task_group, "notebook_params"):
+            self.notebook_params = {
+                **self.notebook_params,
+                **databricks_workflow_task_group.notebook_params,
+            }
+
+        base_task_json = self._get_task_base_json()
+        result = {
+            "task_key": self._get_databricks_task_id(self.task_id),
+            "depends_on": [
+                {"task_key": self._get_databricks_task_id(task_id)}
+                for task_id in self.upstream_task_ids
+                if task_id in relevant_upstreams
+            ],
+            **base_task_json,
+        }
+
+        if self.existing_cluster_id and self.job_cluster_key:
+            raise ValueError(
+                "Both existing_cluster_id and job_cluster_key are set. Only 
one can be set per task."
+            )
+
+        if self.existing_cluster_id:
+            result["existing_cluster_id"] = self.existing_cluster_id
+        elif self.job_cluster_key:
+            result["job_cluster_key"] = self.job_cluster_key
+
+        return result
+
     def _get_run_json(self) -> dict[str, Any]:
         """Get run json to be used for task submissions."""
         run_json = {
@@ -1039,6 +1125,17 @@ class DatabricksNotebookOperator(BaseOperator):
         self.log.info("Check the job run in Databricks: %s", url)
         return self.databricks_run_id
 
+    def _handle_terminal_run_state(self, run_state: RunState) -> None:
+        if run_state.life_cycle_state != RunLifeCycleState.TERMINATED.value:
+            raise AirflowException(
+                f"Databricks job failed with state 
{run_state.life_cycle_state}. Message: {run_state.state_message}"
+            )
+        if not run_state.is_successful:
+            raise AirflowException(
+                f"Task failed. Final state {run_state.result_state}. Reason: 
{run_state.state_message}"
+            )
+        self.log.info("Task succeeded. Final state %s.", 
run_state.result_state)
+
     def monitor_databricks_job(self) -> None:
         if self.databricks_run_id is None:
             raise ValueError("Databricks job not yet launched. Please run 
launch_notebook_job first.")
@@ -1063,34 +1160,28 @@ class DatabricksNotebookOperator(BaseOperator):
             run = self._hook.get_run(self.databricks_run_id)
             run_state = RunState(**run["state"])
             self.log.info(
-                "task %s %s", self._get_databricks_task_id(self.task_id), 
run_state.life_cycle_state
-            )
-            self.log.info("Current state of the job: %s", 
run_state.life_cycle_state)
-        if run_state.life_cycle_state != "TERMINATED":
-            raise AirflowException(
-                f"Databricks job failed with state 
{run_state.life_cycle_state}. "
-                f"Message: {run_state.state_message}"
+                "Current state of the databricks task %s is %s",
+                self._get_databricks_task_id(self.task_id),
+                run_state.life_cycle_state,
             )
-        if not run_state.is_successful:
-            raise AirflowException(
-                f"Task failed. Final state {run_state.result_state}. Reason: 
{run_state.state_message}"
-            )
-        self.log.info("Task succeeded. Final state %s.", 
run_state.result_state)
+        self._handle_terminal_run_state(run_state)
 
     def execute(self, context: Context) -> None:
-        self.launch_notebook_job()
+        if self._databricks_workflow_task_group:
+            # If we are in a DatabricksWorkflowTaskGroup, we should have an 
upstream task launched.
+            if not self.workflow_run_metadata:
+                launch_task_id = next(task for task in self.upstream_task_ids 
if task.endswith(".launch"))
+                self.workflow_run_metadata = 
context["ti"].xcom_pull(task_ids=launch_task_id)
+            workflow_run_metadata = WorkflowRunMetadata(  # type: 
ignore[arg-type]
+                **self.workflow_run_metadata
+            )
+            self.databricks_run_id = workflow_run_metadata.run_id
+            self.databricks_conn_id = workflow_run_metadata.conn_id
+        else:
+            self.launch_notebook_job()
         if self.wait_for_termination:
             self.monitor_databricks_job()
 
     def execute_complete(self, context: dict | None, event: dict) -> None:
         run_state = RunState.from_json(event["run_state"])
-        if run_state.life_cycle_state != "TERMINATED":
-            raise AirflowException(
-                f"Databricks job failed with state 
{run_state.life_cycle_state}. "
-                f"Message: {run_state.state_message}"
-            )
-        if not run_state.is_successful:
-            raise AirflowException(
-                f"Task failed. Final state {run_state.result_state}. Reason: 
{run_state.state_message}"
-            )
-        self.log.info("Task succeeded. Final state %s.", 
run_state.result_state)
+        self._handle_terminal_run_state(run_state)
diff --git a/airflow/providers/databricks/operators/databricks_workflow.py 
b/airflow/providers/databricks/operators/databricks_workflow.py
new file mode 100644
index 0000000000..8203145314
--- /dev/null
+++ b/airflow/providers/databricks/operators/databricks_workflow.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import json
+import time
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Any
+
+from mergedeep import merge
+
+from airflow.exceptions import AirflowException
+from airflow.models import BaseOperator
+from airflow.providers.databricks.hooks.databricks import DatabricksHook, 
RunLifeCycleState
+from airflow.utils.task_group import TaskGroup
+
+if TYPE_CHECKING:
+    from types import TracebackType
+
+    from airflow.models.taskmixin import DAGNode
+    from airflow.utils.context import Context
+
+
+@dataclass
+class WorkflowRunMetadata:
+    """
+    Metadata for a Databricks workflow run.
+
+    :param run_id: The ID of the Databricks workflow run.
+    :param job_id: The ID of the Databricks workflow job.
+    :param conn_id: The connection ID used to connect to Databricks.
+    """
+
+    conn_id: str
+    job_id: str
+    run_id: int
+
+
+def _flatten_node(
+    node: TaskGroup | BaseOperator | DAGNode, tasks: list[BaseOperator] | None 
= None
+) -> list[BaseOperator]:
+    """Flatten a node (either a TaskGroup or Operator) to a list of nodes."""
+    if tasks is None:
+        tasks = []
+    if isinstance(node, BaseOperator):
+        return [node]
+
+    if isinstance(node, TaskGroup):
+        new_tasks = []
+        for _, child in node.children.items():
+            new_tasks += _flatten_node(child, tasks)
+
+        return tasks + new_tasks
+
+    return tasks
+
+
+class _CreateDatabricksWorkflowOperator(BaseOperator):
+    """
+    Creates a Databricks workflow from a DatabricksWorkflowTaskGroup specified 
in a DAG.
+
+    :param task_id: The task_id of the operator
+    :param databricks_conn_id: The connection ID to use when connecting to 
Databricks.
+    :param existing_clusters: A list of existing clusters to use for the 
workflow.
+    :param extra_job_params: A dictionary of extra properties which will 
override the default Databricks
+        Workflow Job definitions.
+    :param job_clusters: A list of job clusters to use for the workflow.
+    :param max_concurrent_runs: The maximum number of concurrent runs for the 
workflow.
+    :param notebook_params: A dictionary of notebook parameters to pass to the 
workflow. These parameters
+        will be passed to all notebooks in the workflow.
+    :param tasks_to_convert: A list of tasks to convert to a Databricks 
workflow. This list can also be
+        populated after instantiation using the `add_task` method.
+    """
+
+    template_fields = ("notebook_params",)
+    caller = "_CreateDatabricksWorkflowOperator"
+
+    def __init__(
+        self,
+        task_id: str,
+        databricks_conn_id: str,
+        existing_clusters: list[str] | None = None,
+        extra_job_params: dict[str, Any] | None = None,
+        job_clusters: list[dict[str, object]] | None = None,
+        max_concurrent_runs: int = 1,
+        notebook_params: dict | None = None,
+        tasks_to_convert: list[BaseOperator] | None = None,
+        **kwargs,
+    ):
+        self.databricks_conn_id = databricks_conn_id
+        self.existing_clusters = existing_clusters or []
+        self.extra_job_params = extra_job_params or {}
+        self.job_clusters = job_clusters or []
+        self.max_concurrent_runs = max_concurrent_runs
+        self.notebook_params = notebook_params or {}
+        self.tasks_to_convert = tasks_to_convert or []
+        self.relevant_upstreams = [task_id]
+        super().__init__(task_id=task_id, **kwargs)
+
+    def _get_hook(self, caller: str) -> DatabricksHook:
+        return DatabricksHook(
+            self.databricks_conn_id,
+            caller=caller,
+        )
+
+    @cached_property
+    def _hook(self) -> DatabricksHook:
+        return self._get_hook(caller=self.caller)
+
+    def add_task(self, task: BaseOperator) -> None:
+        """Add a task to the list of tasks to convert to a Databricks 
workflow."""
+        self.tasks_to_convert.append(task)
+
+    @property
+    def job_name(self) -> str:
+        if not self.task_group:
+            raise AirflowException("Task group must be set before accessing 
job_name")
+        return f"{self.dag_id}.{self.task_group.group_id}"
+
+    def create_workflow_json(self, context: Context | None = None) -> 
dict[str, object]:
+        """Create a workflow json to be used in the Databricks API."""
+        task_json = [
+            task._convert_to_databricks_workflow_task(  # type: 
ignore[attr-defined]
+                relevant_upstreams=self.relevant_upstreams, context=context
+            )
+            for task in self.tasks_to_convert
+        ]
+
+        default_json = {
+            "name": self.job_name,
+            "email_notifications": {"no_alert_for_skipped_runs": False},
+            "timeout_seconds": 0,
+            "tasks": task_json,
+            "format": "MULTI_TASK",
+            "job_clusters": self.job_clusters,
+            "max_concurrent_runs": self.max_concurrent_runs,
+        }
+        return merge(default_json, self.extra_job_params)
+
+    def _create_or_reset_job(self, context: Context) -> int:
+        job_spec = self.create_workflow_json(context=context)
+        existing_jobs = self._hook.list_jobs(job_name=self.job_name)
+        job_id = existing_jobs[0]["job_id"] if existing_jobs else None
+        if job_id:
+            self.log.info(
+                "Updating existing Databricks workflow job %s with spec %s",
+                self.job_name,
+                json.dumps(job_spec, indent=2),
+            )
+            self._hook.reset_job(job_id, job_spec)
+        else:
+            self.log.info(
+                "Creating new Databricks workflow job %s with spec %s",
+                self.job_name,
+                json.dumps(job_spec, indent=2),
+            )
+            job_id = self._hook.create_job(job_spec)
+        return job_id
+
+    def _wait_for_job_to_start(self, run_id: int) -> None:
+        run_url = self._hook.get_run_page_url(run_id)
+        self.log.info("Check the progress of the Databricks job at %s", 
run_url)
+        life_cycle_state = self._hook.get_run_state(run_id).life_cycle_state
+        if life_cycle_state not in (
+            RunLifeCycleState.PENDING.value,
+            RunLifeCycleState.RUNNING.value,
+            RunLifeCycleState.BLOCKED.value,
+        ):
+            raise AirflowException(f"Could not start the workflow job. State: 
{life_cycle_state}")
+        while life_cycle_state in (RunLifeCycleState.PENDING.value, 
RunLifeCycleState.BLOCKED.value):
+            self.log.info("Waiting for the Databricks job to start running")
+            time.sleep(5)
+            life_cycle_state = 
self._hook.get_run_state(run_id).life_cycle_state
+        self.log.info("Databricks job started. State: %s", life_cycle_state)
+
+    def execute(self, context: Context) -> Any:
+        if not isinstance(self.task_group, DatabricksWorkflowTaskGroup):
+            raise AirflowException("Task group must be a 
DatabricksWorkflowTaskGroup")
+
+        job_id = self._create_or_reset_job(context)
+
+        run_id = self._hook.run_now(
+            {
+                "job_id": job_id,
+                "jar_params": self.task_group.jar_params,
+                "notebook_params": self.notebook_params,
+                "python_params": self.task_group.python_params,
+                "spark_submit_params": self.task_group.spark_submit_params,
+            }
+        )
+
+        self._wait_for_job_to_start(run_id)
+
+        return {
+            "conn_id": self.databricks_conn_id,
+            "job_id": job_id,
+            "run_id": run_id,
+        }
+
+
+class DatabricksWorkflowTaskGroup(TaskGroup):
+    """
+    A task group that takes a list of tasks and creates a databricks workflow.
+
+    The DatabricksWorkflowTaskGroup takes a list of tasks and creates a 
databricks workflow
+    based on the metadata produced by those tasks. For a task to be eligible 
for this
+    TaskGroup, it must contain the ``_convert_to_databricks_workflow_task`` 
method. If any tasks
+    do not contain this method then the Taskgroup will raise an error at parse 
time.
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the 
guide:
+        :ref:`howto/operator:DatabricksWorkflowTaskGroup`
+
+    :param databricks_conn_id: The name of the databricks connection to use.
+    :param existing_clusters: A list of existing clusters to use for this 
workflow.
+    :param extra_job_params: A dictionary containing properties which will 
override the default
+        Databricks Workflow Job definitions.
+    :param jar_params: A list of jar parameters to pass to the workflow. These 
parameters will be passed to all jar
+        tasks in the workflow.
+    :param job_clusters: A list of job clusters to use for this workflow.
+    :param max_concurrent_runs: The maximum number of concurrent runs for this 
workflow.
+    :param notebook_packages: A list of dictionary of Python packages to be 
installed. Packages defined
+        at the workflow task group level are installed for each of the 
notebook tasks under it. And
+        packages defined at the notebook task level are installed specific for 
the notebook task.
+    :param notebook_params: A dictionary of notebook parameters to pass to the 
workflow. These parameters
+        will be passed to all notebook tasks in the workflow.
+    :param python_params: A list of python parameters to pass to the workflow. 
These parameters will be passed to
+        all python tasks in the workflow.
+    :param spark_submit_params: A list of spark submit parameters to pass to 
the workflow. These parameters
+        will be passed to all spark submit tasks.
+    """
+
+    is_databricks = True
+
+    def __init__(
+        self,
+        databricks_conn_id: str,
+        existing_clusters: list[str] | None = None,
+        extra_job_params: dict[str, Any] | None = None,
+        jar_params: list[str] | None = None,
+        job_clusters: list[dict] | None = None,
+        max_concurrent_runs: int = 1,
+        notebook_packages: list[dict[str, Any]] | None = None,
+        notebook_params: dict | None = None,
+        python_params: list | None = None,
+        spark_submit_params: list | None = None,
+        **kwargs,
+    ):
+        self.databricks_conn_id = databricks_conn_id
+        self.existing_clusters = existing_clusters or []
+        self.extra_job_params = extra_job_params or {}
+        self.jar_params = jar_params or []
+        self.job_clusters = job_clusters or []
+        self.max_concurrent_runs = max_concurrent_runs
+        self.notebook_packages = notebook_packages or []
+        self.notebook_params = notebook_params or {}
+        self.python_params = python_params or []
+        self.spark_submit_params = spark_submit_params or []
+        super().__init__(**kwargs)
+
+    def __exit__(
+        self, _type: type[BaseException] | None, _value: BaseException | None, 
_tb: TracebackType | None
+    ) -> None:
+        """Exit the context manager and add tasks to a single 
``_CreateDatabricksWorkflowOperator``."""
+        roots = list(self.get_roots())
+        tasks = _flatten_node(self)
+
+        create_databricks_workflow_task = _CreateDatabricksWorkflowOperator(
+            dag=self.dag,
+            task_group=self,
+            task_id="launch",
+            databricks_conn_id=self.databricks_conn_id,
+            existing_clusters=self.existing_clusters,
+            extra_job_params=self.extra_job_params,
+            job_clusters=self.job_clusters,
+            max_concurrent_runs=self.max_concurrent_runs,
+            notebook_params=self.notebook_params,
+        )
+
+        for task in tasks:
+            if not (
+                hasattr(task, "_convert_to_databricks_workflow_task")
+                and callable(task._convert_to_databricks_workflow_task)
+            ):
+                raise AirflowException(
+                    f"Task {task.task_id} does not support conversion to 
databricks workflow task."
+                )
+
+            task.workflow_run_metadata = create_databricks_workflow_task.output
+            
create_databricks_workflow_task.relevant_upstreams.append(task.task_id)
+            create_databricks_workflow_task.add_task(task)
+
+        for root_task in roots:
+            root_task.set_upstream(create_databricks_workflow_task)
+
+        super().__exit__(_type, _value, _tb)
diff --git a/airflow/providers/databricks/provider.yaml 
b/airflow/providers/databricks/provider.yaml
index a6e2174640..80506dc16c 100644
--- a/airflow/providers/databricks/provider.yaml
+++ b/airflow/providers/databricks/provider.yaml
@@ -72,6 +72,7 @@ dependencies:
   # The 2.9.1 (to be released soon) already contains the fix
   - databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0
   - aiohttp>=3.9.2, <4
+  - mergedeep>=1.3.4
 
 additional-extras:
   # pip install apache-airflow-providers-databricks[sdk]
@@ -108,6 +109,12 @@ integrations:
       - /docs/apache-airflow-providers-databricks/operators/repos_delete.rst
     logo: /integration-logos/databricks/Databricks.png
     tags: [service]
+  - integration-name: Databricks Workflow
+    external-doc-url: https://docs.databricks.com/en/workflows/index.html
+    how-to-guide:
+      - /docs/apache-airflow-providers-databricks/operators/workflow.rst
+    logo: /integration-logos/databricks/Databricks.png
+    tags: [service]
 
 operators:
   - integration-name: Databricks
@@ -119,6 +126,9 @@ operators:
   - integration-name: Databricks Repos
     python-modules:
       - airflow.providers.databricks.operators.databricks_repos
+  - integration-name: Databricks Workflow
+    python-modules:
+      - airflow.providers.databricks.operators.databricks_workflow
 
 hooks:
   - integration-name: Databricks
diff --git 
a/docs/apache-airflow-providers-databricks/img/databricks_workflow_task_group_airflow_graph_view.png
 
b/docs/apache-airflow-providers-databricks/img/databricks_workflow_task_group_airflow_graph_view.png
new file mode 100644
index 0000000000..3a3cb669e0
Binary files /dev/null and 
b/docs/apache-airflow-providers-databricks/img/databricks_workflow_task_group_airflow_graph_view.png
 differ
diff --git 
a/docs/apache-airflow-providers-databricks/img/workflow_run_databricks_graph_view.png
 
b/docs/apache-airflow-providers-databricks/img/workflow_run_databricks_graph_view.png
new file mode 100644
index 0000000000..cb189b8105
Binary files /dev/null and 
b/docs/apache-airflow-providers-databricks/img/workflow_run_databricks_graph_view.png
 differ
diff --git a/docs/apache-airflow-providers-databricks/operators/workflow.rst 
b/docs/apache-airflow-providers-databricks/operators/workflow.rst
new file mode 100644
index 0000000000..f58514dd51
--- /dev/null
+++ b/docs/apache-airflow-providers-databricks/operators/workflow.rst
@@ -0,0 +1,71 @@
+ .. Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+ ..   http://www.apache.org/licenses/LICENSE-2.0
+
+ .. Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _howto/operator:DatabricksWorkflowTaskGroup:
+
+
+DatabricksWorkflowTaskGroup
+===========================
+
+Use the 
:class:`~airflow.providers.databricks.operators.databricks_workflow.DatabricksWorkflowTaskGroup`
 to launch and monitor
+Databricks notebook job runs as Airflow tasks. The task group launches a 
`Databricks Workflow <https://docs.databricks.com/en/workflows/index.html/>`_ 
and runs the notebook jobs from within it, resulting in a `75% cost reduction 
<https://www.databricks.com/product/pricing>`_ ($0.40/DBU for all-purpose 
compute, $0.07/DBU for Jobs compute) when compared to executing 
``DatabricksNotebookOperator`` outside of ``DatabricksWorkflowTaskGroup``.
+
+
+There are a few advantages to defining your Databricks Workflows in Airflow:
+
+=======================================  
=============================================  =================================
+Authoring interface                      via Databricks (Web-based with 
Databricks UI)  via Airflow(Code with Airflow DAG)
+=======================================  
=============================================  =================================
+Workflow compute pricing                 ✅                                     
        ✅
+Notebook code in source control          ✅                                     
        ✅
+Workflow structure in source control                                           
         ✅
+Retry from beginning                     ✅                                     
        ✅
+Retry single task                                                              
         ✅
+Task groups within Workflows                                                   
         ✅
+Trigger workflows from other DAGs                                              
         ✅
+Workflow-level parameters                                                      
         ✅
+=======================================  
=============================================  =================================
+
+Examples
+--------
+
+Example of what a DAG looks like with a DatabricksWorkflowTaskGroup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. exampleinclude:: 
/../../tests/system/providers/databricks/example_databricks_workflow.py
+    :language: python
+    :start-after: [START howto_databricks_workflow_notebook]
+    :end-before: [END howto_databricks_workflow_notebook]
+
+With this example, Airflow will produce a job named 
``<dag_name>.test_workflow_<USER>_<GROUP_ID>`` that will
+run task ``notebook_1`` and then ``notebook_2``. The job will be created in 
the databricks workspace
+if it does not already exist. If the job already exists, it will be updated to 
match
+the workflow defined in the DAG.
+
+The following image displays the resulting Databricks Workflow in the Airflow 
UI (based on the above example provided)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. image:: ../img/databricks_workflow_task_group_airflow_graph_view.png
+
+The corresponding Databricks Workflow  in the Databricks UI for the run 
triggered from the Airflow DAG is depicted below
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. image:: ../img/workflow_run_databricks_graph_view.png
+
+
+To minimize update conflicts, we recommend that you keep parameters in the 
``notebook_params`` of the
+``DatabricksWorkflowTaskGroup`` and not in the ``DatabricksNotebookOperator`` 
whenever possible.
+This is because, tasks in the ``DatabricksWorkflowTaskGroup`` are passed in on 
the job trigger time and
+do not modify the job definition.
diff --git a/generated/provider_dependencies.json 
b/generated/provider_dependencies.json
index 7f29984d32..01c1e33785 100644
--- a/generated/provider_dependencies.json
+++ b/generated/provider_dependencies.json
@@ -411,6 +411,7 @@
       "apache-airflow-providers-common-sql>=1.10.0",
       "apache-airflow>=2.7.0",
       "databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0",
+      "mergedeep>=1.3.4",
       "requests>=2.27.0,<3"
     ],
     "devel-deps": [
diff --git a/tests/providers/databricks/operators/test_databricks.py 
b/tests/providers/databricks/operators/test_databricks.py
index d6e7eb3892..2774385ea5 100644
--- a/tests/providers/databricks/operators/test_databricks.py
+++ b/tests/providers/databricks/operators/test_databricks.py
@@ -2014,3 +2014,122 @@ class TestDatabricksNotebookOperator:
             "Set it instead to `None` if you desire the task to run 
indefinitely."
         )
         assert str(exc_info.value) == exception_message
+
+    def test_extend_workflow_notebook_packages(self):
+        """Test that the operator can extend the notebook packages of a 
Databricks workflow task group."""
+        databricks_workflow_task_group = MagicMock()
+        databricks_workflow_task_group.notebook_packages = [
+            {"pypi": {"package": "numpy"}},
+            {"pypi": {"package": "pandas"}},
+        ]
+
+        operator = DatabricksNotebookOperator(
+            notebook_path="/path/to/notebook",
+            source="WORKSPACE",
+            task_id="test_task",
+            notebook_packages=[
+                {"pypi": {"package": "numpy"}},
+                {"pypi": {"package": "scipy"}},
+            ],
+        )
+
+        
operator._extend_workflow_notebook_packages(databricks_workflow_task_group)
+
+        assert operator.notebook_packages == [
+            {"pypi": {"package": "numpy"}},
+            {"pypi": {"package": "scipy"}},
+            {"pypi": {"package": "pandas"}},
+        ]
+
+    def test_convert_to_databricks_workflow_task(self):
+        """Test that the operator can convert itself to a Databricks workflow 
task."""
+        dag = DAG(dag_id="example_dag", start_date=datetime.now())
+        operator = DatabricksNotebookOperator(
+            notebook_path="/path/to/notebook",
+            source="WORKSPACE",
+            task_id="test_task",
+            notebook_packages=[
+                {"pypi": {"package": "numpy"}},
+                {"pypi": {"package": "scipy"}},
+            ],
+            dag=dag,
+        )
+
+        databricks_workflow_task_group = MagicMock()
+        databricks_workflow_task_group.notebook_packages = [
+            {"pypi": {"package": "numpy"}},
+        ]
+        databricks_workflow_task_group.notebook_params = {"param1": "value1"}
+
+        operator.notebook_packages = [{"pypi": {"package": "pandas"}}]
+        operator.notebook_params = {"param2": "value2"}
+        operator.task_group = databricks_workflow_task_group
+        operator.task_id = "test_task"
+        operator.upstream_task_ids = ["upstream_task"]
+        relevant_upstreams = [MagicMock(task_id="upstream_task")]
+
+        task_json = 
operator._convert_to_databricks_workflow_task(relevant_upstreams)
+
+        expected_json = {
+            "task_key": "example_dag__test_task",
+            "depends_on": [],
+            "timeout_seconds": 0,
+            "email_notifications": {},
+            "notebook_task": {
+                "notebook_path": "/path/to/notebook",
+                "source": "WORKSPACE",
+                "base_parameters": {
+                    "param2": "value2",
+                    "param1": "value1",
+                },
+            },
+            "libraries": [
+                {"pypi": {"package": "pandas"}},
+                {"pypi": {"package": "numpy"}},
+            ],
+        }
+
+        assert task_json == expected_json
+
+    def test_convert_to_databricks_workflow_task_no_task_group(self):
+        """Test that an error is raised if the operator is not in a 
TaskGroup."""
+        operator = DatabricksNotebookOperator(
+            notebook_path="/path/to/notebook",
+            source="WORKSPACE",
+            task_id="test_task",
+            notebook_packages=[
+                {"pypi": {"package": "numpy"}},
+                {"pypi": {"package": "scipy"}},
+            ],
+        )
+        operator.task_group = None
+        relevant_upstreams = [MagicMock(task_id="upstream_task")]
+
+        with pytest.raises(
+            AirflowException,
+            match="Calling `_convert_to_databricks_workflow_task` without a 
parent TaskGroup.",
+        ):
+            operator._convert_to_databricks_workflow_task(relevant_upstreams)
+
+    def test_convert_to_databricks_workflow_task_cluster_conflict(self):
+        """Test that an error is raised if both `existing_cluster_id` and 
`job_cluster_key` are set."""
+        operator = DatabricksNotebookOperator(
+            notebook_path="/path/to/notebook",
+            source="WORKSPACE",
+            task_id="test_task",
+            notebook_packages=[
+                {"pypi": {"package": "numpy"}},
+                {"pypi": {"package": "scipy"}},
+            ],
+        )
+        databricks_workflow_task_group = MagicMock()
+        operator.existing_cluster_id = "existing-cluster-id"
+        operator.job_cluster_key = "job-cluster-key"
+        operator.task_group = databricks_workflow_task_group
+        relevant_upstreams = [MagicMock(task_id="upstream_task")]
+
+        with pytest.raises(
+            ValueError,
+            match="Both existing_cluster_id and job_cluster_key are set. Only 
one can be set per task.",
+        ):
+            operator._convert_to_databricks_workflow_task(relevant_upstreams)
diff --git a/tests/providers/databricks/operators/test_databricks_workflow.py 
b/tests/providers/databricks/operators/test_databricks_workflow.py
new file mode 100644
index 0000000000..99f1a9d148
--- /dev/null
+++ b/tests/providers/databricks/operators/test_databricks_workflow.py
@@ -0,0 +1,233 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from airflow import DAG
+from airflow.exceptions import AirflowException
+from airflow.models.baseoperator import BaseOperator
+from airflow.operators.empty import EmptyOperator
+from airflow.providers.databricks.hooks.databricks import RunLifeCycleState
+from airflow.providers.databricks.operators.databricks_workflow import (
+    DatabricksWorkflowTaskGroup,
+    _CreateDatabricksWorkflowOperator,
+    _flatten_node,
+)
+from airflow.utils import timezone
+
+pytestmark = pytest.mark.db_test
+
+DEFAULT_DATE = timezone.datetime(2021, 1, 1)
+
+
+@pytest.fixture
+def mock_databricks_hook():
+    """Provide a mock DatabricksHook."""
+    with 
patch("airflow.providers.databricks.operators.databricks_workflow.DatabricksHook")
 as mock_hook:
+        yield mock_hook
+
+
+@pytest.fixture
+def context():
+    """Provide a mock context object."""
+    return MagicMock()
+
+
+@pytest.fixture
+def mock_task_group():
+    """Provide a mock DatabricksWorkflowTaskGroup with necessary attributes."""
+    mock_group = MagicMock(spec=DatabricksWorkflowTaskGroup)
+    mock_group.group_id = "test_group"
+    return mock_group
+
+
+def test_flatten_node():
+    """Test that _flatten_node returns a flat list of operators."""
+    task_group = MagicMock(spec=DatabricksWorkflowTaskGroup)
+    base_operator = MagicMock(spec=BaseOperator)
+    task_group.children = {"task1": base_operator, "task2": base_operator}
+
+    result = _flatten_node(task_group)
+    assert result == [base_operator, base_operator]
+
+
+def test_create_workflow_json(mock_databricks_hook, context, mock_task_group):
+    """Test that _CreateDatabricksWorkflowOperator.create_workflow_json 
returns the expected JSON."""
+    operator = _CreateDatabricksWorkflowOperator(
+        task_id="test_task",
+        databricks_conn_id="databricks_default",
+    )
+    operator.task_group = mock_task_group
+
+    task = MagicMock(spec=BaseOperator)
+    task._convert_to_databricks_workflow_task = MagicMock(return_value={})
+    operator.add_task(task)
+
+    workflow_json = operator.create_workflow_json(context=context)
+
+    assert ".test_group" in workflow_json["name"]
+    assert "tasks" in workflow_json
+    assert workflow_json["format"] == "MULTI_TASK"
+    assert workflow_json["email_notifications"] == 
{"no_alert_for_skipped_runs": False}
+    assert workflow_json["job_clusters"] == []
+    assert workflow_json["max_concurrent_runs"] == 1
+    assert workflow_json["timeout_seconds"] == 0
+
+
+def test_create_or_reset_job_existing(mock_databricks_hook, context, 
mock_task_group):
+    """Test that _CreateDatabricksWorkflowOperator._create_or_reset_job resets 
the job if it already exists."""
+    operator = _CreateDatabricksWorkflowOperator(task_id="test_task", 
databricks_conn_id="databricks_default")
+    operator.task_group = mock_task_group
+    operator._hook.list_jobs.return_value = [{"job_id": 123}]
+    operator._hook.create_job.return_value = 123
+
+    job_id = operator._create_or_reset_job(context)
+    assert job_id == 123
+    operator._hook.reset_job.assert_called_once()
+
+
+def test_create_or_reset_job_new(mock_databricks_hook, context, 
mock_task_group):
+    """Test that _CreateDatabricksWorkflowOperator._create_or_reset_job 
creates a new job if it does not exist."""
+    operator = _CreateDatabricksWorkflowOperator(task_id="test_task", 
databricks_conn_id="databricks_default")
+    operator.task_group = mock_task_group
+    operator._hook.list_jobs.return_value = []
+    operator._hook.create_job.return_value = 456
+
+    job_id = operator._create_or_reset_job(context)
+    assert job_id == 456
+    operator._hook.create_job.assert_called_once()
+
+
+def test_wait_for_job_to_start(mock_databricks_hook):
+    """Test that _CreateDatabricksWorkflowOperator._wait_for_job_to_start 
waits for the job to start."""
+    operator = _CreateDatabricksWorkflowOperator(task_id="test_task", 
databricks_conn_id="databricks_default")
+    mock_hook_instance = mock_databricks_hook.return_value
+    mock_hook_instance.get_run_state.side_effect = [
+        MagicMock(life_cycle_state=RunLifeCycleState.PENDING.value),
+        MagicMock(life_cycle_state=RunLifeCycleState.RUNNING.value),
+    ]
+
+    operator._wait_for_job_to_start(123)
+    mock_hook_instance.get_run_state.assert_called()
+
+
+def test_execute(mock_databricks_hook, context, mock_task_group):
+    """Test that _CreateDatabricksWorkflowOperator.execute runs the task 
group."""
+    operator = _CreateDatabricksWorkflowOperator(task_id="test_task", 
databricks_conn_id="databricks_default")
+    operator.task_group = mock_task_group
+    mock_task_group.jar_params = {}
+    mock_task_group.python_params = {}
+    mock_task_group.spark_submit_params = {}
+
+    mock_hook_instance = mock_databricks_hook.return_value
+    mock_hook_instance.run_now.return_value = 789
+    mock_hook_instance.list_jobs.return_value = [{"job_id": 123}]
+    mock_hook_instance.get_run_state.return_value = MagicMock(
+        life_cycle_state=RunLifeCycleState.RUNNING.value
+    )
+
+    task = MagicMock(spec=BaseOperator)
+    task._convert_to_databricks_workflow_task = MagicMock(return_value={})
+    operator.add_task(task)
+
+    result = operator.execute(context)
+
+    assert result == {
+        "conn_id": "databricks_default",
+        "job_id": 123,
+        "run_id": 789,
+    }
+    mock_hook_instance.run_now.assert_called_once()
+
+
+def test_execute_invalid_task_group(context):
+    """Test that _CreateDatabricksWorkflowOperator.execute raises an exception 
if the task group is invalid."""
+    operator = _CreateDatabricksWorkflowOperator(task_id="test_task", 
databricks_conn_id="databricks_default")
+    operator.task_group = MagicMock()  # Not a DatabricksWorkflowTaskGroup
+
+    with pytest.raises(AirflowException, match="Task group must be a 
DatabricksWorkflowTaskGroup"):
+        operator.execute(context)
+
+
+@pytest.fixture
+def mock_databricks_workflow_operator():
+    with patch(
+        
"airflow.providers.databricks.operators.databricks_workflow._CreateDatabricksWorkflowOperator"
+    ) as mock_operator:
+        yield mock_operator
+
+
+def test_task_group_initialization():
+    """Test that DatabricksWorkflowTaskGroup initializes correctly."""
+    with DAG(dag_id="example_databricks_workflow_dag", 
start_date=DEFAULT_DATE) as example_dag:
+        with DatabricksWorkflowTaskGroup(
+            group_id="test_databricks_workflow", 
databricks_conn_id="databricks_conn"
+        ) as task_group:
+            task_1 = EmptyOperator(task_id="task1")
+            task_1._convert_to_databricks_workflow_task = 
MagicMock(return_value={})
+        assert task_group.group_id == "test_databricks_workflow"
+        assert task_group.databricks_conn_id == "databricks_conn"
+        assert task_group.dag == example_dag
+
+
+def test_task_group_exit_creates_operator(mock_databricks_workflow_operator):
+    """Test that DatabricksWorkflowTaskGroup creates a 
_CreateDatabricksWorkflowOperator on exit."""
+    with DAG(dag_id="example_databricks_workflow_dag", 
start_date=DEFAULT_DATE) as example_dag:
+        with DatabricksWorkflowTaskGroup(
+            group_id="test_databricks_workflow",
+            databricks_conn_id="databricks_conn",
+        ) as task_group:
+            task1 = MagicMock(task_id="task1")
+            task1._convert_to_databricks_workflow_task = 
MagicMock(return_value={})
+            task2 = MagicMock(task_id="task2")
+            task2._convert_to_databricks_workflow_task = 
MagicMock(return_value={})
+
+            task_group.add(task1)
+            task_group.add(task2)
+
+            task1.set_downstream(task2)
+
+    mock_databricks_workflow_operator.assert_called_once_with(
+        dag=example_dag,
+        task_group=task_group,
+        task_id="launch",
+        databricks_conn_id="databricks_conn",
+        existing_clusters=[],
+        extra_job_params={},
+        job_clusters=[],
+        max_concurrent_runs=1,
+        notebook_params={},
+    )
+
+
+def 
test_task_group_root_tasks_set_upstream_to_operator(mock_databricks_workflow_operator):
+    """Test that tasks added to a DatabricksWorkflowTaskGroup are set upstream 
to the operator."""
+    with DAG(dag_id="example_databricks_workflow_dag", 
start_date=DEFAULT_DATE):
+        with DatabricksWorkflowTaskGroup(
+            group_id="test_databricks_workflow1",
+            databricks_conn_id="databricks_conn",
+        ) as task_group:
+            task1 = MagicMock(task_id="task1")
+            task1._convert_to_databricks_workflow_task = 
MagicMock(return_value={})
+            task_group.add(task1)
+
+    create_operator_instance = mock_databricks_workflow_operator.return_value
+    task1.set_upstream.assert_called_once_with(create_operator_instance)
diff --git a/tests/system/providers/databricks/example_databricks_workflow.py 
b/tests/system/providers/databricks/example_databricks_workflow.py
new file mode 100644
index 0000000000..6b05f34684
--- /dev/null
+++ b/tests/system/providers/databricks/example_databricks_workflow.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Example DAG for using the DatabricksWorkflowTaskGroup and 
DatabricksNotebookOperator."""
+
+from __future__ import annotations
+
+import os
+from datetime import timedelta
+
+from airflow.models.dag import DAG
+from airflow.providers.databricks.operators.databricks import 
DatabricksNotebookOperator
+from airflow.providers.databricks.operators.databricks_workflow import 
DatabricksWorkflowTaskGroup
+from airflow.utils.timezone import datetime
+
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+DATABRICKS_CONN_ID = os.getenv("DATABRICKS_CONN_ID", "databricks_conn")
+DATABRICKS_NOTIFICATION_EMAIL = os.getenv("DATABRICKS_NOTIFICATION_EMAIL", 
"your_em...@serviceprovider.com")
+
+GROUP_ID = os.getenv("DATABRICKS_GROUP_ID", "1234").replace(".", "_")
+USER = os.environ.get("USER")
+
+job_cluster_spec = [
+    {
+        "job_cluster_key": "Shared_job_cluster",
+        "new_cluster": {
+            "cluster_name": "",
+            "spark_version": "11.3.x-scala2.12",
+            "aws_attributes": {
+                "first_on_demand": 1,
+                "availability": "SPOT_WITH_FALLBACK",
+                "zone_id": "us-east-2b",
+                "spot_bid_price_percent": 100,
+                "ebs_volume_count": 0,
+            },
+            "node_type_id": "i3.xlarge",
+            "spark_env_vars": {"PYSPARK_PYTHON": 
"/databricks/python3/bin/python3"},
+            "enable_elastic_disk": False,
+            "data_security_mode": "LEGACY_SINGLE_USER_STANDARD",
+            "runtime_engine": "STANDARD",
+            "num_workers": 8,
+        },
+    }
+]
+dag = DAG(
+    dag_id="example_databricks_workflow",
+    start_date=datetime(2022, 1, 1),
+    schedule_interval=None,
+    catchup=False,
+    tags=["example", "databricks"],
+)
+with dag:
+    # [START howto_databricks_workflow_notebook]
+    task_group = DatabricksWorkflowTaskGroup(
+        group_id=f"test_workflow_{USER}_{GROUP_ID}",
+        databricks_conn_id=DATABRICKS_CONN_ID,
+        job_clusters=job_cluster_spec,
+        notebook_params={"ts": "{{ ts }}"},
+        notebook_packages=[
+            {
+                "pypi": {
+                    "package": "simplejson==3.18.0",  # Pin specification 
version of a package like this.
+                    "repo": "https://pypi.org/simple";,  # You can specify your 
required Pypi index here.
+                }
+            },
+        ],
+        extra_job_params={
+            "email_notifications": {
+                "on_start": [DATABRICKS_NOTIFICATION_EMAIL],
+            },
+        },
+    )
+    with task_group:
+        notebook_1 = DatabricksNotebookOperator(
+            task_id="workflow_notebook_1",
+            databricks_conn_id=DATABRICKS_CONN_ID,
+            notebook_path="/Shared/Notebook_1",
+            notebook_packages=[{"pypi": {"package": "Faker"}}],
+            source="WORKSPACE",
+            job_cluster_key="Shared_job_cluster",
+            execution_timeout=timedelta(seconds=600),
+        )
+        notebook_2 = DatabricksNotebookOperator(
+            task_id="workflow_notebook_2",
+            databricks_conn_id=DATABRICKS_CONN_ID,
+            notebook_path="/Shared/Notebook_2",
+            source="WORKSPACE",
+            job_cluster_key="Shared_job_cluster",
+            notebook_params={"foo": "bar", "ds": "{{ ds }}"},
+        )
+        notebook_1 >> notebook_2
+    # [END howto_databricks_workflow_notebook]
+
+    from tests.system.utils.watcher import watcher
+
+    # This test needs watcher in order to properly mark success/failure
+    # when "tearDown" task with trigger rule is part of the DAG
+    list(dag.tasks) >> watcher()
+
+from tests.system.utils import get_test_run  # noqa: E402
+
+# Needed to run the example DAG with pytest (see: 
tests/system/README.md#run_via_pytest)
+test_run = get_test_run(dag)

(airflow) branch main updated: Add DatabricksWorkflowTaskGroup (#39771)

Reply via email to