This is an automated email from the ASF dual-hosted git repository.
eladkal pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new b7a0983b66 docs for `DataprocSubmitJobOperator` (Presto job) (#32798)
b7a0983b66 is described below
commit b7a0983b668ba2d5c817b812daec68943a3d2bc2
Author: max <[email protected]>
AuthorDate: Wed Jul 26 08:51:01 2023 +0200
docs for `DataprocSubmitJobOperator` (Presto job) (#32798)
---
.../operators/cloud/dataproc.rst | 20 ++++
.../cloud/dataproc/example_dataproc_presto.py | 119 +++++++++++++++++++++
2 files changed, 139 insertions(+)
diff --git a/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst
b/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst
index 8144e0bfe4..d13227c135 100644
--- a/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst
+++ b/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst
@@ -75,6 +75,18 @@ With this configuration we can create the cluster:
:start-after: [START how_to_cloud_dataproc_create_cluster_operator_in_gke]
:end-before: [END how_to_cloud_dataproc_create_cluster_operator_in_gke]
+You can also create Dataproc cluster with optional component Presto.
+To do so, please use the following configuration.
+Note that default image might not support the chosen optional component.
+If this is your case, please specify correct ``image_version`` that you can
find in the
+`documentation.
<https://cloud.google.com/dataproc/docs/concepts/components/overview#available_optional_components>`__
+
+.. exampleinclude::
/../../tests/system/providers/google/cloud/dataproc/example_dataproc_presto.py
+ :language: python
+ :dedent: 0
+ :start-after: [START how_to_cloud_dataproc_create_cluster]
+ :end-before: [END how_to_cloud_dataproc_create_cluster]
+
You can use deferrable mode for this action in order to run the operator
asynchronously:
.. exampleinclude::
/../../tests/system/providers/google/cloud/dataproc/example_dataproc_cluster_deferrable.py
@@ -239,6 +251,14 @@ Example of the configuration for a SparkR:
:start-after: [START how_to_cloud_dataproc_sparkr_config]
:end-before: [END how_to_cloud_dataproc_sparkr_config]
+Example of the configuration for a Presto Job:
+
+.. exampleinclude::
/../../tests/system/providers/google/cloud/dataproc/example_dataproc_presto.py
+ :language: python
+ :dedent: 0
+ :start-after: [START how_to_cloud_dataproc_presto_config]
+ :end-before: [END how_to_cloud_dataproc_presto_config]
+
Working with workflows templates
--------------------------------
diff --git
a/tests/system/providers/google/cloud/dataproc/example_dataproc_presto.py
b/tests/system/providers/google/cloud/dataproc/example_dataproc_presto.py
new file mode 100644
index 0000000000..48e3d46614
--- /dev/null
+++ b/tests/system/providers/google/cloud/dataproc/example_dataproc_presto.py
@@ -0,0 +1,119 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Example Airflow DAG for DataprocSubmitJobOperator with presto job.
+"""
+from __future__ import annotations
+
+import os
+from datetime import datetime
+
+from airflow import models
+from airflow.providers.google.cloud.operators.dataproc import (
+ DataprocCreateClusterOperator,
+ DataprocDeleteClusterOperator,
+ DataprocSubmitJobOperator,
+)
+from airflow.utils.trigger_rule import TriggerRule
+
+ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID")
+DAG_ID = "dataproc_presto"
+PROJECT_ID = os.environ.get("SYSTEM_TESTS_GCP_PROJECT")
+
+CLUSTER_NAME = f"cluster-{ENV_ID}-{DAG_ID}".replace("_", "-")
+REGION = "europe-west1"
+
+# Cluster definition
+# [START how_to_cloud_dataproc_create_cluster]
+CLUSTER_CONFIG = {
+ "master_config": {
+ "num_instances": 1,
+ "machine_type_uri": "n1-standard-4",
+ "disk_config": {"boot_disk_type": "pd-standard", "boot_disk_size_gb":
1024},
+ },
+ "worker_config": {
+ "num_instances": 2,
+ "machine_type_uri": "n1-standard-4",
+ "disk_config": {"boot_disk_type": "pd-standard", "boot_disk_size_gb":
1024},
+ },
+ "software_config": {
+ "optional_components": [
+ "PRESTO",
+ ],
+ "image_version": "2.0",
+ },
+}
+# [END how_to_cloud_dataproc_create_cluster]
+
+# Jobs definitions
+# [START how_to_cloud_dataproc_presto_config]
+PRESTO_JOB = {
+ "reference": {"project_id": PROJECT_ID},
+ "placement": {"cluster_name": CLUSTER_NAME},
+ "presto_job": {"query_list": {"queries": ["SHOW CATALOGS"]}},
+}
+# [END how_to_cloud_dataproc_presto_config]
+
+
+with models.DAG(
+ DAG_ID,
+ schedule="@once",
+ start_date=datetime(2021, 1, 1),
+ catchup=False,
+ tags=["example", "dataproc", "presto"],
+) as dag:
+ create_cluster = DataprocCreateClusterOperator(
+ task_id="create_cluster",
+ project_id=PROJECT_ID,
+ cluster_config=CLUSTER_CONFIG,
+ region=REGION,
+ cluster_name=CLUSTER_NAME,
+ )
+
+ presto_task = DataprocSubmitJobOperator(
+ task_id="presto_task", job=PRESTO_JOB, region=REGION,
project_id=PROJECT_ID
+ )
+
+ delete_cluster = DataprocDeleteClusterOperator(
+ task_id="delete_cluster",
+ project_id=PROJECT_ID,
+ cluster_name=CLUSTER_NAME,
+ region=REGION,
+ trigger_rule=TriggerRule.ALL_DONE,
+ )
+
+ (
+ # TEST SETUP
+ create_cluster
+ # TEST BODY
+ >> presto_task
+ # TEST TEARDOWN
+ >> delete_cluster
+ )
+
+ from tests.system.utils.watcher import watcher
+
+ # This test needs watcher in order to properly mark success/failure
+ # when "teardown" task with trigger rule is part of the DAG
+ list(dag.tasks) >> watcher()
+
+
+from tests.system.utils import get_test_run # noqa: E402
+
+# Needed to run the example DAG with pytest (see:
tests/system/README.md#run_via_pytest)
+test_run = get_test_run(dag)