tatiana commented on code in PR #50392:
URL: https://github.com/apache/airflow/pull/50392#discussion_r2084143507


##########
providers/databricks/src/airflow/providers/databricks/utils/openlineage.py:
##########
@@ -0,0 +1,328 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+from typing import TYPE_CHECKING, Any
+
+import requests
+
+from airflow.providers.common.compat.openlineage.check import 
require_openlineage_version
+from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
+from airflow.utils import timezone
+
+if TYPE_CHECKING:
+    from openlineage.client.event_v2 import RunEvent
+    from openlineage.client.facet_v2 import JobFacet
+
+    from airflow.providers.databricks.hooks.databricks_sql import 
DatabricksSqlHook
+
+
+log = logging.getLogger(__name__)
+
+
+def _get_logical_date(task_instance):
+    # todo: remove when min airflow version >= 3.0
+    if AIRFLOW_V_3_0_PLUS:
+        dagrun = task_instance.get_template_context()["dag_run"]
+        return dagrun.logical_date or dagrun.run_after
+
+    if hasattr(task_instance, "logical_date"):
+        date = task_instance.logical_date
+    else:
+        date = task_instance.execution_date
+
+    return date
+
+
+def _get_dag_run_clear_number(task_instance):
+    # todo: remove when min airflow version >= 3.0
+    if AIRFLOW_V_3_0_PLUS:
+        dagrun = task_instance.get_template_context()["dag_run"]
+        return dagrun.clear_number
+    return task_instance.dag_run.clear_number
+
+
+# todo: move this run_id logic into OpenLineage's listener to avoid differences
+def _get_ol_run_id(task_instance) -> str:
+    """
+    Get OpenLineage run_id from TaskInstance.
+
+    It's crucial that the task_instance's run_id creation logic matches 
OpenLineage's listener implementation.
+    Only then can we ensure that the generated run_id aligns with the Airflow 
task,
+    enabling a proper connection between events.
+    """
+    from airflow.providers.openlineage.plugins.adapter import 
OpenLineageAdapter
+
+    # Generate same OL run id as is generated for current task instance
+    return OpenLineageAdapter.build_task_instance_run_id(
+        dag_id=task_instance.dag_id,
+        task_id=task_instance.task_id,
+        logical_date=_get_logical_date(task_instance),
+        try_number=task_instance.try_number,
+        map_index=task_instance.map_index,
+    )
+
+
+# todo: move this run_id logic into OpenLineage's listener to avoid differences
+def _get_ol_dag_run_id(task_instance) -> str:
+    from airflow.providers.openlineage.plugins.adapter import 
OpenLineageAdapter
+
+    return OpenLineageAdapter.build_dag_run_id(
+        dag_id=task_instance.dag_id,
+        logical_date=_get_logical_date(task_instance),
+        clear_number=_get_dag_run_clear_number(task_instance),
+    )
+
+
+def _get_parent_run_facet(task_instance):
+    """
+    Retrieve the ParentRunFacet associated with a specific Airflow task 
instance.
+
+    This facet helps link OpenLineage events of child jobs - such as queries 
executed within
+    external systems (e.g., Databricks) by the Airflow task - to the original 
Airflow task execution.
+    Establishing this connection enables better lineage tracking and 
observability.
+    """
+    from openlineage.client.facet_v2 import parent_run
+
+    from airflow.providers.openlineage.conf import namespace
+
+    parent_run_id = _get_ol_run_id(task_instance)
+    root_parent_run_id = _get_ol_dag_run_id(task_instance)
+
+    return parent_run.ParentRunFacet(
+        run=parent_run.Run(runId=parent_run_id),
+        job=parent_run.Job(
+            namespace=namespace(),
+            name=f"{task_instance.dag_id}.{task_instance.task_id}",
+        ),
+        root=parent_run.Root(
+            run=parent_run.RootRun(runId=root_parent_run_id),
+            job=parent_run.RootJob(
+                name=task_instance.dag_id,
+                namespace=namespace(),
+            ),
+        ),
+    )
+
+
+def _run_api_call(hook: DatabricksSqlHook, query_ids: list[str]) -> list[dict]:
+    """Retrieve execution details for specific queries from Databricks's query 
history API."""
+    if not hook._token:
+        # This has logic for token initialization
+        hook.get_conn()
+
+    # https://docs.databricks.com/api/azure/workspace/queryhistory/list
+    response = requests.get(
+        url=f"https://{hook.host}/api/2.0/sql/history/queries";,
+        headers={"Authorization": f"Bearer {hook._token}"},
+        data=json.dumps({"filter_by": {"statement_ids": query_ids}}),
+    )
+    if response.status_code != 200:
+        log.warning(
+            "OpenLineage could not retrieve Databricks queries details. API 
error received: `%s`: `%s`",
+            response.status_code,
+            response.text,
+        )
+        return []
+
+    return response.json()["res"]
+
+
+def _get_queries_details_from_databricks(

Review Comment:
   How much does the `DatabricksSqlHook` performance degrade by adding fetching 
this additional metadata from Databricks?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@airflow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to