mik-laj commented on a change in pull request #5539: [AIRFLOW-4811] Implement GCP DLP' Hook and Operators URL: https://github.com/apache/airflow/pull/5539#discussion_r303125436
########## File path: airflow/contrib/hooks/gcp_dlp_hook.py ########## @@ -0,0 +1,1362 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This module contains a CloudDLPHook +which allows you to connect to GCP Cloud DLP service. +""" + +import time +from google.cloud.dlp_v2 import DlpServiceClient +from google.cloud.dlp_v2.types import DlpJob + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + + +# pylint: disable=R0904, C0302 +class CloudDLPHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud Data Loss Prevention (DLP) APIs. + Cloud DLP allows clients to detect the presence of Personally Identifiable + Information (PII) and other privacy-sensitive data in user-supplied, + unstructured data streams, like text blocks or images. The service also + includes methods for sensitive data redaction and scheduling of data scans + on Google Cloud Platform based data sets. + + :param gcp_conn_id: The connection ID to use when fetching connection info. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + """ + + def __init__(self, + gcp_conn_id="google_cloud_default", + delegate_to=None): + super().__init__(gcp_conn_id, delegate_to) + self._client = None + + def get_conn(self): + """ + Provides a client for interacting with the Cloud DLP API. + + :return: GCP Cloud DLP API Client + :rtype: google.cloud.dlp_v2.DlpServiceClient + """ + if not self._client: + self._client = DlpServiceClient(credentials=self._get_credentials()) + return self._client + + @GoogleCloudBaseHook.catch_http_exception + def cancel_dlp_job( + self, name, retry=None, timeout=None, metadata=None + ): + """ + Starts asynchronous cancellation on a long-running DlpJob. + + :param name: The name of the DlpJob resource to be cancelled. + :type name: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + """ + + client = self.get_conn() + + if not name: + raise AirflowException( + "Please provide the name of the DlpJob resource to be cancelled." + ) + + client.cancel_dlp_job( + name=name, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleCloudBaseHook.catch_http_exception + def create_deidentify_template( + self, + parent, + deidentify_template=None, + template_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + Creates a DeidentifyTemplate for re-using frequently used configuration for + de-identifying content, images, and storage. + + :param parent: The parent resource name. + :type parent: str + :param deidentify_template: (Optional) The DeidentifyTemplate to create. + :type deidentify_template: dict or google.cloud.dlp_v2.types.DeidentifyTemplate + :param template_id: (Optional) The template ID. + :type template_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.dlp_v2.types.DeidentifyTemplate + """ + + client = self.get_conn() + + if not parent: + raise AirflowException("Please provide the parent resource name.") + + return client.create_deidentify_template( + parent=parent, + deidentify_template=deidentify_template, + template_id=template_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + @GoogleCloudBaseHook.catch_http_exception + def create_dlp_job( + self, + parent, + inspect_job=None, + risk_job=None, + job_id=None, + retry=None, + timeout=None, + metadata=None, + wait_until_finished=True, + polling_interval_in_seconds=60 + ): + """ + Creates a new job to inspect storage or calculate risk metrics. + + :param parent: The parent resource name. + :type parent: str Review comment: This contains the project ID. I think it's worth adding support for setting the project ID by configuring the connection Please look at: `@GoogleCloudBaseHook.fallback_to_default_project_id` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services