This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new 42017f5bd8 Protect against manually updated generated dependencies 
(#37056)
42017f5bd8 is described below

commit 42017f5bd85749673c34fa0ef0f68151502eb50c
Author: Jarek Potiuk <ja...@potiuk.com>
AuthorDate: Sun Jan 28 15:09:32 2024 +0100

    Protect against manually updated generated dependencies (#37056)
    
    The "generated/provider_dependencies.json" is regenerated automatically
    together with pyproject.toml from all the provider.yaml files. The
    file contains information about dependencies for all providers and
    it is used to determine a number of actions when we build providers
    (for example it is used to automatically generated cross-provider
    dependencies, and determine whether provider is ready to be released)
    
    The "dependencies" from provider_dependencies.json are also
    reflected in pyproject.toml file in order to determine what should
    be installed when you install specific editable extra, so pyproject.toml
    is also generated automatically together with
    provider_dependencies.json. All is good when that generation is done
    automatically, but so far, when you updated provider_dependencies.json
    manually, the pyproject.toml was not regenerated - it was actually
    skipped from regeneration.
    
    This PR changes it by storing hash of provider_depenedencies.json (mixed
    with the has of the script that generates them) in the generated
    pyproject.toml file. This way, both - provider_dependencies and
    pyproject.toml wil always be regenerated by the
    "update_providers_dependencies.py" pre-commit whenever there is any
    inconsistency between provider.yaml files, provider_dependencies.json
    or pyproject.toml.
---
 pyproject.toml                                     | 175 +++++++++++----------
 .../pre_commit_update_providers_dependencies.py    |  30 +++-
 ..._commit_update_providers_dependencies.py.md5sum |   2 +-
 3 files changed, 115 insertions(+), 92 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a84b4af06f..e82b5897f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -524,15 +524,16 @@ winrm = [
 # If you want to modify these - modify the corresponding provider.yaml instead.
 
#############################################################################################################
 # START OF GENERATED DEPENDENCIES
-airbyte = [
+# Hash of dependencies: 6e486f782b4745afd1f0f19dabe1253c
+airbyte = [ # source: airflow/providers/airbyte/provider.yaml
   "apache-airflow[http]",
 ]
-alibaba = [
+alibaba = [ # source: airflow/providers/alibaba/provider.yaml
   "alibabacloud_adb20211201>=1.0.0",
   "alibabacloud_tea_openapi>=0.3.7",
   "oss2>=2.14.0",
 ]
-amazon = [
+amazon = [ # source: airflow/providers/amazon/provider.yaml
   "PyAthena>=3.0.10",
   "apache-airflow[common_sql]",
   "apache-airflow[http]",
@@ -554,82 +555,82 @@ amazon = [
   "mypy-boto3-s3>=1.33.0",
   "s3fs>=2023.10.0",
 ]
-apache-beam = [
+apache-beam = [ # source: airflow/providers/apache/beam/provider.yaml
   "apache-beam>=2.53.0",
   "pyarrow>=14.0.1",
 ]
-apache-cassandra = [
+apache-cassandra = [ # source: airflow/providers/apache/cassandra/provider.yaml
   "cassandra-driver>=3.13.0",
 ]
-apache-drill = [
+apache-drill = [ # source: airflow/providers/apache/drill/provider.yaml
   "apache-airflow[common_sql]",
   "sqlalchemy-drill>=1.1.0",
 ]
-apache-druid = [
+apache-druid = [ # source: airflow/providers/apache/druid/provider.yaml
   "apache-airflow[common_sql]",
   "pydruid>=0.4.1",
 ]
-apache-flink = [
+apache-flink = [ # source: airflow/providers/apache/flink/provider.yaml
   "apache-airflow[cncf_kubernetes]",
   "cryptography>=2.0.0",
 ]
-apache-hdfs = [
+apache-hdfs = [ # source: airflow/providers/apache/hdfs/provider.yaml
   "hdfs[avro,dataframe,kerberos]>=2.0.4",
 ]
-apache-hive = [
+apache-hive = [ # source: airflow/providers/apache/hive/provider.yaml
   "apache-airflow[common_sql]",
   "hmsclient>=0.1.0",
   "pandas>=1.2.5",
   "pyhive[hive-pure-sasl]>=0.7.0",
   "thrift>=0.9.2",
 ]
-apache-impala = [
+apache-impala = [ # source: airflow/providers/apache/impala/provider.yaml
   "impyla>=0.18.0,<1.0",
 ]
-apache-kafka = [
+apache-kafka = [ # source: airflow/providers/apache/kafka/provider.yaml
   "asgiref",
   "confluent-kafka>=1.8.2",
 ]
-apache-kylin = [
+apache-kylin = [ # source: airflow/providers/apache/kylin/provider.yaml
   "kylinpy>=2.6",
 ]
-apache-livy = [
+apache-livy = [ # source: airflow/providers/apache/livy/provider.yaml
   "aiohttp",
   "apache-airflow[http]",
   "asgiref",
 ]
-apache-pig = [
+apache-pig = [ # source: airflow/providers/apache/pig/provider.yaml
 ]
-apache-pinot = [
+apache-pinot = [ # source: airflow/providers/apache/pinot/provider.yaml
   "apache-airflow[common_sql]",
   "pinotdb>0.4.7",
 ]
-apache-spark = [
+apache-spark = [ # source: airflow/providers/apache/spark/provider.yaml
   "grpcio-status>=1.59.0",
   "pyspark",
 ]
-apprise = [
+apprise = [ # source: airflow/providers/apprise/provider.yaml
   "apprise",
 ]
-arangodb = [
+arangodb = [ # source: airflow/providers/arangodb/provider.yaml
   "python-arango>=7.3.2",
 ]
-asana = [
+asana = [ # source: airflow/providers/asana/provider.yaml
   "asana>=0.10,<4.0.0",
 ]
-atlassian-jira = [
+atlassian-jira = [ # source: airflow/providers/atlassian/jira/provider.yaml
   "atlassian-python-api>=1.14.2,!=3.41.6",
   "beautifulsoup4",
 ]
-celery = [
+celery = [ # source: airflow/providers/celery/provider.yaml
   "celery>=5.3.0,<6,!=5.3.3,!=5.3.2",
   "flower>=1.0.0",
   "google-re2>=1.0",
 ]
-cloudant = [
+cloudant = [ # source: airflow/providers/cloudant/provider.yaml
   "cloudant>=2.0",
 ]
-cncf-kubernetes = [
+cncf-kubernetes = [ # source: airflow/providers/cncf/kubernetes/provider.yaml
   "aiofiles>=23.2.0",
   "asgiref>=3.5.2",
   "cryptography>=2.0.0",
@@ -637,15 +638,15 @@ cncf-kubernetes = [
   "kubernetes>=21.7.0,<24",
   "kubernetes_asyncio>=18.20.1,<25",
 ]
-cohere = [
+cohere = [ # source: airflow/providers/cohere/provider.yaml
   "cohere>=4.37",
 ]
-common-io = [
+common-io = [ # source: airflow/providers/common/io/provider.yaml
 ]
-common-sql = [
+common-sql = [ # source: airflow/providers/common/sql/provider.yaml
   "sqlparse>=0.4.2",
 ]
-databricks = [
+databricks = [ # source: airflow/providers/databricks/provider.yaml
   "aiohttp>=3.6.3, <4",
   "apache-airflow[common_sql]",
   "databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0",
@@ -653,48 +654,48 @@ databricks = [
   # Devel dependencies for the databricks provider
   "deltalake>=0.12.0",
 ]
-datadog = [
+datadog = [ # source: airflow/providers/datadog/provider.yaml
   "datadog>=0.14.0",
 ]
-dbt-cloud = [
+dbt-cloud = [ # source: airflow/providers/dbt/cloud/provider.yaml
   "aiohttp",
   "apache-airflow[http]",
   "asgiref",
 ]
-dingding = [
+dingding = [ # source: airflow/providers/dingding/provider.yaml
   "apache-airflow[http]",
 ]
-discord = [
+discord = [ # source: airflow/providers/discord/provider.yaml
   "apache-airflow[http]",
 ]
-docker = [
+docker = [ # source: airflow/providers/docker/provider.yaml
   "docker>=5.0.3",
   "python-dotenv>=0.21.0",
 ]
-elasticsearch = [
+elasticsearch = [ # source: airflow/providers/elasticsearch/provider.yaml
   "apache-airflow[common_sql]",
   "elasticsearch>=8.10,<9",
 ]
-exasol = [
+exasol = [ # source: airflow/providers/exasol/provider.yaml
   "apache-airflow[common_sql]",
   "pandas>=1.2.5",
   "pyexasol>=0.5.1",
 ]
-fab = [
+fab = [ # source: airflow/providers/fab/provider.yaml
   "flask-appbuilder==4.3.10",
   "flask-login>=0.6.2",
   "flask>=2.2,<2.3",
   "google-re2>=1.0",
 ]
-facebook = [
+facebook = [ # source: airflow/providers/facebook/provider.yaml
   "facebook-business>=6.0.2",
 ]
-ftp = [
+ftp = [ # source: airflow/providers/ftp/provider.yaml
 ]
-github = [
+github = [ # source: airflow/providers/github/provider.yaml
   "PyGithub!=1.58",
 ]
-google = [
+google = [ # source: airflow/providers/google/provider.yaml
   "PyOpenSSL",
   "apache-airflow[common_sql]",
   "asgiref>=3.5.2",
@@ -754,34 +755,34 @@ google = [
   "sqlalchemy-bigquery>=1.2.1",
   "sqlalchemy-spanner>=1.6.2",
 ]
-grpc = [
+grpc = [ # source: airflow/providers/grpc/provider.yaml
   "google-auth-httplib2>=0.0.1",
   "google-auth>=1.0.0, <3.0.0",
   "grpcio>=1.15.0",
 ]
-hashicorp = [
+hashicorp = [ # source: airflow/providers/hashicorp/provider.yaml
   "hvac>=1.1.0",
 ]
-http = [
+http = [ # source: airflow/providers/http/provider.yaml
   "aiohttp",
   "asgiref",
   "requests>=2.26.0",
   "requests_toolbelt",
 ]
-imap = [
+imap = [ # source: airflow/providers/imap/provider.yaml
 ]
-influxdb = [
+influxdb = [ # source: airflow/providers/influxdb/provider.yaml
   "influxdb-client>=1.19.0",
   "requests>=2.26.0",
 ]
-jdbc = [
+jdbc = [ # source: airflow/providers/jdbc/provider.yaml
   "apache-airflow[common_sql]",
   "jaydebeapi>=1.1.1",
 ]
-jenkins = [
+jenkins = [ # source: airflow/providers/jenkins/provider.yaml
   "python-jenkins>=1.0.0",
 ]
-microsoft-azure = [
+microsoft-azure = [ # source: airflow/providers/microsoft/azure/provider.yaml
   "adal>=1.2.7",
   "adlfs>=2023.10.0",
   "azure-batch>=8.0.0",
@@ -806,147 +807,147 @@ microsoft-azure = [
   # Devel dependencies for the microsoft.azure provider
   "pywinrm",
 ]
-microsoft-mssql = [
+microsoft-mssql = [ # source: airflow/providers/microsoft/mssql/provider.yaml
   "apache-airflow[common_sql]",
   "pymssql>=2.1.8",
 ]
-microsoft-psrp = [
+microsoft-psrp = [ # source: airflow/providers/microsoft/psrp/provider.yaml
   "pypsrp>=0.8.0",
 ]
-microsoft-winrm = [
+microsoft-winrm = [ # source: airflow/providers/microsoft/winrm/provider.yaml
   "pywinrm>=0.4",
 ]
-mongo = [
+mongo = [ # source: airflow/providers/mongo/provider.yaml
   "dnspython>=1.13.0",
   "pymongo>=3.6.0",
   # Devel dependencies for the mongo provider
   "mongomock",
 ]
-mysql = [
+mysql = [ # source: airflow/providers/mysql/provider.yaml
   "apache-airflow[common_sql]",
   "mysql-connector-python>=8.0.29",
   "mysqlclient>=1.3.6",
 ]
-neo4j = [
+neo4j = [ # source: airflow/providers/neo4j/provider.yaml
   "neo4j>=4.2.1",
 ]
-odbc = [
+odbc = [ # source: airflow/providers/odbc/provider.yaml
   "apache-airflow[common_sql]",
   "pyodbc",
 ]
-openai = [
+openai = [ # source: airflow/providers/openai/provider.yaml
   "openai[datalib]>=1.0",
 ]
-openfaas = [
+openfaas = [ # source: airflow/providers/openfaas/provider.yaml
 ]
-openlineage = [
+openlineage = [ # source: airflow/providers/openlineage/provider.yaml
   "apache-airflow[common_sql]",
   "attrs>=22.2",
   "openlineage-integration-common>=0.28.0",
   "openlineage-python>=0.28.0",
 ]
-opensearch = [
+opensearch = [ # source: airflow/providers/opensearch/provider.yaml
   "opensearch-py>=2.2.0",
 ]
-opsgenie = [
+opsgenie = [ # source: airflow/providers/opsgenie/provider.yaml
   "opsgenie-sdk>=2.1.5",
 ]
-oracle = [
+oracle = [ # source: airflow/providers/oracle/provider.yaml
   "apache-airflow[common_sql]",
   "oracledb>=1.0.0",
 ]
-pagerduty = [
+pagerduty = [ # source: airflow/providers/pagerduty/provider.yaml
   "pdpyras>=4.1.2",
 ]
-papermill = [
+papermill = [ # source: airflow/providers/papermill/provider.yaml
   "ipykernel",
   "papermill[all]>=2.4.0",
   "scrapbook[all]",
 ]
-pgvector = [
+pgvector = [ # source: airflow/providers/pgvector/provider.yaml
   "apache-airflow[postgres]",
   "pgvector>=0.2.3",
 ]
-pinecone = [
+pinecone = [ # source: airflow/providers/pinecone/provider.yaml
   "pinecone-client>=2.2.4,<3.0",
 ]
-postgres = [
+postgres = [ # source: airflow/providers/postgres/provider.yaml
   "apache-airflow[common_sql]",
   "psycopg2-binary>=2.8.0",
 ]
-presto = [
+presto = [ # source: airflow/providers/presto/provider.yaml
   "apache-airflow[common_sql]",
   "pandas>=1.2.5",
   "presto-python-client>=0.8.4",
 ]
-redis = [
+redis = [ # source: airflow/providers/redis/provider.yaml
   "redis>=4.5.2,<5.0.0,!=4.5.5",
 ]
-salesforce = [
+salesforce = [ # source: airflow/providers/salesforce/provider.yaml
   "pandas>=1.2.5",
   "simple-salesforce>=1.0.0",
 ]
-samba = [
+samba = [ # source: airflow/providers/samba/provider.yaml
   "smbprotocol>=1.5.0",
 ]
-segment = [
+segment = [ # source: airflow/providers/segment/provider.yaml
   "analytics-python>=1.2.9",
 ]
-sendgrid = [
+sendgrid = [ # source: airflow/providers/sendgrid/provider.yaml
   "sendgrid>=6.0.0",
 ]
-sftp = [
+sftp = [ # source: airflow/providers/sftp/provider.yaml
   "apache-airflow[ssh]",
   "paramiko>=2.8.0",
 ]
-singularity = [
+singularity = [ # source: airflow/providers/singularity/provider.yaml
   "spython>=0.0.56",
 ]
-slack = [
+slack = [ # source: airflow/providers/slack/provider.yaml
   "apache-airflow[common_sql]",
   "slack_sdk>=3.19.0",
 ]
-smtp = [
+smtp = [ # source: airflow/providers/smtp/provider.yaml
 ]
-snowflake = [
+snowflake = [ # source: airflow/providers/snowflake/provider.yaml
   "apache-airflow[common_sql]",
   "snowflake-connector-python>=2.7.8",
   "snowflake-sqlalchemy>=1.1.0",
 ]
-sqlite = [
+sqlite = [ # source: airflow/providers/sqlite/provider.yaml
   "apache-airflow[common_sql]",
 ]
-ssh = [
+ssh = [ # source: airflow/providers/ssh/provider.yaml
   "paramiko>=2.6.0",
   "sshtunnel>=0.3.2",
 ]
-tableau = [
+tableau = [ # source: airflow/providers/tableau/provider.yaml
   "tableauserverclient",
 ]
-tabular = [
+tabular = [ # source: airflow/providers/tabular/provider.yaml
   # Devel dependencies for the tabular provider
   "pyiceberg>=0.5.0",
 ]
-telegram = [
+telegram = [ # source: airflow/providers/telegram/provider.yaml
   "python-telegram-bot>=20.2",
 ]
-trino = [
+trino = [ # source: airflow/providers/trino/provider.yaml
   "apache-airflow[common_sql]",
   "pandas>=1.2.5",
   "trino>=0.318.0",
 ]
-vertica = [
+vertica = [ # source: airflow/providers/vertica/provider.yaml
   "apache-airflow[common_sql]",
   "vertica-python>=0.5.1",
 ]
-weaviate = [
+weaviate = [ # source: airflow/providers/weaviate/provider.yaml
   "pandas>=1.2.5",
   "weaviate-client>=3.24.2",
 ]
-yandex = [
+yandex = [ # source: airflow/providers/yandex/provider.yaml
   "yandexcloud>=0.228.0",
 ]
-zendesk = [
+zendesk = [ # source: airflow/providers/zendesk/provider.yaml
   "zenpy>=2.0.24",
 ]
 all = [
diff --git a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py 
b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
index 50cebee3e5..ca502e0d0b 100755
--- a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
+++ b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
@@ -20,6 +20,7 @@ from __future__ import annotations
 import hashlib
 import json
 import os
+import re
 import sys
 from ast import Import, ImportFrom, NodeVisitor, parse
 from collections import defaultdict
@@ -237,7 +238,10 @@ def generate_dependencies(
     for dependency, dependency_info in dependencies.items():
         if dependency_info["state"] in ["suspended", "removed"]:
             continue
-        result_content.append(f"{normalize_extra(dependency)} = [")
+        result_content.append(
+            f"{normalize_extra(dependency)} = "
+            f"[ # source: airflow/providers/{dependency.replace('.', 
'/')}/provider.yaml"
+        )
         deps = dependency_info["deps"]
         if not isinstance(deps, list):
             raise TypeError(f"Wrong type of 'deps' {deps} for {dependency} in 
{DEPENDENCIES_JSON_FILE_PATH}")
@@ -280,7 +284,7 @@ def get_dependency_type(dependency_type: str) -> 
ParsedDependencyTypes | None:
     return None
 
 
-def update_pyproject_toml(dependencies: dict[str, dict[str, list[str] | str]]):
+def update_pyproject_toml(dependencies: dict[str, dict[str, list[str] | str]], 
dependencies_hash: str):
     file_content = PYPROJECT_TOML_FILE_PATH.read_text()
     result_content: list[str] = []
     copying = True
@@ -291,6 +295,7 @@ def update_pyproject_toml(dependencies: dict[str, dict[str, 
list[str] | str]]):
             result_content.append(line)
         if line.strip().startswith(GENERATED_DEPENDENCIES_START):
             copying = False
+            result_content.append(f"# Hash of dependencies: 
{dependencies_hash}")
             generate_dependencies(result_content, dependencies)
         elif line.strip().startswith(GENERATED_DEPENDENCIES_END):
             copying = True
@@ -325,6 +330,16 @@ def calculate_my_hash():
     return hash_md5.hexdigest()
 
 
+def calculate_dependencies_hash(dependencies: str):
+    my_file = MY_FILE.resolve()
+    hash_md5 = hashlib.md5()
+    hash_md5.update(my_file.read_bytes())
+    hash_md5.update(dependencies.encode(encoding="utf-8"))
+    return hash_md5.hexdigest()
+
+
+HASH_REGEXP = re.compile(r"# Hash of dependencies: (?P<hash>[a-f0-9]+)")
+
 if __name__ == "__main__":
     find_all_providers_and_provider_files()
     num_files = len(ALL_PROVIDER_FILES)
@@ -367,7 +382,14 @@ if __name__ == "__main__":
     new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n"
     old_md5sum = MY_MD5SUM_FILE.read_text().strip() if MY_MD5SUM_FILE.exists() 
else ""
     new_md5sum = calculate_my_hash()
-    if new_dependencies != old_dependencies or new_md5sum != old_md5sum:
+    find_hash = HASH_REGEXP.findall(PYPROJECT_TOML_FILE_PATH.read_text())
+    dependencies_hash_from_pyproject_toml = find_hash[0] if find_hash else ""
+    dependencies_hash = calculate_dependencies_hash(new_dependencies)
+    if (
+        new_dependencies != old_dependencies
+        or new_md5sum != old_md5sum
+        or dependencies_hash_from_pyproject_toml != dependencies_hash
+    ):
         
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, 
indent=2) + "\n")
         if os.environ.get("CI"):
             console.print()
@@ -386,7 +408,7 @@ if __name__ == "__main__":
             )
             console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}")
             console.print()
-            update_pyproject_toml(unique_sorted_dependencies)
+            update_pyproject_toml(unique_sorted_dependencies, 
dependencies_hash)
             console.print(f"Written {PYPROJECT_TOML_FILE_PATH}")
             console.print()
             MY_MD5SUM_FILE.write_text(new_md5sum + "\n")
diff --git 
a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py.md5sum 
b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py.md5sum
index 610f5562c8..0bce5d16b0 100644
--- a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py.md5sum
+++ b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py.md5sum
@@ -1 +1 @@
-ed25c4f6b220c14b40bbf370fee9388e
+5f442e24a09b079464bde7b552f812d1

Reply via email to