This is an automated email from the ASF dual-hosted git repository. eladkal pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push: new 681859c7bf Change default `parquet_row_group_size` in `BaseSQLToGCSOperator` (#36817) 681859c7bf is described below commit 681859c7bffabce0c294060d811db2fb16851816 Author: Renze Post <renze.p...@gmail.com> AuthorDate: Thu Jan 18 15:20:12 2024 +0100 Change default `parquet_row_group_size` in `BaseSQLToGCSOperator` (#36817) * Change default parquet_row_group_size in BaseSQLToGCSOperator * Add change to changelog * Added a better change description * Remove unnecessary extra newline * Applied suggested changes Co-authored-by: Andrey Anshin <andrey.ans...@taragol.is> * Applied suggested changes Co-authored-by: Elad Kalif <45845474+elad...@users.noreply.github.com> --------- Co-authored-by: Andrey Anshin <andrey.ans...@taragol.is> Co-authored-by: Elad Kalif <45845474+elad...@users.noreply.github.com> --- airflow/providers/google/CHANGELOG.rst | 10 ++++++++++ airflow/providers/google/cloud/transfers/sql_to_gcs.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/airflow/providers/google/CHANGELOG.rst b/airflow/providers/google/CHANGELOG.rst index 3187a397b4..9249209235 100644 --- a/airflow/providers/google/CHANGELOG.rst +++ b/airflow/providers/google/CHANGELOG.rst @@ -27,6 +27,16 @@ Changelog --------- +.. note:: + The default value of ``parquet_row_group_size`` in ``BaseSQLToGCSOperator`` has changed from 1 to + 100000, in order to have a default that provides better compression efficiency and performance of + reading the data in the output Parquet files. In many cases, the previous value of 1 resulted in + very large files, long task durations and out of memory issues. A default value of 100000 may require + more memory to execute the operator, in which case users can override the ``parquet_row_group_size`` + parameter in the operator. All operators that are derived from ``BaseSQLToGCSOperator`` are affected + when ``export_format`` is ``parquet``: ``MySQLToGCSOperator``, ``PrestoToGCSOperator``, + ``OracleToGCSOperator``, ``TrinoToGCSOperator``, ``MSSQLToGCSOperator`` and ``PostgresToGCSOperator``. Due to the above we treat this change as bug fix. + 10.13.1 ....... diff --git a/airflow/providers/google/cloud/transfers/sql_to_gcs.py b/airflow/providers/google/cloud/transfers/sql_to_gcs.py index dcadaf7859..1529430c97 100644 --- a/airflow/providers/google/cloud/transfers/sql_to_gcs.py +++ b/airflow/providers/google/cloud/transfers/sql_to_gcs.py @@ -85,7 +85,7 @@ class BaseSQLToGCSOperator(BaseOperator): :param parquet_row_group_size: The approximate number of rows in each row group when using parquet format. Using a large row group size can reduce the file size and improve the performance of reading the data, but it needs more memory to - execute the operator. (default: 1) + execute the operator. (default: 100000) """ template_fields: Sequence[str] = ( @@ -123,7 +123,7 @@ class BaseSQLToGCSOperator(BaseOperator): exclude_columns: set | None = None, partition_columns: list | None = None, write_on_empty: bool = False, - parquet_row_group_size: int = 1, + parquet_row_group_size: int = 100000, **kwargs, ) -> None: super().__init__(**kwargs)