Taragolis commented on code in PR #35037: URL: https://github.com/apache/airflow/pull/35037#discussion_r1364518573
########## tests/providers/amazon/aws/hooks/test_s3.py: ########## @@ -1047,6 +1047,33 @@ def test_download_file_with_preserve_name_file_already_exists(self, tmp_path): use_autogenerated_subdir=False, ) + @mock.patch.object(S3Hook, "get_session") + def test_download_file_with_extra_args_sanitizes_values(self, mock_session): + bucket = "test_bucket" + s3_key = "test_key" + encryption_key = "abcd123" + + s3_hook = S3Hook( + extra_args={ + "SSECustomerKey": encryption_key, + "SSECustomerAlgorithm": "AES256", + "invalid_arg": "should be dropped", + } + ) + + mock_obj = Mock() + mock_resource = Mock() + mock_resource.resource.return_value = mock_obj + mock_session.return_value = mock_resource + + s3_hook.download_file(key=s3_key, bucket_name=bucket) + + mock_obj.load.assert_called_once_with( + key=s3_key, + bucket_name=bucket, + extra_args={"SSECustomerKey": encryption_key, "SSECustomerAlgorithm": "AES256"}, + ) Review Comment: ```suggestion mock_obj = Mock(name="MockedS3Object") mock_resource = Mock(name="MockedBoto3Resource") mock_resource.return_value.Object = mock_obj mock_session.return_value.resource = mock_resource s3_hook.download_file(key=s3_key, bucket_name=bucket) mock_obj.assert_called_once_with(bucket, s3_key) mock_obj.return_value.load.assert_called_once_with( SSECustomerKey=encryption_key, SSECustomerAlgorithm="AES256" ) ``` ########## airflow/providers/amazon/aws/hooks/s3.py: ########## @@ -912,14 +912,27 @@ def get_key(self, key: str, bucket_name: str | None = None) -> S3ResourceObject: :param bucket_name: the name of the bucket :return: the key object from the bucket """ + + def __sanitize_extra_args() -> dict[str, str]: + """Parse extra_args and return a dict with only the args listed in ALLOWED_DOWNLOAD_ARGS.""" + return { + arg_name: arg_value + for (arg_name, arg_value) in self.extra_args.items() + if arg_name in S3Transfer(self.conn).ALLOWED_DOWNLOAD_ARGS Review Comment: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer.ALLOWED_DOWNLOAD_ARGS ```suggestion if arg_name in S3Transfer.ALLOWED_DOWNLOAD_ARGS ``` ########## airflow/providers/amazon/aws/hooks/s3.py: ########## @@ -912,14 +912,27 @@ def get_key(self, key: str, bucket_name: str | None = None) -> S3ResourceObject: :param bucket_name: the name of the bucket :return: the key object from the bucket """ + + def __sanitize_extra_args() -> dict[str, str]: + """Parse extra_args and return a dict with only the args listed in ALLOWED_DOWNLOAD_ARGS.""" + return { + arg_name: arg_value + for (arg_name, arg_value) in self.extra_args.items() + if arg_name in S3Transfer(self.conn).ALLOWED_DOWNLOAD_ARGS + } + s3_resource = self.get_session().resource( "s3", endpoint_url=self.conn_config.endpoint_url, config=self.config, verify=self.verify, ) obj = s3_resource.Object(bucket_name, key) - obj.load() + + # TODO inline this after debugging + new_args = __sanitize_extra_args() + + obj.load(**new_args) Review Comment: According to documentation [S3.Object.load()](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/object/load.html#load) doesn't support any of the parameters, see comment from previous attempt to fix it: https://github.com/apache/airflow/pull/25835#discussion_r950511996. However I thought it might support this arguments by non obvious way https://github.com/boto/boto3/blob/4c34032a4dfe5371bb7ab808c65e3187398d5153/boto3/resources/factory.py#L558-L575 . So I guess you tried to use it on actual AWS environment and it works, am I right? I think the bigger problem, that we tried to use here High-level Client (aka resource) and which behaviour is unpredictable. IMO we should try to replace where it possible s3 resource by s3 client, which is more flexible and at least predictable, sometime in the future. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@airflow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org