hussein-awala commented on code in PR #34729: URL: https://github.com/apache/airflow/pull/34729#discussion_r1367713729
########## airflow/io/__init__.py: ########## @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import logging +from typing import ( + TYPE_CHECKING, + Callable, +) + +from fsspec.implementations.local import LocalFileSystem + +from airflow.compat.functools import cache +from airflow.providers_manager import ProvidersManager +from airflow.stats import Stats +from airflow.utils.module_loading import import_string + +if TYPE_CHECKING: + from fsspec import AbstractFileSystem + +log = logging.getLogger(__name__) + + +def _file(_: str | None) -> LocalFileSystem: + return LocalFileSystem() + + +# builtin supported filesystems +_BUILTIN_SCHEME_TO_FS: dict[str, Callable[[str | None], AbstractFileSystem]] = { + "file": _file, +} + + +@cache +def _register_filesystems() -> dict[str, Callable[[str | None], AbstractFileSystem]]: Review Comment: I suggest adding a new configuration to select which providers we want to load filesystems from, with a default value of `"*",` to reduce the loading time and avoid loading some filesystems that will not be used (for example, when the user installs google provider only to use ads manager operators, but he doesn't use GCS at all). ########## airflow/providers/common/io/operators/file_transfer.py: ########## @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from typing import TYPE_CHECKING + +from airflow.io.store.path import ObjectStoragePath +from airflow.models import BaseOperator + +if TYPE_CHECKING: + from airflow.utils.context import Context + + +class FileTransferOperator(BaseOperator): Review Comment: I wonder if we should use `ObjectTransferOperator` to be aligned with the other classes (`ObjectStorage` and `ObjectStoragePath`) ########## docs/apache-airflow/core-concepts/objectstorage.rst: ########## @@ -0,0 +1,206 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +.. _concepts:objectstorage: + +Object Storage +============== + +.. versionadded:: 2.8.0 + +Airflow provides a generic abstraction on top of object stores, like s3, gcs, and azure blob storage. +This abstraction allows you to use a variety of object storage systems in your DAGs without having to +change you code to deal with every different object storage system. In addition, it allows you to use +most of the standard Python modules, like ``shutil``, that can work with file-like objects. + +Support for a particular object storage system is dependent on the providers you have installed. For Review Comment: ```suggestion Support for a particular object storage system depends on the providers you have installed. For ``` ########## docs/apache-airflow/core-concepts/objectstorage.rst: ########## @@ -0,0 +1,206 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +.. _concepts:objectstorage: + +Object Storage +============== + +.. versionadded:: 2.8.0 + +Airflow provides a generic abstraction on top of object stores, like s3, gcs, and azure blob storage. +This abstraction allows you to use a variety of object storage systems in your DAGs without having to +change you code to deal with every different object storage system. In addition, it allows you to use +most of the standard Python modules, like ``shutil``, that can work with file-like objects. + +Support for a particular object storage system is dependent on the providers you have installed. For +example, if you have installed the ``apache-airflow-providers-google`` provider, you will be able to +use the ``gcs`` scheme for object storage. Out of the box, Airflow provides support for the ``file`` +scheme. + +.. note:: + Support for s3 requires you to install ``apache-airflow-providers-amazon[s3fs]``. This is because + it depends on ``aiobotocore``, which is not installed by default as it can create dependency + challenges with ``botocore``. + + +.. _concepts:basic-use: + +Basic Use +--------- + +To use object storage you instantiate a Path-like (see below) object with the URI of the object you +want to interact with. For example, to point to a bucket in s3 you would do the following: Review Comment: ```suggestion To use object storage, you need to instantiate a Path-like (see below) object with the URI of the object you want to interact with. For example, to point to a bucket in s3, you would do the following: ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@airflow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org