This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 513f78b47c Support create object store source tables without depending 
on environment variables (#5732)
513f78b47c is described below

commit 513f78b47c2105d0849bb8d7f9d33d6928338315
Author: r.4ntix <[email protected]>
AuthorDate: Thu Apr 6 03:02:27 2023 +0800

    Support create object store source tables without depending on environment 
variables (#5732)
    
    * feat: support to obtain necessary information when creating an object 
store table through ListingTableFactory
    
    * fix: enable "aws" feature of object_store
    
    * fix: format Cargo.toml
    
    * chore: update cli docs
    
    * fix: format cli.md
    
    * chore: move code for registering the object store dynamically out of core 
and into datafusion-cli
    
    * chore: improve documentation of cli
    
    * chore: fix typos in doc
    
    * chore: isolate the object store builder to a separate file, add unit tests
    
    * chore: improve docs
---
 datafusion-cli/src/exec.rs                         | 175 ++++++++++++++-
 datafusion-cli/src/main.rs                         |   5 +-
 datafusion-cli/src/object_storage.rs               | 235 +++++++++------------
 datafusion/core/Cargo.toml                         |   6 +-
 .../core/src/datasource/listing_table_factory.rs   |  14 +-
 datafusion/core/src/execution/context.rs           |   5 +
 docs/source/user-guide/cli.md                      | 110 ++++++++--
 7 files changed, 375 insertions(+), 175 deletions(-)

diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index abf2250a35..3234ea000f 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -20,16 +20,26 @@
 use crate::{
     command::{Command, OutputFormat},
     helper::CliHelper,
+    object_storage::{
+        get_gcs_object_store_builder, get_oss_object_store_builder,
+        get_s3_object_store_builder,
+    },
     print_options::PrintOptions,
 };
-use datafusion::error::Result;
-use datafusion::prelude::SessionContext;
+use datafusion::{
+    datasource::listing::ListingTableUrl,
+    error::{DataFusionError, Result},
+    logical_expr::CreateExternalTable,
+};
+use datafusion::{logical_expr::LogicalPlan, prelude::SessionContext};
+use object_store::ObjectStore;
 use rustyline::error::ReadlineError;
 use rustyline::Editor;
-use std::fs::File;
 use std::io::prelude::*;
 use std::io::BufReader;
 use std::time::Instant;
+use std::{fs::File, sync::Arc};
+use url::Url;
 
 /// run and execute SQL statements and commands from a file, against a context 
with the given print options
 pub async fn exec_from_lines(
@@ -165,9 +175,166 @@ async fn exec_and_print(
     sql: String,
 ) -> Result<()> {
     let now = Instant::now();
-    let df = ctx.sql(&sql).await?;
+
+    let plan = ctx.state().create_logical_plan(&sql).await?;
+    let df = match &plan {
+        LogicalPlan::CreateExternalTable(cmd) => {
+            create_external_table(&ctx, cmd)?;
+            ctx.execute_logical_plan(plan).await?
+        }
+        _ => ctx.execute_logical_plan(plan).await?,
+    };
+
     let results = df.collect().await?;
     print_options.print_batches(&results, now)?;
 
     Ok(())
 }
+
+fn create_external_table(ctx: &SessionContext, cmd: &CreateExternalTable) -> 
Result<()> {
+    let table_path = ListingTableUrl::parse(&cmd.location)?;
+    let scheme = table_path.scheme();
+    let url: &Url = table_path.as_ref();
+
+    // registering the cloud object store dynamically using cmd.options
+    let store = match scheme {
+        "s3" => {
+            let builder = get_s3_object_store_builder(url, cmd)?;
+            Arc::new(builder.build()?) as Arc<dyn ObjectStore>
+        }
+        "oss" => {
+            let builder = get_oss_object_store_builder(url, cmd)?;
+            Arc::new(builder.build()?) as Arc<dyn ObjectStore>
+        }
+        "gs" | "gcs" => {
+            let builder = get_gcs_object_store_builder(url, cmd)?;
+            Arc::new(builder.build()?) as Arc<dyn ObjectStore>
+        }
+        _ => {
+            // for other types, try to get from the object_store_registry
+            ctx.runtime_env()
+                .object_store_registry
+                .get_store(url)
+                .map_err(|_| {
+                    DataFusionError::Execution(format!(
+                        "Unsupported object store scheme: {}",
+                        scheme
+                    ))
+                })?
+        }
+    };
+
+    ctx.runtime_env().register_object_store(url, store);
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    async fn create_external_table_test(location: &str, sql: &str) -> 
Result<()> {
+        let ctx = SessionContext::new();
+        let plan = ctx.state().create_logical_plan(&sql).await?;
+
+        match &plan {
+            LogicalPlan::CreateExternalTable(cmd) => {
+                create_external_table(&ctx, cmd)?;
+            }
+            _ => assert!(false),
+        };
+
+        ctx.runtime_env()
+            .object_store(ListingTableUrl::parse(location)?)?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn create_object_store_table_s3() -> Result<()> {
+        let access_key_id = "fake_access_key_id";
+        let secret_access_key = "fake_secret_access_key";
+        let region = "fake_us-east-2";
+        let session_token = "fake_session_token";
+        let location = "s3://bucket/path/file.parquet";
+
+        // Missing region
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+            OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' 
'{secret_access_key}') LOCATION '{location}'");
+        let err = create_external_table_test(location, &sql)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("Missing region"));
+
+        // Should be OK
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+            OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' 
'{secret_access_key}', 'region' '{region}', 'session_token' '{session_token}') 
LOCATION '{location}'");
+        create_external_table_test(location, &sql).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn create_object_store_table_oss() -> Result<()> {
+        let access_key_id = "fake_access_key_id";
+        let secret_access_key = "fake_secret_access_key";
+        let endpoint = "fake_endpoint";
+        let location = "oss://bucket/path/file.parquet";
+
+        // Should be OK
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+            OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' 
'{secret_access_key}', 'endpoint' '{endpoint}') LOCATION '{location}'");
+        create_external_table_test(location, &sql).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn create_object_store_table_gcs() -> Result<()> {
+        let service_account_path = "fake_service_account_path";
+        let service_account_key =
+            "{\"private_key\": 
\"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}";
+        let application_credentials_path = "fake_application_credentials_path";
+        let location = "gcs://bucket/path/file.parquet";
+
+        // for service_account_path
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+            OPTIONS('service_account_path' '{service_account_path}') LOCATION 
'{location}'");
+        let err = create_external_table_test(location, &sql)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("No such file or directory"));
+
+        // for service_account_key
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET 
OPTIONS('service_account_key' '{service_account_key}') LOCATION '{location}'");
+        let err = create_external_table_test(location, &sql)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("No RSA key found in pem file"));
+
+        // for application_credentials_path
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+            OPTIONS('application_credentials_path' 
'{application_credentials_path}') LOCATION '{location}'");
+        let err = create_external_table_test(location, &sql)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("A configuration file was passed"));
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn create_external_table_local_file() -> Result<()> {
+        let location = "/path/to/file.parquet";
+
+        // Ensure that local files are also registered
+        let sql =
+            format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION 
'{location}'");
+        let err = create_external_table_test(location, &sql)
+            .await
+            .unwrap_err();
+        assert!(err.to_string().contains("No such file or directory"));
+
+        Ok(())
+    }
+}
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 8061f4d516..4c1dd2f94e 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -21,7 +21,6 @@ use datafusion::execution::context::SessionConfig;
 use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv};
 use datafusion::prelude::SessionContext;
 use datafusion_cli::catalog::DynamicFileCatalog;
-use datafusion_cli::object_storage::DatafusionCliObjectStoreRegistry;
 use datafusion_cli::{
     exec, print_format::PrintFormat, print_options::PrintOptions, 
DATAFUSION_CLI_VERSION,
 };
@@ -148,9 +147,7 @@ pub async fn main() -> Result<()> {
 }
 
 fn create_runtime_env() -> Result<RuntimeEnv> {
-    let object_store_registry = DatafusionCliObjectStoreRegistry::new();
-    let rn_config =
-        
RuntimeConfig::new().with_object_store_registry(Arc::new(object_store_registry));
+    let rn_config = RuntimeConfig::new();
     RuntimeEnv::new(rn_config)
 }
 
diff --git a/datafusion-cli/src/object_storage.rs 
b/datafusion-cli/src/object_storage.rs
index 229a1571f0..87018bbe15 100644
--- a/datafusion-cli/src/object_storage.rs
+++ b/datafusion-cli/src/object_storage.rs
@@ -15,100 +15,95 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::error::Result;
-use std::{env, str::FromStr, sync::Arc};
-
-use datafusion::datasource::object_store::{
-    DefaultObjectStoreRegistry, ObjectStoreRegistry,
+use datafusion::{
+    error::{DataFusionError, Result},
+    logical_expr::CreateExternalTable,
 };
-use datafusion::error::DataFusionError;
-use object_store::{aws::AmazonS3Builder, gcp::GoogleCloudStorageBuilder, 
ObjectStore};
+use object_store::{aws::AmazonS3Builder, gcp::GoogleCloudStorageBuilder};
 use url::Url;
 
-#[derive(Debug, PartialEq, Eq, clap::ArgEnum, Clone)]
-pub enum ObjectStoreScheme {
-    S3,
-    GCS,
-}
+pub fn get_s3_object_store_builder(
+    url: &Url,
+    cmd: &CreateExternalTable,
+) -> Result<AmazonS3Builder> {
+    let bucket_name = get_bucket_name(url)?;
+    let mut builder = 
AmazonS3Builder::from_env().with_bucket_name(bucket_name);
+
+    if let (Some(access_key_id), Some(secret_access_key)) = (
+        cmd.options.get("access_key_id"),
+        cmd.options.get("secret_access_key"),
+    ) {
+        builder = builder
+            .with_access_key_id(access_key_id)
+            .with_secret_access_key(secret_access_key);
+    }
 
-impl FromStr for ObjectStoreScheme {
-    type Err = DataFusionError;
-
-    fn from_str(input: &str) -> Result<Self> {
-        match input {
-            "s3" => Ok(ObjectStoreScheme::S3),
-            "gs" | "gcs" => Ok(ObjectStoreScheme::GCS),
-            _ => Err(DataFusionError::Execution(format!(
-                "Unsupported object store scheme {}",
-                input
-            ))),
-        }
+    if let Some(session_token) = cmd.options.get("session_token") {
+        builder = builder.with_token(session_token);
     }
-}
 
-/// An [`ObjectStoreRegistry`] that can automatically create S3 and GCS stores 
for a given URL
-#[derive(Debug, Default)]
-pub struct DatafusionCliObjectStoreRegistry {
-    inner: DefaultObjectStoreRegistry,
+    if let Some(region) = cmd.options.get("region") {
+        builder = builder.with_region(region);
+    }
+
+    Ok(builder)
 }
 
-impl DatafusionCliObjectStoreRegistry {
-    pub fn new() -> Self {
-        Default::default()
+pub fn get_oss_object_store_builder(
+    url: &Url,
+    cmd: &CreateExternalTable,
+) -> Result<AmazonS3Builder> {
+    let bucket_name = get_bucket_name(url)?;
+    let mut builder = AmazonS3Builder::from_env()
+        .with_virtual_hosted_style_request(true)
+        .with_bucket_name(bucket_name)
+        // oss don't care about the "region" field
+        .with_region("do_not_care");
+
+    if let (Some(access_key_id), Some(secret_access_key)) = (
+        cmd.options.get("access_key_id"),
+        cmd.options.get("secret_access_key"),
+    ) {
+        builder = builder
+            .with_access_key_id(access_key_id)
+            .with_secret_access_key(secret_access_key);
     }
-}
 
-impl ObjectStoreRegistry for DatafusionCliObjectStoreRegistry {
-    fn register_store(
-        &self,
-        url: &Url,
-        store: Arc<dyn ObjectStore>,
-    ) -> Option<Arc<dyn ObjectStore>> {
-        self.inner.register_store(url, store)
+    if let Some(endpoint) = cmd.options.get("endpoint") {
+        builder = builder.with_endpoint(endpoint);
     }
 
-    fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
-        self.inner.get_store(url).or_else(|_| {
-            let store =
-                ObjectStoreScheme::from_str(url.scheme()).map(
-                    |scheme| match scheme {
-                        ObjectStoreScheme::S3 => build_s3_object_store(url),
-                        ObjectStoreScheme::GCS => build_gcs_object_store(url),
-                    },
-                )??;
+    Ok(builder)
+}
 
-            self.inner.register_store(url, store.clone());
+pub fn get_gcs_object_store_builder(
+    url: &Url,
+    cmd: &CreateExternalTable,
+) -> Result<GoogleCloudStorageBuilder> {
+    let bucket_name = get_bucket_name(url)?;
+    let mut builder = 
GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name);
 
-            Ok(store)
-        })
+    if let Some(service_account_path) = 
cmd.options.get("service_account_path") {
+        builder = builder.with_service_account_path(service_account_path);
     }
-}
 
-fn build_s3_object_store(url: &Url) -> Result<Arc<dyn ObjectStore>> {
-    let host = get_host_name(url)?;
-    match AmazonS3Builder::from_env().with_bucket_name(host).build() {
-        Ok(s3) => Ok(Arc::new(s3)),
-        Err(err) => Err(DataFusionError::External(Box::new(err))),
+    if let Some(service_account_key) = cmd.options.get("service_account_key") {
+        builder = builder.with_service_account_key(service_account_key);
     }
-}
 
-fn build_gcs_object_store(url: &Url) -> Result<Arc<dyn ObjectStore>> {
-    let host = get_host_name(url)?;
-    let mut builder = GoogleCloudStorageBuilder::new().with_bucket_name(host);
-
-    if let Ok(path) = env::var("GCP_SERVICE_ACCOUNT_PATH") {
-        builder = builder.with_service_account_path(path);
-    }
-    match builder.build() {
-        Ok(gcs) => Ok(Arc::new(gcs)),
-        Err(err) => Err(DataFusionError::External(Box::new(err))),
+    if let Some(application_credentials_path) =
+        cmd.options.get("application_credentials_path")
+    {
+        builder = 
builder.with_application_credentials(application_credentials_path);
     }
+
+    Ok(builder)
 }
 
-fn get_host_name(url: &Url) -> Result<&str> {
+fn get_bucket_name(url: &Url) -> Result<&str> {
     url.host_str().ok_or_else(|| {
         DataFusionError::Execution(format!(
-            "Not able to parse hostname from url, {}",
+            "Not able to parse bucket name from url: {}",
             url.as_str()
         ))
     })
@@ -116,76 +111,34 @@ fn get_host_name(url: &Url) -> Result<&str> {
 
 #[cfg(test)]
 mod tests {
-    use std::{env, str::FromStr};
-
-    use datafusion::datasource::object_store::ObjectStoreRegistry;
-    use url::Url;
-
-    use super::DatafusionCliObjectStoreRegistry;
-
-    #[test]
-    fn s3_provider_no_host() {
-        let no_host_url = "s3:///";
-        let registry = DatafusionCliObjectStoreRegistry::new();
-        let err = registry
-            .get_store(&Url::from_str(no_host_url).unwrap())
-            .unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("Not able to parse hostname from url"))
-    }
-
-    #[test]
-    fn gs_provider_no_host() {
-        let no_host_url = "gs:///";
-        let registry = DatafusionCliObjectStoreRegistry::new();
-        let err = registry
-            .get_store(&Url::from_str(no_host_url).unwrap())
-            .unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("Not able to parse hostname from url"))
-    }
-
-    #[test]
-    fn gcs_provider_no_host() {
-        let no_host_url = "gcs:///";
-        let registry = DatafusionCliObjectStoreRegistry::new();
-        let err = registry
-            .get_store(&Url::from_str(no_host_url).unwrap())
-            .unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("Not able to parse hostname from url"))
-    }
-
-    #[test]
-    fn unknown_object_store_type() {
-        let unknown = "unknown://bucket_name/path";
-        let registry = DatafusionCliObjectStoreRegistry::new();
-        let err = registry
-            .get_store(&Url::from_str(unknown).unwrap())
-            .unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("Unsupported object store scheme unknown"))
-    }
+    use datafusion::{
+        datasource::listing::ListingTableUrl, logical_expr::LogicalPlan,
+        prelude::SessionContext,
+    };
+
+    use super::*;
+
+    #[ignore] // https://github.com/apache/arrow-rs/issues/4021
+    #[tokio::test]
+    async fn oss_object_store_builder() -> Result<()> {
+        let access_key_id = "access_key_id";
+        let secret_access_key = "secret_access_key";
+        let region = "us-east-2";
+        let location = "s3://bucket/path/file.parquet";
+        let table_url = ListingTableUrl::parse(location)?;
+        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET 
OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key' 
'{secret_access_key}', 'region' '{region}') LOCATION '{location}'");
+
+        let ctx = SessionContext::new();
+        let plan = ctx.state().create_logical_plan(&sql).await?;
+
+        match &plan {
+            LogicalPlan::CreateExternalTable(cmd) => {
+                let _builder = 
get_oss_object_store_builder(table_url.as_ref(), cmd)?;
+                // get the actual configuration information, then assert_eq!
+            }
+            _ => assert!(false),
+        }
 
-    #[test]
-    fn s3_region_validation() {
-        let s3 = "s3://bucket_name/path";
-        let registry = DatafusionCliObjectStoreRegistry::new();
-        let err = registry.get_store(&Url::from_str(s3).unwrap()).unwrap_err();
-        assert!(err.to_string().contains("Generic S3 error: Missing region"));
-
-        env::set_var("AWS_REGION", "us-east-1");
-        let url = Url::from_str(s3).expect("Unable to parse s3 url");
-        let res = registry.get_store(&url);
-        let msg = match res {
-            Err(e) => format!("{e}"),
-            Ok(_) => "".to_string(),
-        };
-        assert_eq!("".to_string(), msg); // Fail with error message
-        env::remove_var("AWS_REGION");
+        Ok(())
     }
 }
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index df2a964a18..c9639864e2 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -19,11 +19,7 @@
 name = "datafusion"
 description = "DataFusion is an in-memory query engine that uses Apache Arrow 
as the memory model"
 keywords = ["arrow", "query", "sql"]
-include = [
-    "benches/*.rs",
-    "src/**/*.rs",
-    "Cargo.toml",
-]
+include = ["benches/*.rs", "src/**/*.rs", "Cargo.toml"]
 version = { workspace = true }
 edition = { workspace = true }
 readme = { workspace = true }
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs 
b/datafusion/core/src/datasource/listing_table_factory.rs
index 6c7b058520..34afe75a6e 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -17,6 +17,14 @@
 
 //! Factory for creating ListingTables with default options
 
+use std::str::FromStr;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, SchemaRef};
+use async_trait::async_trait;
+use datafusion_common::DataFusionError;
+use datafusion_expr::CreateExternalTable;
+
 use crate::datasource::datasource::TableProviderFactory;
 use crate::datasource::file_format::avro::AvroFormat;
 use crate::datasource::file_format::csv::CsvFormat;
@@ -29,12 +37,6 @@ use crate::datasource::listing::{
 };
 use crate::datasource::TableProvider;
 use crate::execution::context::SessionState;
-use arrow::datatypes::{DataType, SchemaRef};
-use async_trait::async_trait;
-use datafusion_common::DataFusionError;
-use datafusion_expr::CreateExternalTable;
-use std::str::FromStr;
-use std::sync::Arc;
 
 /// A `TableProviderFactory` capable of creating new `ListingTable`s
 pub struct ListingTableFactory {}
diff --git a/datafusion/core/src/execution/context.rs 
b/datafusion/core/src/execution/context.rs
index eb9f6ba3c4..003d640c29 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -319,6 +319,11 @@ impl SessionContext {
         // create a query planner
         let plan = self.state().create_logical_plan(sql).await?;
 
+        self.execute_logical_plan(plan).await
+    }
+
+    /// Execute the [`LogicalPlan`], return a [`DataFrame`]
+    pub async fn execute_logical_plan(&self, plan: LogicalPlan) -> 
Result<DataFrame> {
         match plan {
             LogicalPlan::Dml(DmlStatement {
                 table_name,
diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index d3512a6dca..ef65561f28 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -180,15 +180,49 @@ STORED AS CSV
 LOCATION '/path/to/aggregate_test_100.csv';
 ```
 
-## Querying S3 Data Sources
+## Registering S3 Data Sources
 
-The CLI can query data in S3 if the following environment variables are 
defined:
+[AWS S3](https://aws.amazon.com/s3/) data sources can be registered by 
executing a `CREATE EXTERNAL TABLE` SQL statement.
 
-- `AWS_DEFAULT_REGION`
-- `AWS_ACCESS_KEY_ID`
-- `AWS_SECRET_ACCESS_KEY`
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+    'access_key_id' '******',
+    'secret_access_key' '******',
+    'region' 'us-east-2'
+)
+LOCATION 's3://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- access_key_id
+- secret_access_key
+- session_token
+- region
+
+It is also possible to simplify sql statements by environment variables.
+
+```bash
+$ export AWS_DEFAULT_REGION=us-east-2
+$ export AWS_SECRET_ACCESS_KEY=******
+$ export AWS_ACCESS_KEY_ID=******
+
+$ datafusion-cli
+DataFusion CLI v21.0.0
+❯ create external table test stored as parquet location 
's3://bucket/path/file.parquet';
+0 rows in set. Query took 0.374 seconds.
+❯ select * from test;
++----------+----------+
+| column_1 | column_2 |
++----------+----------+
+| 1        | 2        |
++----------+----------+
+1 row in set. Query took 0.171 seconds.
+```
 
-Details of the environment variables that can be used are
+Details of the environment variables that can be used are:
 
 - AWS_ACCESS_KEY_ID -> access_key_id
 - AWS_SECRET_ACCESS_KEY -> secret_access_key
@@ -198,19 +232,56 @@ Details of the environment variables that can be used are
 - AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> 
<https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html>
 - AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS
 
-Example:
+## Registering OSS Data Sources
 
-```bash
-$ aws s3 cp test.csv s3://my-bucket/
-upload: ./test.csv to s3://my-bucket/test.csv
+[Alibaba cloud 
OSS](https://www.alibabacloud.com/product/object-storage-service) data sources 
can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
 
-$ export AWS_DEFAULT_REGION=us-east-2
-$ export AWS_SECRET_ACCESS_KEY=***************************
-$ export AWS_ACCESS_KEY_ID=**************
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+    'access_key_id' '******',
+    'secret_access_key' '******',
+    'endpoint' 'https://bucket.oss-cn-hangzhou.aliyuncs.com'
+)
+LOCATION 'oss://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- access_key_id
+- secret_access_key
+- endpoint
+
+Note that the `endpoint` format of oss needs to be: 
`https://{bucket}.{oss-region-endpoint}`
+
+## Registering GCS Data Sources
+
+[Google Cloud Storage](https://cloud.google.com/storage) data sources can be 
registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
+
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+    'service_account_path' '/tmp/gcs.json',
+)
+LOCATION 'gs://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- service_account_path -> location of service account file
+- service_account_key -> JSON serialized service account key
+- application_credentials_path -> location of application credentials file
+
+It is also possible to simplify sql statements by environment variables.
+
+```bash
+$ export GOOGLE_SERVICE_ACCOUNT=/tmp/gcs.json
 
 $ datafusion-cli
-DataFusion CLI v14.0.0
-❯ create external table test stored as csv location 's3://my-bucket/test.csv';
+DataFusion CLI v21.0.0
+❯ create external table test stored as parquet location 
'gs://bucket/path/file.parquet';
 0 rows in set. Query took 0.374 seconds.
 ❯ select * from test;
 +----------+----------+
@@ -221,6 +292,15 @@ DataFusion CLI v14.0.0
 1 row in set. Query took 0.171 seconds.
 ```
 
+Details of the environment variables that can be used are:
+
+- GOOGLE_SERVICE_ACCOUNT: location of service account file
+- GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file
+- SERVICE_ACCOUNT: (alias) location of service account file
+- GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key
+- GOOGLE_BUCKET: bucket name
+- GOOGLE_BUCKET_NAME: (alias) bucket name
+
 ## Commands
 
 Available commands inside DataFusion CLI are:

Reply via email to