This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 513f78b47c Support create object store source tables without depending
on environment variables (#5732)
513f78b47c is described below
commit 513f78b47c2105d0849bb8d7f9d33d6928338315
Author: r.4ntix <[email protected]>
AuthorDate: Thu Apr 6 03:02:27 2023 +0800
Support create object store source tables without depending on environment
variables (#5732)
* feat: support to obtain necessary information when creating an object
store table through ListingTableFactory
* fix: enable "aws" feature of object_store
* fix: format Cargo.toml
* chore: update cli docs
* fix: format cli.md
* chore: move code for registering the object store dynamically out of core
and into datafusion-cli
* chore: improve documentation of cli
* chore: fix typos in doc
* chore: isolate the object store builder to a separate file, add unit tests
* chore: improve docs
---
datafusion-cli/src/exec.rs | 175 ++++++++++++++-
datafusion-cli/src/main.rs | 5 +-
datafusion-cli/src/object_storage.rs | 235 +++++++++------------
datafusion/core/Cargo.toml | 6 +-
.../core/src/datasource/listing_table_factory.rs | 14 +-
datafusion/core/src/execution/context.rs | 5 +
docs/source/user-guide/cli.md | 110 ++++++++--
7 files changed, 375 insertions(+), 175 deletions(-)
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index abf2250a35..3234ea000f 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -20,16 +20,26 @@
use crate::{
command::{Command, OutputFormat},
helper::CliHelper,
+ object_storage::{
+ get_gcs_object_store_builder, get_oss_object_store_builder,
+ get_s3_object_store_builder,
+ },
print_options::PrintOptions,
};
-use datafusion::error::Result;
-use datafusion::prelude::SessionContext;
+use datafusion::{
+ datasource::listing::ListingTableUrl,
+ error::{DataFusionError, Result},
+ logical_expr::CreateExternalTable,
+};
+use datafusion::{logical_expr::LogicalPlan, prelude::SessionContext};
+use object_store::ObjectStore;
use rustyline::error::ReadlineError;
use rustyline::Editor;
-use std::fs::File;
use std::io::prelude::*;
use std::io::BufReader;
use std::time::Instant;
+use std::{fs::File, sync::Arc};
+use url::Url;
/// run and execute SQL statements and commands from a file, against a context
with the given print options
pub async fn exec_from_lines(
@@ -165,9 +175,166 @@ async fn exec_and_print(
sql: String,
) -> Result<()> {
let now = Instant::now();
- let df = ctx.sql(&sql).await?;
+
+ let plan = ctx.state().create_logical_plan(&sql).await?;
+ let df = match &plan {
+ LogicalPlan::CreateExternalTable(cmd) => {
+ create_external_table(&ctx, cmd)?;
+ ctx.execute_logical_plan(plan).await?
+ }
+ _ => ctx.execute_logical_plan(plan).await?,
+ };
+
let results = df.collect().await?;
print_options.print_batches(&results, now)?;
Ok(())
}
+
+fn create_external_table(ctx: &SessionContext, cmd: &CreateExternalTable) ->
Result<()> {
+ let table_path = ListingTableUrl::parse(&cmd.location)?;
+ let scheme = table_path.scheme();
+ let url: &Url = table_path.as_ref();
+
+ // registering the cloud object store dynamically using cmd.options
+ let store = match scheme {
+ "s3" => {
+ let builder = get_s3_object_store_builder(url, cmd)?;
+ Arc::new(builder.build()?) as Arc<dyn ObjectStore>
+ }
+ "oss" => {
+ let builder = get_oss_object_store_builder(url, cmd)?;
+ Arc::new(builder.build()?) as Arc<dyn ObjectStore>
+ }
+ "gs" | "gcs" => {
+ let builder = get_gcs_object_store_builder(url, cmd)?;
+ Arc::new(builder.build()?) as Arc<dyn ObjectStore>
+ }
+ _ => {
+ // for other types, try to get from the object_store_registry
+ ctx.runtime_env()
+ .object_store_registry
+ .get_store(url)
+ .map_err(|_| {
+ DataFusionError::Execution(format!(
+ "Unsupported object store scheme: {}",
+ scheme
+ ))
+ })?
+ }
+ };
+
+ ctx.runtime_env().register_object_store(url, store);
+
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ async fn create_external_table_test(location: &str, sql: &str) ->
Result<()> {
+ let ctx = SessionContext::new();
+ let plan = ctx.state().create_logical_plan(&sql).await?;
+
+ match &plan {
+ LogicalPlan::CreateExternalTable(cmd) => {
+ create_external_table(&ctx, cmd)?;
+ }
+ _ => assert!(false),
+ };
+
+ ctx.runtime_env()
+ .object_store(ListingTableUrl::parse(location)?)?;
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn create_object_store_table_s3() -> Result<()> {
+ let access_key_id = "fake_access_key_id";
+ let secret_access_key = "fake_secret_access_key";
+ let region = "fake_us-east-2";
+ let session_token = "fake_session_token";
+ let location = "s3://bucket/path/file.parquet";
+
+ // Missing region
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+ OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key'
'{secret_access_key}') LOCATION '{location}'");
+ let err = create_external_table_test(location, &sql)
+ .await
+ .unwrap_err();
+ assert!(err.to_string().contains("Missing region"));
+
+ // Should be OK
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+ OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key'
'{secret_access_key}', 'region' '{region}', 'session_token' '{session_token}')
LOCATION '{location}'");
+ create_external_table_test(location, &sql).await?;
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn create_object_store_table_oss() -> Result<()> {
+ let access_key_id = "fake_access_key_id";
+ let secret_access_key = "fake_secret_access_key";
+ let endpoint = "fake_endpoint";
+ let location = "oss://bucket/path/file.parquet";
+
+ // Should be OK
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+ OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key'
'{secret_access_key}', 'endpoint' '{endpoint}') LOCATION '{location}'");
+ create_external_table_test(location, &sql).await?;
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn create_object_store_table_gcs() -> Result<()> {
+ let service_account_path = "fake_service_account_path";
+ let service_account_key =
+ "{\"private_key\":
\"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}";
+ let application_credentials_path = "fake_application_credentials_path";
+ let location = "gcs://bucket/path/file.parquet";
+
+ // for service_account_path
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+ OPTIONS('service_account_path' '{service_account_path}') LOCATION
'{location}'");
+ let err = create_external_table_test(location, &sql)
+ .await
+ .unwrap_err();
+ assert!(err.to_string().contains("No such file or directory"));
+
+ // for service_account_key
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
OPTIONS('service_account_key' '{service_account_key}') LOCATION '{location}'");
+ let err = create_external_table_test(location, &sql)
+ .await
+ .unwrap_err();
+ assert!(err.to_string().contains("No RSA key found in pem file"));
+
+ // for application_credentials_path
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
+ OPTIONS('application_credentials_path'
'{application_credentials_path}') LOCATION '{location}'");
+ let err = create_external_table_test(location, &sql)
+ .await
+ .unwrap_err();
+ assert!(err.to_string().contains("A configuration file was passed"));
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn create_external_table_local_file() -> Result<()> {
+ let location = "/path/to/file.parquet";
+
+ // Ensure that local files are also registered
+ let sql =
+ format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION
'{location}'");
+ let err = create_external_table_test(location, &sql)
+ .await
+ .unwrap_err();
+ assert!(err.to_string().contains("No such file or directory"));
+
+ Ok(())
+ }
+}
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 8061f4d516..4c1dd2f94e 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -21,7 +21,6 @@ use datafusion::execution::context::SessionConfig;
use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv};
use datafusion::prelude::SessionContext;
use datafusion_cli::catalog::DynamicFileCatalog;
-use datafusion_cli::object_storage::DatafusionCliObjectStoreRegistry;
use datafusion_cli::{
exec, print_format::PrintFormat, print_options::PrintOptions,
DATAFUSION_CLI_VERSION,
};
@@ -148,9 +147,7 @@ pub async fn main() -> Result<()> {
}
fn create_runtime_env() -> Result<RuntimeEnv> {
- let object_store_registry = DatafusionCliObjectStoreRegistry::new();
- let rn_config =
-
RuntimeConfig::new().with_object_store_registry(Arc::new(object_store_registry));
+ let rn_config = RuntimeConfig::new();
RuntimeEnv::new(rn_config)
}
diff --git a/datafusion-cli/src/object_storage.rs
b/datafusion-cli/src/object_storage.rs
index 229a1571f0..87018bbe15 100644
--- a/datafusion-cli/src/object_storage.rs
+++ b/datafusion-cli/src/object_storage.rs
@@ -15,100 +15,95 @@
// specific language governing permissions and limitations
// under the License.
-use datafusion::error::Result;
-use std::{env, str::FromStr, sync::Arc};
-
-use datafusion::datasource::object_store::{
- DefaultObjectStoreRegistry, ObjectStoreRegistry,
+use datafusion::{
+ error::{DataFusionError, Result},
+ logical_expr::CreateExternalTable,
};
-use datafusion::error::DataFusionError;
-use object_store::{aws::AmazonS3Builder, gcp::GoogleCloudStorageBuilder,
ObjectStore};
+use object_store::{aws::AmazonS3Builder, gcp::GoogleCloudStorageBuilder};
use url::Url;
-#[derive(Debug, PartialEq, Eq, clap::ArgEnum, Clone)]
-pub enum ObjectStoreScheme {
- S3,
- GCS,
-}
+pub fn get_s3_object_store_builder(
+ url: &Url,
+ cmd: &CreateExternalTable,
+) -> Result<AmazonS3Builder> {
+ let bucket_name = get_bucket_name(url)?;
+ let mut builder =
AmazonS3Builder::from_env().with_bucket_name(bucket_name);
+
+ if let (Some(access_key_id), Some(secret_access_key)) = (
+ cmd.options.get("access_key_id"),
+ cmd.options.get("secret_access_key"),
+ ) {
+ builder = builder
+ .with_access_key_id(access_key_id)
+ .with_secret_access_key(secret_access_key);
+ }
-impl FromStr for ObjectStoreScheme {
- type Err = DataFusionError;
-
- fn from_str(input: &str) -> Result<Self> {
- match input {
- "s3" => Ok(ObjectStoreScheme::S3),
- "gs" | "gcs" => Ok(ObjectStoreScheme::GCS),
- _ => Err(DataFusionError::Execution(format!(
- "Unsupported object store scheme {}",
- input
- ))),
- }
+ if let Some(session_token) = cmd.options.get("session_token") {
+ builder = builder.with_token(session_token);
}
-}
-/// An [`ObjectStoreRegistry`] that can automatically create S3 and GCS stores
for a given URL
-#[derive(Debug, Default)]
-pub struct DatafusionCliObjectStoreRegistry {
- inner: DefaultObjectStoreRegistry,
+ if let Some(region) = cmd.options.get("region") {
+ builder = builder.with_region(region);
+ }
+
+ Ok(builder)
}
-impl DatafusionCliObjectStoreRegistry {
- pub fn new() -> Self {
- Default::default()
+pub fn get_oss_object_store_builder(
+ url: &Url,
+ cmd: &CreateExternalTable,
+) -> Result<AmazonS3Builder> {
+ let bucket_name = get_bucket_name(url)?;
+ let mut builder = AmazonS3Builder::from_env()
+ .with_virtual_hosted_style_request(true)
+ .with_bucket_name(bucket_name)
+ // oss don't care about the "region" field
+ .with_region("do_not_care");
+
+ if let (Some(access_key_id), Some(secret_access_key)) = (
+ cmd.options.get("access_key_id"),
+ cmd.options.get("secret_access_key"),
+ ) {
+ builder = builder
+ .with_access_key_id(access_key_id)
+ .with_secret_access_key(secret_access_key);
}
-}
-impl ObjectStoreRegistry for DatafusionCliObjectStoreRegistry {
- fn register_store(
- &self,
- url: &Url,
- store: Arc<dyn ObjectStore>,
- ) -> Option<Arc<dyn ObjectStore>> {
- self.inner.register_store(url, store)
+ if let Some(endpoint) = cmd.options.get("endpoint") {
+ builder = builder.with_endpoint(endpoint);
}
- fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
- self.inner.get_store(url).or_else(|_| {
- let store =
- ObjectStoreScheme::from_str(url.scheme()).map(
- |scheme| match scheme {
- ObjectStoreScheme::S3 => build_s3_object_store(url),
- ObjectStoreScheme::GCS => build_gcs_object_store(url),
- },
- )??;
+ Ok(builder)
+}
- self.inner.register_store(url, store.clone());
+pub fn get_gcs_object_store_builder(
+ url: &Url,
+ cmd: &CreateExternalTable,
+) -> Result<GoogleCloudStorageBuilder> {
+ let bucket_name = get_bucket_name(url)?;
+ let mut builder =
GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name);
- Ok(store)
- })
+ if let Some(service_account_path) =
cmd.options.get("service_account_path") {
+ builder = builder.with_service_account_path(service_account_path);
}
-}
-fn build_s3_object_store(url: &Url) -> Result<Arc<dyn ObjectStore>> {
- let host = get_host_name(url)?;
- match AmazonS3Builder::from_env().with_bucket_name(host).build() {
- Ok(s3) => Ok(Arc::new(s3)),
- Err(err) => Err(DataFusionError::External(Box::new(err))),
+ if let Some(service_account_key) = cmd.options.get("service_account_key") {
+ builder = builder.with_service_account_key(service_account_key);
}
-}
-fn build_gcs_object_store(url: &Url) -> Result<Arc<dyn ObjectStore>> {
- let host = get_host_name(url)?;
- let mut builder = GoogleCloudStorageBuilder::new().with_bucket_name(host);
-
- if let Ok(path) = env::var("GCP_SERVICE_ACCOUNT_PATH") {
- builder = builder.with_service_account_path(path);
- }
- match builder.build() {
- Ok(gcs) => Ok(Arc::new(gcs)),
- Err(err) => Err(DataFusionError::External(Box::new(err))),
+ if let Some(application_credentials_path) =
+ cmd.options.get("application_credentials_path")
+ {
+ builder =
builder.with_application_credentials(application_credentials_path);
}
+
+ Ok(builder)
}
-fn get_host_name(url: &Url) -> Result<&str> {
+fn get_bucket_name(url: &Url) -> Result<&str> {
url.host_str().ok_or_else(|| {
DataFusionError::Execution(format!(
- "Not able to parse hostname from url, {}",
+ "Not able to parse bucket name from url: {}",
url.as_str()
))
})
@@ -116,76 +111,34 @@ fn get_host_name(url: &Url) -> Result<&str> {
#[cfg(test)]
mod tests {
- use std::{env, str::FromStr};
-
- use datafusion::datasource::object_store::ObjectStoreRegistry;
- use url::Url;
-
- use super::DatafusionCliObjectStoreRegistry;
-
- #[test]
- fn s3_provider_no_host() {
- let no_host_url = "s3:///";
- let registry = DatafusionCliObjectStoreRegistry::new();
- let err = registry
- .get_store(&Url::from_str(no_host_url).unwrap())
- .unwrap_err();
- assert!(err
- .to_string()
- .contains("Not able to parse hostname from url"))
- }
-
- #[test]
- fn gs_provider_no_host() {
- let no_host_url = "gs:///";
- let registry = DatafusionCliObjectStoreRegistry::new();
- let err = registry
- .get_store(&Url::from_str(no_host_url).unwrap())
- .unwrap_err();
- assert!(err
- .to_string()
- .contains("Not able to parse hostname from url"))
- }
-
- #[test]
- fn gcs_provider_no_host() {
- let no_host_url = "gcs:///";
- let registry = DatafusionCliObjectStoreRegistry::new();
- let err = registry
- .get_store(&Url::from_str(no_host_url).unwrap())
- .unwrap_err();
- assert!(err
- .to_string()
- .contains("Not able to parse hostname from url"))
- }
-
- #[test]
- fn unknown_object_store_type() {
- let unknown = "unknown://bucket_name/path";
- let registry = DatafusionCliObjectStoreRegistry::new();
- let err = registry
- .get_store(&Url::from_str(unknown).unwrap())
- .unwrap_err();
- assert!(err
- .to_string()
- .contains("Unsupported object store scheme unknown"))
- }
+ use datafusion::{
+ datasource::listing::ListingTableUrl, logical_expr::LogicalPlan,
+ prelude::SessionContext,
+ };
+
+ use super::*;
+
+ #[ignore] // https://github.com/apache/arrow-rs/issues/4021
+ #[tokio::test]
+ async fn oss_object_store_builder() -> Result<()> {
+ let access_key_id = "access_key_id";
+ let secret_access_key = "secret_access_key";
+ let region = "us-east-2";
+ let location = "s3://bucket/path/file.parquet";
+ let table_url = ListingTableUrl::parse(location)?;
+ let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
OPTIONS('access_key_id' '{access_key_id}', 'secret_access_key'
'{secret_access_key}', 'region' '{region}') LOCATION '{location}'");
+
+ let ctx = SessionContext::new();
+ let plan = ctx.state().create_logical_plan(&sql).await?;
+
+ match &plan {
+ LogicalPlan::CreateExternalTable(cmd) => {
+ let _builder =
get_oss_object_store_builder(table_url.as_ref(), cmd)?;
+ // get the actual configuration information, then assert_eq!
+ }
+ _ => assert!(false),
+ }
- #[test]
- fn s3_region_validation() {
- let s3 = "s3://bucket_name/path";
- let registry = DatafusionCliObjectStoreRegistry::new();
- let err = registry.get_store(&Url::from_str(s3).unwrap()).unwrap_err();
- assert!(err.to_string().contains("Generic S3 error: Missing region"));
-
- env::set_var("AWS_REGION", "us-east-1");
- let url = Url::from_str(s3).expect("Unable to parse s3 url");
- let res = registry.get_store(&url);
- let msg = match res {
- Err(e) => format!("{e}"),
- Ok(_) => "".to_string(),
- };
- assert_eq!("".to_string(), msg); // Fail with error message
- env::remove_var("AWS_REGION");
+ Ok(())
}
}
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index df2a964a18..c9639864e2 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -19,11 +19,7 @@
name = "datafusion"
description = "DataFusion is an in-memory query engine that uses Apache Arrow
as the memory model"
keywords = ["arrow", "query", "sql"]
-include = [
- "benches/*.rs",
- "src/**/*.rs",
- "Cargo.toml",
-]
+include = ["benches/*.rs", "src/**/*.rs", "Cargo.toml"]
version = { workspace = true }
edition = { workspace = true }
readme = { workspace = true }
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs
b/datafusion/core/src/datasource/listing_table_factory.rs
index 6c7b058520..34afe75a6e 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -17,6 +17,14 @@
//! Factory for creating ListingTables with default options
+use std::str::FromStr;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, SchemaRef};
+use async_trait::async_trait;
+use datafusion_common::DataFusionError;
+use datafusion_expr::CreateExternalTable;
+
use crate::datasource::datasource::TableProviderFactory;
use crate::datasource::file_format::avro::AvroFormat;
use crate::datasource::file_format::csv::CsvFormat;
@@ -29,12 +37,6 @@ use crate::datasource::listing::{
};
use crate::datasource::TableProvider;
use crate::execution::context::SessionState;
-use arrow::datatypes::{DataType, SchemaRef};
-use async_trait::async_trait;
-use datafusion_common::DataFusionError;
-use datafusion_expr::CreateExternalTable;
-use std::str::FromStr;
-use std::sync::Arc;
/// A `TableProviderFactory` capable of creating new `ListingTable`s
pub struct ListingTableFactory {}
diff --git a/datafusion/core/src/execution/context.rs
b/datafusion/core/src/execution/context.rs
index eb9f6ba3c4..003d640c29 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -319,6 +319,11 @@ impl SessionContext {
// create a query planner
let plan = self.state().create_logical_plan(sql).await?;
+ self.execute_logical_plan(plan).await
+ }
+
+ /// Execute the [`LogicalPlan`], return a [`DataFrame`]
+ pub async fn execute_logical_plan(&self, plan: LogicalPlan) ->
Result<DataFrame> {
match plan {
LogicalPlan::Dml(DmlStatement {
table_name,
diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index d3512a6dca..ef65561f28 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -180,15 +180,49 @@ STORED AS CSV
LOCATION '/path/to/aggregate_test_100.csv';
```
-## Querying S3 Data Sources
+## Registering S3 Data Sources
-The CLI can query data in S3 if the following environment variables are
defined:
+[AWS S3](https://aws.amazon.com/s3/) data sources can be registered by
executing a `CREATE EXTERNAL TABLE` SQL statement.
-- `AWS_DEFAULT_REGION`
-- `AWS_ACCESS_KEY_ID`
-- `AWS_SECRET_ACCESS_KEY`
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+ 'access_key_id' '******',
+ 'secret_access_key' '******',
+ 'region' 'us-east-2'
+)
+LOCATION 's3://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- access_key_id
+- secret_access_key
+- session_token
+- region
+
+It is also possible to simplify sql statements by environment variables.
+
+```bash
+$ export AWS_DEFAULT_REGION=us-east-2
+$ export AWS_SECRET_ACCESS_KEY=******
+$ export AWS_ACCESS_KEY_ID=******
+
+$ datafusion-cli
+DataFusion CLI v21.0.0
+❯ create external table test stored as parquet location
's3://bucket/path/file.parquet';
+0 rows in set. Query took 0.374 seconds.
+❯ select * from test;
++----------+----------+
+| column_1 | column_2 |
++----------+----------+
+| 1 | 2 |
++----------+----------+
+1 row in set. Query took 0.171 seconds.
+```
-Details of the environment variables that can be used are
+Details of the environment variables that can be used are:
- AWS_ACCESS_KEY_ID -> access_key_id
- AWS_SECRET_ACCESS_KEY -> secret_access_key
@@ -198,19 +232,56 @@ Details of the environment variables that can be used are
- AWS_CONTAINER_CREDENTIALS_RELATIVE_URI ->
<https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html>
- AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS
-Example:
+## Registering OSS Data Sources
-```bash
-$ aws s3 cp test.csv s3://my-bucket/
-upload: ./test.csv to s3://my-bucket/test.csv
+[Alibaba cloud
OSS](https://www.alibabacloud.com/product/object-storage-service) data sources
can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
-$ export AWS_DEFAULT_REGION=us-east-2
-$ export AWS_SECRET_ACCESS_KEY=***************************
-$ export AWS_ACCESS_KEY_ID=**************
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+ 'access_key_id' '******',
+ 'secret_access_key' '******',
+ 'endpoint' 'https://bucket.oss-cn-hangzhou.aliyuncs.com'
+)
+LOCATION 'oss://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- access_key_id
+- secret_access_key
+- endpoint
+
+Note that the `endpoint` format of oss needs to be:
`https://{bucket}.{oss-region-endpoint}`
+
+## Registering GCS Data Sources
+
+[Google Cloud Storage](https://cloud.google.com/storage) data sources can be
registered by executing a `CREATE EXTERNAL TABLE` SQL statement.
+
+```sql
+CREATE EXTERNAL TABLE test
+STORED AS PARQUET
+OPTIONS(
+ 'service_account_path' '/tmp/gcs.json',
+)
+LOCATION 'gs://bucket/path/file.parquet';
+```
+
+The supported OPTIONS are:
+
+- service_account_path -> location of service account file
+- service_account_key -> JSON serialized service account key
+- application_credentials_path -> location of application credentials file
+
+It is also possible to simplify sql statements by environment variables.
+
+```bash
+$ export GOOGLE_SERVICE_ACCOUNT=/tmp/gcs.json
$ datafusion-cli
-DataFusion CLI v14.0.0
-❯ create external table test stored as csv location 's3://my-bucket/test.csv';
+DataFusion CLI v21.0.0
+❯ create external table test stored as parquet location
'gs://bucket/path/file.parquet';
0 rows in set. Query took 0.374 seconds.
❯ select * from test;
+----------+----------+
@@ -221,6 +292,15 @@ DataFusion CLI v14.0.0
1 row in set. Query took 0.171 seconds.
```
+Details of the environment variables that can be used are:
+
+- GOOGLE_SERVICE_ACCOUNT: location of service account file
+- GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file
+- SERVICE_ACCOUNT: (alias) location of service account file
+- GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key
+- GOOGLE_BUCKET: bucket name
+- GOOGLE_BUCKET_NAME: (alias) bucket name
+
## Commands
Available commands inside DataFusion CLI are: