martin-g commented on code in PR #17702:
URL: https://github.com/apache/datafusion/pull/17702#discussion_r2528968804
##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -625,7 +625,7 @@ impl AsLogicalPlan for LogicalPlanNode {
create_extern_table.name.as_ref(),
"CreateExternalTable",
)?,
- location: create_extern_table.location.clone(),
+ locations: vec![create_extern_table.location.clone()],
Review Comment:
This should split by comma, no ?
At line 1502 below you serialize the locations to a location by joining them
with comma.
I.e. `vec!["path1", "path2"]` serializes to `"path1,path2"` but deserializes
back to `vec!["path1,path2"]`
##########
datafusion/core/src/test_util/mod.rs:
##########
@@ -194,7 +194,7 @@ impl TableProviderFactory for TestTableFactory {
#[derive(Debug)]
pub struct TestTableProvider {
/// URL of table files or folder
- pub url: String,
+ pub url: Vec<String>,
Review Comment:
```suggestion
pub urls: Vec<String>,
```
?
`location` has been renamed to `locations` already
##########
datafusion/sql/src/statement.rs:
##########
@@ -1548,7 +1548,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
PlanCreateExternalTable {
schema: df_schema,
name,
- location,
+ locations: location.split(",").map(|x|
x.to_string()).collect(),
Review Comment:
```suggestion
locations: location.split(",").map(|x|
x.to_string().trim()).collect(),
```
##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -63,137 +65,190 @@ impl TableProviderFactory for ListingTableFactory {
))?
.create(session_state, &cmd.options)?;
- let mut table_path = ListingTableUrl::parse(&cmd.location)?;
- let file_extension = match table_path.is_collection() {
- // Setting the extension to be empty instead of allowing the
default extension seems
- // odd, but was done to ensure existing behavior isn't modified.
It seems like this
- // could be refactored to either use the default extension or set
the fully expected
- // extension when compression is included (e.g. ".csv.gz")
- true => "",
- false => &get_extension(cmd.location.as_str()),
+ let file_extension = match cmd.locations.len() {
+ 1 => {
+ let table_path = ListingTableUrl::parse(&cmd.locations[0])?;
+ match table_path.is_collection() {
+ // Setting the extension to be empty instead of allowing
the default extension seems
+ // odd, but was done to ensure existing behavior isn't
modified. It seems like this
+ // could be refactored to either use the default extension
or set the fully expected
+ // extension when compression is included (e.g. ".csv.gz").
+ // We do the same if there are multiple locations provided
for the table.
+ true => "",
+ false => &get_extension(cmd.locations[0].as_str()),
+ }
+ }
+ _ => "",
Review Comment:
You could make an attempt here - if all files have the same extension then
use it, otherwise fall back to `""`
##########
datafusion/proto/tests/cases/roundtrip_logical_plan.rs:
##########
@@ -196,7 +196,7 @@ impl LogicalExtensionCodec for TestTableProviderCodec {
})?;
assert_eq!(msg.table_name, table_ref.to_string());
let provider = TestTableProvider {
- url: msg.url,
+ url: vec![msg.url],
Review Comment:
Same error as earlier - here you need to split by comma because the encode
below joins by comma.
But as I already said the idea of using comma as separator is fragile.
##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -63,137 +65,190 @@ impl TableProviderFactory for ListingTableFactory {
))?
.create(session_state, &cmd.options)?;
- let mut table_path = ListingTableUrl::parse(&cmd.location)?;
- let file_extension = match table_path.is_collection() {
- // Setting the extension to be empty instead of allowing the
default extension seems
- // odd, but was done to ensure existing behavior isn't modified.
It seems like this
- // could be refactored to either use the default extension or set
the fully expected
- // extension when compression is included (e.g. ".csv.gz")
- true => "",
- false => &get_extension(cmd.location.as_str()),
+ let file_extension = match cmd.locations.len() {
+ 1 => {
+ let table_path = ListingTableUrl::parse(&cmd.locations[0])?;
+ match table_path.is_collection() {
+ // Setting the extension to be empty instead of allowing
the default extension seems
+ // odd, but was done to ensure existing behavior isn't
modified. It seems like this
+ // could be refactored to either use the default extension
or set the fully expected
+ // extension when compression is included (e.g. ".csv.gz").
+ // We do the same if there are multiple locations provided
for the table.
+ true => "",
+ false => &get_extension(cmd.locations[0].as_str()),
+ }
+ }
+ _ => "",
};
+
let mut options = ListingOptions::new(file_format)
.with_session_config_options(session_state.config())
.with_file_extension(file_extension);
- let (provided_schema, table_partition_cols) = if
cmd.schema.fields().is_empty() {
- let infer_parts = session_state
- .config_options()
- .execution
- .listing_table_factory_infer_partitions;
- let part_cols = if cmd.table_partition_cols.is_empty() &&
infer_parts {
- options
- .infer_partitions(session_state, &table_path)
- .await?
- .into_iter()
- } else {
- cmd.table_partition_cols.clone().into_iter()
- };
-
- (
- None,
- part_cols
- .map(|p| {
- (
- p,
- DataType::Dictionary(
- Box::new(DataType::UInt16),
- Box::new(DataType::Utf8),
- ),
- )
- })
- .collect::<Vec<_>>(),
+ let table_paths: Vec<ListingTableUrl> = cmd
+ .locations
+ .iter()
+ .map(|loc| ListingTableUrl::parse(loc))
+ .collect::<Result<Vec<_>>>()?;
+
+ // We use the first location to infer the partition columns,
+ // primarily for performance and simplicity reasons.
+ let partition_columns = infer_partition_columns(
+ &options,
+ session_state,
+ &table_paths[0],
Review Comment:
Could this fail with out of bounds error ?
Above while extracting the file extension you make a check whether the
locations.len() is 1 or anything else (`_ => ""`), i.e. even 0 will match there.
##########
datafusion-cli/src/exec.rs:
##########
@@ -524,14 +529,16 @@ mod tests {
if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) =
&plan {
let format = config_file_type_from_str(&cmd.file_type);
- register_object_store_and_config_extensions(
- &ctx,
- &cmd.location,
- &cmd.options,
- format,
- false,
- )
- .await?;
+ for location in &cmd.locations {
+ register_object_store_and_config_extensions(
+ &ctx,
+ &location,
+ &cmd.options,
+ format.clone(),
+ false,
+ )
+ .await?;
Review Comment:
At line 428 you join the `register_futures`. Here you don't. Why the
difference ?
##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -1499,7 +1499,7 @@ impl AsLogicalPlan for LogicalPlanNode {
logical_plan_type:
Some(LogicalPlanType::CreateExternalTable(
protobuf::CreateExternalTableNode {
name: Some(name.clone().into()),
- location: location.clone(),
+ location: location.clone().join(","),
Review Comment:
Comma is an allowed character in file names.
Joining the file names by comma may break later when splitting if a file
already has a comma in its name.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]