alamb opened a new issue, #8285:
URL: https://github.com/apache/arrow-datafusion/issues/8285
### Describe the bug
We are seeing the following bug in our user queries:
Internal error: PhysicalOptimizer rule 'join_selection' failed, due to
generate a different schema, original schema: Schema { fields: [Field { name:
"id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false,
metadata: {} }], metadata: {} }, new schema: Schema { fields: [Field { name:
"id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false,
metadata: {"metadata_key": "the id field"} }], metadata: {} }.
### To Reproduce
Run a query like this where the relevant schema has metadata:
```sql
query error
SELECT "data"."id"
FROM
(
(SELECT "id" FROM "table_with_metadata")
UNION
(SELECT "id" FROM "table_with_metadata")
) as "data",
(
SELECT "id" FROM "table_with_metadata"
) as "samples"
WHERE "data"."id" = "samples"."id";
```
```
diff --git a/datafusion/sqllogictest/src/test_context.rs
b/datafusion/sqllogictest/src/test_context.rs
index b2314f34f3..f5ab8f71aa 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -35,6 +35,7 @@ use datafusion::{
};
use datafusion_common::DataFusionError;
use log::info;
+use std::collections::HashMap;
use std::fs::File;
use std::io::Write;
use std::path::Path;
@@ -57,8 +58,8 @@ impl TestContext {
}
}
- /// Create a SessionContext, configured for the specific test, if
- /// possible.
+ /// Create a SessionContext, configured for the specific sqllogictest
+ /// test(.slt file) , if possible.
///
/// If `None` is returned (e.g. because some needed feature is not
/// enabled), the file should be skipped
@@ -67,7 +68,7 @@ impl TestContext {
// hardcode target partitions so plans are deterministic
.with_target_partitions(4);
- let test_ctx =
TestContext::new(SessionContext::new_with_config(config));
+ let mut test_ctx =
TestContext::new(SessionContext::new_with_config(config));
let file_name =
relative_path.file_name().unwrap().to_str().unwrap();
match file_name {
@@ -86,10 +87,8 @@ impl TestContext {
"avro.slt" => {
#[cfg(feature = "avro")]
{
- let mut test_ctx = test_ctx;
info!("Registering avro tables");
register_avro_tables(&mut test_ctx).await;
- return Some(test_ctx);
}
#[cfg(not(feature = "avro"))]
{
@@ -99,10 +98,11 @@ impl TestContext {
}
"joins.slt" => {
info!("Registering partition table tables");
-
- let mut test_ctx = test_ctx;
register_partition_table(&mut test_ctx).await;
- return Some(test_ctx);
+ }
+ "metadata.slt" => {
+ info!("Registering metadata table tables");
+ register_metadata_tables(test_ctx.session_ctx()).await;
}
_ => {
info!("Using default SessionContext");
@@ -299,3 +299,31 @@ fn table_with_many_types() -> Arc<dyn TableProvider> {
let provider = MemTable::try_new(Arc::new(schema),
vec![vec![batch]]).unwrap();
Arc::new(provider)
}
+
+/// Registers a table_with_metadata that contains both field level and
Table level metadata
+pub async fn register_metadata_tables(ctx: &SessionContext) {
+ let id = Field::new("id", DataType::Int32,
true).with_metadata(HashMap::from([(
+ String::from("metadata_key"),
+ String::from("the id field"),
+ )]));
+ let name = Field::new("name", DataType::Utf8,
true).with_metadata(HashMap::from([(
+ String::from("metadata_key"),
+ String::from("the name field"),
+ )]));
+
+ let batch = RecordBatch::try_new(
+ Arc::new(schema),
+ vec![
+ Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _,
+ Arc::new(StringArray::from(vec![None, Some("bar"),
Some("baz")])) as _,
+ ],
+ )
+ .unwrap();
+
+ ctx.register_batch("table_with_metadata", batch).unwrap();
+}
diff --git a/datafusion/sqllogictest/test_files/metadata.slt
b/datafusion/sqllogictest/test_files/metadata.slt
new file mode 100644
index 0000000000..a57eba2e3c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Tests for tables that has both metadata on each field as well as
metadata on
+## the schema itself.
+##########
+
+query IT
+select * from table_with_metadata;
+----
+1 NULL
+NULL bar
+3 baz
+
+query I
+SELECT (
+ SELECT id FROM table_with_metadata
+ ) UNION (
+ SELECT id FROM table_with_metadata
+ );
+----
+1
+NULL
+3
+
+query error
+SELECT "data"."id"
+FROM
+ (
+ (SELECT "id" FROM "table_with_metadata")
+ UNION
+ (SELECT "id" FROM "table_with_metadata")
+ ) as "data",
+ (
+ SELECT "id" FROM "table_with_metadata"
+ ) as "samples"
+WHERE "data"."id" = "samples"."id";
+----
+DataFusion error: join_selection
+caused by
+Internal error: PhysicalOptimizer rule 'join_selection' failed, due to
generate a different schema, original schema: Schema { fields: [Field { name:
"id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false,
metadata: {} }], metadata: {} }, new schema: Schema { fields: [Field { name:
"id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false,
metadata: {"metadata_key": "the id field"} }], metadata: {} }.
+This was likely caused by a bug in DataFusion's code and we would welcome
that you file an bug report in our issue tracker
```
### Expected behavior
The query should run successfully
### Additional context
_No response_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]