This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e38383acc121 [SPARK-48844][SQL] USE INVALID_EMPTY_LOCATION instead of UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY when path is empty e38383acc121 is described below commit e38383acc121d79e195d95a9009c9eac94841cc2 Author: Kent Yao <y...@apache.org> AuthorDate: Wed Jul 10 08:20:45 2024 -0700 [SPARK-48844][SQL] USE INVALID_EMPTY_LOCATION instead of UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY when path is empty ### What changes were proposed in this pull request? When running sql on valid datasource files directly, if the given path is an empty string, we currently report UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY, which claims the datasource is invalid. The reason is that the `hadoop.Path` class can not be constructed with empty strings and we wrap `IAE` with UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY. In this PR, we check the path ahead to avoid this ambiguous error message ### Why are the changes needed? trivial bugfix, although this error rarely occurs in REPL environments but might still get a chance to happen when using the query with string interpolation. ### Does this PR introduce _any_ user-facing change? Yes, different error class ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #47267 from yaooqinn/SPARK-48844. Authored-by: Kent Yao <y...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../spark/sql/execution/datasources/rules.scala | 6 + .../analyzer-results/sql-on-files.sql.out | 167 +++++++++++++++++++ .../resources/sql-tests/inputs/sql-on-files.sql | 19 +++ .../sql-tests/results/sql-on-files.sql.out | 179 +++++++++++++++++++++ .../test-data/before_1582_date_v2_4.snappy.orc | Bin 0 -> 201 bytes .../src/test/resources/test-data/cars.csv | 7 + .../resources/test-data/dec-in-fixed-len.parquet | Bin 0 -> 460 bytes .../test/resources/test-data/with-map-fields.json | 5 + 8 files changed, 383 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 2f39a1962d2c..e4c3cd20dedb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -60,6 +60,12 @@ class ResolveSQLOnFile(sparkSession: SparkSession) extends Rule[LogicalPlan] { errorClass = "UNSUPPORTED_DATASOURCE_FOR_DIRECT_QUERY", messageParameters = Map("dataSourceType" -> ident.head)) } + if (isFileFormat && ident.last.isEmpty) { + unresolved.failAnalysis( + errorClass = "INVALID_EMPTY_LOCATION", + messageParameters = Map("location" -> ident.last)) + } + dataSource } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out new file mode 100644 index 000000000000..78e2a876da86 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-on-files.sql.out @@ -0,0 +1,167 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT * FROM parquet.`` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 24, + "fragment" : "parquet.``" + } ] +} + + +-- !query +SELECT * FROM parquet.`/file/not/found` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM parquet.`src/test/resources/test-data/dec-in-fixed-len.parquet` LIMIT 1 +-- !query analysis +GlobalLimit 1 ++- LocalLimit 1 + +- Project [fixed_len_dec#x] + +- Relation [fixed_len_dec#x] parquet + + +-- !query +SELECT * FROM orc.`` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 20, + "fragment" : "orc.``" + } ] +} + + +-- !query +SELECT * FROM orc.`/file/not/found` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM orc.`src/test/resources/test-data/before_1582_date_v2_4.snappy.orc` LIMIT 1 +-- !query analysis +GlobalLimit 1 ++- LocalLimit 1 + +- Project [dt#x] + +- Relation [dt#x] orc + + +-- !query +SELECT * FROM csv.`` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 20, + "fragment" : "csv.``" + } ] +} + + +-- !query +SELECT * FROM csv.`/file/not/found` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM csv.`src/test/resources/test-data/cars.csv` LIMIT 1 +-- !query analysis +GlobalLimit 1 ++- LocalLimit 1 + +- Project [_c0#x, _c1#x, _c2#x, _c3#x, _c4#x] + +- Relation [_c0#x,_c1#x,_c2#x,_c3#x,_c4#x] csv + + +-- !query +SELECT * FROM json.`` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 21, + "fragment" : "json.``" + } ] +} + + +-- !query +SELECT * FROM json.`/file/not/found` +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM json.`src/test/resources/test-data/with-map-fields.json` LIMIT 1 +-- !query analysis +GlobalLimit 1 ++- LocalLimit 1 + +- Project [id#xL, intervals#x] + +- Relation [id#xL,intervals#x] json diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql new file mode 100644 index 000000000000..aee8aaa4d195 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/sql-on-files.sql @@ -0,0 +1,19 @@ +-- Parquet +SELECT * FROM parquet.``; +SELECT * FROM parquet.`/file/not/found`; +SELECT * FROM parquet.`src/test/resources/test-data/dec-in-fixed-len.parquet` LIMIT 1; + +-- ORC +SELECT * FROM orc.``; +SELECT * FROM orc.`/file/not/found`; +SELECT * FROM orc.`src/test/resources/test-data/before_1582_date_v2_4.snappy.orc` LIMIT 1; + +-- CSV +SELECT * FROM csv.``; +SELECT * FROM csv.`/file/not/found`; +SELECT * FROM csv.`src/test/resources/test-data/cars.csv` LIMIT 1; + +-- JSON +SELECT * FROM json.``; +SELECT * FROM json.`/file/not/found`; +SELECT * FROM json.`src/test/resources/test-data/with-map-fields.json` LIMIT 1; diff --git a/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out new file mode 100644 index 000000000000..5c1e5697d029 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/sql-on-files.sql.out @@ -0,0 +1,179 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT * FROM parquet.`` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 24, + "fragment" : "parquet.``" + } ] +} + + +-- !query +SELECT * FROM parquet.`/file/not/found` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM parquet.`src/test/resources/test-data/dec-in-fixed-len.parquet` LIMIT 1 +-- !query schema +struct<fixed_len_dec:decimal(10,2)> +-- !query output +0.00 + + +-- !query +SELECT * FROM orc.`` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 20, + "fragment" : "orc.``" + } ] +} + + +-- !query +SELECT * FROM orc.`/file/not/found` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM orc.`src/test/resources/test-data/before_1582_date_v2_4.snappy.orc` LIMIT 1 +-- !query schema +struct<dt:date> +-- !query output +1200-01-01 + + +-- !query +SELECT * FROM csv.`` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 20, + "fragment" : "csv.``" + } ] +} + + +-- !query +SELECT * FROM csv.`/file/not/found` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM csv.`src/test/resources/test-data/cars.csv` LIMIT 1 +-- !query schema +struct<_c0:string,_c1:string,_c2:string,_c3:string,_c4:string> +-- !query output +year make model comment blank + + +-- !query +SELECT * FROM json.`` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_EMPTY_LOCATION", + "sqlState" : "42K05", + "messageParameters" : { + "location" : "" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 15, + "stopIndex" : 21, + "fragment" : "json.``" + } ] +} + + +-- !query +SELECT * FROM json.`/file/not/found` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "PATH_NOT_FOUND", + "sqlState" : "42K03", + "messageParameters" : { + "path" : "file:/file/not/found" + } +} + + +-- !query +SELECT * FROM json.`src/test/resources/test-data/with-map-fields.json` LIMIT 1 +-- !query schema +struct<id:bigint,intervals:struct<a:struct<endTime:bigint,startTime:bigint>,b:struct<endTime:bigint,startTime:bigint>>> +-- !query output +1 {"a":{"endTime":211,"startTime":111},"b":{"endTime":221,"startTime":121}} diff --git a/sql/hive-thriftserver/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc b/sql/hive-thriftserver/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc new file mode 100644 index 000000000000..ebe01743b2e2 Binary files /dev/null and b/sql/hive-thriftserver/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc differ diff --git a/sql/hive-thriftserver/src/test/resources/test-data/cars.csv b/sql/hive-thriftserver/src/test/resources/test-data/cars.csv new file mode 100644 index 000000000000..40ded573ade5 --- /dev/null +++ b/sql/hive-thriftserver/src/test/resources/test-data/cars.csv @@ -0,0 +1,7 @@ + +year,make,model,comment,blank +"2012","Tesla","S","No comment", + +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt + diff --git a/sql/hive-thriftserver/src/test/resources/test-data/dec-in-fixed-len.parquet b/sql/hive-thriftserver/src/test/resources/test-data/dec-in-fixed-len.parquet new file mode 100644 index 000000000000..6ad37d563951 Binary files /dev/null and b/sql/hive-thriftserver/src/test/resources/test-data/dec-in-fixed-len.parquet differ diff --git a/sql/hive-thriftserver/src/test/resources/test-data/with-map-fields.json b/sql/hive-thriftserver/src/test/resources/test-data/with-map-fields.json new file mode 100644 index 000000000000..576fbb9b8758 --- /dev/null +++ b/sql/hive-thriftserver/src/test/resources/test-data/with-map-fields.json @@ -0,0 +1,5 @@ +{ "id": 1, "intervals": { "a": { "startTime": 111, "endTime": 211 }, "b": { "startTime": 121, "endTime": 221 }}} +{ "id": 2, "intervals": { "a": { "startTime": 112, "endTime": 212 }, "b": { "startTime": 122, "endTime": 222 }}} +{ "id": 3, "intervals": { "a": { "startTime": 113, "endTime": 213 }, "b": { "startTime": 123, "endTime": 223 }}} +{ "id": 4, "intervals": { }} +{ "id": 5 } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org