[spark] branch master updated: [SPARK-40998][SQL] Rename the error class `_LEGACY_ERROR_TEMP_0040` to `INVALID_IDENTIFIER`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a3c6cd64512 [SPARK-40998][SQL] Rename the error class `_LEGACY_ERROR_TEMP_0040` to `INVALID_IDENTIFIER` a3c6cd64512 is described below commit a3c6cd64512f2a1f5ad312f4a06db1935e514a07 Author: Max Gekk AuthorDate: Thu Nov 3 08:44:43 2022 +0300 [SPARK-40998][SQL] Rename the error class `_LEGACY_ERROR_TEMP_0040` to `INVALID_IDENTIFIER` ### What changes were proposed in this pull request? In the PR, I propose to assign the proper name `INVALID_IDENTIFIER ` to the legacy error class `_LEGACY_ERROR_TEMP_0040 `, and modify test suite to use `checkError()` which checks the error class name, context and etc. ### Why are the changes needed? Proper name improves user experience w/ Spark SQL. ### Does this PR introduce _any_ user-facing change? Yes, the PR changes an user-facing error message. ### How was this patch tested? By running the modified test suites: ``` $ build/sbt "core/testOnly *SparkThrowableSuite" $ build/sbt "test:testOnly *ErrorParserSuite" ``` Closes #38484 from MaxGekk/invalid-identifier-error-class. Authored-by: Max Gekk Signed-off-by: Max Gekk --- core/src/main/resources/error/error-classes.json | 10 ++-- .../spark/sql/catalyst/parser/ParseDriver.scala| 2 +- .../spark/sql/errors/QueryParsingErrors.scala | 4 +- .../sql/catalyst/parser/ErrorParserSuite.scala | 66 +++--- 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 7ec5e11a206..d1b4c4f030c 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -569,6 +569,11 @@ ], "sqlState" : "22023" }, + "INVALID_IDENTIFIER" : { +"message" : [ + "The identifier is invalid. Please, consider quoting it with back-quotes as ``." +] + }, "INVALID_JSON_SCHEMA_MAP_TYPE" : { "message" : [ "Input schema can only contain STRING as a key type for a MAP." @@ -1376,11 +1381,6 @@ "Unsupported SQL statement." ] }, - "_LEGACY_ERROR_TEMP_0040" : { -"message" : [ - "Possibly unquoted identifier detected. Please consider quoting it with back-quotes as ``." -] - }, "_LEGACY_ERROR_TEMP_0041" : { "message" : [ "Found duplicate clauses: ." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index 498d2d9ee13..10a213373ad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -325,7 +325,7 @@ case object PostProcessor extends SqlBaseParserBaseListener { override def exitErrorIdent(ctx: SqlBaseParser.ErrorIdentContext): Unit = { val ident = ctx.getParent.getText -throw QueryParsingErrors.unquotedIdentifierError(ident, ctx) +throw QueryParsingErrors.invalidIdentifierError(ident, ctx) } /** Remove the back ticks from an Identifier. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala index 204b28f3725..1fce265bece 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala @@ -448,9 +448,9 @@ private[sql] object QueryParsingErrors extends QueryErrorsBase { Some("_LEGACY_ERROR_TEMP_0039")) } - def unquotedIdentifierError(ident: String, ctx: ErrorIdentContext): Throwable = { + def invalidIdentifierError(ident: String, ctx: ErrorIdentContext): Throwable = { new ParseException( - errorClass = "_LEGACY_ERROR_TEMP_0040", + errorClass = "INVALID_IDENTIFIER", messageParameters = Map("ident" -> ident), ctx) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala index b48a950d9d5..e88ccae2ac5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala @@ -42,22 +42,22 @@ class ErrorParserSuite extends AnalysisTest { test("hyphen in identifier - DDL tests") { checkError( exception = parseException("USE test-test"), - errorClass = "_LEGACY_ERROR_TEMP_0040", +
[spark] branch master updated: [SPARK-40977][CONNECT][PYTHON] Complete Support for Union in Python client
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 10722044f42 [SPARK-40977][CONNECT][PYTHON] Complete Support for Union in Python client 10722044f42 is described below commit 10722044f429b1a825018673ca139d698559f6fa Author: Rui Wang AuthorDate: Thu Nov 3 13:53:23 2022 +0900 [SPARK-40977][CONNECT][PYTHON] Complete Support for Union in Python client ### What changes were proposed in this pull request? 1. Improve testing coverage for `Union` and `UnionAll` (they are actually both `UnionAll`) 2. Add the API which does `UnionByName`. ### Why are the changes needed? Improve API Coverage. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UT Closes #38453 from amaliujia/python_union. Authored-by: Rui Wang Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/connect/dataframe.py| 27 ++ python/pyspark/sql/connect/plan.py | 6 - .../sql/tests/connect/test_connect_plan_only.py| 10 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index b9ddb0db300..b9ba4b99ba0 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -293,6 +293,33 @@ class DataFrame(object): raise ValueError("Argument to Union does not contain a valid plan.") return DataFrame.withPlan(plan.UnionAll(self._plan, other._plan), session=self._session) +def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame": +"""Returns a new :class:`DataFrame` containing union of rows in this and another +:class:`DataFrame`. + +This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set +union (that does deduplication of elements), use this function followed by :func:`distinct`. + +.. versionadded:: 3.4.0 + +Parameters +-- +other : :class:`DataFrame` +Another :class:`DataFrame` that needs to be combined. +allowMissingColumns : bool, optional, default False + Specify whether to allow missing columns. + +Returns +--- +:class:`DataFrame` +Combined DataFrame. +""" +if other._plan is None: +raise ValueError("Argument to UnionByName does not contain a valid plan.") +return DataFrame.withPlan( +plan.UnionAll(self._plan, other._plan, allowMissingColumns), session=self._session +) + def where(self, condition: Expression) -> "DataFrame": return self.filter(condition) diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 2f1f70ec1a9..cc59a493d5a 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -606,9 +606,12 @@ class Join(LogicalPlan): class UnionAll(LogicalPlan): -def __init__(self, child: Optional["LogicalPlan"], other: "LogicalPlan") -> None: +def __init__( +self, child: Optional["LogicalPlan"], other: "LogicalPlan", by_name: bool = False +) -> None: super().__init__(child) self.other = other +self.by_name = by_name def plan(self, session: Optional["RemoteSparkSession"]) -> proto.Relation: assert self._child is not None @@ -617,6 +620,7 @@ class UnionAll(LogicalPlan): rel.set_op.right_input.CopyFrom(self.other.plan(session)) rel.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_UNION rel.set_op.is_all = True +rel.set_op.by_name = self.by_name return rel def print(self, indent: int = 0) -> str: diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py index e40a54b7d0c..8a9b98e73fd 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan_only.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan_only.py @@ -190,6 +190,16 @@ class SparkConnectTestsPlanOnly(PlanOnlyTestFixture): self.assertIsNotNone(plan.root, "Root relation must be set") self.assertIsNotNone(plan.root.read) +def test_union(self): +df1 = self.connect.readTable(table_name=self.tbl_name) +df2 = self.connect.readTable(table_name=self.tbl_name) +plan1 = df1.union(df2)._plan.to_proto(self.connect) +self.assertTrue(plan1.root.set_op.is_all) +plan2 = df1.union(df2)._plan.to_proto(self.connect) +self.assertTrue(plan2.root.set_op.is_all) +plan3 = df1.unionByName(df2, True)._plan.to_proto(self.connect) +
[spark] branch master updated: [SPARK-40995][CONNECT][DOC][FOLLOW-UP] Fix the type in the doc name
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 39824f1dae2 [SPARK-40995][CONNECT][DOC][FOLLOW-UP] Fix the type in the doc name 39824f1dae2 is described below commit 39824f1dae2caa62292f75c818c0d28d281bc415 Author: Rui Wang AuthorDate: Thu Nov 3 11:09:11 2022 +0900 [SPARK-40995][CONNECT][DOC][FOLLOW-UP] Fix the type in the doc name ### What changes were proposed in this pull request? Fix the type in the doc filename: `coient` -> `client`. ### Why are the changes needed? Fix typo. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? UT Closes #38487 from amaliujia/follow_up_docs. Authored-by: Rui Wang Signed-off-by: Hyukjin Kwon --- .../docs/{coient-connection-string.md => client-connection-string.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/connector/connect/docs/coient-connection-string.md b/connector/connect/docs/client-connection-string.md similarity index 100% rename from connector/connect/docs/coient-connection-string.md rename to connector/connect/docs/client-connection-string.md - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (e63b7da85e4 -> adb41ca8480)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from e63b7da85e4 [SPARK-40995][CONNECT][DOC] Defining Spark Connect Client Connection String add adb41ca8480 [SPARK-40989][CONNECT][PYTHON][TESTS] Improve `session.sql` testing coverage in Python client No new revisions were added by this update. Summary of changes: python/pyspark/sql/tests/connect/test_connect_basic.py | 4 python/pyspark/sql/tests/connect/test_connect_plan_only.py | 4 python/pyspark/testing/connectutils.py | 8 +++- 3 files changed, 15 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (c4e6b2cecee -> e63b7da85e4)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from c4e6b2cecee [SPARK-40985][BUILD] Upgrade RoaringBitmap to 0.9.35 add e63b7da85e4 [SPARK-40995][CONNECT][DOC] Defining Spark Connect Client Connection String No new revisions were added by this update. Summary of changes: .../pyspark/sql => connector}/connect/README.md| 26 - connector/connect/docs/coient-connection-string.md | 116 + 2 files changed, 137 insertions(+), 5 deletions(-) rename {python/pyspark/sql => connector}/connect/README.md (72%) create mode 100644 connector/connect/docs/coient-connection-string.md - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-40985][BUILD] Upgrade RoaringBitmap to 0.9.35
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c4e6b2cecee [SPARK-40985][BUILD] Upgrade RoaringBitmap to 0.9.35 c4e6b2cecee is described below commit c4e6b2cecee612035651c32ff5aba3bd2a17a283 Author: yangjie01 AuthorDate: Wed Nov 2 10:46:55 2022 -0500 [SPARK-40985][BUILD] Upgrade RoaringBitmap to 0.9.35 ### What changes were proposed in this pull request? This pr aims upgrade RoaringBitmap 0.9.35 ### Why are the changes needed? This version bring some bug fix: - https://github.com/RoaringBitmap/RoaringBitmap/pull/587 - https://github.com/RoaringBitmap/RoaringBitmap/issues/588 other changes as follows: https://github.com/RoaringBitmap/RoaringBitmap/compare/0.9.32...0.9.35 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #38465 from LuciferYang/rbitmap-0935. Authored-by: yangjie01 Signed-off-by: Sean Owen --- core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt | 8 core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt | 10 +- core/benchmarks/MapStatusesConvertBenchmark-results.txt | 10 +- dev/deps/spark-deps-hadoop-2-hive-2.3 | 4 ++-- dev/deps/spark-deps-hadoop-3-hive-2.3 | 4 ++-- pom.xml | 2 +- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt index adac80834e4..06f7cc7c92c 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark -OpenJDK 64-Bit Server VM 11.0.16+8-LTS on Linux 5.15.0-1019-azure +OpenJDK 64-Bit Server VM 11.0.16.1+1 on Linux 5.15.0-1022-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Num Maps: 5 Fetch partitions:500 1269 1276 8 0.0 1268666001.0 1.0X -Num Maps: 5 Fetch partitions:1000 2672 2695 39 0.0 2671542753.0 0.5X -Num Maps: 5 Fetch partitions:1500 4034 4069 50 0.0 4033696987.0 0.3X +Num Maps: 5 Fetch partitions:500 1227 1262 47 0.0 1226744907.0 1.0X +Num Maps: 5 Fetch partitions:1000 2620 2637 15 0.0 2620288061.0 0.5X +Num Maps: 5 Fetch partitions:1500 3975 3990 17 0.0 3974979610.0 0.3X diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt index 9911ae3326f..3b6f5c6695e 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark -OpenJDK 64-Bit Server VM 17.0.4+8-LTS on Linux 5.15.0-1019-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +OpenJDK 64-Bit Server VM 17.0.4.1+1 on Linux 5.15.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms)Rate(M/s) Per Row(ns) Relative -Num Maps: 5 Fetch partitions:500 1228 1238 17 0.0 1228191051.0 1.0X -Num Maps: 5 Fetch partitions:1000 2380 2393 16 0.0 2379601524.0 0.5X -Num Maps: 5 Fetch partitions:1500 3803 3857 55 0.0 3802550172.0 0.3X +Num Maps: 5 Fetch partitions:500 1159 1184 38 0.0 1159155979.0 1.0X +Num Maps: 5 Fetch partitions:1000 2329 2387 57 0.0 2328833805.0 0.5X +Num Maps: 5 Fetch partitions:1500 3608 3712 92 0.0 3607631972.0
[spark] branch master updated: [SPARK-40957] Add in memory cache in HDFSMetadataLog
This is an automated email from the ASF dual-hosted git repository. kabhwan pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 5fa2c13cbf8 [SPARK-40957] Add in memory cache in HDFSMetadataLog 5fa2c13cbf8 is described below commit 5fa2c13cbf83c6c4c040f15bbbf66dbe49581bdc Author: Jerry Peng AuthorDate: Wed Nov 2 22:24:16 2022 +0900 [SPARK-40957] Add in memory cache in HDFSMetadataLog ### What changes were proposed in this pull request? Every time entries in offset log or commit log needs to be access, we read from disk which is slow. Can a cache of recent entries to speed up reads. There is already an existing implementation of a caching mechanism in OffsetSeqLog. Lets replace it with an implementation in HDFSMetadataLog (parent class) so that we can support reading from in memory cache for both offset log and commit log. ### Why are the changes needed? Improve read speeds for entries in offset log and commit log ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing unit tests should suffice Closes #38430 from jerrypeng/SPARK-40957. Authored-by: Jerry Peng Signed-off-by: Jungtaek Lim --- .../org/apache/spark/sql/internal/SQLConf.scala| 8 ++ .../sql/execution/streaming/HDFSMetadataLog.scala | 112 ++--- .../sql/execution/streaming/OffsetSeqLog.scala | 18 3 files changed, 85 insertions(+), 53 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index abe9df8dd87..0f3dc3cf44c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2007,6 +2007,14 @@ object SQLConf { .booleanConf .createWithDefault(true) + val STREAMING_METADATA_CACHE_ENABLED = +buildConf("spark.sql.streaming.metadataCache.enabled") + .internal() + .doc("Whether the streaming HDFSMetadataLog caches the metadata of the latest two batches.") + .booleanConf + .createWithDefault(true) + + val VARIABLE_SUBSTITUTE_ENABLED = buildConf("spark.sql.variable.substitute") .doc("This enables substitution using syntax like `${var}`, `${system:var}`, " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index 8a037b55168..1d444655548 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -19,7 +19,9 @@ package org.apache.spark.sql.execution.streaming import java.io._ import java.nio.charset.StandardCharsets +import java.util.{Collections, LinkedHashMap => JLinkedHashMap} +import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.commons.io.IOUtils @@ -30,6 +32,7 @@ import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf /** @@ -64,6 +67,17 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: fileManager.mkdirs(metadataPath) } + protected val metadataCacheEnabled: Boolean + = sparkSession.sessionState.conf.getConf(SQLConf.STREAMING_METADATA_CACHE_ENABLED) + + /** + * Cache the latest two batches. [[StreamExecution]] usually just accesses the latest two batches + * when committing offsets, this cache will save some file system operations. + */ + protected[sql] val batchCache = Collections.synchronizedMap(new JLinkedHashMap[Long, T](2) { +override def removeEldestEntry(e: java.util.Map.Entry[Long, T]): Boolean = size > 2 + }) + /** * A `PathFilter` to filter only batch files */ @@ -113,10 +127,18 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: */ override def add(batchId: Long, metadata: T): Boolean = { require(metadata != null, "'null' metadata cannot written to a metadata log") -addNewBatchByStream(batchId) { output => serialize(metadata, output) } +val res = addNewBatchByStream(batchId) { output => serialize(metadata, output) } +if (metadataCacheEnabled && res) batchCache.put(batchId, metadata) +res } override def get(batchId: Long): Option[T] = { +if (metadataCacheEnabled && batchCache.containsKey(batchId)) { + val metadata = batchCache.get(batchId) + assert(metadata != null) + return Some(metadata) +}
[spark] branch master updated: [SPARK-40374][SQL] Migrate type check failures of type creators onto error classes
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 68531ada34d [SPARK-40374][SQL] Migrate type check failures of type creators onto error classes 68531ada34d is described below commit 68531ada34db72d352c39396f85458a8370af812 Author: panbingkun AuthorDate: Wed Nov 2 14:51:36 2022 +0300 [SPARK-40374][SQL] Migrate type check failures of type creators onto error classes ### What changes were proposed in this pull request? This pr replaces TypeCheckFailure by DataTypeMismatch in type checks in the complex type creator expressions, includes: 1. CreateMap (3): https://github.com/apache/spark/blob/1431975723d8df30a25b2333eddcfd0bb6c57677/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L205-L214 2. CreateNamedStruct (3): https://github.com/apache/spark/blob/1431975723d8df30a25b2333eddcfd0bb6c57677/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L445-L457 3. UpdateFields (2): https://github.com/apache/spark/blob/1431975723d8df30a25b2333eddcfd0bb6c57677/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L670-L673 ### Why are the changes needed? Migration onto error classes unifies Spark SQL error messages. ### Does this PR introduce _any_ user-facing change? Yes. The PR changes user-facing error messages. ### How was this patch tested? 1. Add new UT 2. Update existed UT 3. Pass GA Closes #38463 from panbingkun/SPARK-40374. Authored-by: panbingkun Signed-off-by: Max Gekk --- core/src/main/resources/error/error-classes.json | 20 ++ .../catalyst/expressions/complexTypeCreator.scala | 72 ++- .../analysis/ExpressionTypeCheckingSuite.scala | 83 -- .../catalyst/expressions/ComplexTypeSuite.scala| 47 .../main/scala/org/apache/spark/sql/Column.scala | 2 +- .../apache/spark/sql/ColumnExpressionSuite.scala | 82 - 6 files changed, 250 insertions(+), 56 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index fe2cd3a44bb..7ec5e11a206 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -138,6 +138,11 @@ "Unable to convert column of type to JSON." ] }, + "CANNOT_DROP_ALL_FIELDS" : { +"message" : [ + "Cannot drop all fields in struct." +] + }, "CAST_WITHOUT_SUGGESTION" : { "message" : [ "cannot cast to ." @@ -155,6 +160,21 @@ "To convert values from to , you can use the functions instead." ] }, + "CREATE_MAP_KEY_DIFF_TYPES" : { +"message" : [ + "The given keys of function should all be the same type, but they are ." +] + }, + "CREATE_MAP_VALUE_DIFF_TYPES" : { +"message" : [ + "The given values of function should all be the same type, but they are ." +] + }, + "CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING" : { +"message" : [ + "Only foldable `STRING` expressions are allowed to appear at odd position, but they are ." +] + }, "DATA_DIFF_TYPES" : { "message" : [ "Input to should all be the same type, but it's ." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 27d4f506ac8..97c882fd176 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{Resolver, TypeCheckResult, TypeCoercion, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{FUNC_ALIAS, FunctionBuilder} +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch +import org.apache.spark.sql.catalyst.expressions.Cast._ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser @@ -202,16 +204,30 @@ case class CreateMap(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) override def checkInputDataTypes(): TypeCheckResult = { if (children.size % 2 != 0)
[spark] branch master updated: [SPARK-40248][SQL] Use larger number of bits to build Bloom filter
This is an automated email from the ASF dual-hosted git repository. yumwang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d627d8e4f48 [SPARK-40248][SQL] Use larger number of bits to build Bloom filter d627d8e4f48 is described below commit d627d8e4f4802b8200574a1a73c4bebe5d813a5a Author: Yuming Wang AuthorDate: Wed Nov 2 18:05:54 2022 +0800 [SPARK-40248][SQL] Use larger number of bits to build Bloom filter ### What changes were proposed in this pull request? This PR makes Bloom filter join use larger number of bits to build Bloom filter if row count is exist. ### Why are the changes needed? To fix Bloom filter join cannot filter out more data when CBO is enabled. For example: TPC-DS q64: CBO is enabled | CBO is disabled -- | -- https://user-images.githubusercontent.com/5399861/187076753-2e9ccc72-0289-4537-a6d9-3a01a37bf6cd.png";> | https://user-images.githubusercontent.com/5399861/187076786-c982e711-52e2-4199-ba42-e1100f57287b.png";> https://user-images.githubusercontent.com/5399861/187075553-bd6956b7-8f1f-4df5-82b7-d010defb6d21.png";> | https://user-images.githubusercontent.com/5399861/187075588-254c3246-b9af-403c-8df7-d8344fd1d2a4.png";> After this PR: Build bloom filter | Filter data -- | -- https://user-images.githubusercontent.com/5399861/187075676-85b2afae-03a0-4430-9c4e-2679c6ef62f7.png";> | https://user-images.githubusercontent.com/5399861/187075713-41173dc1-d01d-476a-b218-5c67be823e1b.png";> ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #37697 from wangyum/SPARK-40248. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Signed-off-by: Yuming Wang --- .../org/apache/spark/util/sketch/BloomFilter.java | 9 .../aggregate/BloomFilterAggregate.scala | 16 +++ .../catalyst/optimizer/InjectRuntimeFilter.scala | 3 +-- .../approved-plans-modified/q10.sf100/explain.txt | 8 .../q10.sf100/simplified.txt | 2 +- .../approved-plans-modified/q59.sf100/explain.txt | 16 +++ .../q59.sf100/simplified.txt | 4 ++-- .../approved-plans-v1_4/q10.sf100/explain.txt | 8 .../approved-plans-v1_4/q10.sf100/simplified.txt | 2 +- .../approved-plans-v1_4/q16.sf100/explain.txt | 24 +++--- .../approved-plans-v1_4/q16.sf100/simplified.txt | 6 +++--- .../approved-plans-v1_4/q2.sf100/explain.txt | 16 +++ .../approved-plans-v1_4/q2.sf100/simplified.txt| 4 ++-- .../approved-plans-v1_4/q32.sf100/explain.txt | 8 .../approved-plans-v1_4/q32.sf100/simplified.txt | 2 +- .../approved-plans-v1_4/q40.sf100/explain.txt | 8 .../approved-plans-v1_4/q40.sf100/simplified.txt | 2 +- .../approved-plans-v1_4/q59.sf100/explain.txt | 16 +++ .../approved-plans-v1_4/q59.sf100/simplified.txt | 4 ++-- .../approved-plans-v1_4/q64.sf100/explain.txt | 8 .../approved-plans-v1_4/q64.sf100/simplified.txt | 2 +- .../approved-plans-v1_4/q69.sf100/explain.txt | 8 .../approved-plans-v1_4/q69.sf100/simplified.txt | 2 +- .../approved-plans-v1_4/q80.sf100/explain.txt | 16 +++ .../approved-plans-v1_4/q80.sf100/simplified.txt | 4 ++-- .../approved-plans-v1_4/q85.sf100/explain.txt | 16 +++ .../approved-plans-v1_4/q85.sf100/simplified.txt | 4 ++-- .../approved-plans-v1_4/q92.sf100/explain.txt | 8 .../approved-plans-v1_4/q92.sf100/simplified.txt | 2 +- .../approved-plans-v1_4/q94.sf100/explain.txt | 24 +++--- .../approved-plans-v1_4/q94.sf100/simplified.txt | 6 +++--- .../approved-plans-v1_4/q95.sf100/explain.txt | 24 +++--- .../approved-plans-v1_4/q95.sf100/simplified.txt | 6 +++--- .../approved-plans-v2_7/q10a.sf100/explain.txt | 8 .../approved-plans-v2_7/q10a.sf100/simplified.txt | 2 +- .../approved-plans-v2_7/q64.sf100/explain.txt | 8 .../approved-plans-v2_7/q64.sf100/simplified.txt | 2 +- .../approved-plans-v2_7/q80a.sf100/explain.txt | 16 +++ .../approved-plans-v2_7/q80a.sf100/simplified.txt | 4 ++-- .../spark/sql/BloomFilterAggregateQuerySuite.scala | 17 +++ 40 files changed, 189 insertions(+), 156 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index 2a6e270a912..5c01841e501 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.