[spark] branch branch-3.5 updated: [SPARK-44496][SQL][CONNECT] Move Interfaces needed by SCSC to sql/api
This is an automated email from the ASF dual-hosted git repository. hvanhovell pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new b9d6b9a2658 [SPARK-44496][SQL][CONNECT] Move Interfaces needed by SCSC to sql/api b9d6b9a2658 is described below commit b9d6b9a26589100db682bb6b6d66eb0fb49df85e Author: Rui Wang AuthorDate: Sat Jul 22 19:53:04 2023 -0400 [SPARK-44496][SQL][CONNECT] Move Interfaces needed by SCSC to sql/api ### What changes were proposed in this pull request? This PR moves some interfaces and utils that are needed by scala client to sql/api. ### Why are the changes needed? So that scala client does not need to depend on Spark. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UT Closes #42092 from amaliujia/row_coder. Authored-by: Rui Wang Signed-off-by: Herman van Hovell (cherry picked from commit 515dfc166a95ecc8decce0f0cd99e06fe395f94f) Signed-off-by: Herman van Hovell --- .../spark/api/java/function/CoGroupFunction.java | 0 .../api/java/function/DoubleFlatMapFunction.java | 0 .../spark/api/java/function/DoubleFunction.java| 0 .../spark/api/java/function/FilterFunction.java| 0 .../spark/api/java/function/FlatMapFunction.java | 0 .../spark/api/java/function/FlatMapFunction2.java | 0 .../api/java/function/FlatMapGroupsFunction.java | 0 .../spark/api/java/function/ForeachFunction.java | 0 .../java/function/ForeachPartitionFunction.java| 0 .../apache/spark/api/java/function/Function.java | 0 .../apache/spark/api/java/function/Function0.java | 0 .../apache/spark/api/java/function/Function2.java | 0 .../apache/spark/api/java/function/Function3.java | 0 .../apache/spark/api/java/function/Function4.java | 0 .../spark/api/java/function/MapFunction.java | 0 .../spark/api/java/function/MapGroupsFunction.java | 0 .../api/java/function/MapPartitionsFunction.java | 0 .../api/java/function/PairFlatMapFunction.java | 0 .../spark/api/java/function/PairFunction.java | 0 .../spark/api/java/function/ReduceFunction.java| 0 .../spark/api/java/function/VoidFunction.java | 0 .../spark/api/java/function/VoidFunction2.java | 0 .../spark/api/java/function/package-info.java | 0 .../apache/spark/api/java/function/package.scala | 0 .../org/apache/spark/util/SparkClassUtils.scala| 4 ++ connector/connect/client/jvm/pom.xml | 5 ++ .../main/scala/org/apache/spark/sql/Column.scala | 4 +- .../org/apache/spark/sql/DataFrameReader.scala | 8 +-- .../main/scala/org/apache/spark/sql/Dataset.scala | 12 ++--- .../main/scala/org/apache/spark/util/Utils.scala | 1 - project/MimaExcludes.scala | 29 +++ .../main/scala/org/apache/spark/sql/Encoder.scala | 0 .../src/main/scala/org/apache/spark/sql/Row.scala | 6 +-- .../scala/org/apache/spark/sql/SqlApiConf.scala| 2 + .../sql/catalyst/encoders/AgnosticEncoder.scala| 4 +- .../sql/catalyst/expressions/GenericRow.scala | 25 ++--- .../sql/catalyst/expressions/OrderUtils.scala | 25 + .../sql/catalyst/util/SparkCharVarcharUtils.scala | 60 ++ .../apache/spark/sql/errors/DataTypeErrors.scala | 28 ++ .../spark/sql/catalyst/expressions/ordering.scala | 9 +--- .../spark/sql/catalyst/expressions/rows.scala | 21 .../spark/sql/catalyst/util/CharVarcharUtils.scala | 38 +- .../spark/sql/errors/QueryCompilationErrors.scala | 8 +-- .../spark/sql/errors/QueryExecutionErrors.scala| 10 +--- .../apache/spark/sql/CharVarcharTestSuite.scala| 28 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 4 +- .../sql/test/DataFrameReaderWriterSuite.scala | 12 ++--- 47 files changed, 203 insertions(+), 140 deletions(-) diff --git a/core/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java b/common/utils/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java similarity index 100% rename from core/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java rename to common/utils/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java diff --git a/core/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java b/common/utils/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java similarity index 100% rename from core/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java rename to common/utils/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java diff --git a/core/src/main/java/org/apache/spark/api/java/function/DoubleFunction.java
[spark] branch master updated: [SPARK-44496][SQL][CONNECT] Move Interfaces needed by SCSC to sql/api
This is an automated email from the ASF dual-hosted git repository. hvanhovell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 515dfc166a9 [SPARK-44496][SQL][CONNECT] Move Interfaces needed by SCSC to sql/api 515dfc166a9 is described below commit 515dfc166a95ecc8decce0f0cd99e06fe395f94f Author: Rui Wang AuthorDate: Sat Jul 22 19:53:04 2023 -0400 [SPARK-44496][SQL][CONNECT] Move Interfaces needed by SCSC to sql/api ### What changes were proposed in this pull request? This PR moves some interfaces and utils that are needed by scala client to sql/api. ### Why are the changes needed? So that scala client does not need to depend on Spark. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UT Closes #42092 from amaliujia/row_coder. Authored-by: Rui Wang Signed-off-by: Herman van Hovell --- .../spark/api/java/function/CoGroupFunction.java | 0 .../api/java/function/DoubleFlatMapFunction.java | 0 .../spark/api/java/function/DoubleFunction.java| 0 .../spark/api/java/function/FilterFunction.java| 0 .../spark/api/java/function/FlatMapFunction.java | 0 .../spark/api/java/function/FlatMapFunction2.java | 0 .../api/java/function/FlatMapGroupsFunction.java | 0 .../spark/api/java/function/ForeachFunction.java | 0 .../java/function/ForeachPartitionFunction.java| 0 .../apache/spark/api/java/function/Function.java | 0 .../apache/spark/api/java/function/Function0.java | 0 .../apache/spark/api/java/function/Function2.java | 0 .../apache/spark/api/java/function/Function3.java | 0 .../apache/spark/api/java/function/Function4.java | 0 .../spark/api/java/function/MapFunction.java | 0 .../spark/api/java/function/MapGroupsFunction.java | 0 .../api/java/function/MapPartitionsFunction.java | 0 .../api/java/function/PairFlatMapFunction.java | 0 .../spark/api/java/function/PairFunction.java | 0 .../spark/api/java/function/ReduceFunction.java| 0 .../spark/api/java/function/VoidFunction.java | 0 .../spark/api/java/function/VoidFunction2.java | 0 .../spark/api/java/function/package-info.java | 0 .../apache/spark/api/java/function/package.scala | 0 .../org/apache/spark/util/SparkClassUtils.scala| 4 ++ connector/connect/client/jvm/pom.xml | 5 ++ .../main/scala/org/apache/spark/sql/Column.scala | 4 +- .../org/apache/spark/sql/DataFrameReader.scala | 8 +-- .../main/scala/org/apache/spark/sql/Dataset.scala | 12 ++--- .../main/scala/org/apache/spark/util/Utils.scala | 1 - project/MimaExcludes.scala | 29 +++ .../main/scala/org/apache/spark/sql/Encoder.scala | 0 .../src/main/scala/org/apache/spark/sql/Row.scala | 6 +-- .../scala/org/apache/spark/sql/SqlApiConf.scala| 2 + .../sql/catalyst/encoders/AgnosticEncoder.scala| 4 +- .../sql/catalyst/expressions/GenericRow.scala | 25 ++--- .../sql/catalyst/expressions/OrderUtils.scala | 25 + .../sql/catalyst/util/SparkCharVarcharUtils.scala | 60 ++ .../apache/spark/sql/errors/DataTypeErrors.scala | 28 ++ .../spark/sql/catalyst/expressions/ordering.scala | 9 +--- .../spark/sql/catalyst/expressions/rows.scala | 21 .../spark/sql/catalyst/util/CharVarcharUtils.scala | 38 +- .../spark/sql/errors/QueryCompilationErrors.scala | 8 +-- .../spark/sql/errors/QueryExecutionErrors.scala| 10 +--- .../apache/spark/sql/CharVarcharTestSuite.scala| 28 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 4 +- .../sql/test/DataFrameReaderWriterSuite.scala | 12 ++--- 47 files changed, 203 insertions(+), 140 deletions(-) diff --git a/core/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java b/common/utils/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java similarity index 100% rename from core/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java rename to common/utils/src/main/java/org/apache/spark/api/java/function/CoGroupFunction.java diff --git a/core/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java b/common/utils/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java similarity index 100% rename from core/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java rename to common/utils/src/main/java/org/apache/spark/api/java/function/DoubleFlatMapFunction.java diff --git a/core/src/main/java/org/apache/spark/api/java/function/DoubleFunction.java b/common/utils/src/main/java/org/apache/spark/api/java/function/DoubleFunction.java similarity index 100% rename from
[spark] branch master updated: [MINOR][UI] Simplify columnDefs in stagepage.js
This is an automated email from the ASF dual-hosted git repository. sarutak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 46440a4a542 [MINOR][UI] Simplify columnDefs in stagepage.js 46440a4a542 is described below commit 46440a4a542148bc05b8c0f80d1860e6380efdb6 Author: Kent Yao AuthorDate: Sat Jul 22 17:12:07 2023 +0900 [MINOR][UI] Simplify columnDefs in stagepage.js ### What changes were proposed in this pull request? Simplify `columnDefs` in stagepage.js ### Why are the changes needed? Reduce hardcode in stagepage.js and potential inconsistency for hidden/show in future changes. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Locally verified. https://github.com/apache/spark/assets/8326978/3b3595a4-7825-47d5-8c28-30ec916321e6;> Closes #42101 from yaooqinn/m. Authored-by: Kent Yao Signed-off-by: Kousuke Saruta --- .../org/apache/spark/ui/static/stagepage.js| 35 ++ 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js index 50bf959d3aa..a8792593bf2 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js @@ -235,11 +235,7 @@ function createDataTableForTaskSummaryMetricsTable(taskSummaryMetricsTable) { } ], "columnDefs": [ -{ "type": "duration", "targets": 1 }, -{ "type": "duration", "targets": 2 }, -{ "type": "duration", "targets": 3 }, -{ "type": "duration", "targets": 4 }, -{ "type": "duration", "targets": 5 } +{ "type": "duration", "targets": [1, 2, 3, 4, 5] } ], "paging": false, "info": false, @@ -592,22 +588,16 @@ $(document).ready(function () { // The targets: $id represents column id which comes from stagespage-template.html // #summary-executor-table.If the relative position of the columns in the table // #summary-executor-table has changed,please be careful to adjust the column index here -// Input Size / Records -{"type": "size", "targets": 9}, -// Output Size / Records -{"type": "size", "targets": 10}, -// Shuffle Read Size / Records -{"type": "size", "targets": 11}, -// Shuffle Write Size / Records -{"type": "size", "targets": 12}, +// Input Size / Records - 9 +// Output Size / Records - 10 +// Shuffle Read Size / Records - 11 +// Shuffle Write Size / Records - 12 +{"type": "size", "targets": [9, 10, 11, 12]}, // Peak JVM Memory OnHeap / OffHeap -{"visible": false, "targets": 15}, // Peak Execution Memory OnHeap / OffHeap -{"visible": false, "targets": 16}, // Peak Storage Memory OnHeap / OffHeap -{"visible": false, "targets": 17}, // Peak Pool Memory Direct / Mapped -{"visible": false, "targets": 18} +{"visible": false, "targets": executorOptionalColumns}, ], "deferRender": true, "order": [[0, "asc"]], @@ -1079,15 +1069,8 @@ $(document).ready(function () { } ], "columnDefs": [ -{ "visible": false, "targets": 11 }, -{ "visible": false, "targets": 12 }, -{ "visible": false, "targets": 13 }, -{ "visible": false, "targets": 14 }, -{ "visible": false, "targets": 15 }, -{ "visible": false, "targets": 16 }, -{ "visible": false, "targets": 17 }, -{ "visible": false, "targets": 18 }, -{ "visible": false, "targets": 21 } +{ "visible": false, "targets": optionalColumns }, +{ "visible": false, "targets": 18 }, // accumulators ], "deferRender": true }; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [MINOR][CONNECT] Remove redundant type cast in `ArtifactManager`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 6e19fb2853b [MINOR][CONNECT] Remove redundant type cast in `ArtifactManager` 6e19fb2853b is described below commit 6e19fb2853b26b5da3f02bc829e0c05e52d5a929 Author: yangjie01 AuthorDate: Sat Jul 22 16:50:27 2023 +0900 [MINOR][CONNECT] Remove redundant type cast in `ArtifactManager` ### What changes were proposed in this pull request? This pr just remove redundant type cast in `ArtifactManager` ### Why are the changes needed? remove redundant type cast. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #42107 from LuciferYang/redundant-cast. Authored-by: yangjie01 Signed-off-by: Hyukjin Kwon (cherry picked from commit 49401c7a679c807abd25613648bd156a221ccc7b) Signed-off-by: Hyukjin Kwon --- .../scala/org/apache/spark/sql/connect/client/ArtifactManager.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala index b24c36ea474..a0158170a5b 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala @@ -219,7 +219,7 @@ class ArtifactManager( .setUserContext(userContext) .setSessionId(sessionId) artifacts.foreach { artifact => - val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32) + val in = new CheckedInputStream(artifact.storage.stream, new CRC32) try { val data = proto.AddArtifactsRequest.ArtifactChunk .newBuilder() @@ -274,7 +274,7 @@ class ArtifactManager( .setUserContext(userContext) .setSessionId(sessionId) -val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32) +val in = new CheckedInputStream(artifact.storage.stream, new CRC32) try { // First RPC contains the `BeginChunkedArtifact` payload (`begin_chunk`). // Subsequent RPCs contains the `ArtifactChunk` payload (`chunk`). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [MINOR][CONNECT] Remove redundant type cast in `ArtifactManager`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 49401c7a679 [MINOR][CONNECT] Remove redundant type cast in `ArtifactManager` 49401c7a679 is described below commit 49401c7a679c807abd25613648bd156a221ccc7b Author: yangjie01 AuthorDate: Sat Jul 22 16:50:27 2023 +0900 [MINOR][CONNECT] Remove redundant type cast in `ArtifactManager` ### What changes were proposed in this pull request? This pr just remove redundant type cast in `ArtifactManager` ### Why are the changes needed? remove redundant type cast. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #42107 from LuciferYang/redundant-cast. Authored-by: yangjie01 Signed-off-by: Hyukjin Kwon --- .../scala/org/apache/spark/sql/connect/client/ArtifactManager.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala index b24c36ea474..a0158170a5b 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala @@ -219,7 +219,7 @@ class ArtifactManager( .setUserContext(userContext) .setSessionId(sessionId) artifacts.foreach { artifact => - val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32) + val in = new CheckedInputStream(artifact.storage.stream, new CRC32) try { val data = proto.AddArtifactsRequest.ArtifactChunk .newBuilder() @@ -274,7 +274,7 @@ class ArtifactManager( .setUserContext(userContext) .setSessionId(sessionId) -val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32) +val in = new CheckedInputStream(artifact.storage.stream, new CRC32) try { // First RPC contains the `BeginChunkedArtifact` payload (`begin_chunk`). // Subsequent RPCs contains the `ArtifactChunk` payload (`chunk`). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [SPARK-44380][PYTHON][FOLLOWUP] Set __doc__ for analyze static method when Arrow is enabled
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 07d3e8e5287 [SPARK-44380][PYTHON][FOLLOWUP] Set __doc__ for analyze static method when Arrow is enabled 07d3e8e5287 is described below commit 07d3e8e52878ea9631d4757b67119a18fbdf0230 Author: Takuya UESHIN AuthorDate: Sat Jul 22 16:49:14 2023 +0900 [SPARK-44380][PYTHON][FOLLOWUP] Set __doc__ for analyze static method when Arrow is enabled ### What changes were proposed in this pull request? This is a follow-up of apache/spark#41948. Set `__doc__` for `analyze` static method when Arrow is enabled. ### Why are the changes needed? When Arrow is enabled, `analyze` static method doesn't have `__doc__` that should be the same as the original contents. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated the related tests. Closes #42111 from ueshin/issues/SPARK-44380/analyze_doc. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon (cherry picked from commit 784b942196bb08a7959222f549722c6db3a3588e) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/tests/test_udtf.py | 8 +++- python/pyspark/sql/udtf.py| 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index 2c76d2f7e15..0fdb1c9b8a1 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -728,6 +728,11 @@ class BaseUDTFTestsMixin: """Initialize the UDTF""" ... +@staticmethod +def analyze(x: AnalyzeArgument) -> AnalyzeResult: +"""Analyze the argument.""" +... + def eval(self, x: int): """Evaluate the input row.""" yield x + 1, @@ -736,9 +741,10 @@ class BaseUDTFTestsMixin: """Terminate the UDTF.""" ... -cls = udtf(TestUDTF, returnType="y: int").func +cls = udtf(TestUDTF).func self.assertIn("A UDTF for test", cls.__doc__) self.assertIn("Initialize the UDTF", cls.__init__.__doc__) +self.assertIn("Analyze the argument", cls.analyze.__doc__) self.assertIn("Evaluate the input row", cls.eval.__doc__) self.assertIn("Terminate the UDTF", cls.terminate.__doc__) diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py index 3ab74193093..e930daa9f51 100644 --- a/python/pyspark/sql/udtf.py +++ b/python/pyspark/sql/udtf.py @@ -134,6 +134,9 @@ def _vectorize_udtf(cls: Type) -> Type: if hasattr(cls, "terminate"): getattr(vectorized_udtf, "terminate").__doc__ = getattr(cls, "terminate").__doc__ +if hasattr(vectorized_udtf, "analyze"): +getattr(vectorized_udtf, "analyze").__doc__ = getattr(cls, "analyze").__doc__ + return vectorized_udtf - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44380][PYTHON][FOLLOWUP] Set __doc__ for analyze static method when Arrow is enabled
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 784b942196b [SPARK-44380][PYTHON][FOLLOWUP] Set __doc__ for analyze static method when Arrow is enabled 784b942196b is described below commit 784b942196bb08a7959222f549722c6db3a3588e Author: Takuya UESHIN AuthorDate: Sat Jul 22 16:49:14 2023 +0900 [SPARK-44380][PYTHON][FOLLOWUP] Set __doc__ for analyze static method when Arrow is enabled ### What changes were proposed in this pull request? This is a follow-up of apache/spark#41948. Set `__doc__` for `analyze` static method when Arrow is enabled. ### Why are the changes needed? When Arrow is enabled, `analyze` static method doesn't have `__doc__` that should be the same as the original contents. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated the related tests. Closes #42111 from ueshin/issues/SPARK-44380/analyze_doc. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/tests/test_udtf.py | 8 +++- python/pyspark/sql/udtf.py| 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index 13ea86ebcb2..e67ec245795 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -749,6 +749,11 @@ class BaseUDTFTestsMixin: """Initialize the UDTF""" ... +@staticmethod +def analyze(x: AnalyzeArgument) -> AnalyzeResult: +"""Analyze the argument.""" +... + def eval(self, x: int): """Evaluate the input row.""" yield x + 1, @@ -757,9 +762,10 @@ class BaseUDTFTestsMixin: """Terminate the UDTF.""" ... -cls = udtf(TestUDTF, returnType="y: int").func +cls = udtf(TestUDTF).func self.assertIn("A UDTF for test", cls.__doc__) self.assertIn("Initialize the UDTF", cls.__init__.__doc__) +self.assertIn("Analyze the argument", cls.analyze.__doc__) self.assertIn("Evaluate the input row", cls.eval.__doc__) self.assertIn("Terminate the UDTF", cls.terminate.__doc__) diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py index a35278deb9b..67d4ef33777 100644 --- a/python/pyspark/sql/udtf.py +++ b/python/pyspark/sql/udtf.py @@ -178,6 +178,9 @@ def _vectorize_udtf(cls: Type) -> Type: if hasattr(cls, "terminate"): getattr(vectorized_udtf, "terminate").__doc__ = getattr(cls, "terminate").__doc__ +if hasattr(vectorized_udtf, "analyze"): +getattr(vectorized_udtf, "analyze").__doc__ = getattr(cls, "analyze").__doc__ + return vectorized_udtf - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [SPARK-44485][SQL] Optimize TreeNode.generateTreeString
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new efca188b7b3 [SPARK-44485][SQL] Optimize TreeNode.generateTreeString efca188b7b3 is described below commit efca188b7b33d0c0fe3cf9ca83286741f7f57dc7 Author: Ziqi Liu AuthorDate: Sat Jul 22 16:43:31 2023 +0900 [SPARK-44485][SQL] Optimize TreeNode.generateTreeString ### What changes were proposed in this pull request? Optimize several critical code path in `TreeNode.generateTreeString` ### Why are the changes needed? In `TreeNode.generateTreeString`, we observed inefficiency in scala collection operations and virtual function call in our internal workload. This inefficiency become significant in large plan (we hit a example of more than 1000 nodes). So it’s worth optimizing the super hot code path. By rewriting into native Java code(not so sweet as scala syntax sugar though), we should be able to get rid of most of the overhead. - `ArrayBuffer.append` https://github.com/apache/spark/assets/22358241/3e1d2e5e-1eeb-46ef-ab7a-20f4cb75f602;> - `Seq.last` https://github.com/apache/spark/assets/22358241/23f29695-8a01-4c8e-b75a-148a92278c2b;> - `SeqLike.$colon$plus` https://github.com/apache/spark/assets/22358241/f0526746-62d0-4556-99be-04a24ab805d2;> - `StringOps.$times` https://github.com/apache/spark/assets/22358241/3a46f18e-7027-421e-aa5a-130d02e1c19c;> ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing UTs Closes #42095 from liuzqt/SPARK-44485. Authored-by: Ziqi Liu Signed-off-by: Hyukjin Kwon (cherry picked from commit 09c44fd05d10dccc131885f94a61da0885de0ad0) Signed-off-by: Hyukjin Kwon --- .../spark/sql/catalyst/util/StringUtils.scala | 8 ++--- .../apache/spark/sql/catalyst/trees/TreeNode.scala | 35 -- .../spark/sql/catalyst/util/StringUtils.scala | 2 +- .../sql/execution/WholeStageCodegenExec.scala | 4 +-- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 8 +++-- .../sql/execution/adaptive/QueryStageExec.scala| 6 ++-- .../sql/execution/basicPhysicalOperators.scala | 2 +- 7 files changed, 42 insertions(+), 23 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index c12a1f50daa..20fb8bb94bd 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -18,8 +18,6 @@ package org.apache.spark.sql.catalyst.util import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.internal.Logging import org.apache.spark.unsafe.array.ByteArrayUtils @@ -29,7 +27,7 @@ import org.apache.spark.unsafe.array.ByteArrayUtils * the string. */ class StringConcat(val maxLength: Int = ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH) { - protected val strings = new ArrayBuffer[String] + protected val strings = new java.util.ArrayList[String] protected var length: Int = 0 def atLimit: Boolean = length >= maxLength @@ -45,7 +43,7 @@ class StringConcat(val maxLength: Int = ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH) if (!atLimit) { val available = maxLength - length val stringToAppend = if (available >= sLen) s else s.substring(0, available) -strings.append(stringToAppend) +strings.add(stringToAppend) } // Keeps the total length of appended strings. Note that we need to cap the length at @@ -62,7 +60,7 @@ class StringConcat(val maxLength: Int = ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH) override def toString: String = { val finalLength = if (atLimit) maxLength else length val result = new java.lang.StringBuilder(finalLength) -strings.foreach(result.append) +strings.forEach(s => result.append(s)) result.toString } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 8d3f81666f8..9e605a45414 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -936,7 +936,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] addSuffix: Boolean, maxFields: Int, printOperatorId: Boolean): Unit = { -generateTreeString(0, Nil, append, verbose, "", addSuffix, maxFields, printOperatorId, 0) +generateTreeString(0, new java.util.ArrayList(), append,
[spark] branch master updated: [SPARK-44485][SQL] Optimize TreeNode.generateTreeString
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 09c44fd05d1 [SPARK-44485][SQL] Optimize TreeNode.generateTreeString 09c44fd05d1 is described below commit 09c44fd05d10dccc131885f94a61da0885de0ad0 Author: Ziqi Liu AuthorDate: Sat Jul 22 16:43:31 2023 +0900 [SPARK-44485][SQL] Optimize TreeNode.generateTreeString ### What changes were proposed in this pull request? Optimize several critical code path in `TreeNode.generateTreeString` ### Why are the changes needed? In `TreeNode.generateTreeString`, we observed inefficiency in scala collection operations and virtual function call in our internal workload. This inefficiency become significant in large plan (we hit a example of more than 1000 nodes). So it’s worth optimizing the super hot code path. By rewriting into native Java code(not so sweet as scala syntax sugar though), we should be able to get rid of most of the overhead. - `ArrayBuffer.append` https://github.com/apache/spark/assets/22358241/3e1d2e5e-1eeb-46ef-ab7a-20f4cb75f602;> - `Seq.last` https://github.com/apache/spark/assets/22358241/23f29695-8a01-4c8e-b75a-148a92278c2b;> - `SeqLike.$colon$plus` https://github.com/apache/spark/assets/22358241/f0526746-62d0-4556-99be-04a24ab805d2;> - `StringOps.$times` https://github.com/apache/spark/assets/22358241/3a46f18e-7027-421e-aa5a-130d02e1c19c;> ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing UTs Closes #42095 from liuzqt/SPARK-44485. Authored-by: Ziqi Liu Signed-off-by: Hyukjin Kwon --- .../spark/sql/catalyst/util/StringUtils.scala | 8 ++--- .../apache/spark/sql/catalyst/trees/TreeNode.scala | 35 -- .../spark/sql/catalyst/util/StringUtils.scala | 2 +- .../sql/execution/WholeStageCodegenExec.scala | 4 +-- .../execution/adaptive/AdaptiveSparkPlanExec.scala | 8 +++-- .../sql/execution/adaptive/QueryStageExec.scala| 6 ++-- .../sql/execution/basicPhysicalOperators.scala | 2 +- 7 files changed, 42 insertions(+), 23 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index c12a1f50daa..20fb8bb94bd 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -18,8 +18,6 @@ package org.apache.spark.sql.catalyst.util import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.internal.Logging import org.apache.spark.unsafe.array.ByteArrayUtils @@ -29,7 +27,7 @@ import org.apache.spark.unsafe.array.ByteArrayUtils * the string. */ class StringConcat(val maxLength: Int = ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH) { - protected val strings = new ArrayBuffer[String] + protected val strings = new java.util.ArrayList[String] protected var length: Int = 0 def atLimit: Boolean = length >= maxLength @@ -45,7 +43,7 @@ class StringConcat(val maxLength: Int = ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH) if (!atLimit) { val available = maxLength - length val stringToAppend = if (available >= sLen) s else s.substring(0, available) -strings.append(stringToAppend) +strings.add(stringToAppend) } // Keeps the total length of appended strings. Note that we need to cap the length at @@ -62,7 +60,7 @@ class StringConcat(val maxLength: Int = ByteArrayUtils.MAX_ROUNDED_ARRAY_LENGTH) override def toString: String = { val finalLength = if (atLimit) maxLength else length val result = new java.lang.StringBuilder(finalLength) -strings.foreach(result.append) +strings.forEach(s => result.append(s)) result.toString } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 8d3f81666f8..9e605a45414 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -936,7 +936,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] addSuffix: Boolean, maxFields: Int, printOperatorId: Boolean): Unit = { -generateTreeString(0, Nil, append, verbose, "", addSuffix, maxFields, printOperatorId, 0) +generateTreeString(0, new java.util.ArrayList(), append, verbose, "", addSuffix, maxFields, + printOperatorId, 0) } /** @@ -998,7 +999,7 @@ abstract class