(spark) branch master updated: [SPARK-46500][PS][TESTS] Reorganize `FrameParityPivotTests`
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d85ad1c14182 [SPARK-46500][PS][TESTS] Reorganize `FrameParityPivotTests` d85ad1c14182 is described below commit d85ad1c14182d847e8a5d5d49cf21cd9079b284f Author: Ruifeng Zheng AuthorDate: Mon Dec 25 17:43:57 2023 +0800 [SPARK-46500][PS][TESTS] Reorganize `FrameParityPivotTests` ### What changes were proposed in this pull request? Reorganize `FrameParityPivotTests`: break `test_pivot_table` into mutiple tests ### Why are the changes needed? this test is slow ``` Starting test(python3.9): pyspark.pandas.tests.connect.computation.test_parity_pivot (temp output: /__w/spark/spark/python/target/5f37e442-9037-47cc-8c6b-e9a273299d0d/python3.9__pyspark.pandas.tests.connect.computation.test_parity_pivot__ozvdx_ay.log) Finished test(python3.9): pyspark.pandas.tests.connect.computation.test_parity_pivot (524s) ``` ### Does this PR introduce _any_ user-facing change? no, test only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44478 from zhengruifeng/ps_test_pivot_multi. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- dev/sparktestsupport/modules.py| 8 ++ .../pyspark/pandas/tests/computation/test_pivot.py | 149 + .../pandas/tests/computation/test_pivot_table.py | 93 + .../tests/computation/test_pivot_table_adv.py | 93 + .../computation/test_pivot_table_multi_idx.py | 91 + .../computation/test_pivot_table_multi_idx_adv.py | 93 + .../tests/connect/computation/test_parity_pivot.py | 6 +- ..._parity_pivot.py => test_parity_pivot_table.py} | 10 +- ...ity_pivot.py => test_parity_pivot_table_adv.py} | 10 +- ...vot.py => test_parity_pivot_table_multi_idx.py} | 10 +- ...py => test_parity_pivot_table_multi_idx_adv.py} | 12 +- 11 files changed, 418 insertions(+), 157 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index e4e3803a8f87..6f41b6f0eddf 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -824,6 +824,10 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.computation.test_melt", "pyspark.pandas.tests.computation.test_missing_data", "pyspark.pandas.tests.computation.test_pivot", +"pyspark.pandas.tests.computation.test_pivot_table", +"pyspark.pandas.tests.computation.test_pivot_table_adv", +"pyspark.pandas.tests.computation.test_pivot_table_multi_idx", +"pyspark.pandas.tests.computation.test_pivot_table_multi_idx_adv", "pyspark.pandas.tests.computation.test_stats", "pyspark.pandas.tests.frame.test_attrs", "pyspark.pandas.tests.frame.test_axis", @@ -1162,6 +1166,10 @@ pyspark_pandas_connect_part2 = Module( python_test_goals=[ # pandas-on-Spark unittests "pyspark.pandas.tests.connect.computation.test_parity_pivot", +"pyspark.pandas.tests.connect.computation.test_parity_pivot_table", +"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_adv", + "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx", + "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv", "pyspark.pandas.tests.connect.computation.test_parity_stats", "pyspark.pandas.tests.connect.indexes.test_parity_base_slow", "pyspark.pandas.tests.connect.frame.test_parity_interpolate", diff --git a/python/pyspark/pandas/tests/computation/test_pivot.py b/python/pyspark/pandas/tests/computation/test_pivot.py index 8a373108ddc8..2670fa384dc7 100644 --- a/python/pyspark/pandas/tests/computation/test_pivot.py +++ b/python/pyspark/pandas/tests/computation/test_pivot.py @@ -61,149 +61,6 @@ class FramePivotMixin: # columns="a", values="b", fill_value=999).dtypes, pdf.pivot_table(index=['e', 'c'], # columns="a", values="b", fill_value=999).dtypes) -def test_pivot_table(self): -pdf = pd.DataFrame( -{ -"a": [4, 2, 3, 4, 8, 6], -"b": [1, 2, 2, 4, 2, 4], -"e": [10, 20, 20, 40, 20, 40], -"c": [1, 2, 9, 4, 7, 4], -"d": [-1, -2, -3, -4, -5, -6], -}, -index=np.random.rand(6), -) -psdf = ps.from_pandas(pdf) - -# Checking if both DataFrames have the same results -self.assert_eq( -psdf.pivot_table(columns="a", values="b").sort_index(), -pdf.pivot_table(columns="a", values="b").sort_index(), -almost=True, -
(spark) branch master updated: [MINOR][INFRA] Comments in GitHub scripts should start with #
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d1a333d554ad [MINOR][INFRA] Comments in GitHub scripts should start with # d1a333d554ad is described below commit d1a333d554ad1da59bca277641f9e7d26247a0c6 Author: panbingkun AuthorDate: Mon Dec 25 19:33:33 2023 +0800 [MINOR][INFRA] Comments in GitHub scripts should start with # ### What changes were proposed in this pull request? The pr aims to fix a typo in github action script, `comments` in GitHub scripts should start with `#`. ### Why are the changes needed? In the GitHub runtime logs, we often observe runtime prompts similar to the one below: https://github.com/panbingkun/spark/actions/runs/7167111730/job/19513821177 https://github.com/panbingkun/spark/actions/runs/7167111730/job/19513823103 https://github.com/apache/spark/assets/15246973/26d39f6c-e6e3-4887-9344-a8fa251a255e";> Although it does not affect the overall task execution, I think it is necessary to correct it. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44473 from panbingkun/address_github. Authored-by: panbingkun Signed-off-by: Ruifeng Zheng --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2385da332002..a337eaa7864d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -264,7 +264,7 @@ jobs: export TERM=vt100 # Hive "other tests" test needs larger metaspace size based on experiment. if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi -// SPARK-46283: should delete the following env replacement after SPARK 3.x EOL +# SPARK-46283: should delete the following env replacement after SPARK 3.x EOL if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} fi - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-46481] Execute immediate VariableReference foldable
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c1d3fc1de30e [SPARK-46481] Execute immediate VariableReference foldable c1d3fc1de30e is described below commit c1d3fc1de30e6c3e453592cac485e674a864692c Author: milastdbx AuthorDate: Mon Dec 25 22:28:17 2023 +0800 [SPARK-46481] Execute immediate VariableReference foldable ### What changes were proposed in this pull request? As part of EXECUTE IMMEDIATE statement, we are doing variable resolution, and [previous PR ](https://github.com/apache/spark/pull/44093) introduced copy/paste issue from SET variable `canFold = false`. This if fine for SET command, but for parameters should be foldable to match regular query behaviour with same pattern. ### Why are the changes needed? To align parameterized and non parameterized queries ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manually Closes #44450 from milastdbx/dev/milast/executeImmediateFoldVarReference. Authored-by: milastdbx Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/analysis/executeImmediate.scala | 2 +- .../scala/org/apache/spark/sql/ParametersSuite.scala | 18 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala index 8fc373b71f25..7cc496616128 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/executeImmediate.scala @@ -177,7 +177,7 @@ class SubstituteExecuteImmediate(val catalogManager: CatalogManager) private def getVariableReference(expr: Expression, nameParts: Seq[String]): VariableReference = { lookupVariable(nameParts) match { - case Some(variable) => variable.copy(canFold = false) + case Some(variable) => variable case _ => throw QueryCompilationErrors .unresolvedVariableError( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala index 974def7f3b85..2801948f6837 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ParametersSuite.scala @@ -21,11 +21,12 @@ import java.time.{Instant, LocalDate, LocalDateTime, ZoneId} import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.functions.{array, call_function, lit, map, map_from_arrays, map_from_entries, str_to_map, struct} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -class ParametersSuite extends QueryTest with SharedSparkSession { +class ParametersSuite extends QueryTest with SharedSparkSession with PlanTest { test("bind named parameters") { val sqlText = @@ -607,4 +608,19 @@ class ParametersSuite extends QueryTest with SharedSparkSession { callSitePattern = getCurrentClassCallSitePattern) ) } + + test("SPARK-46481: Test variable folding") { +sql("DECLARE a INT = 1") +sql("SET VAR a = 1") +val expected = sql("SELECT 42 WHERE 1 = 1").queryExecution.optimizedPlan +val variableDirectly = sql("SELECT 42 WHERE 1 = a").queryExecution.optimizedPlan +val parameterizedSpark = + spark.sql("SELECT 42 WHERE 1 = ?", Array(1)).queryExecution.optimizedPlan +val parameterizedSql = + spark.sql("EXECUTE IMMEDIATE 'SELECT 42 WHERE 1 = ?' USING a").queryExecution.optimizedPlan + +comparePlans(expected, variableDirectly) +comparePlans(expected, parameterizedSpark) +comparePlans(expected, parameterizedSql) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-46488][SQL] Skipping trimAll call during timestamp parsing
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1a7a2f7a889f [SPARK-46488][SQL] Skipping trimAll call during timestamp parsing 1a7a2f7a889f is described below commit 1a7a2f7a889fe0270318b304cd50c148729dd90b Author: Stefan Kandic AuthorDate: Mon Dec 25 19:41:10 2023 +0300 [SPARK-46488][SQL] Skipping trimAll call during timestamp parsing ### What changes were proposed in this pull request? This PR is a follow up to [46173](https://github.com/apache/spark/pull/44110) which added skipping the trimAll calls during date parsing. Now I'm doing the same just for timestamp parsing. ### Why are the changes needed? These changes should drastically improve edge case where input string in cast to date has many whitespace as prefix/sufix. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? I added the tests to check for cases with prefixes and suffixes of whitespaces and control chars. Also there are benchmark tests in the previous [PR](https://github.com/apache/spark/pull/44110) ### Was this patch authored or co-authored using generative AI tooling? No Closes #44463 from stefankandic/str2timeStamp-skipTrim. Authored-by: Stefan Kandic Signed-off-by: Max Gekk --- .../sql/catalyst/util/SparkDateTimeUtils.scala | 66 -- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 43 +- 2 files changed, 79 insertions(+), 30 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index 35118b449e2f..ed4d68f553f1 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -315,18 +315,11 @@ trait SparkDateTimeUtils { var currentSegmentValue = 0 var currentSegmentDigits = 0 val bytes = s.getBytes -var j = 0 -var strEndTrimmed = bytes.length +var j = getTrimmedStart(bytes) +val strEndTrimmed = getTrimmedEnd(j, bytes) -while (j < bytes.length && UTF8String.isWhitespaceOrISOControl(bytes(j))) { - j += 1; -} -if (j == bytes.length) { - return None; -} - -while (strEndTrimmed > j && UTF8String.isWhitespaceOrISOControl(bytes(strEndTrimmed - 1))) { - strEndTrimmed -= 1; +if (j == strEndTrimmed) { + return None } if (bytes(j) == '-' || bytes(j) == '+') { @@ -418,7 +411,7 @@ trait SparkDateTimeUtils { (segment == 7 && digits <= 2) || (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2) } -if (s == null || s.trimAll().numBytes() == 0) { +if (s == null) { return (Array.empty, None, false) } var tz: Option[String] = None @@ -426,8 +419,14 @@ trait SparkDateTimeUtils { var i = 0 var currentSegmentValue = 0 var currentSegmentDigits = 0 -val bytes = s.trimAll().getBytes -var j = 0 +val bytes = s.getBytes +var j = getTrimmedStart(bytes) +val strEndTrimmed = getTrimmedEnd(j, bytes) + +if (j == strEndTrimmed) { + return (Array.empty, None, false) +} + var digitsMilli = 0 var justTime = false var yearSign: Option[Int] = None @@ -435,7 +434,7 @@ trait SparkDateTimeUtils { yearSign = if (bytes(j) == '-') Some(-1) else Some(1) j += 1 } -while (j < bytes.length) { +while (j < strEndTrimmed) { val b = bytes(j) val parsedValue = b - '0'.toByte if (parsedValue < 0 || parsedValue > 9) { @@ -504,8 +503,8 @@ trait SparkDateTimeUtils { currentSegmentValue = 0 currentSegmentDigits = 0 i += 1 -tz = Some(new String(bytes, j, bytes.length - j)) -j = bytes.length - 1 +tz = Some(new String(bytes, j, strEndTrimmed - j)) +j = strEndTrimmed - 1 } if (i == 6 && b != '.') { i += 1 @@ -619,6 +618,39 @@ trait SparkDateTimeUtils { case NonFatal(_) => None } } + + /** + * Returns the index of the first non-whitespace and non-ISO control character in the byte array. + * + * @param bytes The byte array to be processed. + * @return The start index after trimming. + */ + @inline private def getTrimmedStart(bytes: Array[Byte]) = { +var start = 0 + +while (start < bytes.length && UTF8String.isWhitespaceOrISOControl(bytes(start))) { + start += 1 +} + +start + } + + /** + * Returns the index of the last non-whitespace and non-ISO control character in the byte array
(spark) branch master updated: [SPARK-46499][BUILD] Bump sbt-eclipse 6.2.0
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a546ff4bbe5e [SPARK-46499][BUILD] Bump sbt-eclipse 6.2.0 a546ff4bbe5e is described below commit a546ff4bbe5e4836c3a4d16be961b01c68dfb351 Author: Cheng Pan AuthorDate: Mon Dec 25 15:09:03 2023 -0800 [SPARK-46499][BUILD] Bump sbt-eclipse 6.2.0 ### What changes were proposed in this pull request? Bump SBT plugin `sbt-eclipse` from 6.0.0 to 6.2.0 ### Why are the changes needed? Which brings the Java 21 support https://github.com/sbt/sbt-eclipse/releases/tag/6.2.0 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GA ### Was this patch authored or co-authored using generative AI tooling? No Closes #44476 from pan3793/SPARK-46499. Authored-by: Cheng Pan Signed-off-by: Dongjoon Hyun --- project/plugins.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/plugins.sbt b/project/plugins.sbt index 4d5d1efaa7a3..628e1e6d8938 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -27,7 +27,7 @@ libraryDependencies += "com.google.guava" % "guava" % "31.0.1-jre" addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.5") -addSbtPlugin("com.github.sbt" % "sbt-eclipse" % "6.0.0") +addSbtPlugin("com.github.sbt" % "sbt-eclipse" % "6.2.0") addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (a546ff4bbe5e -> 768aba1d0582)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from a546ff4bbe5e [SPARK-46499][BUILD] Bump sbt-eclipse 6.2.0 add 768aba1d0582 [SPARK-46475][BUILD] Upgrade RoaringBitmap to 1.0.1 No new revisions were added by this update. Summary of changes: core/benchmarks/MapStatusesConvertBenchmark-jdk21-results.txt | 10 +- core/benchmarks/MapStatusesConvertBenchmark-results.txt | 10 +- dev/deps/spark-deps-hadoop-3-hive-2.3 | 2 +- pom.xml | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (768aba1d0582 -> e314d3cd82d6)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 768aba1d0582 [SPARK-46475][BUILD] Upgrade RoaringBitmap to 1.0.1 add e314d3cd82d6 [SPARK-46497][SQL][TESTS] Re-enable the test cases that were ignored in SPARK-45309 No new revisions were added by this update. Summary of changes: .../apache/spark/sql/hive/HiveSparkSubmitSuite.scala | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-46501][INFRA] List the python packages with the correct versions
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new bb85bbcf16f7 [SPARK-46501][INFRA] List the python packages with the correct versions bb85bbcf16f7 is described below commit bb85bbcf16f7c837a47d0cf430bc94899709c254 Author: Ruifeng Zheng AuthorDate: Mon Dec 25 15:28:44 2023 -0800 [SPARK-46501][INFRA] List the python packages with the correct versions ### What changes were proposed in this pull request? List the python packages with the correct versions ### Why are the changes needed? the version here should be in `PYTHON_TO_TEST ` ### Does this PR introduce _any_ user-facing change? no, infra-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44479 from zhengruifeng/infra_pip_list. Authored-by: Ruifeng Zheng Signed-off-by: Dongjoon Hyun --- .github/workflows/build_and_test.yml | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a337eaa7864d..a4aca07d69f7 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -429,10 +429,14 @@ jobs: with: distribution: zulu java-version: ${{ matrix.java }} -- name: List Python packages (Python 3.9, PyPy3) +- name: List Python packages (${{ env.PYTHON_TO_TEST }}) + shell: 'script -q -e -c "bash {0}"' run: | -python3.9 -m pip list -pypy3 -m pip list +for py in $(echo $PYTHON_TO_TEST | tr "," "\n") +do + echo $py + $py -m pip list +done - name: Install Conda for pip packaging test if: contains(matrix.modules, 'pyspark-errors') run: | - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-40876][SQL][TESTS][FOLLOWUP] Fix failed test in `ParquetTypeWideningSuite` when `SPARK_ANSI_SQL_MODE` is set to true
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c1888cdf5361 [SPARK-40876][SQL][TESTS][FOLLOWUP] Fix failed test in `ParquetTypeWideningSuite` when `SPARK_ANSI_SQL_MODE` is set to true c1888cdf5361 is described below commit c1888cdf53610909af996c7f41ee0cd7ee0691db Author: yangjie01 AuthorDate: Mon Dec 25 15:42:13 2023 -0800 [SPARK-40876][SQL][TESTS][FOLLOWUP] Fix failed test in `ParquetTypeWideningSuite` when `SPARK_ANSI_SQL_MODE` is set to true ### What changes were proposed in this pull request? This pr aims to change the test inputs in `ParquetTypeWideningSuite` to valid int to fix failed test in `ParquetTypeWideningSuite` when SPARK_ANSI_SQL_MODE` is set to true ### Why are the changes needed? Fix the day test failure when `SPARK_ANSI_SQL_MODE` is set to true. - https://github.com/apache/spark/actions/runs/7318074558/job/19934321639 - https://github.com/apache/spark/actions/runs/7305312703/job/19908735746 - https://github.com/apache/spark/actions/runs/7311683968/job/19921532402 ``` [info] - unsupported parquet conversion IntegerType -> TimestampType *** FAILED *** (68 milliseconds) [info] org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 261.0 failed 1 times, most recent failure: Lost task 1.0 in stage 261.0 (TID 523) (localhost executor driver): org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '1.23' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. I [...] [info] == DataFrame == [info] "cast" was called from [info] org.apache.spark.sql.execution.datasources.parquet.ParquetTypeWideningSuite.writeParquetFiles(ParquetTypeWideningSuite.scala:113) [info] [info] at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145) [info] at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.withException(UTF8StringUtils.scala:51) [info] at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.toIntExact(UTF8StringUtils.scala:34) [info] at org.apache.spark.sql.catalyst.util.UTF8StringUtils.toIntExact(UTF8StringUtils.scala) [info] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) [info] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) [info] at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43) [info] at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:388) [info] at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:101) [info] at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:891) [info] at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:891) [info] at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) [info] at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) [info] at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) [info] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93) [info] at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171) [info] at org.apache.spark.scheduler.Task.run(Task.scala:141) [info] at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:628) [info] at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) [info] at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) [info] at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96) [info] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:631) [info] at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) [info] at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) [info] at java.base/java.lang.Thread.run(Thread.java:840) ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Pass GitHub Actions - Manual check ``` SPARK_ANSI_SQL_MODE=true build/sbt "sql/testOnly org.apache.spark.sql.execution.datasources.parquet.ParquetTypeWideningSuite" ``` **Before** ``` [info] Run completed in 27 seconds, 432 milliseconds. [info] Total numb
(spark) branch master updated: [SPARK-46503][PS][TESTS] Move `test_default_index` to `pyspark.pandas.tests.indexes.*`
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 6e3b9f74229e [SPARK-46503][PS][TESTS] Move `test_default_index` to `pyspark.pandas.tests.indexes.*` 6e3b9f74229e is described below commit 6e3b9f74229ec953371cdfa3e40aee27e7bf1115 Author: Ruifeng Zheng AuthorDate: Tue Dec 26 09:29:42 2023 +0900 [SPARK-46503][PS][TESTS] Move `test_default_index` to `pyspark.pandas.tests.indexes.*` ### What changes were proposed in this pull request? Move `test_default_index` to `pyspark.pandas.tests.indexes.*` ### Why are the changes needed? test code clean up ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44482 from zhengruifeng/ps_test_idx_default. Authored-by: Ruifeng Zheng Signed-off-by: Hyukjin Kwon --- dev/sparktestsupport/modules.py | 4 ++-- .../test_parity_default.py} | 8 +--- .../tests/{test_default_index.py => indexes/test_default.py} | 7 +-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 6f41b6f0eddf..102d54875504 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -723,7 +723,7 @@ pyspark_pandas = Module( "pyspark.pandas.tests.plot.test_series_plot_plotly", "pyspark.pandas.tests.test_categorical", "pyspark.pandas.tests.test_config", -"pyspark.pandas.tests.test_default_index", +"pyspark.pandas.tests.indexes.test_default", "pyspark.pandas.tests.window.test_expanding", "pyspark.pandas.tests.window.test_expanding_adv", "pyspark.pandas.tests.window.test_expanding_error", @@ -1049,7 +1049,7 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.plot.test_parity_series_plot_plotly", "pyspark.pandas.tests.connect.test_parity_categorical", "pyspark.pandas.tests.connect.test_parity_config", -"pyspark.pandas.tests.connect.test_parity_default_index", +"pyspark.pandas.tests.connect.indexes.test_parity_default", "pyspark.pandas.tests.connect.test_parity_extension", "pyspark.pandas.tests.connect.test_parity_frame_spark", "pyspark.pandas.tests.connect.test_parity_generic_functions", diff --git a/python/pyspark/pandas/tests/connect/test_parity_default_index.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_default.py similarity index 85% rename from python/pyspark/pandas/tests/connect/test_parity_default_index.py rename to python/pyspark/pandas/tests/connect/indexes/test_parity_default.py index a249fd7ef915..d6f0cadbf0cd 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_default_index.py +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_default.py @@ -16,13 +16,15 @@ # import unittest -from pyspark.pandas.tests.test_default_index import DefaultIndexTestsMixin +from pyspark.pandas.tests.indexes.test_default import DefaultIndexTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils class DefaultIndexParityTests( -DefaultIndexTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +DefaultIndexTestsMixin, +PandasOnSparkTestUtils, +ReusedConnectTestCase, ): @unittest.skip("Test depends on SparkContext which is not supported from Spark Connect.") def test_index_distributed_sequence_cleanup(self): @@ -30,7 +32,7 @@ class DefaultIndexParityTests( if __name__ == "__main__": -from pyspark.pandas.tests.connect.test_parity_default_index import * # noqa: F401 +from pyspark.pandas.tests.connect.indexes.test_parity_default import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/test_default_index.py b/python/pyspark/pandas/tests/indexes/test_default.py similarity index 96% rename from python/pyspark/pandas/tests/test_default_index.py rename to python/pyspark/pandas/tests/indexes/test_default.py index 29e489e81f3c..3d19eb407b42 100644 --- a/python/pyspark/pandas/tests/test_default_index.py +++ b/python/pyspark/pandas/tests/indexes/test_default.py @@ -91,13 +91,16 @@ class DefaultIndexTestsMixin: ) -class DefaultIndexTests(DefaultIndexTestsMixin, PandasOnSparkTestCase): +class DefaultIndexTests( +DefaultIndexTestsMixin, +PandasOnSparkTestCase, +): pass if __name__ == "__main__": import unittest -from pyspark.pandas.tests.test_default_index import * # noqa: F401 +from pys
(spark) branch master updated: [MINOR][DOCS] Python docs also require Pandoc
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3ee2db8f0bee [MINOR][DOCS] Python docs also require Pandoc 3ee2db8f0bee is described below commit 3ee2db8f0bee91e0947c9e0935dc898cb5884737 Author: Nicholas Chammas AuthorDate: Tue Dec 26 10:33:07 2023 +0900 [MINOR][DOCS] Python docs also require Pandoc ### What changes were proposed in this pull request? Clarify in the README for building docs that the Python API docs also require Pandoc, not just R. ### Why are the changes needed? Attempting to build the Python API docs without Pandoc installed will yield the following error: ``` reading sources... [ 0%] getting_started/quickstart_connect Notebook error: PandocMissing in getting_started/quickstart_connect.ipynb: Pandoc wasn't found. Please check that pandoc is installed: https://pandoc.org/installing.html make: *** [html] Error 2 Jekyll 4.3.2 Please append `--trace` to the `build` command for any additional information or backtrace. .../spark/docs/_plugins/copy_api_dirs.rb:130:in `': Python doc generation failed (RuntimeError) ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Via trying to build the Python docs without Pandoc. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44484 from nchammas/python-docs-pandoc. Authored-by: Nicholas Chammas Signed-off-by: Hyukjin Kwon --- docs/README.md | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/README.md b/docs/README.md index 99ccf69dbaee..95f4b9ac9e08 100644 --- a/docs/README.md +++ b/docs/README.md @@ -28,8 +28,7 @@ whichever version of Spark you currently have checked out of revision control. ## Prerequisites -The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala, Java, -Python, R and SQL. +The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala, Java, Python, R, and SQL. You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/) and [Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python) @@ -48,6 +47,8 @@ $ bundle install Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0. +To generate the Python or R docs, you'll need to [install Pandoc](https://pandoc.org/installing.html). + ### SQL and Python API Documentation (Optional) To generate SQL and Python API docs, you'll need to install these libraries: @@ -59,8 +60,7 @@ $ pip install --upgrade -r dev/requirements.txt ### R API Documentation (Optional) -If you'd like to generate R API documentation, you'll need to [install Pandoc](https://pandoc.org/installing.html) -and install these libraries: +If you'd like to generate R API documentation, install these libraries: ```sh $ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown"), repos="https://cloud.r-project.org/";)' - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated (3ee2db8f0bee -> 439ec6b954b4)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 3ee2db8f0bee [MINOR][DOCS] Python docs also require Pandoc add 439ec6b954b4 [SPARK-45600][SQL][PYTHON][FOLLOW-UP] Make Python data source registration session level No new revisions were added by this update. Summary of changes: .../scala/org/apache/spark/sql/execution/datasources/DataSource.scala | 2 -- 1 file changed, 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-46505][CONNECT] Make bytes threshold configurable in `ProtoUtils.abbreviate`
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4911a5bad4ac [SPARK-46505][CONNECT] Make bytes threshold configurable in `ProtoUtils.abbreviate` 4911a5bad4ac is described below commit 4911a5bad4ac4665772bafbc45ea18cc03e64f3c Author: Ruifeng Zheng AuthorDate: Tue Dec 26 10:52:54 2023 +0800 [SPARK-46505][CONNECT] Make bytes threshold configurable in `ProtoUtils.abbreviate` ### What changes were proposed in this pull request? Make bytes threshold configurable in `ProtoUtils.abbreviate` ### Why are the changes needed? the bytes threshold should be also configurable, like string type ### Does this PR introduce _any_ user-facing change? no, this function is only used internally ### How was this patch tested? added ut ### Was this patch authored or co-authored using generative AI tooling? no Closes #44486 from zhengruifeng/connect_ab_config. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .../spark/sql/connect/common/ProtoUtils.scala | 23 - .../sql/connect/messages/AbbreviateSuite.scala | 30 ++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala index 18739ed54a29..44de2350b9fd 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/ProtoUtils.scala @@ -24,18 +24,25 @@ import com.google.protobuf.Descriptors.FieldDescriptor private[connect] object ProtoUtils { private val format = java.text.NumberFormat.getInstance() + private val BYTES = "BYTES" + private val STRING = "STRING" private val MAX_BYTES_SIZE = 8 private val MAX_STRING_SIZE = 1024 def abbreviate(message: Message, maxStringSize: Int = MAX_STRING_SIZE): Message = { +abbreviate(message, Map(STRING -> maxStringSize)) + } + + def abbreviate(message: Message, thresholds: Map[String, Int]): Message = { val builder = message.toBuilder message.getAllFields.asScala.iterator.foreach { case (field: FieldDescriptor, string: String) if field.getJavaType == FieldDescriptor.JavaType.STRING && string != null => val size = string.length -if (size > maxStringSize) { - builder.setField(field, createString(string.take(maxStringSize), size)) +val threshold = thresholds.getOrElse(STRING, MAX_STRING_SIZE) +if (size > threshold) { + builder.setField(field, createString(string.take(threshold), size)) } else { builder.setField(field, string) } @@ -43,11 +50,12 @@ private[connect] object ProtoUtils { case (field: FieldDescriptor, byteString: ByteString) if field.getJavaType == FieldDescriptor.JavaType.BYTE_STRING && byteString != null => val size = byteString.size -if (size > MAX_BYTES_SIZE) { +val threshold = thresholds.getOrElse(BYTES, MAX_BYTES_SIZE) +if (size > threshold) { builder.setField( field, byteString - .substring(0, MAX_BYTES_SIZE) + .substring(0, threshold) .concat(createTruncatedByteString(size))) } else { builder.setField(field, byteString) @@ -56,11 +64,12 @@ private[connect] object ProtoUtils { case (field: FieldDescriptor, byteArray: Array[Byte]) if field.getJavaType == FieldDescriptor.JavaType.BYTE_STRING && byteArray != null => val size = byteArray.length -if (size > MAX_BYTES_SIZE) { +val threshold = thresholds.getOrElse(BYTES, MAX_BYTES_SIZE) +if (size > threshold) { builder.setField( field, ByteString - .copyFrom(byteArray, 0, MAX_BYTES_SIZE) + .copyFrom(byteArray, 0, threshold) .concat(createTruncatedByteString(size))) } else { builder.setField(field, byteArray) @@ -69,7 +78,7 @@ private[connect] object ProtoUtils { // TODO(SPARK-43117): should also support 1, repeated msg; 2, map case (field: FieldDescriptor, msg: Message) if field.getJavaType == FieldDescriptor.JavaType.MESSAGE && msg != null => -builder.setField(field, abbreviate(msg, maxStringSize)) +builder.setField(field, abbreviate(msg, thresholds)) case (field: FieldDescriptor, value: Any) => builder.setField(field, value) } diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/messages/AbbreviateS
(spark) branch master updated (4911a5bad4ac -> 41383e974d86)
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 4911a5bad4ac [SPARK-46505][CONNECT] Make bytes threshold configurable in `ProtoUtils.abbreviate` add 41383e974d86 [SPARK-46504][PS][TESTS] Factor slow tests out of `IndexesTests` No new revisions were added by this update. Summary of changes: dev/sparktestsupport/modules.py| 12 + .../tests/connect/indexes/test_parity_asof.py | 41 +++ .../tests/connect/indexes/test_parity_astype.py| 41 +++ .../tests/connect/indexes/test_parity_delete.py| 41 +++ .../tests/connect/indexes/test_parity_diff.py | 41 +++ .../tests/connect/indexes/test_parity_insert.py| 41 +++ .../tests/connect/indexes/test_parity_map.py | 41 +++ python/pyspark/pandas/tests/indexes/test_asof.py | 89 + python/pyspark/pandas/tests/indexes/test_astype.py | 104 ++ python/pyspark/pandas/tests/indexes/test_base.py | 370 + python/pyspark/pandas/tests/indexes/test_delete.py | 87 + python/pyspark/pandas/tests/indexes/test_diff.py | 126 +++ python/pyspark/pandas/tests/indexes/test_insert.py | 116 +++ python/pyspark/pandas/tests/indexes/test_map.py| 118 +++ 14 files changed, 903 insertions(+), 365 deletions(-) create mode 100644 python/pyspark/pandas/tests/connect/indexes/test_parity_asof.py create mode 100644 python/pyspark/pandas/tests/connect/indexes/test_parity_astype.py create mode 100644 python/pyspark/pandas/tests/connect/indexes/test_parity_delete.py create mode 100644 python/pyspark/pandas/tests/connect/indexes/test_parity_diff.py create mode 100644 python/pyspark/pandas/tests/connect/indexes/test_parity_insert.py create mode 100644 python/pyspark/pandas/tests/connect/indexes/test_parity_map.py create mode 100644 python/pyspark/pandas/tests/indexes/test_asof.py create mode 100644 python/pyspark/pandas/tests/indexes/test_astype.py create mode 100644 python/pyspark/pandas/tests/indexes/test_delete.py create mode 100644 python/pyspark/pandas/tests/indexes/test_diff.py create mode 100644 python/pyspark/pandas/tests/indexes/test_insert.py create mode 100644 python/pyspark/pandas/tests/indexes/test_map.py - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [SPARK-45597][PYTHON][SQL][FOLLOW-UP] Minor deduplicate datasource checking logic
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ca147d2f30ee [SPARK-45597][PYTHON][SQL][FOLLOW-UP] Minor deduplicate datasource checking logic ca147d2f30ee is described below commit ca147d2f30ee1fe4d6fdaa57c6698d151f83262b Author: Hyukjin Kwon AuthorDate: Tue Dec 26 14:52:42 2023 +0900 [SPARK-45597][PYTHON][SQL][FOLLOW-UP] Minor deduplicate datasource checking logic ### What changes were proposed in this pull request? This PR proposes to deduplicate datasource checking logic. ### Why are the changes needed? For better maintenance and readability. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing test cases should cover them. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44489 from HyukjinKwon/SPARK-45597-followup2. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon --- .../spark/sql/execution/datasources/DataSource.scala | 16 ++-- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index b400e3799942..decc20c52531 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -639,6 +639,8 @@ object DataSource extends Logging { val provider2 = s"$provider1.DefaultSource" val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader) +lazy val isUserDefinedDataSource = SparkSession.getActiveSession.exists( + _.sessionState.dataSourceManager.dataSourceExists(provider)) try { serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider1)).toList match { @@ -650,8 +652,6 @@ object DataSource extends Logging { // Found the data source using fully qualified path dataSource case Failure(error) => -val isUserDefinedDataSource = SparkSession.getActiveSession.exists( - _.sessionState.dataSourceManager.dataSourceExists(provider)) if (provider1.startsWith("org.apache.spark.sql.hive.orc")) { throw QueryCompilationErrors.orcNotUsedWithHiveEnabledError() } else if (provider1.toLowerCase(Locale.ROOT) == "avro" || @@ -676,15 +676,11 @@ object DataSource extends Logging { throw e } } +case _ :: Nil if isUserDefinedDataSource => + // There was DSv1 or DSv2 loaded, but the same name source was found + // in user defined data source. + throw QueryCompilationErrors.foundMultipleDataSources(provider) case head :: Nil => - // there is exactly one registered alias - val isUserDefinedDataSource = SparkSession.getActiveSession.exists( -_.sessionState.dataSourceManager.dataSourceExists(provider)) - // The source can be successfully loaded as either a V1 or a V2 data source. - // Check if it is also a user-defined data source. - if (isUserDefinedDataSource) { -throw QueryCompilationErrors.foundMultipleDataSources(provider) - } head.getClass case sources => // There are multiple registered aliases for the input. If there is single datasource - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
(spark) branch master updated: [MINOR][DOCS] Fix rst link in Python API docs for .sql()
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 05c48e7450a9 [MINOR][DOCS] Fix rst link in Python API docs for .sql() 05c48e7450a9 is described below commit 05c48e7450a9d7b0d41035627d2036699ccb8f21 Author: Nicholas Chammas AuthorDate: Tue Dec 26 14:53:14 2023 +0900 [MINOR][DOCS] Fix rst link in Python API docs for .sql() ### What changes were proposed in this pull request? This PR fixes the rst markup for a link in the documentation for `pyspark.sql.SparkSession.sql` and `pyspark.pandas.sql`. ### Why are the changes needed? The current markup is incorrect. Technically, though the markup in this PR is correct, the link target is incorrect. We should be linking to page relative to the site root, rather than hardcoding a link to `/latest/`. However, I could not figure out how to do that in rst, and building the API docs takes a really long time, and I could not make it build incrementally. ### Does this PR introduce _any_ user-facing change? Yes, the markup goes from looking like this: https://github.com/apache/spark/assets/1039369/077566a3-79df-4aa2-a0f7-d819f608f673";> To looking like this: https://github.com/apache/spark/assets/1039369/b1453761-3f9c-435e-89e1-cfd3748cce9c";> ### How was this patch tested? I built the docs as follows: ``` SKIP_SCALADOC=1 SKIP_RDOC=1 SKIP_SQLDOC=1 bundle exec jekyll serve ``` And reviewed the output in my browser. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44488 from nchammas/data-types-rst-link. Authored-by: Nicholas Chammas Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/sql_formatter.py | 6 +++--- python/pyspark/sql/session.py | 7 --- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/pyspark/pandas/sql_formatter.py b/python/pyspark/pandas/sql_formatter.py index 9800037016c5..7e8263f552f0 100644 --- a/python/pyspark/pandas/sql_formatter.py +++ b/python/pyspark/pandas/sql_formatter.py @@ -109,13 +109,13 @@ def sql( args : dict or list A dictionary of parameter names to Python objects or a list of Python objects that can be converted to SQL literal expressions. See -https://spark.apache.org/docs/latest/sql-ref-datatypes.html";> -Supported Data Types for supported value types in Python. +`Supported Data Types`_ for supported value types in Python. For example, dictionary keys: "rank", "name", "birthdate"; dictionary values: 1, "Steven", datetime.date(2023, 4, 2). A value can be also a `Column` of a literal or collection constructor functions such as `map()`, `array()`, `struct()`, in that case it is taken as is. +.. _Supported Data Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html .. versionadded:: 3.4.0 @@ -176,7 +176,7 @@ def sql( 1 2 2 3 -And substitude named parameters with the `:` prefix by SQL literals. +And substitute named parameters with the `:` prefix by SQL literals. >>> ps.sql("SELECT * FROM range(10) WHERE id > :bound1", args={"bound1":7}) id diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 7615491a1778..10b56d006dcd 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -1548,13 +1548,14 @@ class SparkSession(SparkConversionMixin): args : dict or list A dictionary of parameter names to Python objects or a list of Python objects that can be converted to SQL literal expressions. See -https://spark.apache.org/docs/latest/sql-ref-datatypes.html";> -Supported Data Types for supported value types in Python. +`Supported Data Types`_ for supported value types in Python. For example, dictionary keys: "rank", "name", "birthdate"; dictionary or list values: 1, "Steven", datetime.date(2023, 4, 2). A value can be also a `Column` of a literal or collection constructor functions such as `map()`, `array()`, `struct()`, in that case it is taken as is. +.. _Supported Data Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html + .. versionadded:: 3.4.0 kwargs : dict @@ -1631,7 +1632,7 @@ class SparkSession(SparkConversionMixin): | 3| 6| +---+---+ -And substitude named parameters with the `:` prefix by SQL literals. +And substitute named parameters with the `:` prefix by SQL literals. >>> from pyspark.sql.functions import create_map >>> spark.sql( -
(spark) branch master updated: [SPARK-46444][SQL] V2SessionCatalog#createTable should not load the table
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d8fb91e61352 [SPARK-46444][SQL] V2SessionCatalog#createTable should not load the table d8fb91e61352 is described below commit d8fb91e61352e57e733e7d7e4978c8ce555454b1 Author: Wenchen Fan AuthorDate: Tue Dec 26 15:17:30 2023 +0800 [SPARK-46444][SQL] V2SessionCatalog#createTable should not load the table ### What changes were proposed in this pull request? It's a perf regression in CREATE TABLE if we switch to the v2 command framework, as `V2SessionCatalog#createTable` does an extra table lookup, which does not happen in v1. This PR fixes it by allowing `TableCatalog#createTable` to return null, and Spark will call `loadTable` to get the new table metadata in the case of CTAS. This PR also fixed `alterTable` in the same way. ### Why are the changes needed? fix perf regression in v2. The perf of a single command may not matter, but in a cluster with many Spark applications, it's important to reduce the RPCs to the metastore. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #44377 from cloud-fan/create-table. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../src/main/resources/error/error-classes.json| 4 +- docs/sql-error-conditions.md | 4 +- .../spark/sql/connector/catalog/TableCatalog.java | 8 +- .../spark/sql/errors/QueryCompilationErrors.scala | 6 +- .../datasources/v2/V2SessionCatalog.scala | 37 ++--- .../datasources/v2/WriteToDataSourceV2Exec.scala | 8 +- .../spark/sql/connector/DataSourceV2Suite.scala| 12 +- .../datasources/v2/V2SessionCatalogSuite.scala | 181 + 8 files changed, 157 insertions(+), 103 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 8970045d4ab3..700b1ed07513 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -895,7 +895,9 @@ }, "DATA_SOURCE_TABLE_SCHEMA_MISMATCH" : { "message" : [ - "The schema of the data source table does not match the actual schema . If you are using the DataFrameReader.schema API or creating a table, avoid specifying the schema." + "The schema of the data source table does not match the expected schema. If you are using the DataFrameReader.schema API or creating a table, avoid specifying the schema.", + "Data Source schema: ", + "Expected schema: " ], "sqlState" : "42K03" }, diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index 0722cae5815e..a8d2b6c894bc 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -496,7 +496,9 @@ Failed to find the data source: ``. Please find packages at `https://s [SQLSTATE: 42K03](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) -The schema of the data source table `` does not match the actual schema ``. If you are using the DataFrameReader.schema API or creating a table, avoid specifying the schema. +The schema of the data source table does not match the expected schema. If you are using the DataFrameReader.schema API or creating a table, avoid specifying the schema. +Data Source schema: `` +Expected schema: `` ### DATETIME_OVERFLOW diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java index 6642adc33548..74700789dde0 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java @@ -187,7 +187,9 @@ public interface TableCatalog extends CatalogPlugin { * @param columns the columns of the new table. * @param partitions transforms to use for partitioning data in the table * @param properties a string map of table properties - * @return metadata for the new table + * @return metadata for the new table. This can be null if getting the metadata for the new table + * is expensive. Spark will call {@link #loadTable(Identifier)} if needed (e.g. CTAS). + * * @throws TableAlreadyExistsException If a table or view already exists for the identifier * @throws UnsupportedOperationException If a requested partition transform is not supported * @throws NoSuchNamespaceException If the identifier namespace does not exist (o
(spark) branch master updated: [SPARK-46507][PS][TESTS] Split `IndexesSlowTests` into multiple tests
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 5ba6e106b858 [SPARK-46507][PS][TESTS] Split `IndexesSlowTests` into multiple tests 5ba6e106b858 is described below commit 5ba6e106b858d2a1ac05bcc5c5ce11d4367cb9a7 Author: Ruifeng Zheng AuthorDate: Tue Dec 26 15:39:43 2023 +0800 [SPARK-46507][PS][TESTS] Split `IndexesSlowTests` into multiple tests ### What changes were proposed in this pull request? Split `IndexesSlowTests` into multiple tests ### Why are the changes needed? for testing parallelism ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44491 from zhengruifeng/ps_test_break_idx_base_slow. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- dev/sparktestsupport/modules.py| 10 +- ...t_parity_base_slow.py => test_parity_append.py} | 17 +- ...ty_base_slow.py => test_parity_intersection.py} | 17 +- ...arity_base_slow.py => test_parity_monotonic.py} | 17 +- ...st_parity_base_slow.py => test_parity_union.py} | 17 +- python/pyspark/pandas/tests/indexes/test_append.py | 129 + .../pyspark/pandas/tests/indexes/test_base_slow.py | 549 - .../pandas/tests/indexes/test_intersection.py | 167 +++ .../pyspark/pandas/tests/indexes/test_monotonic.py | 184 +++ python/pyspark/pandas/tests/indexes/test_union.py | 208 10 files changed, 728 insertions(+), 587 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 19e17efe9064..66ae11886cd4 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -801,7 +801,10 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.indexes.test_diff", "pyspark.pandas.tests.indexes.test_insert", "pyspark.pandas.tests.indexes.test_map", -"pyspark.pandas.tests.indexes.test_base_slow", +"pyspark.pandas.tests.indexes.test_append", +"pyspark.pandas.tests.indexes.test_intersection", +"pyspark.pandas.tests.indexes.test_monotonic", +"pyspark.pandas.tests.indexes.test_union", "pyspark.pandas.tests.indexes.test_datetime", "pyspark.pandas.tests.indexes.test_datetime_at", "pyspark.pandas.tests.indexes.test_datetime_between", @@ -1183,7 +1186,10 @@ pyspark_pandas_connect_part2 = Module( "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx", "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv", "pyspark.pandas.tests.connect.computation.test_parity_stats", -"pyspark.pandas.tests.connect.indexes.test_parity_base_slow", +"pyspark.pandas.tests.connect.indexes.test_parity_append", +"pyspark.pandas.tests.connect.indexes.test_parity_intersection", +"pyspark.pandas.tests.connect.indexes.test_parity_monotonic", +"pyspark.pandas.tests.connect.indexes.test_parity_union", "pyspark.pandas.tests.connect.frame.test_parity_interpolate", "pyspark.pandas.tests.connect.frame.test_parity_interpolate_error", "pyspark.pandas.tests.connect.series.test_parity_interpolate", diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_base_slow.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_append.py similarity index 71% copy from python/pyspark/pandas/tests/connect/indexes/test_parity_base_slow.py copy to python/pyspark/pandas/tests/connect/indexes/test_parity_append.py index 1b4d187df894..ec81d64c8456 100644 --- a/python/pyspark/pandas/tests/connect/indexes/test_parity_base_slow.py +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_append.py @@ -16,22 +16,21 @@ # import unittest -from pyspark import pandas as ps -from pyspark.pandas.tests.indexes.test_base_slow import IndexesSlowTestsMixin +from pyspark.pandas.tests.indexes.test_append import AppendMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class IndexesSlowParityTests( -IndexesSlowTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase +class AppendParityTests( +AppendMixin, +PandasOnSparkTestUtils, +ReusedConnectTestCase, ): -@property -def psdf(self): -return ps.from_pandas(self.pdf) +pass if __name__ == "__main__": -from pyspark.pandas.tests.connect.indexes.test_parity_base_slow import * # noqa: F401 +from pyspark.pandas.tests.connect.indexes.test_parity_append import * # noqa: F