[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161499430 --- Diff: python/pyspark/sql/tests.py --- @@ -4279,6 +4272,386 @@ def test_unsupported_types(self): df.groupby('id').apply(f).collect() +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyAggTests(ReusedSQLTestCase): + +@property +def data(self): +from pyspark.sql.functions import array, explode, col, lit +return self.spark.range(10).toDF('id') \ +.withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ +.withColumn("v", explode(col('vs'))) \ +.drop('vs') \ +.withColumn('w', lit(1.0)) + +@property +def plus_one(self): +from pyspark.sql.functions import udf + +@udf('double') +def plus_one(v): +assert isinstance(v, (int, float)) +return v + 1 +return plus_one + +@property +def plus_two(self): +import pandas as pd +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.SCALAR) +def plus_two(v): +assert isinstance(v, pd.Series) +return v + 2 +return plus_two + +@property +def mean_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def mean_udf(v): +return v.mean() +return mean_udf + +@property +def sum_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def sum_udf(v): +return v.sum() +return sum_udf + +@property +def weighted_mean_udf(self): +import numpy as np +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def weighted_mean_udf(v, w): +return np.average(v, weights=w) +return weighted_mean_udf + +def test_basic(self): +from pyspark.sql.functions import col, lit, sum, mean + +df = self.data +weighted_mean_udf = self.weighted_mean_udf + +result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') +expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean_udf(v, 1.0)')).sort('id') +self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) + +result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ +.sort(df.id + 1) +expected2 = df.groupby((col('id') + 1))\ +.agg(mean(df.v).alias('weighted_mean_udf(v, 1.0)')).sort(df.id + 1) +self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) + +result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') +expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean_udf(v, w)')).sort('id') +self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) + +result4 = df.groupby((col('id') + 1).alias('id'))\ +.agg(weighted_mean_udf(df.v, df.w))\ +.sort('id') +expected4 = df.groupby((col('id') + 1).alias('id'))\ +.agg(mean(df.v).alias('weighted_mean_udf(v, w)'))\ +.sort('id') +self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) + +def test_array(self): +from pyspark.sql.types import ArrayType, DoubleType +from pyspark.sql.functions import pandas_udf, PandasUDFType + +with QuietTest(self.sc): +with self.assertRaisesRegexp(NotImplementedError, 'not supported'): +@pandas_udf(ArrayType(DoubleType()), PandasUDFType.GROUP_AGG) +def mean_and_std_udf(v): +return [v.mean(), v.std()] + +def test_struct(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +with QuietTest(self.sc): +with self.assertRaisesRegexp(NotImplementedError, 'not supported'): +@pandas_udf('mean double, std double', PandasUDFType.GROUP_AGG) +def mean_and_std_udf(v): +return (v.mean(), v.std()) + +def test_alias(self): +from pyspark.sql.functions import mean + +df = self.data +mean_udf = self.mean_udf +
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161500723 --- Diff: python/pyspark/sql/tests.py --- @@ -4279,6 +4272,386 @@ def test_unsupported_types(self): df.groupby('id').apply(f).collect() +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyAggTests(ReusedSQLTestCase): + +@property +def data(self): +from pyspark.sql.functions import array, explode, col, lit +return self.spark.range(10).toDF('id') \ +.withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ +.withColumn("v", explode(col('vs'))) \ +.drop('vs') \ +.withColumn('w', lit(1.0)) + +@property +def plus_one(self): +from pyspark.sql.functions import udf + +@udf('double') +def plus_one(v): +assert isinstance(v, (int, float)) +return v + 1 +return plus_one + +@property +def plus_two(self): +import pandas as pd +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.SCALAR) +def plus_two(v): +assert isinstance(v, pd.Series) +return v + 2 +return plus_two + +@property +def mean_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def mean_udf(v): +return v.mean() +return mean_udf + +@property +def sum_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def sum_udf(v): +return v.sum() +return sum_udf + +@property +def weighted_mean_udf(self): +import numpy as np +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def weighted_mean_udf(v, w): +return np.average(v, weights=w) +return weighted_mean_udf + +def test_basic(self): +from pyspark.sql.functions import col, lit, sum, mean + +df = self.data +weighted_mean_udf = self.weighted_mean_udf + +result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') +expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean_udf(v, 1.0)')).sort('id') +self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) + +result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ +.sort(df.id + 1) +expected2 = df.groupby((col('id') + 1))\ +.agg(mean(df.v).alias('weighted_mean_udf(v, 1.0)')).sort(df.id + 1) +self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) + +result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') +expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean_udf(v, w)')).sort('id') +self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) + +result4 = df.groupby((col('id') + 1).alias('id'))\ +.agg(weighted_mean_udf(df.v, df.w))\ +.sort('id') +expected4 = df.groupby((col('id') + 1).alias('id'))\ +.agg(mean(df.v).alias('weighted_mean_udf(v, w)'))\ +.sort('id') +self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) + +def test_array(self): +from pyspark.sql.types import ArrayType, DoubleType +from pyspark.sql.functions import pandas_udf, PandasUDFType + +with QuietTest(self.sc): +with self.assertRaisesRegexp(NotImplementedError, 'not supported'): +@pandas_udf(ArrayType(DoubleType()), PandasUDFType.GROUP_AGG) +def mean_and_std_udf(v): +return [v.mean(), v.std()] + +def test_struct(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +with QuietTest(self.sc): +with self.assertRaisesRegexp(NotImplementedError, 'not supported'): +@pandas_udf('mean double, std double', PandasUDFType.GROUP_AGG) +def mean_and_std_udf(v): +return (v.mean(), v.std()) + +def test_alias(self): +from pyspark.sql.functions import mean + +df = self.data +mean_udf = self.mean_udf +
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161496153 --- Diff: python/pyspark/sql/tests.py --- @@ -4279,6 +4272,386 @@ def test_unsupported_types(self): df.groupby('id').apply(f).collect() +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyAggTests(ReusedSQLTestCase): + +@property +def data(self): +from pyspark.sql.functions import array, explode, col, lit +return self.spark.range(10).toDF('id') \ +.withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ +.withColumn("v", explode(col('vs'))) \ +.drop('vs') \ +.withColumn('w', lit(1.0)) + +@property +def plus_one(self): +from pyspark.sql.functions import udf + +@udf('double') +def plus_one(v): +assert isinstance(v, (int, float)) +return v + 1 +return plus_one + +@property +def plus_two(self): +import pandas as pd +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.SCALAR) +def plus_two(v): +assert isinstance(v, pd.Series) +return v + 2 +return plus_two + +@property +def mean_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def mean_udf(v): +return v.mean() +return mean_udf + +@property +def sum_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def sum_udf(v): +return v.sum() +return sum_udf + +@property +def weighted_mean_udf(self): +import numpy as np +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def weighted_mean_udf(v, w): +return np.average(v, weights=w) +return weighted_mean_udf + +def test_basic(self): +from pyspark.sql.functions import col, lit, sum, mean + +df = self.data +weighted_mean_udf = self.weighted_mean_udf + +result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') --- End diff -- Let's add each comment for each test here. Seems hard to read. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161495144 --- Diff: python/pyspark/sql/group.py --- @@ -82,6 +91,13 @@ def agg(self, *exprs): >>> from pyspark.sql import functions as F >>> sorted(gdf.agg(F.min(df.age)).collect()) [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)] + +>>> from pyspark.sql.functions import pandas_udf, PandasUDFType +>>> @pandas_udf('int', PandasUDFType.GROUP_AGG) +... def min_udf(v): +... return v.min() +>>> sorted(gdf.agg(min_udf(df.age)).collect()) # doctest: +SKIP --- End diff -- That's fine. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161507851 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala --- @@ -334,34 +339,51 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { object Aggregation extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalAggregation( - groupingExpressions, aggregateExpressions, resultExpressions, child) => - -val (functionsWithDistinct, functionsWithoutDistinct) = - aggregateExpressions.partition(_.isDistinct) -if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) { - // This is a sanity check. We should not reach here when we have multiple distinct - // column sets. Our MultipleDistinctRewriter should take care this case. - sys.error("You hit a query analyzer bug. Please report your query to " + - "Spark user mailing list.") -} + groupingExpressions, aggExpressions, resultExpressions, child) => + +if (aggExpressions.forall(expr => expr.isInstanceOf[AggregateExpression])) { -val aggregateOperator = - if (functionsWithDistinct.isEmpty) { -aggregate.AggUtils.planAggregateWithoutDistinct( - groupingExpressions, - aggregateExpressions, - resultExpressions, - planLater(child)) - } else { -aggregate.AggUtils.planAggregateWithOneDistinct( - groupingExpressions, - functionsWithDistinct, - functionsWithoutDistinct, - resultExpressions, - planLater(child)) + val aggregateExpressions = aggExpressions.map(expr => +expr.asInstanceOf[AggregateExpression]) + + val (functionsWithDistinct, functionsWithoutDistinct) = +aggregateExpressions.partition(_.isDistinct) + if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) { +// This is a sanity check. We should not reach here when we have multiple distinct +// column sets. Our MultipleDistinctRewriter should take care this case. +sys.error("You hit a query analyzer bug. Please report your query to " + + "Spark user mailing list.") } -aggregateOperator + val aggregateOperator = +if (functionsWithDistinct.isEmpty) { + aggregate.AggUtils.planAggregateWithoutDistinct( +groupingExpressions, +aggregateExpressions, +resultExpressions, +planLater(child)) +} else { + aggregate.AggUtils.planAggregateWithOneDistinct( +groupingExpressions, +functionsWithDistinct, +functionsWithoutDistinct, +resultExpressions, +planLater(child)) +} + + aggregateOperator +} else if (aggExpressions.forall(expr => expr.isInstanceOf[PythonUDF])) { + val udfExpressions = aggExpressions.map(expr => expr.asInstanceOf[PythonUDF]) + + Seq(execution.python.AggregateInPandasExec( +groupingExpressions, +udfExpressions, +resultExpressions, +planLater(child))) +} else { + throw new IllegalArgumentException( --- End diff -- `AnalysisException` too ? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161507315 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala --- @@ -288,9 +289,13 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case PhysicalAggregation( namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) => +require( --- End diff -- Should we throw `AnalysisException` instead? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161498029 --- Diff: python/pyspark/sql/tests.py --- @@ -4279,6 +4272,386 @@ def test_unsupported_types(self): df.groupby('id').apply(f).collect() +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyAggTests(ReusedSQLTestCase): --- End diff -- I think we should have `Pandas` prefix here and for other Pandas UDF cases too ... --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161503314 --- Diff: python/pyspark/sql/udf.py --- @@ -111,6 +111,10 @@ def returnType(self): and not isinstance(self._returnType_placeholder, StructType): raise ValueError("Invalid returnType: returnType must be a StructType for " "pandas_udf with function type GROUP_MAP") +elif self.evalType == PythonEvalType.SQL_PANDAS_GROUP_AGG_UDF \ +and isinstance(self._returnType_placeholder, (StructType, ArrayType)): --- End diff -- Hm .. i think we don't support `MapType` too? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161495918 --- Diff: python/pyspark/sql/tests.py --- @@ -4279,6 +4272,386 @@ def test_unsupported_types(self): df.groupby('id').apply(f).collect() +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyAggTests(ReusedSQLTestCase): + +@property +def data(self): +from pyspark.sql.functions import array, explode, col, lit +return self.spark.range(10).toDF('id') \ +.withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ +.withColumn("v", explode(col('vs'))) \ +.drop('vs') \ +.withColumn('w', lit(1.0)) + +@property +def plus_one(self): --- End diff -- Could we add a prefix to note the differences like `udf`, `padnas_udf` and `functionType`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161500785 --- Diff: python/pyspark/sql/tests.py --- @@ -4279,6 +4272,386 @@ def test_unsupported_types(self): df.groupby('id').apply(f).collect() +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyAggTests(ReusedSQLTestCase): + +@property +def data(self): +from pyspark.sql.functions import array, explode, col, lit +return self.spark.range(10).toDF('id') \ +.withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ +.withColumn("v", explode(col('vs'))) \ +.drop('vs') \ +.withColumn('w', lit(1.0)) + +@property +def plus_one(self): +from pyspark.sql.functions import udf + +@udf('double') +def plus_one(v): +assert isinstance(v, (int, float)) +return v + 1 +return plus_one + +@property +def plus_two(self): +import pandas as pd +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.SCALAR) +def plus_two(v): +assert isinstance(v, pd.Series) +return v + 2 +return plus_two + +@property +def mean_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def mean_udf(v): +return v.mean() +return mean_udf + +@property +def sum_udf(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def sum_udf(v): +return v.sum() +return sum_udf + +@property +def weighted_mean_udf(self): +import numpy as np +from pyspark.sql.functions import pandas_udf, PandasUDFType + +@pandas_udf('double', PandasUDFType.GROUP_AGG) +def weighted_mean_udf(v, w): +return np.average(v, weights=w) +return weighted_mean_udf + +def test_basic(self): +from pyspark.sql.functions import col, lit, sum, mean + +df = self.data +weighted_mean_udf = self.weighted_mean_udf + +result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') +expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean_udf(v, 1.0)')).sort('id') +self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) + +result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ +.sort(df.id + 1) +expected2 = df.groupby((col('id') + 1))\ +.agg(mean(df.v).alias('weighted_mean_udf(v, 1.0)')).sort(df.id + 1) +self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) + +result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') +expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean_udf(v, w)')).sort('id') +self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) + +result4 = df.groupby((col('id') + 1).alias('id'))\ +.agg(weighted_mean_udf(df.v, df.w))\ +.sort('id') +expected4 = df.groupby((col('id') + 1).alias('id'))\ +.agg(mean(df.v).alias('weighted_mean_udf(v, w)'))\ +.sort('id') +self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) + +def test_array(self): +from pyspark.sql.types import ArrayType, DoubleType +from pyspark.sql.functions import pandas_udf, PandasUDFType + +with QuietTest(self.sc): +with self.assertRaisesRegexp(NotImplementedError, 'not supported'): +@pandas_udf(ArrayType(DoubleType()), PandasUDFType.GROUP_AGG) +def mean_and_std_udf(v): +return [v.mean(), v.std()] + +def test_struct(self): +from pyspark.sql.functions import pandas_udf, PandasUDFType + +with QuietTest(self.sc): +with self.assertRaisesRegexp(NotImplementedError, 'not supported'): +@pandas_udf('mean double, std double', PandasUDFType.GROUP_AGG) +def mean_and_std_udf(v): +return (v.mean(), v.std()) + +def test_alias(self): +from pyspark.sql.functions import mean + +df = self.data +mean_udf = self.mean_udf +
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161507743 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- Hive has it, but in the documentation it is not explained. And in the comments it just references the blog I referenced, but then removed according to @gatorsmile's comment. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161503031 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- Actually we already referred a commercial RDBMS in L33... --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161502866 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- Did any open source RDBMS have this rule? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161502564 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- I did, but @gatorsmile told me to remove it: https://github.com/apache/spark/pull/20023#discussion_r159117817. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161501303 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- sorry, typo, "put some reference"... We can put a link to a document of a mainstream RDBMS and say this rule follows xxx... --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20259: [SPARK-23066][WEB-UI] Master Page increase master start-...
Github user guoxiaolongzte commented on the issue: https://github.com/apache/spark/pull/20259 OK, i understand your suggestion. Can I set the startup time to a metric? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20268 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/86131/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20268 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20268 **[Test build #86131 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86131/testReport)** for PR 20268 at commit [`6770835`](https://github.com/apache/spark/commit/67708359ff19d450a3f3e60548df778fb1588515). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20206: [SPARK-19256][SQL] Remove ordering enforcement fr...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20206#discussion_r161496715 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala --- @@ -150,6 +152,10 @@ case class InsertIntoHadoopFsRelationCommand( } } + val partitionSet = AttributeSet(partitionColumns) + val dataColumns = query.output.filterNot(partitionSet.contains) --- End diff -- We should use `outputColumns` instead of `query.output`, cc @gengliangwang --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20206: [SPARK-19256][SQL] Remove ordering enforcement fr...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20206#discussion_r161496099 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala --- @@ -184,6 +190,43 @@ case class InsertIntoHadoopFsRelationCommand( Seq.empty[Row] } + private def getBucketIdExpression(dataColumns: Seq[Attribute]): Option[Expression] = { +bucketSpec.map { spec => + val bucketColumns = spec.bucketColumnNames.map(c => dataColumns.find(_.name == c).get) + // Use `HashPartitioning.partitionIdExpression` as our bucket id expression, so that we can + // guarantee the data distribution is same between shuffle and bucketed data source, which + // enables us to only shuffle one side when join a bucketed table and a normal one. + HashPartitioning(bucketColumns, spec.numBuckets).partitionIdExpression +} + } + + /** + * How is `requiredOrdering` determined ? + * + * table type |requiredOrdering + * -+- + * normal table | partition columns --- End diff -- nit: `non-bucketed table`, a partitioned table is not a normal table... --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20206: [SPARK-19256][SQL] Remove ordering enforcement fr...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20206#discussion_r161495402 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala --- @@ -156,40 +145,14 @@ object FileFormatWriter extends Logging { statsTrackers = statsTrackers ) -// We should first sort by partition columns, then bucket id, and finally sorting columns. -val requiredOrdering = partitionColumns ++ bucketIdExpression ++ sortColumns -// the sort order doesn't matter -val actualOrdering = plan.outputOrdering.map(_.child) -val orderingMatched = if (requiredOrdering.length > actualOrdering.length) { - false -} else { - requiredOrdering.zip(actualOrdering).forall { -case (requiredOrder, childOutputOrder) => - requiredOrder.semanticEquals(childOutputOrder) - } -} - SQLExecution.checkSQLExecutionId(sparkSession) // This call shouldn't be put into the `try` block below because it only initializes and // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called. committer.setupJob(job) try { - val rdd = if (orderingMatched) { -plan.execute() - } else { -// SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and -// the physical plan may have different attribute ids due to optimizer removing some -// aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch. --- End diff -- This concern is still valid, the `DataWritingCommand.requiredChildOrdering` is based on logical plan's output attribute ids, how can we safely apply it in `DataWritingCommandExec`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161494664 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- no, we would return the a type with scale 6 and in `CheckOverflow` all the numbers which don't fit will be translated to `NULL`, since this is current Spark behavior in such cases. May I kindly ask you to elaborate a but more what you mean by "push some reference"? Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20259: [SPARK-23066][WEB-UI] Master Page increase master start-...
Github user srowen commented on the issue: https://github.com/apache/spark/pull/20259 I don't think this number tells you the system is stable or not. If it did, how is it actionable to users who don't have any access to investigate further? Certainly, you should have proper monitoring for all of these services, and their metrics, if you care. This bit of info just isn't informative compared to the large amount of metrics already exposed. (I could see exposing it as a metric though, if not already. That's what metrics reporting is for.) If users who have access can't do so because important logs disappear, that is certainly a log policy problem you need to fix. Although it's "just one extra piece of info" I think there's no logic to adding just this piece info, and only here. I would oppose this change. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19247 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/86132/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19247 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19247 **[Test build #86132 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86132/testReport)** for PR 19247 at commit [`2542014`](https://github.com/apache/spark/commit/2542014b8769cb3a605ec03e3d1e45ff2ab81576). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19872#discussion_r161491996 --- Diff: python/pyspark/sql/functions.py --- @@ -2214,6 +2216,37 @@ def pandas_udf(f=None, returnType=None, functionType=None): .. seealso:: :meth:`pyspark.sql.GroupedData.apply` +3. GROUP_AGG + + A group aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar + The returnType should be a primitive data type, e.g, `DoubleType()`. + The returned scalar can be either a python primitive type, e.g., `int` or `float` + or a numpy data type, e.g., `numpy.int64` or `numpy.float64`. + + StructType and ArrayType are currently not supported. + + Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg` + + >>> from pyspark.sql.functions import pandas_udf, PandasUDFType + >>> df = spark.createDataFrame( + ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], + ... ("id", "v")) + >>> @pandas_udf("double", PandasUDFType.GROUP_AGG) + ... def mean_udf(v): --- End diff -- shall we include grouping columns? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161487862 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -1869,6 +1869,65 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-23057: SET LOCATION for managed table with partition") { +withTable("tbl_partition") { + withTempDir { dir => +sql("CREATE TABLE tbl_partition(col1 INT, col2 INT) USING parquet PARTITIONED BY (col1)") +sql("INSERT INTO tbl_partition PARTITION(col1=1) SELECT 11") +sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 22") +checkAnswer(spark.table("tbl_partition"), Seq(Row(11, 1), Row(22, 2))) +val defaultTablePath = spark.sessionState.catalog + .getTableMetadata(TableIdentifier("tbl_partition")).storage.locationUri.get +try { + // before set location of partition col1 =1 and 2 + checkPath(defaultTablePath.toString, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + val path = dir.getCanonicalPath + + // set location of partition col1 =1 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='1') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + + // set location of partition col1 =2 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='2') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(dir.getCanonicalPath, Map("col1" -> "2"), "tbl_partition") + + spark.catalog.refreshTable("tbl_partition") + // SET LOCATION won't move data from previous table path to new table path. + assert(spark.table("tbl_partition").count() == 0) --- End diff -- For me this assert states the old data is there which is not consistent with the comment above. I would suggest to check whether the new location dir is empty: ```scala assert(dir.listFiles().isEmpty) ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161491069 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala --- @@ -751,6 +751,25 @@ class HiveDDLSuite } } + test("SPARK-23057: SET LOCATION should change the path of partition in table") { +withTable("boxes") { + sql("CREATE TABLE boxes (height INT, length INT) PARTITIONED BY (width INT)") + sql("INSERT OVERWRITE TABLE boxes PARTITION (width=4) SELECT 4, 4") + val expected = "/path/to/part/ways" + sql(s"ALTER TABLE boxes PARTITION (width=4) SET LOCATION '$expected'") + val catalog = spark.sessionState.catalog --- End diff -- This is close to the method body checkPath(). Is it possible to find a common place for the method and call the method from here too? Like creating an object close the DDLSuite. What is your opinion? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161488850 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -1869,6 +1869,65 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-23057: SET LOCATION for managed table with partition") { +withTable("tbl_partition") { + withTempDir { dir => +sql("CREATE TABLE tbl_partition(col1 INT, col2 INT) USING parquet PARTITIONED BY (col1)") +sql("INSERT INTO tbl_partition PARTITION(col1=1) SELECT 11") +sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 22") +checkAnswer(spark.table("tbl_partition"), Seq(Row(11, 1), Row(22, 2))) +val defaultTablePath = spark.sessionState.catalog + .getTableMetadata(TableIdentifier("tbl_partition")).storage.locationUri.get +try { + // before set location of partition col1 =1 and 2 + checkPath(defaultTablePath.toString, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + val path = dir.getCanonicalPath + + // set location of partition col1 =1 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='1') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + + // set location of partition col1 =2 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='2') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(dir.getCanonicalPath, Map("col1" -> "2"), "tbl_partition") + + spark.catalog.refreshTable("tbl_partition") + // SET LOCATION won't move data from previous table path to new table path. + assert(spark.table("tbl_partition").count() == 0) + // the previous table path should be still there. + assert(new File(defaultTablePath).exists()) + + sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 33") + // newly inserted data will go to the new table path. + assert(dir.listFiles().nonEmpty) + + sql("DROP TABLE tbl_partition") + // the new table path will be removed after DROP TABLE. + assert(!dir.exists()) +} finally { + Utils.deleteRecursively(new File(defaultTablePath)) +} + } +} + } + + def checkPath(path: String, partSpec: Map[String, String], table: String): Unit = { +val catalog = spark.sessionState.catalog +val spec = Some(partSpec) --- End diff -- The "Some" is not needed here so spec.map {} below can be spared too. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161477649 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -800,6 +802,15 @@ case class AlterTableSetLocationCommand( CommandUtils.updateTableStats(sparkSession, table) Seq.empty[Row] } + + private def updatePathInProps( + storage: CatalogStorageFormat, + newPath: Option[String]): Map[String, String] = { --- End diff -- I would suggest to use String as type for newPath instead of Option[String] as updatePathInProps is always called with an Some so Option is not needed here. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161480210 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -1869,6 +1869,65 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-23057: SET LOCATION for managed table with partition") { +withTable("tbl_partition") { + withTempDir { dir => +sql("CREATE TABLE tbl_partition(col1 INT, col2 INT) USING parquet PARTITIONED BY (col1)") +sql("INSERT INTO tbl_partition PARTITION(col1=1) SELECT 11") +sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 22") +checkAnswer(spark.table("tbl_partition"), Seq(Row(11, 1), Row(22, 2))) +val defaultTablePath = spark.sessionState.catalog + .getTableMetadata(TableIdentifier("tbl_partition")).storage.locationUri.get +try { + // before set location of partition col1 =1 and 2 + checkPath(defaultTablePath.toString, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + val path = dir.getCanonicalPath + + // set location of partition col1 =1 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='1') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") --- End diff -- Consider using the "path" val here for calling "checkPath" method instead "dir.getCanonicalPath" to be as close to your ALTER as possible when you are checking its effect or remove the "path" val and use "dir.getCanonicalPath" everywhere (so avoid unnecessary indirection). If reason of introducing "path" was the line length restriction then you can even use multiline string interpolation. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161488605 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -1869,6 +1869,65 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-23057: SET LOCATION for managed table with partition") { +withTable("tbl_partition") { + withTempDir { dir => +sql("CREATE TABLE tbl_partition(col1 INT, col2 INT) USING parquet PARTITIONED BY (col1)") +sql("INSERT INTO tbl_partition PARTITION(col1=1) SELECT 11") +sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 22") +checkAnswer(spark.table("tbl_partition"), Seq(Row(11, 1), Row(22, 2))) +val defaultTablePath = spark.sessionState.catalog + .getTableMetadata(TableIdentifier("tbl_partition")).storage.locationUri.get +try { + // before set location of partition col1 =1 and 2 + checkPath(defaultTablePath.toString, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + val path = dir.getCanonicalPath + + // set location of partition col1 =1 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='1') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + + // set location of partition col1 =2 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='2') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(dir.getCanonicalPath, Map("col1" -> "2"), "tbl_partition") + + spark.catalog.refreshTable("tbl_partition") + // SET LOCATION won't move data from previous table path to new table path. + assert(spark.table("tbl_partition").count() == 0) + // the previous table path should be still there. + assert(new File(defaultTablePath).exists()) + + sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 33") + // newly inserted data will go to the new table path. + assert(dir.listFiles().nonEmpty) + + sql("DROP TABLE tbl_partition") + // the new table path will be removed after DROP TABLE. + assert(!dir.exists()) +} finally { + Utils.deleteRecursively(new File(defaultTablePath)) --- End diff -- If the test fails before DROP then the cleanup of this new "dir" would not happen. In the finally please call deleteRecursively on the new dir if it is still exists. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20249: [SPARK-23057][SPARK-19235][SQL] SET LOCATION shou...
Github user attilapiros commented on a diff in the pull request: https://github.com/apache/spark/pull/20249#discussion_r161488179 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -1869,6 +1869,65 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-23057: SET LOCATION for managed table with partition") { +withTable("tbl_partition") { + withTempDir { dir => +sql("CREATE TABLE tbl_partition(col1 INT, col2 INT) USING parquet PARTITIONED BY (col1)") +sql("INSERT INTO tbl_partition PARTITION(col1=1) SELECT 11") +sql("INSERT INTO tbl_partition PARTITION(col1=2) SELECT 22") +checkAnswer(spark.table("tbl_partition"), Seq(Row(11, 1), Row(22, 2))) +val defaultTablePath = spark.sessionState.catalog + .getTableMetadata(TableIdentifier("tbl_partition")).storage.locationUri.get +try { + // before set location of partition col1 =1 and 2 + checkPath(defaultTablePath.toString, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + val path = dir.getCanonicalPath + + // set location of partition col1 =1 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='1') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(defaultTablePath.toString, Map("col1" -> "2"), "tbl_partition") + + // set location of partition col1 =2 + sql(s"ALTER TABLE tbl_partition PARTITION (col1='2') SET LOCATION '$path'") + checkPath(dir.getCanonicalPath, Map("col1" -> "1"), "tbl_partition") + checkPath(dir.getCanonicalPath, Map("col1" -> "2"), "tbl_partition") + + spark.catalog.refreshTable("tbl_partition") + // SET LOCATION won't move data from previous table path to new table path. + assert(spark.table("tbl_partition").count() == 0) + // the previous table path should be still there. + assert(new File(defaultTablePath).exists()) --- End diff -- Move this above the previously suggested "dir.listFiles().isEmpty" assert . --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20167: [SPARK-16501] [MESOS] Allow providing Mesos princ...
Github user rvesse commented on a diff in the pull request: https://github.com/apache/spark/pull/20167#discussion_r161484346 --- Diff: resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala --- @@ -80,10 +80,27 @@ trait MesosSchedulerUtils extends Logging { } fwInfoBuilder.setHostname(Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse( conf.get(DRIVER_HOST_ADDRESS))) +conf.getOption("spark.mesos.principal.file") --- End diff -- We have customers who operate secure multi-tenant environments who consider even leaking principals of users from other tenants to be a security issue i.e. they want to minimise what users from one tenant can learn about users from another --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20140: [SPARK-19228][SQL] Introduce tryParseDate method to proc...
Github user sergey-rubtsov commented on the issue: https://github.com/apache/spark/pull/20140 @HyukjinKwon, @gatorsmile could you please help find someone to review this? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19764: [SPARK-22539][SQL] Add second order for rangepart...
Github user caneGuy closed the pull request at: https://github.com/apache/spark/pull/19764 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20257: [SPARK-23048][ML] Add OneHotEncoderEstimator docu...
Github user MLnick commented on a diff in the pull request: https://github.com/apache/spark/pull/20257#discussion_r161472191 --- Diff: docs/ml-features.md --- @@ -775,7 +775,9 @@ for more details on the API. -## OneHotEncoder +## OneHotEncoder (Deprecated since 2.3.0) --- End diff -- I think we should add a little more detail about why it's deprecated. The reason is that because the existing `OneHotEncoder` is a stateless transformer, it is not usable on new data where the number of categories may differ from the training data. In order to fix this, a new `OneHotEncoderEstimator` was created that produces a `OneHotEncoderModel` when fit. Add a link to the JIRA ticket for more detail (https://issues.apache.org/jira/browse/SPARK-13030). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20257: [SPARK-23048][ML] Add OneHotEncoderEstimator docu...
Github user MLnick commented on a diff in the pull request: https://github.com/apache/spark/pull/20257#discussion_r161477464 --- Diff: examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala --- @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer} +// $example off$ +import org.apache.spark.sql.SparkSession + +object OneHotEncoderEstimatorExample { + def main(args: Array[String]): Unit = { +val spark = SparkSession + .builder + .appName("OneHotEncoderEstimatorExample") + .getOrCreate() + +// $example on$ +val df = spark.createDataFrame(Seq( --- End diff -- I know the examples are re-creating the existing `OneHotEncoder` examples, but perhaps we should just drop the `StringIndexer` part and show a simplified example transforming the raw label indices to OHE vectors? We could mention in the user guide that it is common to encode categorical features using `StringIndexer` first? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20257: [SPARK-23048][ML] Add OneHotEncoderEstimator docu...
Github user MLnick commented on a diff in the pull request: https://github.com/apache/spark/pull/20257#discussion_r161475879 --- Diff: docs/ml-features.md --- @@ -775,7 +775,9 @@ for more details on the API. -## OneHotEncoder +## OneHotEncoder (Deprecated since 2.3.0) + +`OneHotEncoder` will be deprecated in 2.3.0 and removed in 3.0.0. Please use [OneHotEncoderEstimator](ml-features.html#onehotencoderestimator) instead. --- End diff -- "will be" -> "has been" and then "and will be removed" --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20257: [SPARK-23048][ML] Add OneHotEncoderEstimator docu...
Github user MLnick commented on a diff in the pull request: https://github.com/apache/spark/pull/20257#discussion_r161473460 --- Diff: docs/ml-features.md --- @@ -807,6 +809,36 @@ for more details on the API. +## OneHotEncoderEstimator + +[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features. --- End diff -- We should add a note that it can handle multiple columns (and returns a one-hot-encoded output vector column for _each_ input column, rather than merging into one output vector). Also, what about describing the missing / invalid value handling in more detail? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20257: [SPARK-23048][ML] Add OneHotEncoderEstimator docu...
Github user MLnick commented on a diff in the pull request: https://github.com/apache/spark/pull/20257#discussion_r161472954 --- Diff: docs/ml-features.md --- @@ -775,7 +775,9 @@ for more details on the API. -## OneHotEncoder +## OneHotEncoder (Deprecated since 2.3.0) + +`OneHotEncoder` will be deprecated in 2.3.0 and removed in 3.0.0. Please use [OneHotEncoderEstimator](ml-features.html#onehotencoderestimator) instead. --- End diff -- Since it is deprecated - and I think we should be pretty aggressive about moving users to the new estimator - what do folks think about removing the description and examples from this doc and just pointing to the new estimator as done in this sentence here? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20229: [SPARK-23045][ML][SparkR] Update RFormula to use OneHotE...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20229 Merged build finished. Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20229: [SPARK-23045][ML][SparkR] Update RFormula to use OneHotE...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20229 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/86133/ Test PASSed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20229: [SPARK-23045][ML][SparkR] Update RFormula to use OneHotE...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20229 **[Test build #86133 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86133/testReport)** for PR 20229 at commit [`68d9ba1`](https://github.com/apache/spark/commit/68d9ba1a1fa7648037e61bfb928338b74eb7d669). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20229: [SPARK-23045][ML][SparkR] Update RFormula to use OneHotE...
Github user MrBago commented on the issue: https://github.com/apache/spark/pull/20229 I've rebased on master, I think that should resolve the the issues @viirya raised. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20229: [SPARK-23045][ML][SparkR] Update RFormula to use OneHotE...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20229 **[Test build #86133 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86133/testReport)** for PR 20229 at commit [`68d9ba1`](https://github.com/apache/spark/commit/68d9ba1a1fa7648037e61bfb928338b74eb7d669). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20163: [SPARK-22966][PYTHON][SQL] Python UDFs with retur...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20163#discussion_r161455317 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala --- @@ -144,6 +145,7 @@ object EvaluatePython { } case StringType => (obj: Any) => nullSafeConvert(obj) { + case _: Calendar => null case _ => UTF8String.fromString(obj.toString) --- End diff -- looks good --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20184: [SPARK-22987][Core] UnsafeExternalSorter cases OOM when ...
Github user jerryshao commented on the issue: https://github.com/apache/spark/pull/20184 The code here should be fine for normal case. The problem is that there're so many spill files, which requires to maintain lots of handler's buffer. A lazy buffer allocation could solve this problem, IIUC. It is not related to queue or something else. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20211: [SPARK-23011][PYTHON][SQL] Prepend missing groupi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20211#discussion_r161453175 --- Diff: python/pyspark/sql/group.py --- @@ -233,6 +233,27 @@ def apply(self, udf): | 2| 1.1094003924504583| +---+---+ +Notes on grouping column: --- End diff -- SGTM, one thing is how to define the type of `key`, a row? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20214: [SPARK-23023][SQL] Cast field data to strings in ...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/20214 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19247: [Spark-21996][SQL] read files with space in name ...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/19247#discussion_r161452270 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala --- @@ -408,6 +420,18 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } } + test("SPARK-21996 read from text files -- file name has space") { --- End diff -- can we run the same test also for the other input format, ie. parquet, orc, ... ? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20214: [SPARK-23023][SQL] Cast field data to strings in showStr...
Github user cloud-fan commented on the issue: https://github.com/apache/spark/pull/20214 thanks, merging to master/2.3 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161451478 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -93,41 +97,76 @@ object DecimalPrecision extends TypeCoercionRule { case e: BinaryArithmetic if e.left.isInstanceOf[PromotePrecision] => e case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => - val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2)) - CheckOverflow(Add(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt) + val resultType = if (SQLConf.get.decimalOperationsAllowTruncat) { +val resultScale = max(s1, s2) --- End diff -- we can put this before the `if` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161451267 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- if scale needs to be less than 6, we would fail the analysis, right? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19247 **[Test build #86132 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86132/testReport)** for PR 19247 at commit [`2542014`](https://github.com/apache/spark/commit/2542014b8769cb3a605ec03e3d1e45ff2ab81576). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161450453 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala --- @@ -42,8 +43,10 @@ import org.apache.spark.sql.types._ * e1 / e2 p1 - s1 + s2 + max(6, s1 + p2 + 1) max(6, s1 + p2 + 1) * e1 % e2 min(p1-s1, p2-s2) + max(s1, s2) max(s1, s2) * e1 union e2 max(s1, s2) + max(p1-s1, p2-s2) max(s1, s2) - * sum(e1) p1 + 10 s1 - * avg(e1) p1 + 4 s1 + 4 + * + * When `spark.sql.decimalOperations.allowTruncat` is set to true, if the precision / scale needed + * are out of the range of available values, the scale is reduced up to 6, in order to prevent the --- End diff -- it's better to push some reference about where we get this rule. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/19247 retest this please --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20268 **[Test build #86131 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86131/testReport)** for PR 20268 at commit [`6770835`](https://github.com/apache/spark/commit/67708359ff19d450a3f3e60548df778fb1588515). --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user wangyum commented on the issue: https://github.com/apache/spark/pull/20268 retest this please --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20023: [SPARK-22036][SQL] Decimal multiplication with hi...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20023#discussion_r161449017 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala --- @@ -1048,6 +1048,16 @@ object SQLConf { .booleanConf .createWithDefault(true) + val DECIMAL_OPERATIONS_ALLOW_TRUNCAT = +buildConf("spark.sql.decimalOperations.allowTruncat") --- End diff -- Sorry that was my typo... `allowPrecisionLoss` SGTM --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20268 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/86129/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/20268 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19247 Merged build finished. Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/19247 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/86130/ Test FAILed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19247: [Spark-21996][SQL] read files with space in name for str...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/19247 **[Test build #86130 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86130/testReport)** for PR 19247 at commit [`2542014`](https://github.com/apache/spark/commit/2542014b8769cb3a605ec03e3d1e45ff2ab81576). * This patch **fails due to an unknown error code, -9**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20268: [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize for s...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/20268 **[Test build #86129 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/86129/testReport)** for PR 20268 at commit [`6770835`](https://github.com/apache/spark/commit/67708359ff19d450a3f3e60548df778fb1588515). * This patch **fails due to an unknown error code, -9**. * This patch merges cleanly. * This patch adds no public classes. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org