Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/19480#discussion_r144683715 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala --- @@ -2103,4 +2103,35 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { testData2.select(lit(7), 'a, 'b).orderBy(lit(1), lit(2), lit(3)), Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2))) } + + test("SPARK-22226: splitExpressions should not generate codes beyond 64KB") { + val colNumber = 10000 + val input = spark.range(2).rdd.map(_ => Row(1 to colNumber: _*)) + val df = sqlContext.createDataFrame(input, StructType( + (1 to colNumber).map(colIndex => StructField(s"_$colIndex", IntegerType, false)))) + val newCols = (1 to colNumber).flatMap { colIndex => + Seq(expr(s"if(1000 < _$colIndex, 1000, _$colIndex)"), + expr(s"sqrt(_$colIndex)")) + } + df.select(newCols: _*).collect() + } + + test("SPARK-22226: too many splitted expressions should not exceed constant pool limit") { --- End diff -- The following which can't be passed in current master branch can be passed with your fix. I didn't see OOM issue and nestedclass constant pool issue. ```scala test("SPARK-22226: too many splitted expressions should not exceed constant pool limit") { val colNumber = 5000 val input = spark.range(2).rdd.map(_ => Row(1 to colNumber: _*)) val df = sqlContext.createDataFrame(input, StructType( (1 to colNumber).map(colIndex => StructField(s"_$colIndex", IntegerType, false)))) val funcs = (1 to colNumber).map { colIndex => val colName = s"_$colIndex" col(colName).cast(LongType) } df.select(funcs: _*).dropDuplicates((1 to 5).map(colIndex => s"_$colIndex")).collect() } ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org