[GitHub] spark issue #16119: [SPARK-18687][Pyspark][SQL]Backward compatibility - crea...
Github user vijoshi commented on the issue: https://github.com/apache/spark/pull/16119 @cloud-fan sure, i'll push an update soon --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93824925 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql --- @@ -0,0 +1,92 @@ +-- A test suite for simple IN predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- simple select +-- TC 01.01 +select * from t1 where t1a in (select t2a from t2); +-- TC 01.02 +select * from t1 where t1b in (select t2b from t2 where t1a = t2a); +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a != t2a); +-- TC 01.04 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a = t2a or t1b > t2b); +-- TC 01.05 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t2i in (select t3i from t3 where t2c = t3c)); +-- TC 01.06 +select t1a, t1b
[GitHub] spark pull request #16296: [SPARK-18885][SQL] unify CREATE TABLE syntax for ...
Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/16296#discussion_r93824911 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala --- @@ -342,42 +342,46 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { } /** - * Create a data source table, returning a [[CreateTable]] logical plan. + * Create a table, returning a [[CreateTable]] logical plan. * * Expected format: * {{{ - * CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name + * CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name * USING table_provider * [OPTIONS table_property_list] * [PARTITIONED BY (col_name, col_name, ...)] * [CLUSTERED BY (col_name, col_name, ...) *[SORTED BY (col_name [ASC|DESC], ...)] *INTO num_buckets BUCKETS * ] + * [TBLPROPERTIES (property_name=property_value, ...)] --- End diff -- Here, we need an update. Removed `TBLPROPERTIES` but added new `locationSpec` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93823456 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( + ctx: CodegenContext, + elementType: DataType, + numElements: Int, + isPrimitive : Boolean): (String, String, String, String) = { +val arrayName = ctx.freshName("array") +val arrayDataName = ctx.freshName("arrayData") +if (!isPrimitive) { + val arrayClass = classOf[GenericArrayData].getName + ctx.addMutableState("Object[]", arrayName, +s"this.$arrayName = new Object[${numElements}];") + ("", + s"$arrayClass $arrayDataName = new $arrayClass($arrayName);", + arrayDataName, + arrayName) +} else { + val unsafeArrayClass = classOf[UnsafeArrayData].getName + val baseObject = ctx.freshName("baseObject") + val unsafeArraySizeInBytes = +UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) + val baseOffset = Platform.BYTE_ARRAY_OFFSET + + (s""" +byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; +$unsafeArrayClass $arrayDataName = new $unsafeArrayClass(); --- End diff -- I see. You are right. Finally, I had to do it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user kevinyu98 commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93823424 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; --- End diff -- ok, I will remove it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user kevinyu98 commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93823423 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- GROUP BY in parent side +-- TC 01.01 +select t1a, avg(t1b) from t1 where t1a in (select t2a from t2) group by t1a; +-- TC 01.02 +select t1a, max(t1b) from t1 where t1b in (select t2b from t2 where t1a = t2a) group by t1a, t1d; +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2c from t2 where t1a = t2a) group by t1a, t1b; +-- TC 01.04 +select t1a, sum(distinct(t1b)) from t1 where t1c in (select t2c from t2 where t1a = t2a) or +t1c in (select t3c from t3 where t1a =
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user kevinyu98 commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93823420 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql --- @@ -0,0 +1,92 @@ +-- A test suite for simple IN predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- simple select +-- TC 01.01 +select * from t1 where t1a in (select t2a from t2); +-- TC 01.02 +select * from t1 where t1b in (select t2b from t2 where t1a = t2a); +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a != t2a); +-- TC 01.04 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a = t2a or t1b > t2b); +-- TC 01.05 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t2i in (select t3i from t3 where t2c = t3c)); +-- TC 01.06 +select t1a, t1b
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user kevinyu98 commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93823418 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql --- @@ -0,0 +1,92 @@ +-- A test suite for simple IN predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- simple select +-- TC 01.01 +select * from t1 where t1a in (select t2a from t2); +-- TC 01.02 +select * from t1 where t1b in (select t2b from t2 where t1a = t2a); +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a != t2a); +-- TC 01.04 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a = t2a or t1b > t2b); +-- TC 01.05 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t2i in (select t3i from t3 where t2c = t3c)); +-- TC 01.06 +select t1a, t1b
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user kevinyu98 commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93823421 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- GROUP BY in parent side +-- TC 01.01 +select t1a, avg(t1b) from t1 where t1a in (select t2a from t2) group by t1a; +-- TC 01.02 +select t1a, max(t1b) from t1 where t1b in (select t2b from t2 where t1a = t2a) group by t1a, t1d; +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2c from t2 where t1a = t2a) group by t1a, t1b; +-- TC 01.04 +select t1a, sum(distinct(t1b)) from t1 where t1c in (select t2c from t2 where t1a = t2a) or +t1c in (select t3c from t3 where t1a =
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user kevinyu98 commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93823414 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- GROUP BY in parent side +-- TC 01.01 +select t1a, avg(t1b) from t1 where t1a in (select t2a from t2) group by t1a; +-- TC 01.02 +select t1a, max(t1b) from t1 where t1b in (select t2b from t2 where t1a = t2a) group by t1a, t1d; +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2c from t2 where t1a = t2a) group by t1a, t1b; +-- TC 01.04 +select t1a, sum(distinct(t1b)) from t1 where t1c in (select t2c from t2 where t1a = t2a) or +t1c in (select t3c from t3 where t1a =
[GitHub] spark issue #15496: [SPARK-17950] [Python] Match SparseVector behavior with ...
Github user itg-abby commented on the issue: https://github.com/apache/spark/pull/15496 @holdenk Thanks for the review and your availability, I really appreciate the work you are doing too by mentoring me! I didn't realize np.append was making copies, though it made a lot of sense once I thought about what is actually going on there in terms of memory. The appends were also ultimately unnecessary and have been removed in favor of a much better, simpler, efficient call to make the csr matrix! --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #15604: [SPARK-18066] [CORE] [TESTS] Add Pool usage policies tes...
Github user erenavsarogullari commented on the issue: https://github.com/apache/spark/pull/15604 Hi @kayousterhout @markhamstra @squito, This PR aims to extend Pool usage policies unit test coverage and is also ready for review. Thanks and all feedbacks are welcome in advance ;) --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user nsyca commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93821052 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql --- @@ -0,0 +1,92 @@ +-- A test suite for simple IN predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- simple select +-- TC 01.01 +select * from t1 where t1a in (select t2a from t2); +-- TC 01.02 +select * from t1 where t1b in (select t2b from t2 where t1a = t2a); +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a != t2a); +-- TC 01.04 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a = t2a or t1b > t2b); +-- TC 01.05 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t2i in (select t3i from t3 where t2c = t3c)); +-- TC 01.06 +select t1a, t1b from t1
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user nsyca commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93821009 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; --- End diff -- If there is no explicit restriction, I would like to keep the create database/use database so that the test file is self-contained and can be run in different environments with minimal side effect. I don't have any preference between real tables or temporary views but variations are good to exercise different code paths. If all the test cases are written homogeneously to certain patterns, it limits the coverage. Again, if there is no explicit rules or guidelines on which particular ways to write test cases, I would like to request to have it kept at this format. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user dongjoon-hyun commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93820846 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- GROUP BY in parent side +-- TC 01.01 +select t1a, avg(t1b) from t1 where t1a in (select t2a from t2) group by t1a; +-- TC 01.02 +select t1a, max(t1b) from t1 where t1b in (select t2b from t2 where t1a = t2a) group by t1a, t1d; +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2c from t2 where t1a = t2a) group by t1a, t1b; +-- TC 01.04 +select t1a, sum(distinct(t1b)) from t1 where t1c in (select t2c from t2 where t1a = t2a) or +t1c in (select t3c from t3 where t1a
[GitHub] spark issue #15237: [SPARK-17663] [CORE] SchedulableBuilder should handle in...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/15237 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #15237: [SPARK-17663] [CORE] SchedulableBuilder should handle in...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/15237 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/70571/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #15237: [SPARK-17663] [CORE] SchedulableBuilder should handle in...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/15237 **[Test build #70571 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70571/testReport)** for PR 15237 at commit [`a4e06b2`](https://github.com/apache/spark/commit/a4e06b2b2baada22ab8b9ca8780fd486b7efa192). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16388: [SPARK-18989][SQL] DESC TABLE should not fail wit...
Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/16388#discussion_r93819750 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala --- @@ -408,8 +408,8 @@ private[hive] class HiveClientImpl( lastAccessTime = h.getLastAccessTime.toLong * 1000, storage = CatalogStorageFormat( locationUri = shim.getDataLocation(h), - inputFormat = Option(h.getInputFormatClass).map(_.getName), - outputFormat = Option(h.getOutputFormatClass).map(_.getName), + inputFormat = Option(h.getTTable.getSd.getInputFormat), + outputFormat = Option(h.getTTable.getSd.getOutputFormat), --- End diff -- I checked the impl of [getInputFormatClass](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java#L302-L320) and [getOutputFormatClass](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java#L322-L342) . It has extra logics if either `getTTable.getSd.getInputFormat` / `getTTable.getSd.getOutputFormat` is null. I checked the change history. These extra logics were added in https://issues.apache.org/jira/browse/HIVE-1122, https://issues.apache.org/jira/browse/HIVE-705, and https://issues.apache.org/jira/browse/HIVE-5260. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93819586 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; --- End diff -- For each test suite, (e.g., in-group-by.sql), we create a dedicated sesssion. See [the code](https://github.com/apache/spark/blob/master/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala#L144-L147). Why not creating temporary views? These views will be automatically removed after the session ends. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93819526 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql --- @@ -0,0 +1,92 @@ +-- A test suite for simple IN predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- simple select +-- TC 01.01 +select * from t1 where t1a in (select t2a from t2); +-- TC 01.02 +select * from t1 where t1b in (select t2b from t2 where t1a = t2a); +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a != t2a); +-- TC 01.04 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a = t2a or t1b > t2b); +-- TC 01.05 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t2i in (select t3i from t3 where t2c = t3c)); +-- TC 01.06 +select t1a, t1b
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93819517 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- GROUP BY in parent side +-- TC 01.01 +select t1a, avg(t1b) from t1 where t1a in (select t2a from t2) group by t1a; +-- TC 01.02 +select t1a, max(t1b) from t1 where t1b in (select t2b from t2 where t1a = t2a) group by t1a, t1d; +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2c from t2 where t1a = t2a) group by t1a, t1b; +-- TC 01.04 +select t1a, sum(distinct(t1b)) from t1 where t1c in (select t2c from t2 where t1a = t2a) or +t1c in (select t3c from t3 where t1a =
[GitHub] spark issue #16392: [SPARK-18992] [SQL] Move spark.sql.hive.thriftServer.sin...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/16392 cc @yhuai @cloud-fan @liancheng --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15237: [SPARK-17663] [CORE] SchedulableBuilder should ha...
Github user erenavsarogullari commented on a diff in the pull request: https://github.com/apache/spark/pull/15237#discussion_r93819357 --- Diff: core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala --- @@ -102,38 +105,55 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) for (poolNode <- (xml \\ POOLS_PROPERTY)) { val poolName = (poolNode \ POOL_NAME_PROPERTY).text - var schedulingMode = DEFAULT_SCHEDULING_MODE - var minShare = DEFAULT_MINIMUM_SHARE - var weight = DEFAULT_WEIGHT val xmlSchedulingMode = (poolNode \ SCHEDULING_MODE_PROPERTY).text - if (xmlSchedulingMode != "") { -try { - schedulingMode = SchedulingMode.withName(xmlSchedulingMode) -} catch { - case e: NoSuchElementException => -logWarning(s"Unsupported schedulingMode: $xmlSchedulingMode, " + - s"using the default schedulingMode: $schedulingMode") -} - } + val schedulingMode = getSchedulingModeValue(xmlSchedulingMode, DEFAULT_SCHEDULING_MODE) val xmlMinShare = (poolNode \ MINIMUM_SHARES_PROPERTY).text - if (xmlMinShare != "") { -minShare = xmlMinShare.toInt - } + val minShare = getIntValue(MINIMUM_SHARES_PROPERTY, xmlMinShare, DEFAULT_MINIMUM_SHARE) val xmlWeight = (poolNode \ WEIGHT_PROPERTY).text - if (xmlWeight != "") { -weight = xmlWeight.toInt - } + val weight = getIntValue(WEIGHT_PROPERTY, xmlWeight, DEFAULT_WEIGHT) + + rootPool.addSchedulable(new Pool(poolName, schedulingMode, minShare, weight)) - val pool = new Pool(poolName, schedulingMode, minShare, weight) - rootPool.addSchedulable(pool) logInfo("Created pool %s, schedulingMode: %s, minShare: %d, weight: %d".format( poolName, schedulingMode, minShare, weight)) } } + private def getSchedulingModeValue(data: String, defaultValue: SchedulingMode): SchedulingMode = { --- End diff -- Sure, `trim` and `toUpperCase` functions help to cover for blank and invalid `schedulingMode` cases. Also `schedulingMode = none/NONE` case needs to be checked. This case is valid but unsupported. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15237: [SPARK-17663] [CORE] SchedulableBuilder should ha...
Github user erenavsarogullari commented on a diff in the pull request: https://github.com/apache/spark/pull/15237#discussion_r93819365 --- Diff: core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala --- @@ -178,4 +177,36 @@ class PoolSuite extends SparkFunSuite with LocalSparkContext { scheduleTaskAndVerifyId(2, rootPool, 6) scheduleTaskAndVerifyId(3, rootPool, 2) } + + test("FairSchedulableBuilder sets default values for blank or invalid datas") { +val xmlPath = getClass.getClassLoader.getResource("fairscheduler-with-invalid-data.xml") + .getFile() +val conf = new SparkConf().set(SCHEDULER_ALLOCATION_FILE_PROPERTY, xmlPath) + +val rootPool = new Pool("", FAIR, 0, 0) +val schedulableBuilder = new FairSchedulableBuilder(rootPool, conf) +schedulableBuilder.buildPools() + +verifyPool(rootPool, schedulableBuilder.DEFAULT_POOL_NAME, 0, 1, FIFO) +verifyPool(rootPool, "pool_with_invalid_min_share", 0, 2, FAIR) +verifyPool(rootPool, "pool_with_invalid_weight", 1, 1, FAIR) +verifyPool(rootPool, "pool_with_invalid_scheduling_mode", 3, 2, FIFO) +verifyPool(rootPool, "pool_with_non_uppercase_scheduling_mode", 2, 1, FAIR) +verifyPool(rootPool, "pool_with_NONE_scheduling_mode", 1, 2, FIFO) +verifyPool(rootPool, "pool_with_whitespace_min_share", 0, 2, FAIR) +verifyPool(rootPool, "pool_with_whitespace_weight", 1, 1, FAIR) +verifyPool(rootPool, "pool_with_whitespace_scheduling_mode", 3, 2, FIFO) +verifyPool(rootPool, "pool_with_empty_min_share", 0, 3, FAIR) +verifyPool(rootPool, "pool_with_empty_weight", 2, 1, FAIR) +verifyPool(rootPool, "pool_with_empty_scheduling_mode", 2, 2, FIFO) --- End diff -- Addressed ;) --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #15237: [SPARK-17663] [CORE] SchedulableBuilder should handle in...
Github user erenavsarogullari commented on the issue: https://github.com/apache/spark/pull/15237 Hi @squito, Many thanks again for review. All comments have just been addressed via new commit. Also It is ready to rereview / merge ;) --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14452: [SPARK-16849][SQL] Improve subquery execution by dedupli...
Github user davies commented on the issue: https://github.com/apache/spark/pull/14452 @viirya For duplicated CTE, without some optimization (pushing down different predicates in different positions), the physical plan should be identical. So I'm wondering some aggressive pushing down cause the problem for some queries (IsNotNull(xxx)). This is the reason I asked that. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/13909 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/13909 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/70570/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/13909 **[Test build #70570 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70570/testReport)** for PR 13909 at commit [`28df09f`](https://github.com/apache/spark/commit/28df09fb569149470dcdb45e36e2d8b993c99a17). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16308: [SPARK-18936][SQL] Infrastructure for session local time...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/16308 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/70569/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16308: [SPARK-18936][SQL] Infrastructure for session local time...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/16308 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16308: [SPARK-18936][SQL] Infrastructure for session local time...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/16308 **[Test build #70569 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70569/testReport)** for PR 16308 at commit [`a00d52f`](https://github.com/apache/spark/commit/a00d52f2df41bf13b82c5644d5d5ffa3532335cd). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #15237: [SPARK-17663] [CORE] SchedulableBuilder should handle in...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/15237 **[Test build #70571 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70571/testReport)** for PR 15237 at commit [`a4e06b2`](https://github.com/apache/spark/commit/a4e06b2b2baada22ab8b9ca8780fd486b7efa192). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16389: [SPARK-18981][Core]The job hang problem when speculation...
Github user zhaorongsheng commented on the issue: https://github.com/apache/spark/pull/16389 Yes, I have checked it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16391: [SPARK-18990][SQL] make DatasetBenchmark fairer f...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/16391#discussion_r93818360 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala --- @@ -170,36 +176,39 @@ object DatasetBenchmark { val benchmark3 = aggregate(spark, numRows) /* -OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 3.10.0-327.18.2.el7.x86_64 -Intel Xeon E3-12xx v2 (Ivy Bridge) +Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.12.1 +Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz + back-to-back map:Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative -RDD 3448 / 3646 29.0 34.5 1.0X -DataFrame 2647 / 3116 37.8 26.5 1.3X -Dataset 4781 / 5155 20.9 47.8 0.7X +RDD 3963 / 3976 25.2 39.6 1.0X +DataFrame 826 / 834121.1 8.3 4.8X +Dataset 5178 / 5198 19.3 51.8 0.8X --- End diff -- the method signature in `Dataset` is: `def map[U : Encoder](f: T => U)`, unless we create primitive version methods, e.g. `def map(f: T => Long)`, I can't think of an easy way to get the concrete signature. BTW, I think the best solution is to analyze the byte code(class file) of the lambda function, and turn it into expressions. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16351: [SPARK-18943][SQL] Avoid per-record type dispatch in CSV...
Github user HyukjinKwon commented on the issue: https://github.com/apache/spark/pull/16351 @cloud-fan Thank you! --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16351: [SPARK-18943][SQL] Avoid per-record type dispatch...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/16351 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16351: [SPARK-18943][SQL] Avoid per-record type dispatch in CSV...
Github user cloud-fan commented on the issue: https://github.com/apache/spark/pull/16351 thanks, merging to master! --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93817737 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( + ctx: CodegenContext, + elementType: DataType, + numElements: Int, + isPrimitive : Boolean): (String, String, String, String) = { +val arrayName = ctx.freshName("array") +val arrayDataName = ctx.freshName("arrayData") +if (!isPrimitive) { + val arrayClass = classOf[GenericArrayData].getName + ctx.addMutableState("Object[]", arrayName, +s"this.$arrayName = new Object[${numElements}];") + ("", + s"$arrayClass $arrayDataName = new $arrayClass($arrayName);", + arrayDataName, + arrayName) +} else { + val unsafeArrayClass = classOf[UnsafeArrayData].getName + val baseObject = ctx.freshName("baseObject") + val unsafeArraySizeInBytes = +UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) + val baseOffset = Platform.BYTE_ARRAY_OFFSET + + (s""" +byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; +$unsafeArrayClass $arrayDataName = new $unsafeArrayClass(); --- End diff -- @viirya Thank you for your support and sorry for my poor English. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/13909 **[Test build #70570 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70570/testReport)** for PR 13909 at commit [`28df09f`](https://github.com/apache/spark/commit/28df09fb569149470dcdb45e36e2d8b993c99a17). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93817572 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( + ctx: CodegenContext, + elementType: DataType, + numElements: Int, + isPrimitive : Boolean): (String, String, String, String) = { +val arrayName = ctx.freshName("array") +val arrayDataName = ctx.freshName("arrayData") +if (!isPrimitive) { + val arrayClass = classOf[GenericArrayData].getName + ctx.addMutableState("Object[]", arrayName, +s"this.$arrayName = new Object[${numElements}];") + ("", + s"$arrayClass $arrayDataName = new $arrayClass($arrayName);", + arrayDataName, + arrayName) +} else { + val unsafeArrayClass = classOf[UnsafeArrayData].getName + val baseObject = ctx.freshName("baseObject") + val unsafeArraySizeInBytes = +UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) + val baseOffset = Platform.BYTE_ARRAY_OFFSET + + (s""" +byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; +$unsafeArrayClass $arrayDataName = new $unsafeArrayClass(); --- End diff -- I think @ueshin meant the split element assignment codes won't access this local `arrayDataName`. So you might need to use `addMutableState` to add them in. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16308: [SPARK-18936][SQL] Infrastructure for session local time...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/16308 **[Test build #70569 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70569/testReport)** for PR 16308 at commit [`a00d52f`](https://github.com/apache/spark/commit/a00d52f2df41bf13b82c5644d5d5ffa3532335cd). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16232: [SPARK-18800][SQL] Correct the assert in UnsafeKVExterna...
Github user viirya commented on the issue: https://github.com/apache/spark/pull/16232 Thanks @davies @srowen --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user nsyca commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93817299 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; --- End diff -- I guess what's missing here is the `USE indb` after the `CREATE DATABASE` to direct all the objects created after this statement to have a different database/schema. My bad. I thought `CREATE DATABASE` implicitly switch to a new database/schema. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93817255 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, --- End diff -- I made a mistake. To create `ArrayData array = new GenericArrayData(...)` can avoid this exception. ` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user nsyca commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93817238 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- GROUP BY in parent side +-- TC 01.01 +select t1a, avg(t1b) from t1 where t1a in (select t2a from t2) group by t1a; +-- TC 01.02 +select t1a, max(t1b) from t1 where t1b in (select t2b from t2 where t1a = t2a) group by t1a, t1d; +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2c from t2 where t1a = t2a) group by t1a, t1b; +-- TC 01.04 +select t1a, sum(distinct(t1b)) from t1 where t1c in (select t2c from t2 where t1a = t2a) or +t1c in (select t3c from t3 where t1a = t3a)
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93817231 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( + ctx: CodegenContext, + elementType: DataType, + numElements: Int, + isPrimitive : Boolean): (String, String, String, String) = { +val arrayName = ctx.freshName("array") +val arrayDataName = ctx.freshName("arrayData") +if (!isPrimitive) { + val arrayClass = classOf[GenericArrayData].getName + ctx.addMutableState("Object[]", arrayName, +s"this.$arrayName = new Object[${numElements}];") + ("", + s"$arrayClass $arrayDataName = new $arrayClass($arrayName);", + arrayDataName, + arrayName) +} else { + val unsafeArrayClass = classOf[UnsafeArrayData].getName + val baseObject = ctx.freshName("baseObject") + val unsafeArraySizeInBytes = +UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) + val baseOffset = Platform.BYTE_ARRAY_OFFSET + + (s""" +byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; +$unsafeArrayClass $arrayDataName = new $unsafeArrayClass(); --- End diff -- Sorry, I think we need to make this mutable state. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93817216 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( + ctx: CodegenContext, + elementType: DataType, + numElements: Int, + isPrimitive : Boolean): (String, String, String, String) = { +val arrayName = ctx.freshName("array") +val arrayDataName = ctx.freshName("arrayData") +if (!isPrimitive) { + val arrayClass = classOf[GenericArrayData].getName + ctx.addMutableState("Object[]", arrayName, +s"this.$arrayName = new Object[${numElements}];") + ("", + s"$arrayClass $arrayDataName = new $arrayClass($arrayName);", + arrayDataName, + arrayName) +} else { + val unsafeArrayClass = classOf[UnsafeArrayData].getName + val baseObject = ctx.freshName("baseObject") + val unsafeArraySizeInBytes = +UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) + val baseOffset = Platform.BYTE_ARRAY_OFFSET + + (s""" +byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; +$unsafeArrayClass $arrayDataName = new $unsafeArrayClass(); --- End diff -- I will declare this as `final`. Is this what you want to do? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user nsyca commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93817218 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql --- @@ -0,0 +1,92 @@ +-- A test suite for simple IN predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; +CREATE TABLE t1(t1a String, t1b Short, t1c Int, t1d Long, t1e float, t1f double, t1g DECIMAL, t1h TIMESTAMP, t1i Date) +using parquet; +CREATE TABLE t2(t2a String, t2b Short, t2c Int, t2d Long, t2e float, t2f double, t2g DECIMAL, t2h TIMESTAMP, t2i Date) +using parquet; +CREATE TABLE t3(t3a String, t3b Short, t3c Int, t3d Long, t3e float, t3f double, t3g DECIMAL, t3h TIMESTAMP, t3i Date) +using parquet; + +-- insert to tables +INSERT INTO t1 VALUES + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1a', 16, 12, 21, 15, 20, 20.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1a', 16, 12, 10, 15, 20, 20.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-05")), + ('t1d', null, 16, 22, 17, 25, 26.00, timestamp(date("2014-06-04")), null), + ('t1d', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), null), + ('t1e', 10, null, 25, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1d', 10, null, 12, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1a', 6, 8, 10, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1e', 10, null, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-0=4")); + +INSERT INTO t2 VALUES + ('t2a', 6, 12, 14, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 119, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")), + ('t1c', 12, 16, 219, 17, 25, 26.00, timestamp(date("2016-05-04")), date("2016-05-04")), + ('t1b', null, 16, 319, 17, 25, 26.00, timestamp(date("2017-05-04")), null), + ('t2e', 8, null, 419, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1f', 19, null, 519, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t1c', 12, 16, 19, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-05")), + ('t1e', 8, null, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-04")), + ('t1f', 19, null, 19, 17, 25, 26.00, timestamp(date("2014-10-04")), date("2014-10-04")), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), null); + +INSERT INTO t3 VALUES + ('t3a', 6, 12, 110, 15, 20, 20.00, timestamp(date("2014-04-04")), date("2014-04-04")), + ('t3a', 6, 12, 10, 15, 20, 20.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 219, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 10, 12, 19, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t1b', 8, 16, 319, 17, 25, 26.00, timestamp(date("2014-06-04")), date("2014-06-04")), + ('t1b', 8, 16, 19, 17, 25, 26.00, timestamp(date("2014-07-04")), date("2014-07-04")), + ('t3c', 17, 16, 519, 17, 25, 26.00, timestamp(date("2014-08-04")), date("2014-08-04")), + ('t3c', 17, 16, 19, 17, 25, 26.00, timestamp(date("2014-09-04")), date("2014-09-05")), + ('t1b', null, 16, 419, 17, 25, 26.00, timestamp(date("2014-10-04")), null), + ('t1b', null, 16, 19, 17, 25, 26.00, timestamp(date("2014-11-04")), null), + ('t3b', 8, null, 719, 17, 25, 26.00, timestamp(date("2014-05-04")), date("2014-05-04")), + ('t3b', 8, null, 19, 17, 25, 26.00, timestamp(date("2015-05-04")), date("2015-05-04")); + +-- correlated IN subquery +-- simple select +-- TC 01.01 +select * from t1 where t1a in (select t2a from t2); +-- TC 01.02 +select * from t1 where t1b in (select t2b from t2 where t1a = t2a); +-- TC 01.03 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a != t2a); +-- TC 01.04 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t1a = t2a or t1b > t2b); +-- TC 01.05 +select t1a, t1b from t1 where t1c in (select t2b from t2 where t2i in (select t3i from t3 where t2c = t3c)); +-- TC 01.06 +select t1a, t1b from t1
[GitHub] spark pull request #16337: [SPARK-18871][SQL] New test cases for IN/NOT IN s...
Github user nsyca commented on a diff in the pull request: https://github.com/apache/spark/pull/16337#discussion_r93817165 --- Diff: sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql --- @@ -0,0 +1,117 @@ +-- A test suite for GROUP BY in parent side, subquery, and both predicate subquery +-- It includes correlated cases. + +-- tables and data types + +CREATE DATABASE indb; --- End diff -- The statement CREATE DATABASE/DROP DATABASE is used to isolate all the objects created in this test file to its own database/schema. The purpose is to protect any name collision from other test files leaving objects of the same names without properly dropping them. The use of different database/schema mitigates this problem as the chance of having the same database/schema name in two different test files is low. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16338: [SPARK-18837][WEBUI] Very long stage descriptions...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/16338 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16338: [SPARK-18837][WEBUI] Very long stage descriptions do not...
Github user srowen commented on the issue: https://github.com/apache/spark/pull/16338 Merged to master/2.1 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16240: [SPARK-16792][SQL] Dataset containing a Case Clas...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/16240#discussion_r93816848 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala --- @@ -130,6 +130,30 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSQLContext { checkDataset(Seq(Array(Tuple1(1))).toDS(), Array(Tuple1(1))) } + test("arbitrary sequences") { --- End diff -- let's also test nested sequences, e.g. `List(Queue(1))`, and sequences inside product, e.g. `List(1) -> Queue(1)` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16393: [SPARK-18993] [Build] Revert Split test-tags into test-J...
Github user srowen commented on the issue: https://github.com/apache/spark/pull/16393 I can reproduce this now on a clean build. I don't think the whole change needs to be reverted; I think it's narrower than that. test-tags no longer has any dependency on Scala libs but does reference a few Scala classes. I think that could be the issue. I'm investigating now --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16232: [SPARK-18800][SQL] Correct the assert in UnsafeKV...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/16232 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16232: [SPARK-18800][SQL] Correct the assert in UnsafeKVExterna...
Github user srowen commented on the issue: https://github.com/apache/spark/pull/16232 Merged to master --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16240: [SPARK-16792][SQL] Dataset containing a Case Clas...
Github user michalsenkyr commented on a diff in the pull request: https://github.com/apache/spark/pull/16240#discussion_r93816269 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala --- @@ -100,31 +97,36 @@ abstract class SQLImplicits { // Seqs /** @since 1.6.1 */ - implicit def newIntSeqEncoder: Encoder[Seq[Int]] = ExpressionEncoder() + implicit def newIntSeqEncoder[T <: Seq[Int] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newLongSeqEncoder: Encoder[Seq[Long]] = ExpressionEncoder() + implicit def newLongSeqEncoder[T <: Seq[Long] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newDoubleSeqEncoder: Encoder[Seq[Double]] = ExpressionEncoder() + implicit def newDoubleSeqEncoder[T <: Seq[Double] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newFloatSeqEncoder: Encoder[Seq[Float]] = ExpressionEncoder() + implicit def newFloatSeqEncoder[T <: Seq[Float] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newByteSeqEncoder: Encoder[Seq[Byte]] = ExpressionEncoder() + implicit def newByteSeqEncoder[T <: Seq[Byte] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newShortSeqEncoder: Encoder[Seq[Short]] = ExpressionEncoder() + implicit def newShortSeqEncoder[T <: Seq[Short] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newBooleanSeqEncoder: Encoder[Seq[Boolean]] = ExpressionEncoder() + implicit def newBooleanSeqEncoder[T <: Seq[Boolean] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newStringSeqEncoder: Encoder[Seq[String]] = ExpressionEncoder() + implicit def newStringSeqEncoder[T <: Seq[String] : TypeTag]: Encoder[T] = ExpressionEncoder() /** @since 1.6.1 */ - implicit def newProductSeqEncoder[A <: Product : TypeTag]: Encoder[Seq[A]] = ExpressionEncoder() + implicit def newProductSeqEncoder[A <: Product, T <: Seq[A] : TypeTag]: Encoder[T] = --- End diff -- You are right. `Seq` is declared covariant on `T` so it works and solves all the problems I was having. Thanks --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16392: [SPARK-18992] [SQL] Move spark.sql.hive.thriftServer.sin...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/16392 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/70567/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16392: [SPARK-18992] [SQL] Move spark.sql.hive.thriftServer.sin...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/16392 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16392: [SPARK-18992] [SQL] Move spark.sql.hive.thriftServer.sin...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/16392 **[Test build #70567 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70567/testReport)** for PR 16392 at commit [`5221494`](https://github.com/apache/spark/commit/52214945fdc7705f65ce6522c2d05b1a79e69c78). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/13909 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/13909 Test FAILed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/70568/ Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/13909 **[Test build #70568 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70568/testReport)** for PR 13909 at commit [`327c8ac`](https://github.com/apache/spark/commit/327c8acc3045b3a96b893ad221e8379d0403b3a9). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16351: [SPARK-18943][SQL] Avoid per-record type dispatch in CSV...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/16351 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16351: [SPARK-18943][SQL] Avoid per-record type dispatch in CSV...
Github user AmplabJenkins commented on the issue: https://github.com/apache/spark/pull/16351 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/70566/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16351: [SPARK-18943][SQL] Avoid per-record type dispatch in CSV...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/16351 **[Test build #70566 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70566/testReport)** for PR 16351 at commit [`a036b7f`](https://github.com/apache/spark/commit/a036b7f6e3b8ef81afdc3e817457feb06b628cf4). * This patch passes all tests. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93815064 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, --- End diff -- To declare as `ArrayData` caused the same exception. Janino recognizes `ArrayData` is not assigned to `UnsafeArrayData`. Janino may keep type of each variable for simple cases. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93815041 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( + ctx: CodegenContext, + elementType: DataType, + numElements: Int, + isPrimitive : Boolean): (String, String, String, String) = { +val arrayName = ctx.freshName("array") +val arrayDataName = ctx.freshName("arrayData") +if (!isPrimitive) { + val arrayClass = classOf[GenericArrayData].getName + ctx.addMutableState("Object[]", arrayName, +s"this.$arrayName = new Object[${numElements}];") + ("", + s"$arrayClass $arrayDataName = new $arrayClass($arrayName);", + arrayDataName, + arrayName) +} else { + val unsafeArrayClass = classOf[UnsafeArrayData].getName + val baseObject = ctx.freshName("baseObject") + val unsafeArraySizeInBytes = +UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) + val baseOffset = Platform.BYTE_ARRAY_OFFSET + + (s""" +byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; +$unsafeArrayClass $arrayDataName = new $unsafeArrayClass(); --- End diff -- Don't we need to make this mutable state for the case assignments are split? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93815032 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( --- End diff -- i see. I will do it tonight --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13758: [SPARK-16043][SQL] Prepare GenericArrayData implementati...
Github user cloud-fan commented on the issue: https://github.com/apache/spark/pull/13758 shall we close it? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814957 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,130 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", + isNull = "false") } override def prettyName: String = "array" } +private [sql] object GenArrayData { + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) + */ + def genCodeToCreateArrayData( --- End diff -- why we need 2 methods can we just write ``` def genCodeToCreateArrayData( ctx: CodegenContext, elementType: DataType, elementCodes: Seq[ExprCode], allowNull: Boolean): (String, String, String, String) = { val arrayName = ctx.freshName("array") val arrayDataName = ctx.freshName("arrayData") val numElements = elementCodes.length if (ctx.isPrimitiveType(elementType)) { ctx.addMutableState("UnsafeArrayData", arrayDataName, "new UnsafeArrayData();") val unsafeArraySizeInBytes = UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) val baseOffset = Platform.BYTE_ARRAY_OFFSET val preprocess = s""" byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; Platform.putLong($arrayName, $baseOffset, $numElements); $arrayDataName.pointTo($arrayName, $baseOffset, $unsafeArraySizeInBytes); """ val primitiveTypeName = ctx.primitiveTypeName(elementType) val assignElements = ctx.splitExpressions(elementCodes.zipWithIndex.map { case (eval, i) => ... }) val createArrayData = "" (preprocess, assignElements, createArrayData, arrayData) } else { .. } } ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814866 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, --- End diff -- How about defining the type of `arrayData` as `ArrayData`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814845 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, --- End diff -- Yes, it can avoid the exception. However, it may introduce a new performance issue. At lines 82 and 86, two castings happens per loop iteration. I would like to avoid these frequent castings. ``` /* 070 */ if (((ArrayData) arrayData1) instanceof UnsafeArrayData) { /* 071 */ /* 072 */ final int sizeInBytes1 = ((UnsafeArrayData) ((ArrayData) arrayData1)).getSizeInBytes(); /* 073 */ // grow the global buffer before writing data. /* 074 */ holder.grow(sizeInBytes1); /* 075 */ ((UnsafeArrayData) ((ArrayData) arrayData1)).writeToMemory(holder.buffer, holder.cursor); /* 076 */ holder.cursor += sizeInBytes1; /* 077 */ /* 078 */ } else { /* 079 */ final int numElements1 = ((ArrayData) arrayData1).numElements(); /* 080 */ arrayWriter1.initialize(holder, numElements1, 8); /* 081 */ /* 082 */ for (int index1 = 0; index1 < numElements1; index1++) { /* 083 */ if (((ArrayData) arrayData1).isNullAt(index1)) { /* 084 */ arrayWriter1.setNullLong(index1); /* 085 */ } else { /* 086 */ final long element1 = ((ArrayData) arrayData1).getLong(index1); /* 087 */ arrayWriter1.write(index1, element1); /* 088 */ } /* 089 */ } /* 090 */ } ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814832 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", isNull = "false") } override def prettyName: String = "array" } private [sql] object GenArrayData { - // This function returns Java code pieces based on DataType and isPrimitive - // for allocation of ArrayData class - def getCodeArrayData( + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) --- End diff -- good catch, thx --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814790 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, --- End diff -- if we write `value = s"((ArrayData) arrayData)"`, can it bypass the exception? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16391: [SPARK-18990][SQL] make DatasetBenchmark fairer f...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/16391#discussion_r93814766 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala --- @@ -170,36 +176,39 @@ object DatasetBenchmark { val benchmark3 = aggregate(spark, numRows) /* -OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 3.10.0-327.18.2.el7.x86_64 -Intel Xeon E3-12xx v2 (Ivy Bridge) +Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.12.1 +Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz + back-to-back map:Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative -RDD 3448 / 3646 29.0 34.5 1.0X -DataFrame 2647 / 3116 37.8 26.5 1.3X -Dataset 4781 / 5155 20.9 47.8 0.7X +RDD 3963 / 3976 25.2 39.6 1.0X +DataFrame 826 / 834121.1 8.3 4.8X +Dataset 5178 / 5198 19.3 51.8 0.8X --- End diff -- IIUC, an signature of `apply()` is `apply(Object)`. It also introduces additional boxing overhead. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814750 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -58,77 +58,126 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val array = ctx.freshName("array") - val et = dataType.elementType val evals = children.map(e => e.genCode(ctx)) val isPrimitiveArray = ctx.isPrimitiveType(et) -val (preprocess, arrayData) = - GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) - -val assigns = if (isPrimitiveArray) { - val primitiveTypeName = ctx.primitiveTypeName(et) - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $arrayData.setNullAt($i); - } else { - $arrayData.set$primitiveTypeName($i, ${eval.value}); - } - """ - } -} else { - evals.zipWithIndex.map { case (eval, i) => -eval.code + s""" - if (${eval.isNull}) { - $array[$i] = null; - } else { - $array[$i] = ${eval.value}; - } - """ - } -} -ev.copy(code = - preprocess + - ctx.splitExpressions(ctx.INPUT_ROW, assigns) + - s"\nfinal ArrayData ${ev.value} = $arrayData;\n", +val (preprocess, postprocess, arrayData, array) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children.size, isPrimitiveArray) +val assigns = GenArrayData.genCodeToAssignArrayElements( + ctx, evals, et, isPrimitiveArray, arrayData, array, true) +/* + TODO: When we generate simpler code, we have to solve the following exception +https://github.com/apache/spark/pull/13909/files#r93813725 + ev.copy( +code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess +value = arrayData, +isNull = "false") +*/ +ev.copy( + code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess + +s"\nfinal ArrayData ${ev.value} = $arrayData;\n", isNull = "false") } override def prettyName: String = "array" } private [sql] object GenArrayData { - // This function returns Java code pieces based on DataType and isPrimitive - // for allocation of ArrayData class - def getCodeArrayData( + /** + * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * + * @param ctx a [[CodegenContext]] + * @param elementType data type of an underlying array + * @param numElements the number of array elements + * @param isPrimitive Are all of the elements of an underlying array primitive type + * @return (code pre-assignments, code post-assignments, underlying array name, arrayData name) --- End diff -- You return `(..., ..., arrayDataName, arrayName)` actually. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16391: [SPARK-18990][SQL] make DatasetBenchmark fairer f...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/16391#discussion_r93814744 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala --- @@ -133,24 +134,29 @@ object DatasetBenchmark { def aggregate(spark: SparkSession, numRows: Long): Benchmark = { import spark.implicits._ -val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) +val rdd = spark.sparkContext.range(0, numRows) +val ds = spark.range(0, numRows) +val df = ds.toDF("l") + val benchmark = new Benchmark("aggregate", numRows) -val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) benchmark.addCase("RDD sum") { iter => - rdd.aggregate(0L)(_ + _.l, _ + _) + rdd.map(l => (l % 10, l)).reduceByKey(_ + _).foreach(_ => Unit) --- End diff -- i see --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814713 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,81 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val array = ctx.freshName("array") + +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, arrayData) = + GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) + +val assigns = if (isPrimitiveArray) { + val primitiveTypeName = ctx.primitiveTypeName(et) + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $arrayData.setNullAt($i); + } else { + $arrayData.set$primitiveTypeName($i, ${eval.value}); + } + """ + } +} else { + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $array[$i] = null; + } else { + $array[$i] = ${eval.value}; + } + """ + } +} +ev.copy(code = --- End diff -- let's not bother about it and leave the code as it was --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13909: [SPARK-16213][SQL] Reduce runtime overhead of a program ...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/13909 **[Test build #70568 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70568/testReport)** for PR 13909 at commit [`327c8ac`](https://github.com/apache/spark/commit/327c8acc3045b3a96b893ad221e8379d0403b3a9). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814699 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,81 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val array = ctx.freshName("array") + +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, arrayData) = + GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) + +val assigns = if (isPrimitiveArray) { + val primitiveTypeName = ctx.primitiveTypeName(et) + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $arrayData.setNullAt($i); + } else { + $arrayData.set$primitiveTypeName($i, ${eval.value}); + } + """ + } +} else { + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $array[$i] = null; + } else { + $array[$i] = ${eval.value}; + } + """ + } +} +ev.copy(code = --- End diff -- For now, while I did not apply this change, I left a comment as ToDo. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16233: [SPARK-18801][SQL] Add `View` operator to help resolve a...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/16233 I also like the last option that is based on local states/variables, especially when we supporting `spark.sql.hive.thriftServer.singleSession`. Here, the disadvantages of session/analyzer-scoped variables/states outweigh the convenience. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814696 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,81 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val array = ctx.freshName("array") + +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, arrayData) = + GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) + +val assigns = if (isPrimitiveArray) { + val primitiveTypeName = ctx.primitiveTypeName(et) + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $arrayData.setNullAt($i); + } else { + $arrayData.set$primitiveTypeName($i, ${eval.value}); + } + """ + } +} else { + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $array[$i] = null; + } else { + $array[$i] = ${eval.value}; + } + """ + } +} +ev.copy(code = --- End diff -- @cloud-fan I love this idea. However, when I have just implemented this, I hit the following problem. Janino throws an exception. IMHO, [this part](https://github.com/janino-compiler/janino/blob/janino_3.0.0/janino/src/org/codehaus/janino/UnitCompiler.java#L4331-L4348) should optimize without throwing an exception. We may have some options 1. remove `if (... instanceof ...)` for projection in Spark 2. submit a PR to janino now to throw this exception and wait for this change until the new janino with this PR will be available 3. submit a PR to janino now to throw this exception and postpone this change later 4. others what do you think? ```java ... /* 037 */ UTF8String value7 = (UTF8String) obj5; /* 038 */ if (false) { /* 039 */ array1[0] = null; /* 040 */ } else { /* 041 */ array1[0] = value7; /* 042 */ } ... /* 068 */ Object obj9 = ((Expression) references[9]).eval(null); /* 069 */ UTF8String value11 = (UTF8String) obj9; /* 070 */ if (false) { /* 071 */ array1[4] = null; /* 072 */ } else { /* 073 */ array1[4] = value11; /* 074 */ } /* 075 */ org.apache.spark.sql.catalyst.util.GenericArrayData arrayData1 = new org.apache.spark.sql.catalyst.util.GenericArrayData(array1); /* 076 */ // Remember the current cursor so that we can calculate how many bytes are /* 077 */ // written later. /* 078 */ final int tmpCursor2 = holder.cursor; /* 079 */ /* 080 */ if (arrayData1 instanceof UnsafeArrayData) { /* 081 */ /* 082 */ final int sizeInBytes1 = ((UnsafeArrayData) arrayData1).getSizeInBytes(); /* 083 */ // grow the global buffer before writing data. /* 084 */ holder.grow(sizeInBytes1); /* 085 */ ((UnsafeArrayData) arrayData1).writeToMemory(holder.buffer, holder.cursor); /* 086 */ holder.cursor += sizeInBytes1; /* 087 */ /* 088 */ } else { ... org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 80, Column 26: "org.apache.spark.sql.catalyst.util.GenericArrayData" can never be an instance of "org.apache.spark.sql.catalyst.expressions.UnsafeArrayData" at org.codehaus.janino.UnitCompiler.compileError(UnitCompiler.java:11004) at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4345) at org.codehaus.janino.UnitCompiler.access$7400(UnitCompiler.java:206) at org.codehaus.janino.UnitCompiler$12.visitInstanceof(UnitCompiler.java:3773)
[GitHub] spark pull request #13909: [SPARK-16213][SQL] Reduce runtime overhead of a p...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/13909#discussion_r93814681 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala --- @@ -56,33 +58,81 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { -val arrayClass = classOf[GenericArrayData].getName -val values = ctx.freshName("values") -ctx.addMutableState("Object[]", values, s"this.$values = null;") - -ev.copy(code = s""" - this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( -ctx.INPUT_ROW, -children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" -if (${eval.isNull}) { - $values[$i] = null; -} else { - $values[$i] = ${eval.value}; -} - """ -}) + - s""" -final ArrayData ${ev.value} = new $arrayClass($values); -this.$values = null; - """, isNull = "false") +val array = ctx.freshName("array") + +val et = dataType.elementType +val evals = children.map(e => e.genCode(ctx)) +val isPrimitiveArray = ctx.isPrimitiveType(et) +val (preprocess, arrayData) = + GenArrayData.getCodeArrayData(ctx, et, children.size, isPrimitiveArray, array) + +val assigns = if (isPrimitiveArray) { + val primitiveTypeName = ctx.primitiveTypeName(et) + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $arrayData.setNullAt($i); + } else { + $arrayData.set$primitiveTypeName($i, ${eval.value}); + } + """ + } +} else { + evals.zipWithIndex.map { case (eval, i) => +eval.code + s""" + if (${eval.isNull}) { + $array[$i] = null; + } else { + $array[$i] = ${eval.value}; + } + """ + } +} +ev.copy(code = --- End diff -- For now, while I did not apply this change, I leave a comment as ToDo. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16392: [SPARK-18992] [SQL] Move spark.sql.hive.thriftServer.sin...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/16392 **[Test build #70567 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70567/testReport)** for PR 16392 at commit [`5221494`](https://github.com/apache/spark/commit/52214945fdc7705f65ce6522c2d05b1a79e69c78). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16392: [SPARK-18992] [SQL] Move spark.sql.hive.thriftServer.sin...
Github user gatorsmile commented on the issue: https://github.com/apache/spark/pull/16392 retest this please --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16351: [SPARK-18943][SQL] Avoid per-record type dispatch in CSV...
Github user SparkQA commented on the issue: https://github.com/apache/spark/pull/16351 **[Test build #70566 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/70566/testReport)** for PR 16351 at commit [`a036b7f`](https://github.com/apache/spark/commit/a036b7f6e3b8ef81afdc3e817457feb06b628cf4). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16351: [SPARK-18943][SQL] Avoid per-record type dispatch...
Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/16351#discussion_r93814412 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala --- @@ -66,144 +66,138 @@ class CSVTypeCastSuite extends SparkFunSuite { } test("Nullable types are handled") { -assertNull( - CSVTypeCast.castTo("-", "_1", ByteType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", ShortType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", LongType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", FloatType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", DoubleType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", BooleanType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", TimestampType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", DateType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo("-", "_1", StringType, nullable = true, CSVOptions("nullValue", "-"))) -assertNull( - CSVTypeCast.castTo(null, "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-"))) - -// casting a null to not nullable field should throw an exception. -var message = intercept[RuntimeException] { - CSVTypeCast.castTo(null, "_1", IntegerType, nullable = false, CSVOptions("nullValue", "-")) -}.getMessage -assert(message.contains("null value found but field _1 is not nullable.")) - -message = intercept[RuntimeException] { - CSVTypeCast.castTo("-", "_1", StringType, nullable = false, CSVOptions("nullValue", "-")) -}.getMessage -assert(message.contains("null value found but field _1 is not nullable.")) - } - - test("String type should also respect `nullValue`") { --- End diff -- Some tests in ``String type should also respect `nullValue` `` were duplicated and other were folded into `Nullable types are handled`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #16383: [SPARK-18980][SQL] implement Aggregator with TypedImpera...
Github user viirya commented on the issue: https://github.com/apache/spark/pull/16383 LGTM. This is a cool improvement. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #16383: [SPARK-18980][SQL] implement Aggregator with Type...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/16383#discussion_r93814369 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala --- @@ -143,15 +197,96 @@ case class TypedAggregateExpression( } } - override def toString: String = { -val input = inputDeserializer match { - case Some(UnresolvedDeserializer(deserializer, _)) => deserializer.dataType.simpleString - case Some(deserializer) => deserializer.dataType.simpleString - case _ => "unknown" + override def withInputInfo( + deser: Expression, + cls: Class[_], + schema: StructType): TypedAggregateExpression = { +copy(inputDeserializer = Some(deser), inputClass = Some(cls), inputSchema = Some(schema)) + } +} + +case class ComplexTypedAggregateExpression( +aggregator: Aggregator[Any, Any, Any], +inputDeserializer: Option[Expression], +inputClass: Option[Class[_]], +inputSchema: Option[StructType], +bufferSerializer: Seq[NamedExpression], +bufferDeserializer: Expression, +outputSerializer: Seq[Expression], +dataType: DataType, +nullable: Boolean, +mutableAggBufferOffset: Int = 0, +inputAggBufferOffset: Int = 0) + extends TypedImperativeAggregate[Any] with TypedAggregateExpression with NonSQLExpression { + + override def deterministic: Boolean = true + + override def children: Seq[Expression] = inputDeserializer.toSeq + + override lazy val resolved: Boolean = inputDeserializer.isDefined && childrenResolved + + override def references: AttributeSet = AttributeSet(inputDeserializer.toSeq) + + override def createAggregationBuffer(): Any = aggregator.zero + + private lazy val inputRowToObj = GenerateSafeProjection.generate(inputDeserializer.get :: Nil) + + override def update(buffer: Any, input: InternalRow): Any = { +val inputObj = inputRowToObj(input).get(0, ObjectType(classOf[Any])) +if (inputObj != null) { + aggregator.reduce(buffer, inputObj) +} else { + buffer +} + } + + override def merge(buffer: Any, input: Any): Any = { +aggregator.merge(buffer, input) + } + + private lazy val resultObjToRow = dataType match { +case _: StructType => + UnsafeProjection.create(CreateStruct(outputSerializer)) +case _ => + assert(outputSerializer.length == 1) + UnsafeProjection.create(outputSerializer.head) + } + + override def eval(buffer: Any): Any = { +val resultObj = aggregator.finish(buffer) +if (resultObj == null) { + null +} else { + resultObjToRow(InternalRow(resultObj)).get(0, dataType) } + } -s"$nodeName($input)" + private lazy val bufferObjToRow = UnsafeProjection.create(bufferSerializer) + + override def serialize(buffer: Any): Array[Byte] = { +bufferObjToRow(InternalRow(buffer)).getBytes } - override def nodeName: String = aggregator.getClass.getSimpleName.stripSuffix("$") + private lazy val bufferRow = new UnsafeRow(bufferSerializer.length) + private lazy val bufferRowToObject = GenerateSafeProjection.generate(bufferDeserializer :: Nil) + + override def deserialize(storageFormat: Array[Byte]): Any = { +bufferRow.pointTo(storageFormat, storageFormat.length) +bufferRowToObject(bufferRow).get(0, ObjectType(classOf[Any])) + } + + override def withNewMutableAggBufferOffset( + newMutableAggBufferOffset: Int): ComplexTypedAggregateExpression = +copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + override def withNewInputAggBufferOffset( + newInputAggBufferOffset: Int): ComplexTypedAggregateExpression = +copy(inputAggBufferOffset = newInputAggBufferOffset) + + override def withInputInfo( --- End diff -- oh. right. nvm. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org