This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new a7a02a8 [SPARK-28392][SQL][TESTS] Add traits for UDF and PostgreSQL tests to share initialization a7a02a8 is described below commit a7a02a86adafd3808051d843cf7e70176a7c4099 Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Mon Jul 15 16:20:09 2019 +0900 [SPARK-28392][SQL][TESTS] Add traits for UDF and PostgreSQL tests to share initialization ## What changes were proposed in this pull request? This PR adds some traits so that we can deduplicate initialization stuff for each type of test case. For instance, see [SPARK-28343](https://issues.apache.org/jira/browse/SPARK-28343). It's a little bit overkill but I think it will make adding test cases easier and cause less confusions. This PR adds both: ``` private trait PgSQLTest private trait UDFTest ``` To indicate and share the logics related to each combination of test types. ## How was this patch tested? Manually tested. Closes #25155 from HyukjinKwon/SPARK-28392. Authored-by: HyukjinKwon <gurwls...@apache.org> Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- .../sql-tests/inputs/udf/pgSQL/udf-case.sql | 5 - .../sql-tests/results/udf/pgSQL/udf-case.sql.out | 190 ++++++++++----------- .../org/apache/spark/sql/SQLQueryTestSuite.scala | 56 ++++-- 3 files changed, 129 insertions(+), 122 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql index b05c21d..a2aab79 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql @@ -6,14 +6,10 @@ -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/case.sql -- Test the CASE statement -- --- This test suite contains two Cartesian products without using explicit CROSS JOIN syntax. --- Thus, we set spark.sql.crossJoin.enabled to true. - -- This test file was converted from pgSQL/case.sql. -- Note that currently registered UDF returns a string. So there are some differences, for instance -- in string cast within UDF in Scala and Python. -set spark.sql.crossJoin.enabled=true; CREATE TABLE CASE_TBL ( i integer, f double @@ -269,4 +265,3 @@ SELECT CASE DROP TABLE CASE_TBL; DROP TABLE CASE2_TBL; -set spark.sql.crossJoin.enabled=false; diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out index 55bef64..6bb7a78 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out @@ -1,19 +1,22 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 37 +-- Number of queries: 35 -- !query 0 -set spark.sql.crossJoin.enabled=true +CREATE TABLE CASE_TBL ( + i integer, + f double +) USING parquet -- !query 0 schema -struct<key:string,value:string> +struct<> -- !query 0 output -spark.sql.crossJoin.enabled true + -- !query 1 -CREATE TABLE CASE_TBL ( +CREATE TABLE CASE2_TBL ( i integer, - f double + j integer ) USING parquet -- !query 1 schema struct<> @@ -22,10 +25,7 @@ struct<> -- !query 2 -CREATE TABLE CASE2_TBL ( - i integer, - j integer -) USING parquet +INSERT INTO CASE_TBL VALUES (1, 10.1) -- !query 2 schema struct<> -- !query 2 output @@ -33,7 +33,7 @@ struct<> -- !query 3 -INSERT INTO CASE_TBL VALUES (1, 10.1) +INSERT INTO CASE_TBL VALUES (2, 20.2) -- !query 3 schema struct<> -- !query 3 output @@ -41,7 +41,7 @@ struct<> -- !query 4 -INSERT INTO CASE_TBL VALUES (2, 20.2) +INSERT INTO CASE_TBL VALUES (3, -30.3) -- !query 4 schema struct<> -- !query 4 output @@ -49,7 +49,7 @@ struct<> -- !query 5 -INSERT INTO CASE_TBL VALUES (3, -30.3) +INSERT INTO CASE_TBL VALUES (4, NULL) -- !query 5 schema struct<> -- !query 5 output @@ -57,7 +57,7 @@ struct<> -- !query 6 -INSERT INTO CASE_TBL VALUES (4, NULL) +INSERT INTO CASE2_TBL VALUES (1, -1) -- !query 6 schema struct<> -- !query 6 output @@ -65,7 +65,7 @@ struct<> -- !query 7 -INSERT INTO CASE2_TBL VALUES (1, -1) +INSERT INTO CASE2_TBL VALUES (2, -2) -- !query 7 schema struct<> -- !query 7 output @@ -73,7 +73,7 @@ struct<> -- !query 8 -INSERT INTO CASE2_TBL VALUES (2, -2) +INSERT INTO CASE2_TBL VALUES (3, -3) -- !query 8 schema struct<> -- !query 8 output @@ -81,7 +81,7 @@ struct<> -- !query 9 -INSERT INTO CASE2_TBL VALUES (3, -3) +INSERT INTO CASE2_TBL VALUES (2, -4) -- !query 9 schema struct<> -- !query 9 output @@ -89,7 +89,7 @@ struct<> -- !query 10 -INSERT INTO CASE2_TBL VALUES (2, -4) +INSERT INTO CASE2_TBL VALUES (1, NULL) -- !query 10 schema struct<> -- !query 10 output @@ -97,7 +97,7 @@ struct<> -- !query 11 -INSERT INTO CASE2_TBL VALUES (1, NULL) +INSERT INTO CASE2_TBL VALUES (NULL, -6) -- !query 11 schema struct<> -- !query 11 output @@ -105,148 +105,140 @@ struct<> -- !query 12 -INSERT INTO CASE2_TBL VALUES (NULL, -6) --- !query 12 schema -struct<> --- !query 12 output - - - --- !query 13 SELECT '3' AS `One`, CASE WHEN CAST(udf(1 < 2) AS boolean) THEN 3 END AS `Simple WHEN` --- !query 13 schema +-- !query 12 schema struct<One:string,Simple WHEN:int> --- !query 13 output +-- !query 12 output 3 3 --- !query 14 +-- !query 13 SELECT '<NULL>' AS `One`, CASE WHEN 1 > 2 THEN udf(3) END AS `Simple default` --- !query 14 schema +-- !query 13 schema struct<One:string,Simple default:string> --- !query 14 output +-- !query 13 output <NULL> NULL --- !query 15 +-- !query 14 SELECT '3' AS `One`, CASE WHEN udf(1) < 2 THEN udf(3) ELSE udf(4) END AS `Simple ELSE` --- !query 15 schema +-- !query 14 schema struct<One:string,Simple ELSE:string> --- !query 15 output +-- !query 14 output 3 3 --- !query 16 +-- !query 15 SELECT udf('4') AS `One`, CASE WHEN 1 > 2 THEN 3 ELSE 4 END AS `ELSE default` --- !query 16 schema +-- !query 15 schema struct<One:string,ELSE default:int> --- !query 16 output +-- !query 15 output 4 4 --- !query 17 +-- !query 16 SELECT udf('6') AS `One`, CASE WHEN CAST(udf(1 > 2) AS boolean) THEN 3 WHEN udf(4) < 5 THEN 6 ELSE 7 END AS `Two WHEN with default` --- !query 17 schema +-- !query 16 schema struct<One:string,Two WHEN with default:int> --- !query 17 output +-- !query 16 output 6 6 --- !query 18 +-- !query 17 SELECT '7' AS `None`, CASE WHEN rand() < udf(0) THEN 1 END AS `NULL on no matches` --- !query 18 schema +-- !query 17 schema struct<None:string,NULL on no matches:int> --- !query 18 output +-- !query 17 output 7 NULL --- !query 19 +-- !query 18 SELECT CASE WHEN CAST(udf(1=0) AS boolean) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END --- !query 19 schema +-- !query 18 schema struct<CASE WHEN CAST(udf((1 = 0)) AS BOOLEAN) THEN (CAST(1 AS DOUBLE) / CAST(0 AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double> --- !query 19 output +-- !query 18 output 1.0 --- !query 20 +-- !query 19 SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END --- !query 20 schema +-- !query 19 schema struct<CASE WHEN (1 = 0) THEN (CAST(1 AS DOUBLE) / CAST(CAST(udf(0) AS DOUBLE) AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double> --- !query 20 output +-- !query 19 output 1.0 --- !query 21 +-- !query 20 SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl --- !query 21 schema +-- !query 20 schema struct<CASE WHEN (i > 100) THEN udf((cast(1 as double) / cast(0 as double))) ELSE udf(0) END:string> --- !query 21 output +-- !query 20 output 0 0 0 0 --- !query 22 +-- !query 21 SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END --- !query 22 schema +-- !query 21 schema struct<CASE WHEN (a = a) THEN udf(1) ELSE udf(2) END:string> --- !query 22 output +-- !query 21 output 1 --- !query 23 +-- !query 22 SELECT '' AS `Five`, CASE WHEN i >= 3 THEN i END AS `>= 3 or Null` FROM CASE_TBL --- !query 23 schema +-- !query 22 schema struct<Five:string,>= 3 or Null:int> --- !query 23 output +-- !query 22 output 3 4 NULL NULL --- !query 24 +-- !query 23 SELECT '' AS `Five`, CASE WHEN i >= 3 THEN (i + i) ELSE i END AS `Simplest Math` FROM CASE_TBL --- !query 24 schema +-- !query 23 schema struct<Five:string,Simplest Math:int> --- !query 24 output +-- !query 23 output 1 2 6 8 --- !query 25 +-- !query 24 SELECT '' AS `Five`, i AS `Value`, CASE WHEN (i < 0) THEN 'small' WHEN (i = 0) THEN 'zero' @@ -255,16 +247,16 @@ SELECT '' AS `Five`, i AS `Value`, ELSE 'big' END AS `Category` FROM CASE_TBL --- !query 25 schema +-- !query 24 schema struct<Five:string,Value:int,Category:string> --- !query 25 output +-- !query 24 output 1 one 2 two 3 big 4 big --- !query 26 +-- !query 25 SELECT '' AS `Five`, CASE WHEN ((i < 0) or (i < 0)) THEN 'small' WHEN ((i = 0) or (i = 0)) THEN 'zero' @@ -273,37 +265,37 @@ SELECT '' AS `Five`, ELSE 'big' END AS `Category` FROM CASE_TBL --- !query 26 schema +-- !query 25 schema struct<Five:string,Category:string> --- !query 26 output +-- !query 25 output big big one two --- !query 27 +-- !query 26 SELECT * FROM CASE_TBL WHERE udf(COALESCE(f,i)) = 4 --- !query 27 schema +-- !query 26 schema struct<i:int,f:double> --- !query 27 output +-- !query 26 output 4 NULL --- !query 28 +-- !query 27 SELECT * FROM CASE_TBL WHERE udf(NULLIF(f,i)) = 2 --- !query 28 schema +-- !query 27 schema struct<i:int,f:double> --- !query 28 output +-- !query 27 output --- !query 29 +-- !query 28 SELECT udf(COALESCE(a.f, b.i, b.j)) FROM CASE_TBL a, CASE2_TBL b --- !query 29 schema +-- !query 28 schema struct<udf(coalesce(f, cast(i as double), cast(j as double))):string> --- !query 29 output +-- !query 28 output -30.3 -30.3 -30.3 @@ -330,24 +322,24 @@ struct<udf(coalesce(f, cast(i as double), cast(j as double))):string> 3.0 --- !query 30 +-- !query 29 SELECT * FROM CASE_TBL a, CASE2_TBL b WHERE udf(COALESCE(a.f, b.i, b.j)) = 2 --- !query 30 schema +-- !query 29 schema struct<i:int,f:double,i:int,j:int> --- !query 30 output +-- !query 29 output 4 NULL 2 -2 4 NULL 2 -4 --- !query 31 +-- !query 30 SELECT udf('') AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, NULLIF(b.i, 4) AS `NULLIF(b.i,4)` FROM CASE_TBL a, CASE2_TBL b --- !query 31 schema +-- !query 30 schema struct<Five:string,NULLIF(a.i,b.i):int,NULLIF(b.i,4):int> --- !query 31 output +-- !query 30 output 1 2 1 2 1 3 @@ -374,18 +366,18 @@ struct<Five:string,NULLIF(a.i,b.i):int,NULLIF(b.i,4):int> NULL 3 --- !query 32 +-- !query 31 SELECT '' AS `Two`, * FROM CASE_TBL a, CASE2_TBL b WHERE CAST(udf(COALESCE(f,b.i) = 2) AS boolean) --- !query 32 schema +-- !query 31 schema struct<Two:string,i:int,f:double,i:int,j:int> --- !query 32 output +-- !query 31 output 4 NULL 2 -2 4 NULL 2 -4 --- !query 33 +-- !query 32 SELECT CASE (CASE vol('bar') WHEN udf('foo') THEN 'it was foo!' @@ -395,31 +387,23 @@ SELECT CASE WHEN udf('it was foo!') THEN 'foo recognized' WHEN 'it was bar!' THEN udf('bar recognized') ELSE 'unrecognized' END AS col --- !query 33 schema +-- !query 32 schema struct<col:string> --- !query 33 output +-- !query 32 output bar recognized --- !query 34 +-- !query 33 DROP TABLE CASE_TBL --- !query 34 schema +-- !query 33 schema struct<> --- !query 34 output +-- !query 33 output --- !query 35 +-- !query 34 DROP TABLE CASE2_TBL --- !query 35 schema +-- !query 34 schema struct<> --- !query 35 output - - +-- !query 34 output --- !query 36 -set spark.sql.crossJoin.enabled=false --- !query 36 schema -struct<key:string,value:string> --- !query 36 output -spark.sql.crossJoin.enabled false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 6e40fcf..1085f03 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -151,17 +151,37 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { val resultFile: String } + /** + * traits that indicate UDF or PgSQL to trigger the code path specific to each. For instance, + * PgSQL tests require to register some UDF functions. + */ + private trait PgSQLTest + + private trait UDFTest { + val udf: TestUDF + } + /** A regular test case. */ private case class RegularTestCase( name: String, inputFile: String, resultFile: String) extends TestCase /** A PostgreSQL test case. */ private case class PgSQLTestCase( - name: String, inputFile: String, resultFile: String) extends TestCase + name: String, inputFile: String, resultFile: String) extends TestCase with PgSQLTest /** A UDF test case. */ private case class UDFTestCase( - name: String, inputFile: String, resultFile: String, udf: TestUDF) extends TestCase + name: String, + inputFile: String, + resultFile: String, + udf: TestUDF) extends TestCase with UDFTest + + /** A UDF PostgreSQL test case. */ + private case class UDFPgSQLTestCase( + name: String, + inputFile: String, + resultFile: String, + udf: TestUDF) extends TestCase with UDFTest with PgSQLTest private def createScalaTestCase(testCase: TestCase): Unit = { if (blackList.exists(t => @@ -169,12 +189,14 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { // Create a test case to ignore this case. ignore(testCase.name) { /* Do nothing */ } } else testCase match { - case UDFTestCase(_, _, _, udf: TestPythonUDF) if !shouldTestPythonUDFs => + case udfTestCase: UDFTest + if udfTestCase.udf.isInstanceOf[TestPythonUDF] && !shouldTestPythonUDFs => ignore(s"${testCase.name} is skipped because " + s"[$pythonExec] and/or pyspark were not available.") { /* Do nothing */ } - case UDFTestCase(_, _, _, udf: TestScalarPandasUDF) if !shouldTestScalarPandasUDFs => + case udfTestCase: UDFTest + if udfTestCase.udf.isInstanceOf[TestScalarPandasUDF] && !shouldTestScalarPandasUDFs => ignore(s"${testCase.name} is skipped because pyspark," + s"pandas and/or pyarrow were not available in [$pythonExec].") { /* Do nothing */ @@ -254,12 +276,15 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { // This does not isolate catalog changes. val localSparkSession = spark.newSession() loadTestData(localSparkSession) + testCase match { - case udfTestCase: UDFTestCase => - // vol used by udf-case.sql. - localSparkSession.udf.register("vol", (s: String) => s) + case udfTestCase: UDFTest => registerTestUDF(udfTestCase.udf, localSparkSession) - case _: PgSQLTestCase => + case _ => + } + + testCase match { + case _: PgSQLTest => // booleq/boolne used by boolean.sql localSparkSession.udf.register("booleq", (b1: Boolean, b2: Boolean) => b1 == b2) localSparkSession.udf.register("boolne", (b1: Boolean, b2: Boolean) => b1 != b2) @@ -268,7 +293,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { // PostgreSQL enabled cartesian product by default. localSparkSession.conf.set(SQLConf.CROSS_JOINS_ENABLED.key, true) localSparkSession.conf.set(SQLConf.ANSI_SQL_PARSER.key, true) - case _ => // Don't add UDFs in Regular tests. + case _ => } if (configSet.isDefined) { @@ -388,13 +413,16 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { val absPath = file.getAbsolutePath val testCaseName = absPath.stripPrefix(inputFilePath).stripPrefix(File.separator) - if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) { + if (file.getAbsolutePath.startsWith( + s"$inputFilePath${File.separator}udf${File.separator}pgSQL")) { + Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { udf => + UDFPgSQLTestCase( + s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) + } + } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) { Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { udf => UDFTestCase( - s"$testCaseName - ${udf.prettyName}", - absPath, - resultFile, - udf) + s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) } } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}pgSQL")) { PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org