[GitHub] spark pull request #19060: [WIP][SQL] Add DataSourceSuite validating data so...

gatorsmile Tue, 05 Sep 2017 11:10:54 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19060#discussion_r137071948
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/sources/DataSourceSuite.scala ---
    @@ -0,0 +1,125 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.sources
    +
    +import java.sql.{Date, Timestamp}
    +
    +import org.apache.orc.OrcConf
    +
    +import org.apache.spark.sql.{Dataset, QueryTest, Row}
    +import org.apache.spark.sql.hive.test.TestHiveSingleton
    +import org.apache.spark.sql.internal.SQLConf
    +import org.apache.spark.sql.test.SQLTestUtils
    +
    +/**
    + * Data Source qualification as Apache Spark Data Sources.
    + * - Apache Spark Data Type Value Limits: CSV, JSON, ORC, Parquet
    + * - Predicate Push Down: ORC
    + */
    +class DataSourceSuite
    +  extends QueryTest
    +  with SQLTestUtils
    +  with TestHiveSingleton {
    +
    +  import testImplicits._
    +
    +  var df: Dataset[Row] = _
    +
    +  override def beforeAll(): Unit = {
    +    super.beforeAll()
    +    spark.conf.set("spark.sql.session.timeZone", "GMT")
    +
    +    df = ((
    +      false,
    +      true,
    +      Byte.MinValue,
    +      Byte.MaxValue,
    +      Short.MinValue,
    +      Short.MaxValue,
    +      Int.MinValue,
    +      Int.MaxValue,
    +      Long.MinValue,
    +      Long.MaxValue,
    +      Float.MinValue,
    +      Float.MaxValue,
    +      Double.MinValue,
    +      Double.MaxValue,
    +      Date.valueOf("0001-01-01"),
    +      Date.valueOf("9999-12-31"),
    +      new Timestamp(-62135769600000L), // 0001-01-01 00:00:00.000
    +      new Timestamp(253402300799999L)  // 9999-12-31 23:59:59.999
    +    ) :: Nil).toDF()
    +  }
    +
    +  override def afterAll(): Unit = {
    +    try {
    +      spark.conf.unset("spark.sql.session.timeZone")
    +    } finally {
    +      super.afterAll()
    +    }
    +  }
    +
    +  Seq("parquet", "orc", "json", "csv").foreach { dataSource =>
    +    test(s"$dataSource - data type value limit") {
    +      withTempPath { dir =>
    +        df.write.format(dataSource).save(dir.getCanonicalPath)
    +
    +        // Use the same schema for saving/loading
    +        checkAnswer(
    +          
spark.read.format(dataSource).schema(df.schema).load(dir.getCanonicalPath),
    +          df)
    +
    +        // Use schema inference, but skip text-based format due to its 
limitation
    +        if (Seq("parquet", "orc").contains(dataSource)) {
    +          withTable("tab1") {
    +            sql(s"CREATE TABLE tab1 USING $dataSource LOCATION 
'${dir.toURI}'")
    +            checkAnswer(sql(s"SELECT ${df.schema.fieldNames.mkString(",")} 
FROM tab1"), df)
    +          }
    +        }
    +      }
    +    }
    +  }
    +
    +  Seq("orc").foreach { dataSource =>
    +    test(s"$dataSource - predicate push down") {
    +      withSQLConf(
    +        SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true",
    +        SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
    +        withTempPath { dir =>
    +          // write 4000 rows with the integer and the string in a single 
orc file with stride 1000
    +          spark
    +            .range(4000)
    +            .map(i => (i, s"$i"))
    +            .toDF("i", "s")
    +            .repartition(1)
    +            .write
    +            .option(OrcConf.ROW_INDEX_STRIDE.getAttribute, 1000)
    +            // TODO: Add Parquet option, too.
    +            .format(dataSource)
    +            .save(dir.getCanonicalPath)
    +
    +          val df = spark.read.format(dataSource).load(dir.getCanonicalPath)
    +            .where(s"i BETWEEN 1500 AND 1999")
    --- End diff --
    
    So far, Parquet and ORC are the only built-in sources that support PPD. 
From the viewpoints of software development, we should make the test framework 
easily extensible. 
    
    Sorry, I do not think it covers what we need. First, to verify whether the 
underlying data source works properly, we need to have the corresponding 
(boundary) data (which are inserted to the sources), the predicates (which are 
using boundary values), and also need to ensure these predicates are pushed 
down and not evaluated by Spark.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #19060: [WIP][SQL] Add DataSourceSuite validating data so...

Reply via email to