Repository: spark Updated Branches: refs/heads/master 55d6fdf22 -> 819b0ea02
[SPARK-13381][SQL] Support for loading CSV with a single function call https://issues.apache.org/jira/browse/SPARK-13381 This PR adds the support to load CSV data directly by a single call with given paths. Also, I corrected this to refer all paths rather than the first path in schema inference, which JSON datasource dose. Several unitests were added for each functionality. Author: hyukjinkwon <gurwls...@gmail.com> Closes #11262 from HyukjinKwon/SPARK-13381. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/819b0ea0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/819b0ea0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/819b0ea0 Branch: refs/heads/master Commit: 819b0ea029d6ef51e661a59e7672480f57322cbb Parents: 55d6fdf Author: hyukjinkwon <gurwls...@gmail.com> Authored: Sun Feb 21 19:11:03 2016 -0800 Committer: Reynold Xin <r...@databricks.com> Committed: Sun Feb 21 19:11:03 2016 -0800 ---------------------------------------------------------------------- .../scala/org/apache/spark/sql/DataFrameReader.scala | 11 +++++++++++ .../sql/execution/datasources/csv/CSVRelation.scala | 6 +++--- .../spark/sql/execution/datasources/csv/CSVSuite.scala | 9 +++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/819b0ea0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 962fdad..20c861d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -345,6 +345,17 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { } /** + * Loads a CSV file and returns the result as a [[DataFrame]]. + * + * This function goes through the input once to determine the input schema. To avoid going + * through the entire data once, specify the schema explicitly using [[schema]]. + * + * @since 2.0.0 + */ + @scala.annotation.varargs + def csv(paths: String*): DataFrame = format("csv").load(paths : _*) + + /** * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty * [[DataFrame]] if no paths are passed in. * http://git-wip-us.apache.org/repos/asf/spark/blob/819b0ea0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala index 471ed0d..da945c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala @@ -37,9 +37,9 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ -private[csv] class CSVRelation( +private[sql] class CSVRelation( private val inputRDD: Option[RDD[String]], - override val paths: Array[String], + override val paths: Array[String] = Array.empty[String], private val maybeDataSchema: Option[StructType], override val userDefinedPartitionColumns: Option[StructType], private val parameters: Map[String, String]) @@ -127,7 +127,7 @@ private[csv] class CSVRelation( } private def inferSchema(paths: Array[String]): StructType = { - val rdd = baseRdd(Array(paths.head)) + val rdd = baseRdd(paths) val firstLine = findFirstLine(rdd) val firstRow = new LineCsvReader(params).parseLine(firstLine) http://git-wip-us.apache.org/repos/asf/spark/blob/819b0ea0/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 9d1f456..7671bc1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -91,6 +91,15 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { verifyCars(cars, withHeader = false, checkTypes = false) } + test("simple csv test with calling another function to load") { + val cars = sqlContext + .read + .option("header", "false") + .csv(testFile(carsFile)) + + verifyCars(cars, withHeader = false, checkTypes = false) + } + test("simple csv test with type inference") { val cars = sqlContext .read --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org