This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new c0b9db1 [SPARK-26339][SQL] Throws better exception when reading files that start with underscore c0b9db1 is described below commit c0b9db120d4c2ad0b5b99b9152549e94ef8f5a2d Author: Hirobe Keiichi <keiichi_hir...@forcia.com> AuthorDate: Mon Dec 31 10:15:14 2018 -0600 [SPARK-26339][SQL] Throws better exception when reading files that start with underscore ## What changes were proposed in this pull request? As the description in SPARK-26339, spark.read behavior is very confusing when reading files that start with underscore, fix this by throwing exception which message is "Path does not exist". ## How was this patch tested? manual tests. Both of codes below throws exception which message is "Path does not exist". ``` spark.read.csv("/home/forcia/work/spark/_test.csv") spark.read.schema("test STRING, number INT").csv("/home/forcia/work/spark/_test.csv") ``` Closes #23288 from KeiichiHirobe/SPARK-26339. Authored-by: Hirobe Keiichi <keiichi_hir...@forcia.com> Signed-off-by: Sean Owen <sean.o...@databricks.com> --- .../spark/sql/execution/datasources/DataSource.scala | 17 ++++++++++++++++- sql/core/src/test/resources/test-data/_cars.csv | 7 +++++++ .../sql/execution/datasources/csv/CSVSuite.scala | 20 ++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index fefff68..517e043 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -543,7 +543,7 @@ case class DataSource( checkFilesExist: Boolean): Seq[Path] = { val allPaths = caseInsensitiveOptions.get("path") ++ paths val hadoopConf = sparkSession.sessionState.newHadoopConf() - allPaths.flatMap { path => + val allGlobPath = allPaths.flatMap { path => val hdfsPath = new Path(path) val fs = hdfsPath.getFileSystem(hadoopConf) val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) @@ -560,6 +560,21 @@ case class DataSource( } globPath }.toSeq + + val (filteredOut, filteredIn) = allGlobPath.partition { path => + InMemoryFileIndex.shouldFilterOut(path.getName) + } + if (filteredOut.nonEmpty) { + if (filteredIn.isEmpty) { + throw new AnalysisException( + s"All paths were ignored:\n${filteredOut.mkString("\n ")}") + } else { + logDebug( + s"Some paths were ignored:\n${filteredOut.mkString("\n ")}") + } + } + + allGlobPath } } diff --git a/sql/core/src/test/resources/test-data/_cars.csv b/sql/core/src/test/resources/test-data/_cars.csv new file mode 100644 index 0000000..40ded57 --- /dev/null +++ b/sql/core/src/test/resources/test-data/_cars.csv @@ -0,0 +1,7 @@ + +year,make,model,comment,blank +"2012","Tesla","S","No comment", + +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index d9e5d7a..fb1bedf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -53,6 +53,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te private val carsEmptyValueFile = "test-data/cars-empty-value.csv" private val carsBlankColName = "test-data/cars-blank-column-name.csv" private val carsCrlf = "test-data/cars-crlf.csv" + private val carsFilteredOutFile = "test-data/_cars.csv" private val emptyFile = "test-data/empty.csv" private val commentsFile = "test-data/comments.csv" private val disableCommentsFile = "test-data/disable_comments.csv" @@ -346,6 +347,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te assert(result.schema.fieldNames.size === 1) } + test("SPARK-26339 Not throw an exception if some of specified paths are filtered in") { + val cars = spark + .read + .option("header", "false") + .csv(testFile(carsFile), testFile(carsFilteredOutFile)) + + verifyCars(cars, withHeader = false, checkTypes = false) + } + + test("SPARK-26339 Throw an exception only if all of the specified paths are filtered out") { + val e = intercept[AnalysisException] { + val cars = spark + .read + .option("header", "false") + .csv(testFile(carsFilteredOutFile)) + }.getMessage + assert(e.contains("All paths were ignored:")) + } + test("DDL test with empty file") { withView("carsTable") { spark.sql( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org