This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 8c0b9cb [SPARK-32815][ML][3.0] Fix LibSVM data source loading error on file paths with glob metacharacters 8c0b9cb is described below commit 8c0b9cbf68693db22314637a75f28e5aa954aff8 Author: Max Gekk <max.g...@gmail.com> AuthorDate: Tue Sep 8 14:16:13 2020 +0000 [SPARK-32815][ML][3.0] Fix LibSVM data source loading error on file paths with glob metacharacters ### What changes were proposed in this pull request? In the PR, I propose to fix an issue with LibSVM datasource when both of the following are true: * no user specified schema * some file paths contain escaped glob metacharacters, such as `[``]`, `{``}`, `*` etc. The fix is a backport of https://github.com/apache/spark/pull/29670, and it is based on another bug fix for CSV/JSON datasources https://github.com/apache/spark/pull/29659. ### Why are the changes needed? To fix the issue when the follow two queries try to read from paths `[abc]`: ```scala spark.read.format("libsvm").load("""/tmp/\[abc\].csv""").show ``` but would end up hitting an exception: ``` Path does not exist: file:/private/var/folders/p3/dfs6mf655d7fnjrsjvldh0tc0000gn/T/spark-6ef0ae5e-ff9f-4c4f-9ff4-0db3ee1f6a82/[abc]/part-00000-26406ab9-4e56-45fd-a25a-491c18a05e76-c000.libsvm; org.apache.spark.sql.AnalysisException: Path does not exist: file:/private/var/folders/p3/dfs6mf655d7fnjrsjvldh0tc0000gn/T/spark-6ef0ae5e-ff9f-4c4f-9ff4-0db3ee1f6a82/[abc]/part-00000-26406ab9-4e56-45fd-a25a-491c18a05e76-c000.libsvm; at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$3(DataSource.scala:770) at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:373) at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659) at scala.util.Success.$anonfun$map$1(Try.scala:255) at scala.util.Success.map(Try.scala:213) ``` ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? Added UT to `LibSVMRelationSuite`. Closes #29675 from MaxGekk/globbing-paths-when-inferring-schema-ml-3.0. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/ml/source/libsvm/LibSVMRelation.scala | 2 +- .../scala/org/apache/spark/mllib/util/MLUtils.scala | 3 ++- .../spark/ml/source/libsvm/LibSVMRelationSuite.scala | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index da8f3a24f..11be1d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -100,7 +100,7 @@ private[libsvm] class LibSVMFileFormat "though the input. If you know the number in advance, please specify it via " + "'numFeatures' option to avoid the extra scan.") - val paths = files.map(_.getPath.toUri.toString) + val paths = files.map(_.getPath.toString) val parsed = MLUtils.parseLibSVMFile(sparkSession, paths) MLUtils.computeNumFeatures(parsed) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 9198334..2411300 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -110,7 +110,8 @@ object MLUtils extends Logging { DataSource.apply( sparkSession, paths = paths, - className = classOf[TextFileFormat].getName + className = classOf[TextFileFormat].getName, + options = Map(DataSource.GLOB_PATHS_KEY -> "false") ).resolveRelation(checkFilesExist = false)) .select("value") diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala index 263ad26..0999892 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala @@ -191,4 +191,24 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { spark.sql("DROP TABLE IF EXISTS libsvmTable") } } + + test("SPARK-32815: Test LibSVM data source on file paths with glob metacharacters") { + withTempDir { dir => + val basePath = dir.getCanonicalPath + // test libsvm writer / reader without specifying schema + val svmFileName = "[abc]" + val escapedSvmFileName = "\\[abc\\]" + val rawData = new java.util.ArrayList[Row]() + rawData.add(Row(1.0, Vectors.sparse(2, Seq((0, 2.0), (1, 3.0))))) + val struct = new StructType() + .add("labelFoo", DoubleType, false) + .add("featuresBar", VectorType, false) + val df = spark.createDataFrame(rawData, struct) + df.write.format("libsvm").save(s"$basePath/$svmFileName") + val df2 = spark.read.format("libsvm").load(s"$basePath/$escapedSvmFileName") + val row1 = df2.first() + val v = row1.getAs[SparseVector](1) + assert(v == Vectors.sparse(2, Seq((0, 2.0), (1, 3.0)))) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org