Github user xuanyuanking commented on a diff in the pull request: https://github.com/apache/spark/pull/17702#discussion_r163156332 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala --- @@ -668,4 +672,31 @@ object DataSource extends Logging { } globPath } + + /** + * Return all paths represented by the wildcard string. + * Follow [[InMemoryFileIndex]].bulkListLeafFile and reuse the conf. + */ + private def getGlobbedPaths( + sparkSession: SparkSession, + fs: FileSystem, + hadoopConf: SerializableConfiguration, + qualified: Path): Seq[Path] = { + val paths = SparkHadoopUtil.get.expandGlobPath(fs, qualified) + if (paths.size <= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { + SparkHadoopUtil.get.globPathIfNecessary(fs, qualified) + } else { + val parallelPartitionDiscoveryParallelism = + sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism + val numParallelism = Math.min(paths.size, parallelPartitionDiscoveryParallelism) + val expanded = sparkSession.sparkContext --- End diff -- Sorry for the late reply, finished in next commit.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org