Github user habren commented on a diff in the pull request: https://github.com/apache/spark/pull/22018#discussion_r208784609 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala --- @@ -297,7 +297,7 @@ object InMemoryFileIndex extends Logging { val missingFiles = mutable.ArrayBuffer.empty[String] val filteredLeafStatuses = allLeafStatuses.filterNot( status => shouldFilterOut(status.getPath.getName)) - val resolvedLeafStatuses = filteredLeafStatuses.flatMap { + val resolvedLeafStatuses = filteredLeafStatuses.par.flatMap { --- End diff -- Thanks @viirya for feedback. Yes, this method can be called on executors as below. Do you think it's not thread-safe ? Each partitions will have its own hadoopConf and then own fs, and nothing is shared in this method. sparkContext .parallelize(serializedPaths, numParallelism) .mapPartitions { pathStrings => val hadoopConf = serializableConfiguration.value pathStrings.map(new Path(_)).toSeq.map { path => (path, listLeafFiles(path, hadoopConf, filter, None)) }.iterator }.map { case (path, statuses) =>
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org