Github user barrenlake commented on a diff in the pull request: https://github.com/apache/spark/pull/17176#discussion_r154575331 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala --- @@ -159,36 +159,11 @@ class HadoopTableReader( def verifyPartitionPath( partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]]): Map[HivePartition, Class[_ <: Deserializer]] = { - if (!sparkSession.sessionState.conf.verifyPartitionPath) { - partitionToDeserializer - } else { - var existPathSet = collection.mutable.Set[String]() - var pathPatternSet = collection.mutable.Set[String]() - partitionToDeserializer.filter { - case (partition, partDeserializer) => - def updateExistPathSetByPathPattern(pathPatternStr: String) { - val pathPattern = new Path(pathPatternStr) - val fs = pathPattern.getFileSystem(hadoopConf) - val matches = fs.globStatus(pathPattern) - matches.foreach(fileStatus => existPathSet += fileStatus.getPath.toString) - } - // convert /demo/data/year/month/day to /demo/data/*/*/*/ - def getPathPatternByPath(parNum: Int, tempPath: Path): String = { - var path = tempPath - for (i <- (1 to parNum)) path = path.getParent - val tails = (1 to parNum).map(_ => "*").mkString("/", "/", "/") - path.toString + tails - } - - val partPath = partition.getDataLocation - val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size(); - var pathPatternStr = getPathPatternByPath(partNum, partPath) - if (!pathPatternSet.contains(pathPatternStr)) { - pathPatternSet += pathPatternStr - updateExistPathSetByPathPattern(pathPatternStr) - } - existPathSet.contains(partPath.toString) - } + partitionToDeserializer.filter { + case (partition, partDeserializer) => + val partPath = partition.getDataLocation + val fs = partPath.getFileSystem(hadoopConf) + fs.exists(partPath) --- End diff -- Each partition sending an RPC request to the NameNode can result in poor performance
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org