This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 7ba70f02b54 [SPARK-40765][SQL] Optimize redundant fs operation in `CommandUtils#calculateSingleLocationSize#getPathSize` method 7ba70f02b54 is described below commit 7ba70f02b5417364985af7bbfdcde6ebeca84357 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Thu Oct 13 11:21:22 2022 +0900 [SPARK-40765][SQL] Optimize redundant fs operation in `CommandUtils#calculateSingleLocationSize#getPathSize` method ### What changes were proposed in this pull request? This pr change the 2nd input parameter from `Path` to `FileStatus` to avoid redundant `fs.getFileStatus(path)` in each recursive call. ### Why are the changes needed? Reduce one dfs operation in each recursive call. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions Closes #38214 from LuciferYang/opt-getPathSize. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../org/apache/spark/sql/execution/command/CommandUtils.scala | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index 41f60bfa2ff..6883f93523b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -22,7 +22,7 @@ import java.net.URI import scala.collection.mutable import scala.util.control.NonFatal -import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession @@ -113,13 +113,12 @@ object CommandUtils extends Logging { // countFileSize to count the table size. val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging") - def getPathSize(fs: FileSystem, path: Path): Long = { - val fileStatus = fs.getFileStatus(path) + def getPathSize(fs: FileSystem, fileStatus: FileStatus): Long = { val size = if (fileStatus.isDirectory) { - fs.listStatus(path) + fs.listStatus(fileStatus.getPath) .map { status => if (isDataPath(status.getPath, stagingDir)) { - getPathSize(fs, status.getPath) + getPathSize(fs, status) } else { 0L } @@ -136,7 +135,7 @@ object CommandUtils extends Logging { val path = new Path(p) try { val fs = path.getFileSystem(sessionState.newHadoopConf()) - getPathSize(fs, path) + getPathSize(fs, fs.getFileStatus(path)) } catch { case NonFatal(e) => logWarning( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org