Github user srowen commented on a diff in the pull request: https://github.com/apache/spark/pull/21456#discussion_r192204172 --- Diff: common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java --- @@ -272,6 +273,57 @@ void close() { } } + /** + * This method is needed to avoid the situation when multiple File instances for the + * same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String. + * According to measurements, in some scenarios such duplicate strings may waste a lot + * of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that + * we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise, + * the internal code in java.io.File would normalize it later, creating a new "foo/bar" + * String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File + * uses, since it is in the package-private class java.io.FileSystem. + */ + private static String createNormalizedInternedPathname(String dir1, String dir2, String fname) { + String pathname = dir1 + File.separator + dir2 + File.separator + fname; + pathname = normalizePathname(pathname); + return pathname.intern(); + } + + @VisibleForTesting + static String normalizePathname(String pathname) { + int len = pathname.length(); + char prevChar = 0; + for (int i = 0; i < len; i++) { + char c = pathname.charAt(i); + if (c == '/' && prevChar == '/') { + return normalizePathnameRemainder(pathname, i - 1); + } + prevChar = c; + } + if (prevChar == '/') return normalizePathnameRemainder(pathname, len - 1); --- End diff -- Style note: we put the body of all if/while/for statements on a new line with braces. What about something like just `path.replaceAll("/{2,}", "/")`? does that cover a lot of the logic here? (Or something more efficient with Pattern and Matcher, but equivalent)
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org