ppkarwasz commented on code in PR #428: URL: https://github.com/apache/commons-codec/pull/428#discussion_r3060659364
########## src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java: ########## @@ -0,0 +1,494 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.digest; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.MessageDigest; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; + +/** + * Operations for computing Git object identifiers and their generalizations described by the + * <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>. + * + * <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git. + * Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p> + * + * <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p> + * + * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> + * @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a> + * @since 1.22.0 + */ +public class GitIdentifiers { + + /** + * The type of a Git tree entry, which maps to a Unix file-mode string. + * + * <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here + * cover the four entry types that Git itself produces.</p> + */ + public enum FileMode { + + /** + * A sub-directory (Git sub-tree). + */ + DIRECTORY("40000"), + + /** + * An executable file. + */ + EXECUTABLE("100755"), + + /** + * A regular (non-executable) file. + */ + REGULAR("100644"), + + /** + * A symbolic link. + */ + SYMBOLIC_LINK("120000"); + + /** + * The octal mode as used by Git. + */ + private final String mode; + + /** + * Serialized {@code mode}: since this is mutable, it must remain private. + */ + private final byte[] modeBytes; + + FileMode(final String mode) { + this.mode = mode; + this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII); + } + + /** + * Gets the octal mode as used by Git. + * + * @return The octal mode + */ + public String getMode() { + return mode; + } + } + + /** + * Represents a single entry in a Git tree object. + * + * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p> + * <ul> + * <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li> + * <li>the entry name (file or directory name, without a path separator),</li> + * <li>the raw object id of the referenced blob or sub-tree.</li> + * </ul> + * + * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} + * sorts after {@code foobar}.</p> + * + * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> + * @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a> + */ + static class DirectoryEntry implements Comparable<DirectoryEntry> { + + /** + * The entry name (file or directory name, no path separator). + */ + private final String name; + /** + * The raw object id of the referenced blob or sub-tree. + */ + private final byte[] rawObjectId; + /** + * The key used for ordering entries within a tree object. + * + * <p>>Git appends {@code '/'} to directory names before comparing.</p> + */ + private final String sortKey; + /** + * The Git object type, which determines the Unix file-mode prefix. + */ + private final FileMode type; + + /** + * Creates an entry. + * + * @param name The name of the entry + * @param type The type of the entry + * @param rawObjectId The id of the entry + */ + DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) { + if (Objects.requireNonNull(name).indexOf('/') >= 0) { + throw new IllegalArgumentException("Entry name must not contain '/': " + name); + } + this.name = name; + this.type = Objects.requireNonNull(type); + this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name; + this.rawObjectId = Objects.requireNonNull(rawObjectId); + } + + @Override + public int compareTo(final DirectoryEntry o) { + return sortKey.compareTo(o.sortKey); + } + + @Override + public boolean equals(final Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof DirectoryEntry)) { + return false; + } + final DirectoryEntry other = (DirectoryEntry) obj; + return name.equals(other.name); + } + + @Override + public int hashCode() { + return name.hashCode(); + } + + } + + /** + * Builds a Git tree identifier for a virtual directory structure, such as the contents of + * an archive. + */ + public static final class TreeIdBuilder { + + /** + * A supplier of a blob identifier that may throw {@link IOException}. + */ + @FunctionalInterface + private interface BlobIdSupplier { + byte[] get() throws IOException; + } + + private static void checkPathComponent(String name) { + if (".".equals(name) || "..".equals(name)) { + throw new IllegalArgumentException("Path component not allowed: " + name); + } + } Review Comment: This is probably worth discussing: in `TreeIdBuilder` I don't really want to deal with arbitrary paths, so I require the paths to be normalized. Arbitrary paths would require to keep track of the parent `TreeIdBuilder` just to handle `..` segments. I don't if this is a legitimate case and mostly occurs in ZIP Slip attempts. On the other hand, we could allow `.` and silently omit it, which is what I am doing with empty path segments (`//`). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
