garydgregory commented on code in PR #428:
URL: https://github.com/apache/commons-codec/pull/428#discussion_r3060393518


##########
src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java:
##########
@@ -0,0 +1,489 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.digest;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeSet;
+
+/**
+ * Operations for computing Git object identifiers and their generalizations 
described by the
+ * <a href="https://www.swhid.org/swhid-specification/";>SWHID 
specification</a>.
+ *
+ * <p>When the hash algorithm is SHA-1, the identifiers produced by this class 
are identical to those used by Git.
+ * Other hash algorithms produce generalized identifiers as described by the 
SWHID specification.</p>
+ *
+ * <p>This class is immutable and thread-safe. However, the {@link 
MessageDigest} instances passed to it generally won't be.</p>
+ *
+ * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects";>Git 
Internals – Git Objects</a>
+ * @see <a href="https://www.swhid.org/swhid-specification/";>SWHID 
Specification</a>
+ * @since 1.22.0
+ */
+public class GitIdentifiers {
+
+    /**
+     * The type of a Git tree entry, which maps to a Unix file-mode string.
+     *
+     * <p>Git encodes the file type and permission bits as an ASCII octal 
string that precedes the entry name in the binary tree format. The values 
defined here
+     * cover the four entry types that Git itself produces.</p>
+     */
+    public enum FileMode {
+
+        /**
+         * A sub-directory (Git sub-tree).
+         */
+        DIRECTORY("40000"),
+
+        /**
+         * An executable file.
+         */
+        EXECUTABLE("100755"),
+
+        /**
+         * A regular (non-executable) file.
+         */
+        REGULAR("100644"),
+
+        /**
+         * A symbolic link.
+         */
+        SYMBOLIC_LINK("120000");
+
+        /**
+         * The octal mode as used by Git.
+         */
+        private final String mode;
+
+        /**
+         * Serialized {@code mode}: since this is mutable, it must remain 
private.
+         */
+        private final byte[] modeBytes;
+
+        FileMode(final String mode) {
+            this.mode = mode;
+            this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII);
+        }
+
+        /**
+         * Gets the octal mode as used by Git.
+         *
+         * @return The octal mode
+         */
+        public String getMode() {
+            return mode;
+        }
+    }
+
+    /**
+     * Represents a single entry in a Git tree object.
+     *
+     * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
+     * <ul>
+     *   <li>a {@link FileMode} that determines the Unix file mode (e.g. 
{@code 100644} for a regular file),</li>
+     *   <li>the entry name (file or directory name, without a path 
separator),</li>
+     *   <li>the raw object id of the referenced blob or sub-tree.</li>
+     * </ul>
+     *
+     * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort 
rule: directory names are compared as if they ended with {@code '/'}, so that 
{@code foo/}
+     * sorts after {@code foobar}.</p>
+     *
+     * @see <a 
href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects";>Git Internals – 
Git Objects</a>
+     * @see <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories";>SWHID
 Directory Identifier</a>
+     */
+    static class DirectoryEntry implements Comparable<DirectoryEntry> {
+
+        /**
+         * The entry name (file or directory name, no path separator).
+         */
+        private final String name;
+        /**
+         * The raw object id of the referenced blob or sub-tree.
+         */
+        private final byte[] rawObjectId;
+        /**
+         * The key used for ordering entries within a tree object.
+         *
+         * <p>>Git appends {@code '/'} to directory names before comparing.</p>
+         */
+        private final String sortKey;
+        /**
+         * The Git object type, which determines the Unix file-mode prefix.
+         */
+        private final FileMode type;
+
+        /**
+         * Creates an entry.
+         *
+         * @param name The name of the entry
+         * @param type The type of the entry
+         * @param rawObjectId The id of the entry
+         */
+        DirectoryEntry(final String name, final FileMode type, final byte[] 
rawObjectId) {
+            if (Objects.requireNonNull(name).indexOf('/') >= 0) {
+                throw new IllegalArgumentException("Entry name must not 
contain '/': " + name);
+            }
+            this.name = name;
+            this.type = Objects.requireNonNull(type);
+            this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
+            this.rawObjectId = Objects.requireNonNull(rawObjectId);
+        }
+
+        @Override
+        public int compareTo(final DirectoryEntry o) {
+            return sortKey.compareTo(o.sortKey);
+        }
+
+        @Override
+        public boolean equals(final Object obj) {
+            if (obj == this) {
+                return true;
+            }
+            if (!(obj instanceof DirectoryEntry)) {
+                return false;
+            }
+            final DirectoryEntry other = (DirectoryEntry) obj;
+            return name.equals(other.name);
+        }
+
+        @Override
+        public int hashCode() {
+            return name.hashCode();
+        }
+
+    }
+
+    /**
+     * Builds a Git tree identifier for a virtual directory structure, such as 
the contents of
+     * an archive.
+     */
+    public static class TreeIdBuilder {
+
+        /**
+         * A supplier of a blob identifier that may throw {@link IOException}.
+         */
+        @FunctionalInterface
+        private interface BlobIdSupplier {

Review Comment:
   This could just be a Commons IO `IOSupplier`.



##########
src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java:
##########
@@ -0,0 +1,489 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.digest;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeSet;
+
+/**
+ * Operations for computing Git object identifiers and their generalizations 
described by the
+ * <a href="https://www.swhid.org/swhid-specification/";>SWHID 
specification</a>.
+ *
+ * <p>When the hash algorithm is SHA-1, the identifiers produced by this class 
are identical to those used by Git.
+ * Other hash algorithms produce generalized identifiers as described by the 
SWHID specification.</p>
+ *
+ * <p>This class is immutable and thread-safe. However, the {@link 
MessageDigest} instances passed to it generally won't be.</p>
+ *
+ * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects";>Git 
Internals – Git Objects</a>
+ * @see <a href="https://www.swhid.org/swhid-specification/";>SWHID 
Specification</a>
+ * @since 1.22.0
+ */
+public class GitIdentifiers {
+
+    /**
+     * The type of a Git tree entry, which maps to a Unix file-mode string.
+     *
+     * <p>Git encodes the file type and permission bits as an ASCII octal 
string that precedes the entry name in the binary tree format. The values 
defined here
+     * cover the four entry types that Git itself produces.</p>
+     */
+    public enum FileMode {
+
+        /**
+         * A sub-directory (Git sub-tree).
+         */
+        DIRECTORY("40000"),
+
+        /**
+         * An executable file.
+         */
+        EXECUTABLE("100755"),
+
+        /**
+         * A regular (non-executable) file.
+         */
+        REGULAR("100644"),
+
+        /**
+         * A symbolic link.
+         */
+        SYMBOLIC_LINK("120000");
+
+        /**
+         * The octal mode as used by Git.
+         */
+        private final String mode;
+
+        /**
+         * Serialized {@code mode}: since this is mutable, it must remain 
private.
+         */
+        private final byte[] modeBytes;
+
+        FileMode(final String mode) {
+            this.mode = mode;
+            this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII);
+        }
+
+        /**
+         * Gets the octal mode as used by Git.
+         *
+         * @return The octal mode
+         */
+        public String getMode() {
+            return mode;
+        }
+    }
+
+    /**
+     * Represents a single entry in a Git tree object.
+     *
+     * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
+     * <ul>
+     *   <li>a {@link FileMode} that determines the Unix file mode (e.g. 
{@code 100644} for a regular file),</li>
+     *   <li>the entry name (file or directory name, without a path 
separator),</li>
+     *   <li>the raw object id of the referenced blob or sub-tree.</li>
+     * </ul>
+     *
+     * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort 
rule: directory names are compared as if they ended with {@code '/'}, so that 
{@code foo/}
+     * sorts after {@code foobar}.</p>
+     *
+     * @see <a 
href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects";>Git Internals – 
Git Objects</a>
+     * @see <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories";>SWHID
 Directory Identifier</a>
+     */
+    static class DirectoryEntry implements Comparable<DirectoryEntry> {
+
+        /**
+         * The entry name (file or directory name, no path separator).
+         */
+        private final String name;
+        /**
+         * The raw object id of the referenced blob or sub-tree.
+         */
+        private final byte[] rawObjectId;
+        /**
+         * The key used for ordering entries within a tree object.
+         *
+         * <p>>Git appends {@code '/'} to directory names before comparing.</p>
+         */
+        private final String sortKey;
+        /**
+         * The Git object type, which determines the Unix file-mode prefix.
+         */
+        private final FileMode type;
+
+        /**
+         * Creates an entry.
+         *
+         * @param name The name of the entry
+         * @param type The type of the entry
+         * @param rawObjectId The id of the entry
+         */
+        DirectoryEntry(final String name, final FileMode type, final byte[] 
rawObjectId) {
+            if (Objects.requireNonNull(name).indexOf('/') >= 0) {
+                throw new IllegalArgumentException("Entry name must not 
contain '/': " + name);
+            }
+            this.name = name;
+            this.type = Objects.requireNonNull(type);
+            this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
+            this.rawObjectId = Objects.requireNonNull(rawObjectId);
+        }
+
+        @Override
+        public int compareTo(final DirectoryEntry o) {
+            return sortKey.compareTo(o.sortKey);
+        }
+
+        @Override
+        public boolean equals(final Object obj) {
+            if (obj == this) {
+                return true;
+            }
+            if (!(obj instanceof DirectoryEntry)) {
+                return false;
+            }
+            final DirectoryEntry other = (DirectoryEntry) obj;
+            return name.equals(other.name);
+        }
+
+        @Override
+        public int hashCode() {
+            return name.hashCode();
+        }
+
+    }
+
+    /**
+     * Builds a Git tree identifier for a virtual directory structure, such as 
the contents of
+     * an archive.
+     */
+    public static class TreeIdBuilder {
+
+        /**
+         * A supplier of a blob identifier that may throw {@link IOException}.
+         */
+        @FunctionalInterface
+        private interface BlobIdSupplier {
+            byte[] get() throws IOException;
+        }
+
+        private static void checkPathComponent(String name) {
+            if (".".equals(name) || "..".equals(name)) {
+                throw new IllegalArgumentException("Path component not 
allowed: " + name);
+            }
+        }
+        private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>();
+        private final Map<String, DirectoryEntry> fileEntries = new 
HashMap<>();
+        private final MessageDigest messageDigest;
+
+        TreeIdBuilder(final MessageDigest messageDigest) {
+            this.messageDigest = Objects.requireNonNull(messageDigest);
+        }
+
+        /**
+         * Returns the {@link TreeIdBuilder} for the named subdirectory, 
creating it if absent.
+         *
+         * @param name The relative path of the subdirectory in normalized 
form (may contain {@code '/'}).
+         * @return The {@link TreeIdBuilder} for the subdirectory.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public TreeIdBuilder addDirectory(final String name) {
+            TreeIdBuilder current = this;
+            for (final String component : name.split("/", -1)) {
+                if (component.isEmpty()) {
+                    continue;
+                }
+                checkPathComponent(component);
+                current = current.dirEntries.computeIfAbsent(component, k -> 
new TreeIdBuilder(messageDigest));
+            }
+            return current;
+        }
+
+        /**
+         * Adds a file entry at the given path within this tree.
+         *
+         * <p>If {@code name} contains {@code '/'}, intermediate 
subdirectories are created automatically.</p>
+         *
+         * <p>The stream is eagerly drained.</p>
+         *
+         * <p>If the size of the stream is known in advance, consider using 
{@link #addFile(FileMode, String, long, InputStream)} instead.</p>
+         *
+         * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
+         * @param name The relative path of the entry in normalized form(may 
contain {@code '/'}).
+         * @param data The file content.
+         * @throws IOException If the stream cannot be read.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public void addFile(final FileMode mode, final String name, final 
InputStream data) throws IOException {
+            addFile(mode, name, () -> blobId(messageDigest, 
readAllBytes(data)));
+        }
+
+        /**
+         * Adds a file entry at the given path within this tree, streaming 
content without buffering.
+         *
+         * <p>If {@code name} contains {@code '/'}, intermediate 
subdirectories are created automatically.</p>
+         *
+         * <p>The stream is eagerly drained.</p>
+         *
+         * @param mode     The file mode (e.g. {@link FileMode#REGULAR}).
+         * @param name The relative path of the entry in normalized form(may 
contain {@code '/'}).
+         * @param dataSize The exact number of bytes in {@code data}.
+         * @param data     The file content.
+         * @throws IOException If the stream cannot be read.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public void addFile(final FileMode mode, final String name, final long 
dataSize, final InputStream data) throws IOException {
+            addFile(mode, name, () -> blobId(messageDigest, dataSize, data));
+        }
+
+        private void addFile(final FileMode mode, final String name, final 
BlobIdSupplier blobId) throws IOException {
+            final int slash = name.indexOf('/');
+            if (slash < 0) {
+                checkPathComponent(name);
+                fileEntries.put(name, new DirectoryEntry(name, mode, 
blobId.get()));
+            } else {
+                addDirectory(name.substring(0, slash)).addFile(mode, 
name.substring(slash + 1), blobId);
+            }
+        }
+
+        /**
+         * Adds a file entry at the given path within this tree.
+         *
+         * <p>If {@code name} contains {@code '/'}, intermediate 
subdirectories are created automatically.</p>
+         *
+         * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
+         * @param name The relative path of the entry in normalized form(may 
contain {@code '/'}).
+         * @param data The file content.
+         * @throws IOException If an I/O error occurs.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public void addFile(final FileMode mode, final String name, final 
byte[] data) throws IOException {
+            addFile(mode, name, () -> blobId(messageDigest, data));
+        }
+
+        /**
+         * Computes the Git tree identifier for this directory and all its 
descendants.
+         *
+         * @return The raw tree identifier bytes.
+         * @throws IOException If a digest operation fails.
+         */
+        public byte[] build() throws IOException {
+            final Set<DirectoryEntry> entries = new 
TreeSet<>(fileEntries.values());
+            for (final Map.Entry<String, TreeIdBuilder> e : 
dirEntries.entrySet()) {
+                entries.add(new DirectoryEntry(e.getKey(), FileMode.DIRECTORY, 
e.getValue().build()));
+            }
+            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            for (final DirectoryEntry entry : entries) {
+                baos.write(entry.type.modeBytes);
+                baos.write(' ');
+                baos.write(entry.name.getBytes(StandardCharsets.UTF_8));
+                baos.write('\0');
+                baos.write(entry.rawObjectId);
+            }
+            messageDigest.reset();
+            DigestUtils.updateDigest(messageDigest, 
getGitTreePrefix(baos.size()));
+            return DigestUtils.updateDigest(messageDigest, 
baos.toByteArray()).digest();
+        }
+    }
+
+    /**
+     * Reads through a byte array and returns a generalized Git blob 
identifier.
+     *
+     * <p>The identifier is computed in the way described by the
+     * <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents";>SWHID
 contents identifier</a>, but it can use any hash
+     * algorithm.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param data          Data to digest.
+     * @return A generalized Git blob identifier.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final 
byte[] data) {
+        messageDigest.reset();
+        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length));
+        return DigestUtils.digest(messageDigest, data);
+    }
+
+    /**
+     * Reads through a stream and returns a generalized Git blob identifier.
+     *
+     * <p>The stream is drained and its contents are buffered to determine the 
size before hashing. To avoid
+     * buffering, use {@link #blobId(MessageDigest, long, InputStream)} when 
the size is known in advance.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param data          Stream to digest.
+     * @return A generalized Git blob identifier.
+     * @throws IOException On error reading the stream.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final 
InputStream data) throws IOException {
+        return blobId(messageDigest, readAllBytes(data));
+    }
+
+    /**
+     * Reads through a stream of known size and returns a generalized Git blob 
identifier, without buffering.
+     *
+     * <p>When the size of the content is known in advance, this overload 
streams {@code data} directly through
+     * the digest without buffering the full content in memory.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param dataSize      The exact number of bytes in {@code data}.
+     * @param data          Stream to digest.
+     * @return A generalized Git blob identifier.
+     * @throws IOException On error reading the stream.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final long 
dataSize, final InputStream data) throws IOException {
+        messageDigest.reset();
+        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize));
+        return DigestUtils.updateDigest(messageDigest, data).digest();
+    }
+
+    /**
+     * Reads through a file and returns a generalized Git blob identifier.
+     *
+     * <p>The identifier is computed in the way described by the
+     * <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents";>SWHID
 contents identifier</a>, but it can use any hash
+     * algorithm.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param data          Path to the file to digest.
+     * @return A generalized Git blob identifier.
+     * @throws IOException On error accessing the file.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final Path 
data) throws IOException {
+        messageDigest.reset();
+        if (Files.isSymbolicLink(data)) {
+            final byte[] linkTarget = 
Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8);
+            DigestUtils.updateDigest(messageDigest, 
getGitBlobPrefix(linkTarget.length));
+            return DigestUtils.digest(messageDigest, linkTarget);
+        }
+        DigestUtils.updateDigest(messageDigest, 
getGitBlobPrefix(Files.size(data)));
+        return DigestUtils.updateDigest(messageDigest, data).digest();
+    }
+
+    private static FileMode getGitDirectoryEntryType(final Path path) {
+        // Symbolic links first
+        if (Files.isSymbolicLink(path)) {
+            return FileMode.SYMBOLIC_LINK;
+        }
+        if (Files.isDirectory(path)) {
+            return FileMode.DIRECTORY;
+        }
+        if (Files.isExecutable(path)) {
+            return FileMode.EXECUTABLE;
+        }
+        return FileMode.REGULAR;
+    }
+
+    private static byte[] getGitBlobPrefix(final long dataSize) {

Review Comment:
   This re-introduces the copy-pasta I fixed with 
`DigestUtils.gitPrefix(String, long)`



##########
src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java:
##########
@@ -0,0 +1,489 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.digest;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeSet;
+
+/**
+ * Operations for computing Git object identifiers and their generalizations 
described by the
+ * <a href="https://www.swhid.org/swhid-specification/";>SWHID 
specification</a>.
+ *
+ * <p>When the hash algorithm is SHA-1, the identifiers produced by this class 
are identical to those used by Git.
+ * Other hash algorithms produce generalized identifiers as described by the 
SWHID specification.</p>
+ *
+ * <p>This class is immutable and thread-safe. However, the {@link 
MessageDigest} instances passed to it generally won't be.</p>
+ *
+ * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects";>Git 
Internals – Git Objects</a>
+ * @see <a href="https://www.swhid.org/swhid-specification/";>SWHID 
Specification</a>
+ * @since 1.22.0
+ */
+public class GitIdentifiers {
+
+    /**
+     * The type of a Git tree entry, which maps to a Unix file-mode string.
+     *
+     * <p>Git encodes the file type and permission bits as an ASCII octal 
string that precedes the entry name in the binary tree format. The values 
defined here
+     * cover the four entry types that Git itself produces.</p>
+     */
+    public enum FileMode {
+
+        /**
+         * A sub-directory (Git sub-tree).
+         */
+        DIRECTORY("40000"),
+
+        /**
+         * An executable file.
+         */
+        EXECUTABLE("100755"),
+
+        /**
+         * A regular (non-executable) file.
+         */
+        REGULAR("100644"),
+
+        /**
+         * A symbolic link.
+         */
+        SYMBOLIC_LINK("120000");
+
+        /**
+         * The octal mode as used by Git.
+         */
+        private final String mode;
+
+        /**
+         * Serialized {@code mode}: since this is mutable, it must remain 
private.
+         */
+        private final byte[] modeBytes;
+
+        FileMode(final String mode) {
+            this.mode = mode;
+            this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII);
+        }
+
+        /**
+         * Gets the octal mode as used by Git.
+         *
+         * @return The octal mode
+         */
+        public String getMode() {
+            return mode;
+        }
+    }
+
+    /**
+     * Represents a single entry in a Git tree object.
+     *
+     * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
+     * <ul>
+     *   <li>a {@link FileMode} that determines the Unix file mode (e.g. 
{@code 100644} for a regular file),</li>
+     *   <li>the entry name (file or directory name, without a path 
separator),</li>
+     *   <li>the raw object id of the referenced blob or sub-tree.</li>
+     * </ul>
+     *
+     * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort 
rule: directory names are compared as if they ended with {@code '/'}, so that 
{@code foo/}
+     * sorts after {@code foobar}.</p>
+     *
+     * @see <a 
href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects";>Git Internals – 
Git Objects</a>
+     * @see <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories";>SWHID
 Directory Identifier</a>
+     */
+    static class DirectoryEntry implements Comparable<DirectoryEntry> {
+
+        /**
+         * The entry name (file or directory name, no path separator).
+         */
+        private final String name;
+        /**
+         * The raw object id of the referenced blob or sub-tree.
+         */
+        private final byte[] rawObjectId;
+        /**
+         * The key used for ordering entries within a tree object.
+         *
+         * <p>>Git appends {@code '/'} to directory names before comparing.</p>
+         */
+        private final String sortKey;
+        /**
+         * The Git object type, which determines the Unix file-mode prefix.
+         */
+        private final FileMode type;
+
+        /**
+         * Creates an entry.
+         *
+         * @param name The name of the entry
+         * @param type The type of the entry
+         * @param rawObjectId The id of the entry
+         */
+        DirectoryEntry(final String name, final FileMode type, final byte[] 
rawObjectId) {
+            if (Objects.requireNonNull(name).indexOf('/') >= 0) {
+                throw new IllegalArgumentException("Entry name must not 
contain '/': " + name);
+            }
+            this.name = name;
+            this.type = Objects.requireNonNull(type);
+            this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
+            this.rawObjectId = Objects.requireNonNull(rawObjectId);
+        }
+
+        @Override
+        public int compareTo(final DirectoryEntry o) {
+            return sortKey.compareTo(o.sortKey);
+        }
+
+        @Override
+        public boolean equals(final Object obj) {
+            if (obj == this) {
+                return true;
+            }
+            if (!(obj instanceof DirectoryEntry)) {
+                return false;
+            }
+            final DirectoryEntry other = (DirectoryEntry) obj;
+            return name.equals(other.name);
+        }
+
+        @Override
+        public int hashCode() {
+            return name.hashCode();
+        }
+
+    }
+
+    /**
+     * Builds a Git tree identifier for a virtual directory structure, such as 
the contents of
+     * an archive.
+     */
+    public static class TreeIdBuilder {
+
+        /**
+         * A supplier of a blob identifier that may throw {@link IOException}.
+         */
+        @FunctionalInterface
+        private interface BlobIdSupplier {
+            byte[] get() throws IOException;
+        }
+
+        private static void checkPathComponent(String name) {
+            if (".".equals(name) || "..".equals(name)) {
+                throw new IllegalArgumentException("Path component not 
allowed: " + name);
+            }
+        }
+        private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>();
+        private final Map<String, DirectoryEntry> fileEntries = new 
HashMap<>();
+        private final MessageDigest messageDigest;
+
+        TreeIdBuilder(final MessageDigest messageDigest) {
+            this.messageDigest = Objects.requireNonNull(messageDigest);
+        }
+
+        /**
+         * Returns the {@link TreeIdBuilder} for the named subdirectory, 
creating it if absent.
+         *
+         * @param name The relative path of the subdirectory in normalized 
form (may contain {@code '/'}).
+         * @return The {@link TreeIdBuilder} for the subdirectory.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public TreeIdBuilder addDirectory(final String name) {
+            TreeIdBuilder current = this;
+            for (final String component : name.split("/", -1)) {
+                if (component.isEmpty()) {
+                    continue;
+                }
+                checkPathComponent(component);
+                current = current.dirEntries.computeIfAbsent(component, k -> 
new TreeIdBuilder(messageDigest));
+            }
+            return current;
+        }
+
+        /**
+         * Adds a file entry at the given path within this tree.
+         *
+         * <p>If {@code name} contains {@code '/'}, intermediate 
subdirectories are created automatically.</p>
+         *
+         * <p>The stream is eagerly drained.</p>
+         *
+         * <p>If the size of the stream is known in advance, consider using 
{@link #addFile(FileMode, String, long, InputStream)} instead.</p>
+         *
+         * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
+         * @param name The relative path of the entry in normalized form(may 
contain {@code '/'}).
+         * @param data The file content.
+         * @throws IOException If the stream cannot be read.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public void addFile(final FileMode mode, final String name, final 
InputStream data) throws IOException {
+            addFile(mode, name, () -> blobId(messageDigest, 
readAllBytes(data)));
+        }
+
+        /**
+         * Adds a file entry at the given path within this tree, streaming 
content without buffering.
+         *
+         * <p>If {@code name} contains {@code '/'}, intermediate 
subdirectories are created automatically.</p>
+         *
+         * <p>The stream is eagerly drained.</p>
+         *
+         * @param mode     The file mode (e.g. {@link FileMode#REGULAR}).
+         * @param name The relative path of the entry in normalized form(may 
contain {@code '/'}).
+         * @param dataSize The exact number of bytes in {@code data}.
+         * @param data     The file content.
+         * @throws IOException If the stream cannot be read.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public void addFile(final FileMode mode, final String name, final long 
dataSize, final InputStream data) throws IOException {
+            addFile(mode, name, () -> blobId(messageDigest, dataSize, data));
+        }
+
+        private void addFile(final FileMode mode, final String name, final 
BlobIdSupplier blobId) throws IOException {
+            final int slash = name.indexOf('/');
+            if (slash < 0) {
+                checkPathComponent(name);
+                fileEntries.put(name, new DirectoryEntry(name, mode, 
blobId.get()));
+            } else {
+                addDirectory(name.substring(0, slash)).addFile(mode, 
name.substring(slash + 1), blobId);
+            }
+        }
+
+        /**
+         * Adds a file entry at the given path within this tree.
+         *
+         * <p>If {@code name} contains {@code '/'}, intermediate 
subdirectories are created automatically.</p>
+         *
+         * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
+         * @param name The relative path of the entry in normalized form(may 
contain {@code '/'}).
+         * @param data The file content.
+         * @throws IOException If an I/O error occurs.
+         * @throws IllegalArgumentException If any path component is {@code 
"."} or {@code ".."}.
+         */
+        public void addFile(final FileMode mode, final String name, final 
byte[] data) throws IOException {
+            addFile(mode, name, () -> blobId(messageDigest, data));
+        }
+
+        /**
+         * Computes the Git tree identifier for this directory and all its 
descendants.
+         *
+         * @return The raw tree identifier bytes.
+         * @throws IOException If a digest operation fails.
+         */
+        public byte[] build() throws IOException {
+            final Set<DirectoryEntry> entries = new 
TreeSet<>(fileEntries.values());
+            for (final Map.Entry<String, TreeIdBuilder> e : 
dirEntries.entrySet()) {
+                entries.add(new DirectoryEntry(e.getKey(), FileMode.DIRECTORY, 
e.getValue().build()));
+            }
+            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            for (final DirectoryEntry entry : entries) {
+                baos.write(entry.type.modeBytes);
+                baos.write(' ');
+                baos.write(entry.name.getBytes(StandardCharsets.UTF_8));
+                baos.write('\0');
+                baos.write(entry.rawObjectId);
+            }
+            messageDigest.reset();
+            DigestUtils.updateDigest(messageDigest, 
getGitTreePrefix(baos.size()));
+            return DigestUtils.updateDigest(messageDigest, 
baos.toByteArray()).digest();
+        }
+    }
+
+    /**
+     * Reads through a byte array and returns a generalized Git blob 
identifier.
+     *
+     * <p>The identifier is computed in the way described by the
+     * <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents";>SWHID
 contents identifier</a>, but it can use any hash
+     * algorithm.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param data          Data to digest.
+     * @return A generalized Git blob identifier.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final 
byte[] data) {
+        messageDigest.reset();
+        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length));
+        return DigestUtils.digest(messageDigest, data);
+    }
+
+    /**
+     * Reads through a stream and returns a generalized Git blob identifier.
+     *
+     * <p>The stream is drained and its contents are buffered to determine the 
size before hashing. To avoid
+     * buffering, use {@link #blobId(MessageDigest, long, InputStream)} when 
the size is known in advance.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param data          Stream to digest.
+     * @return A generalized Git blob identifier.
+     * @throws IOException On error reading the stream.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final 
InputStream data) throws IOException {
+        return blobId(messageDigest, readAllBytes(data));
+    }
+
+    /**
+     * Reads through a stream of known size and returns a generalized Git blob 
identifier, without buffering.
+     *
+     * <p>When the size of the content is known in advance, this overload 
streams {@code data} directly through
+     * the digest without buffering the full content in memory.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param dataSize      The exact number of bytes in {@code data}.
+     * @param data          Stream to digest.
+     * @return A generalized Git blob identifier.
+     * @throws IOException On error reading the stream.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final long 
dataSize, final InputStream data) throws IOException {
+        messageDigest.reset();
+        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize));
+        return DigestUtils.updateDigest(messageDigest, data).digest();
+    }
+
+    /**
+     * Reads through a file and returns a generalized Git blob identifier.
+     *
+     * <p>The identifier is computed in the way described by the
+     * <a 
href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents";>SWHID
 contents identifier</a>, but it can use any hash
+     * algorithm.</p>
+     *
+     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git 
blob identifier and SWHID contents identifier.</p>
+     *
+     * @param messageDigest The MessageDigest to use (for example SHA-1).
+     * @param data          Path to the file to digest.
+     * @return A generalized Git blob identifier.
+     * @throws IOException On error accessing the file.
+     */
+    public static byte[] blobId(final MessageDigest messageDigest, final Path 
data) throws IOException {
+        messageDigest.reset();
+        if (Files.isSymbolicLink(data)) {
+            final byte[] linkTarget = 
Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8);
+            DigestUtils.updateDigest(messageDigest, 
getGitBlobPrefix(linkTarget.length));
+            return DigestUtils.digest(messageDigest, linkTarget);
+        }
+        DigestUtils.updateDigest(messageDigest, 
getGitBlobPrefix(Files.size(data)));
+        return DigestUtils.updateDigest(messageDigest, data).digest();
+    }
+
+    private static FileMode getGitDirectoryEntryType(final Path path) {
+        // Symbolic links first
+        if (Files.isSymbolicLink(path)) {
+            return FileMode.SYMBOLIC_LINK;
+        }
+        if (Files.isDirectory(path)) {
+            return FileMode.DIRECTORY;
+        }
+        if (Files.isExecutable(path)) {
+            return FileMode.EXECUTABLE;
+        }
+        return FileMode.REGULAR;
+    }
+
+    private static byte[] getGitBlobPrefix(final long dataSize) {
+        return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
+    }
+
+    private static byte[] getGitTreePrefix(final long dataSize) {
+        return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
+    }
+
+    private static void populateFromPath(final TreeIdBuilder builder, final 
Path directory) throws IOException {
+        try (DirectoryStream<Path> files = 
Files.newDirectoryStream(directory)) {
+            for (final Path path : files) {
+                final String name = path.getFileName().toString();
+                final FileMode mode = getGitDirectoryEntryType(path);
+                switch (mode) {
+                    case DIRECTORY:
+                        populateFromPath(builder.addDirectory(name), path);
+                        break;
+                    case SYMBOLIC_LINK:
+                        final byte[] linkTarget = 
Files.readSymbolicLink(path).toString().getBytes(StandardCharsets.UTF_8);
+                        builder.addFile(FileMode.SYMBOLIC_LINK, name, 
linkTarget);
+                        break;
+                    default:
+                        try (InputStream is = Files.newInputStream(path)) {
+                            builder.addFile(mode, name, Files.size(path), is);
+                        }
+                        break;
+                }
+            }
+        }
+    }
+
+    private static byte[] readAllBytes(final InputStream in) throws 
IOException {

Review Comment:
   Duplicates IO's `IOUtils.toByteArray(InputStream)`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to