ppkarwasz commented on code in PR #428: URL: https://github.com/apache/commons-codec/pull/428#discussion_r3068384417
########## src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java: ########## @@ -0,0 +1,489 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.digest; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.MessageDigest; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; + +/** + * Operations for computing Git object identifiers and their generalizations described by the + * <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>. + * + * <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git. + * Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p> + * + * <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p> + * + * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> + * @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a> + * @since 1.22.0 + */ +public class GitIdentifiers { + + /** + * The type of a Git tree entry, which maps to a Unix file-mode string. + * + * <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here + * cover the four entry types that Git itself produces.</p> + */ + public enum FileMode { + + /** + * A sub-directory (Git sub-tree). + */ + DIRECTORY("40000"), + + /** + * An executable file. + */ + EXECUTABLE("100755"), + + /** + * A regular (non-executable) file. + */ + REGULAR("100644"), + + /** + * A symbolic link. + */ + SYMBOLIC_LINK("120000"); + + /** + * The octal mode as used by Git. + */ + private final String mode; + + /** + * Serialized {@code mode}: since this is mutable, it must remain private. + */ + private final byte[] modeBytes; + + FileMode(final String mode) { + this.mode = mode; + this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII); + } + + /** + * Gets the octal mode as used by Git. + * + * @return The octal mode + */ + public String getMode() { + return mode; + } + } + + /** + * Represents a single entry in a Git tree object. + * + * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p> + * <ul> + * <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li> + * <li>the entry name (file or directory name, without a path separator),</li> + * <li>the raw object id of the referenced blob or sub-tree.</li> + * </ul> + * + * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} + * sorts after {@code foobar}.</p> + * + * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> + * @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a> + */ + static class DirectoryEntry implements Comparable<DirectoryEntry> { + + /** + * The entry name (file or directory name, no path separator). + */ + private final String name; + /** + * The raw object id of the referenced blob or sub-tree. + */ + private final byte[] rawObjectId; + /** + * The key used for ordering entries within a tree object. + * + * <p>>Git appends {@code '/'} to directory names before comparing.</p> + */ + private final String sortKey; + /** + * The Git object type, which determines the Unix file-mode prefix. + */ + private final FileMode type; + + /** + * Creates an entry. + * + * @param name The name of the entry + * @param type The type of the entry + * @param rawObjectId The id of the entry + */ + DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) { + if (Objects.requireNonNull(name).indexOf('/') >= 0) { + throw new IllegalArgumentException("Entry name must not contain '/': " + name); + } + this.name = name; + this.type = Objects.requireNonNull(type); + this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name; + this.rawObjectId = Objects.requireNonNull(rawObjectId); + } + + @Override + public int compareTo(final DirectoryEntry o) { + return sortKey.compareTo(o.sortKey); + } + + @Override + public boolean equals(final Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof DirectoryEntry)) { + return false; + } + final DirectoryEntry other = (DirectoryEntry) obj; + return name.equals(other.name); + } + + @Override + public int hashCode() { + return name.hashCode(); + } + + } + + /** + * Builds a Git tree identifier for a virtual directory structure, such as the contents of + * an archive. + */ + public static class TreeIdBuilder { + + /** + * A supplier of a blob identifier that may throw {@link IOException}. + */ + @FunctionalInterface + private interface BlobIdSupplier { + byte[] get() throws IOException; + } + + private static void checkPathComponent(String name) { + if (".".equals(name) || "..".equals(name)) { + throw new IllegalArgumentException("Path component not allowed: " + name); + } + } + private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>(); + private final Map<String, DirectoryEntry> fileEntries = new HashMap<>(); + private final MessageDigest messageDigest; + + TreeIdBuilder(final MessageDigest messageDigest) { + this.messageDigest = Objects.requireNonNull(messageDigest); + } + + /** + * Returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent. + * + * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}). + * @return The {@link TreeIdBuilder} for the subdirectory. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public TreeIdBuilder addDirectory(final String name) { + TreeIdBuilder current = this; + for (final String component : name.split("/", -1)) { + if (component.isEmpty()) { + continue; + } + checkPathComponent(component); + current = current.dirEntries.computeIfAbsent(component, k -> new TreeIdBuilder(messageDigest)); + } + return current; + } + + /** + * Adds a file entry at the given path within this tree. + * + * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p> + * + * <p>The stream is eagerly drained.</p> + * + * <p>If the size of the stream is known in advance, consider using {@link #addFile(FileMode, String, long, InputStream)} instead.</p> + * + * @param mode The file mode (e.g. {@link FileMode#REGULAR}). + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param data The file content. + * @throws IOException If the stream cannot be read. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public void addFile(final FileMode mode, final String name, final InputStream data) throws IOException { + addFile(mode, name, () -> blobId(messageDigest, readAllBytes(data))); + } + + /** + * Adds a file entry at the given path within this tree, streaming content without buffering. + * + * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p> + * + * <p>The stream is eagerly drained.</p> + * + * @param mode The file mode (e.g. {@link FileMode#REGULAR}). + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param dataSize The exact number of bytes in {@code data}. + * @param data The file content. + * @throws IOException If the stream cannot be read. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException { + addFile(mode, name, () -> blobId(messageDigest, dataSize, data)); + } + + private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException { + final int slash = name.indexOf('/'); + if (slash < 0) { + checkPathComponent(name); + fileEntries.put(name, new DirectoryEntry(name, mode, blobId.get())); + } else { + addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId); + } + } + + /** + * Adds a file entry at the given path within this tree. + * + * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p> + * + * @param mode The file mode (e.g. {@link FileMode#REGULAR}). + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param data The file content. + * @throws IOException If an I/O error occurs. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException { + addFile(mode, name, () -> blobId(messageDigest, data)); + } + + /** + * Computes the Git tree identifier for this directory and all its descendants. + * + * @return The raw tree identifier bytes. + * @throws IOException If a digest operation fails. + */ + public byte[] build() throws IOException { + final Set<DirectoryEntry> entries = new TreeSet<>(fileEntries.values()); + for (final Map.Entry<String, TreeIdBuilder> e : dirEntries.entrySet()) { + entries.add(new DirectoryEntry(e.getKey(), FileMode.DIRECTORY, e.getValue().build())); + } + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (final DirectoryEntry entry : entries) { + baos.write(entry.type.modeBytes); + baos.write(' '); + baos.write(entry.name.getBytes(StandardCharsets.UTF_8)); + baos.write('\0'); + baos.write(entry.rawObjectId); + } + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size())); + return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest(); + } + } + + /** + * Reads through a byte array and returns a generalized Git blob identifier. + * + * <p>The identifier is computed in the way described by the + * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash + * algorithm.</p> + * + * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> + * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Data to digest. + * @return A generalized Git blob identifier. + */ + public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) { + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length)); + return DigestUtils.digest(messageDigest, data); + } + + /** + * Reads through a stream and returns a generalized Git blob identifier. + * + * <p>The stream is drained and its contents are buffered to determine the size before hashing. To avoid + * buffering, use {@link #blobId(MessageDigest, long, InputStream)} when the size is known in advance.</p> + * + * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> + * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Stream to digest. + * @return A generalized Git blob identifier. + * @throws IOException On error reading the stream. + */ + public static byte[] blobId(final MessageDigest messageDigest, final InputStream data) throws IOException { + return blobId(messageDigest, readAllBytes(data)); + } + + /** + * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering. + * + * <p>When the size of the content is known in advance, this overload streams {@code data} directly through + * the digest without buffering the full content in memory.</p> + * + * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> + * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param dataSize The exact number of bytes in {@code data}. + * @param data Stream to digest. + * @return A generalized Git blob identifier. + * @throws IOException On error reading the stream. + */ + public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException { + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize)); + return DigestUtils.updateDigest(messageDigest, data).digest(); + } + + /** + * Reads through a file and returns a generalized Git blob identifier. + * + * <p>The identifier is computed in the way described by the + * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash + * algorithm.</p> + * + * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> + * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Path to the file to digest. + * @return A generalized Git blob identifier. + * @throws IOException On error accessing the file. + */ + public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { + messageDigest.reset(); + if (Files.isSymbolicLink(data)) { + final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(linkTarget.length)); + return DigestUtils.digest(messageDigest, linkTarget); + } + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data))); + return DigestUtils.updateDigest(messageDigest, data).digest(); + } + + private static FileMode getGitDirectoryEntryType(final Path path) { + // Symbolic links first + if (Files.isSymbolicLink(path)) { + return FileMode.SYMBOLIC_LINK; + } + if (Files.isDirectory(path)) { + return FileMode.DIRECTORY; + } + if (Files.isExecutable(path)) { + return FileMode.EXECUTABLE; + } + return FileMode.REGULAR; + } + + private static byte[] getGitBlobPrefix(final long dataSize) { + return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + + private static byte[] getGitTreePrefix(final long dataSize) { + return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + + private static void populateFromPath(final TreeIdBuilder builder, final Path directory) throws IOException { + try (DirectoryStream<Path> files = Files.newDirectoryStream(directory)) { + for (final Path path : files) { + final String name = path.getFileName().toString(); + final FileMode mode = getGitDirectoryEntryType(path); + switch (mode) { + case DIRECTORY: + populateFromPath(builder.addDirectory(name), path); + break; + case SYMBOLIC_LINK: + final byte[] linkTarget = Files.readSymbolicLink(path).toString().getBytes(StandardCharsets.UTF_8); + builder.addFile(FileMode.SYMBOLIC_LINK, name, linkTarget); + break; + default: + try (InputStream is = Files.newInputStream(path)) { + builder.addFile(mode, name, Files.size(path), is); + } + break; + } + } + } + } + + private static byte[] readAllBytes(final InputStream in) throws IOException { Review Comment: Removed in https://github.com/apache/commons-codec/pull/428/commits/9064aa23dce73af3c0eba956e5480e85192116de -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
