From a7f54b5893863126ca8bb703b45cf399c74d2f5a Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 13:11:32 +0200 Subject: [PATCH 01/14] feat: Refactor Git-related methods in `GitIdentifiers` This change moves `gitBlob` and `gitTree` from `DigestUtils` into a separate utility class, to prepare for an enhancement of the provided API. The git tree identifier can be computed for many objects: the most natural is a directory in a filesystem, but we can also compute the identifier on an archive containing this directory. Additional usages will require expanding the API, beyond what can be reasonably contained in `DigestUtils`. --- .../commons/codec/digest/DigestUtils.java | 144 -------------- .../commons/codec/digest/GitIdentifiers.java | 181 ++++++++++++++++++ .../commons/codec/digest/DigestUtilsTest.java | 92 --------- .../codec/digest/GitIdentifiersTest.java | 130 +++++++++++++ 4 files changed, 311 insertions(+), 236 deletions(-) create mode 100644 src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java create mode 100644 src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java diff --git a/src/main/java/org/apache/commons/codec/digest/DigestUtils.java b/src/main/java/org/apache/commons/codec/digest/DigestUtils.java index 7c84b0b021..8970a03dbc 100644 --- a/src/main/java/org/apache/commons/codec/digest/DigestUtils.java +++ b/src/main/java/org/apache/commons/codec/digest/DigestUtils.java @@ -18,24 +18,17 @@ package org.apache.commons.codec.digest; import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; -import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.OpenOption; import java.nio.file.Path; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.TreeSet; import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.StringUtils; @@ -191,26 +184,6 @@ public static MessageDigest getDigest(final String algorithm, final MessageDiges } } - /** - * Returns the {@link GitDirectoryEntry.Type} of a file. - * - * @param path The file to check. - * @return A {@link GitDirectoryEntry.Type} - */ - private static GitDirectoryEntry.Type getGitDirectoryEntryType(final Path path) { - // Symbolic links first - if (Files.isSymbolicLink(path)) { - return GitDirectoryEntry.Type.SYMBOLIC_LINK; - } - if (Files.isDirectory(path)) { - return GitDirectoryEntry.Type.DIRECTORY; - } - if (Files.isExecutable(path)) { - return GitDirectoryEntry.Type.EXECUTABLE; - } - return GitDirectoryEntry.Type.REGULAR; - } - /** * Gets an MD2 MessageDigest. * @@ -407,123 +380,6 @@ public static MessageDigest getShake256_512Digest() { return getDigest(MessageDigestAlgorithms.SHAKE256_512); } - /** - * Reads through a byte array and return a generalized Git blob identifier. - * - *

The identifier is computed in the way described by the - * SWHID contents identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Data to digest. - * @return A generalized Git blob identifier. - * @since 1.22.0 - */ - public static byte[] gitBlob(final MessageDigest messageDigest, final byte[] data) { - messageDigest.reset(); - updateDigest(messageDigest, gitBlobPrefix(data.length)); - return digest(messageDigest, data); - } - - /** - * Reads through a byte array and return a generalized Git blob identifier. - * - *

The identifier is computed in the way described by the - * SWHID contents identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Data to digest. - * @param options Options how to open the file. - * @return A generalized Git blob identifier. - * @throws IOException On error accessing the file. - * @since 1.22.0 - */ - public static byte[] gitBlob(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { - messageDigest.reset(); - if (Files.isSymbolicLink(data)) { - final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); - updateDigest(messageDigest, gitBlobPrefix(linkTarget.length)); - return digest(messageDigest, linkTarget); - } - updateDigest(messageDigest, gitBlobPrefix(Files.size(data))); - return updateDigest(messageDigest, data, options).digest(); - } - - private static byte[] gitBlobPrefix(final long dataSize) { - return gitPrefix("blob ", dataSize); - } - - private static byte[] gitPrefix(final String prefix, final long dataSize) { - return (prefix + dataSize + "\0").getBytes(StandardCharsets.UTF_8); - } - - /** - * Returns a generalized Git tree identifier. - * - *

The identifier is computed in the way described by the - * SWHID directory identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param entries The directory entries. - * @return A generalized Git tree identifier. - */ - static byte[] gitTree(final MessageDigest messageDigest, final Collection entries) { - final TreeSet treeSet = new TreeSet<>(entries); - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (final GitDirectoryEntry entry : treeSet) { - final byte[] treeEntryBytes = entry.toTreeEntryBytes(); - baos.write(treeEntryBytes, 0, treeEntryBytes.length); - } - messageDigest.reset(); - updateDigest(messageDigest, gitTreePrefix(baos.size())); - return updateDigest(messageDigest, baos.toByteArray()).digest(); - } - - /** - * Reads through a byte array and return a generalized Git tree identifier. - * - *

The identifier is computed in the way described by the - * SWHID directory identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Data to digest. - * @param options Options how to open the file. - * @return A generalized Git tree identifier. - * @throws IOException On error accessing the file. - * @since 1.22.0 - */ - public static byte[] gitTree(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { - final List entries = new ArrayList<>(); - try (DirectoryStream files = Files.newDirectoryStream(data)) { - for (final Path path : files) { - final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path); - final byte[] rawObjectId; - if (type == GitDirectoryEntry.Type.DIRECTORY) { - rawObjectId = gitTree(messageDigest, path, options); - } else { - rawObjectId = gitBlob(messageDigest, path, options); - } - entries.add(new GitDirectoryEntry(path, type, rawObjectId)); - } - } - return gitTree(messageDigest, entries); - } - - private static byte[] gitTreePrefix(final long dataSize) { - return gitPrefix("tree ", dataSize); - } - /** * Test whether the algorithm is supported. * diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java new file mode 100644 index 0000000000..3cbf48b8ea --- /dev/null +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.digest; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.OpenOption; +import java.nio.file.Path; +import java.security.MessageDigest; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.TreeSet; + +/** + * Operations for computing Git object identifiers and their generalizations described by the + * SWHID specification. + * + *

When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git. + * Other hash algorithms produce generalized identifiers as described by the SWHID specification.

+ * + *

This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.

+ * + * @see Git Internals – Git Objects + * @see SWHID Specification + * @since 1.22.0 + */ +public class GitIdentifiers { + + private static GitDirectoryEntry.Type getGitDirectoryEntryType(final Path path) { + // Symbolic links first + if (Files.isSymbolicLink(path)) { + return GitDirectoryEntry.Type.SYMBOLIC_LINK; + } + if (Files.isDirectory(path)) { + return GitDirectoryEntry.Type.DIRECTORY; + } + if (Files.isExecutable(path)) { + return GitDirectoryEntry.Type.EXECUTABLE; + } + return GitDirectoryEntry.Type.REGULAR; + } + + /** + * Reads through a byte array and returns a generalized Git blob identifier. + * + *

The identifier is computed in the way described by the + * SWHID contents identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Data to digest. + * @return A generalized Git blob identifier. + */ + public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) { + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, gitBlobPrefix(data.length)); + return DigestUtils.digest(messageDigest, data); + } + + /** + * Reads through a file and returns a generalized Git blob identifier. + * + *

The identifier is computed in the way described by the + * SWHID contents identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Path to the file to digest. + * @param options Options how to open the file. + * @return A generalized Git blob identifier. + * @throws IOException On error accessing the file. + * @since 1.22.0 + */ + public static byte[] blobId(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { + messageDigest.reset(); + if (Files.isSymbolicLink(data)) { + final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); + DigestUtils.updateDigest(messageDigest, gitBlobPrefix(linkTarget.length)); + return DigestUtils.digest(messageDigest, linkTarget); + } + DigestUtils.updateDigest(messageDigest, gitBlobPrefix(Files.size(data))); + return DigestUtils.updateDigest(messageDigest, data, options).digest(); + } + + private static byte[] gitBlobPrefix(final long dataSize) { + return gitPrefix("blob ", dataSize); + } + + private static byte[] gitPrefix(final String prefix, final long dataSize) { + return (prefix + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + + /** + * Returns a generalized Git tree identifier for a collection of directory entries. + * + *

The identifier is computed in the way described by the + * SWHID directory identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param entries The directory entries. + * @return A generalized Git tree identifier. + */ + static byte[] treeId(final MessageDigest messageDigest, final Collection entries) { + final TreeSet treeSet = new TreeSet<>(entries); + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (final GitDirectoryEntry entry : treeSet) { + final byte[] treeEntryBytes = entry.toTreeEntryBytes(); + baos.write(treeEntryBytes, 0, treeEntryBytes.length); + } + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, gitTreePrefix(baos.size())); + return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest(); + } + + /** + * Reads through a directory and returns a generalized Git tree identifier. + * + *

The identifier is computed in the way described by the + * SWHID directory identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Path to the directory to digest. + * @param options Options how to open files within the directory. + * @return A generalized Git tree identifier. + * @throws IOException On error accessing the directory or its contents. + * @since 1.22.0 + */ + public static byte[] treeId(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { + final List entries = new ArrayList<>(); + try (DirectoryStream files = Files.newDirectoryStream(data)) { + for (final Path path : files) { + final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path); + final byte[] rawObjectId; + if (type == GitDirectoryEntry.Type.DIRECTORY) { + rawObjectId = treeId(messageDigest, path, options); + } else { + rawObjectId = blobId(messageDigest, path, options); + } + entries.add(new GitDirectoryEntry(path, type, rawObjectId)); + } + } + return treeId(messageDigest, entries); + } + + private static byte[] gitTreePrefix(final long dataSize) { + return gitPrefix("tree ", dataSize); + } + + private GitIdentifiers() { + // utility class + } +} diff --git a/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java b/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java index 7d1e72b0b8..6f7160baa7 100644 --- a/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java +++ b/src/test/java/org/apache/commons/codec/digest/DigestUtilsTest.java @@ -32,14 +32,11 @@ import java.io.OutputStream; import java.io.RandomAccessFile; import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.security.MessageDigest; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; import java.util.Locale; import java.util.Random; import java.util.stream.Stream; @@ -50,14 +47,11 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.SystemUtils; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; -import org.junit.jupiter.params.provider.ValueSource; /** * Tests {@link DigestUtils}. @@ -244,31 +238,6 @@ class DigestUtilsTest { "CA 92 BF 0B E5 61 5E 96 95 9D 76 71 97 A0 BE EB"; // @formatter:on - /** - * Binary body of the test tree object used in {@link #testGitTreeCollection}. - * - *

Each entry has the format {@code SP NUL <20-byte-object-id>}.

- */ - private static final String TREE_BODY_HEX = - // 100644 hello.txt\0 + objectId - "3130303634342068656c6c6f2e74787400" + "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0" + - // 120000 link.txt\0 + objectId - "313230303030206c696e6b2e74787400" + "1234567890abcdef1234567890abcdef12345678" + - // 100755 run.sh\0 + objectId - "3130303735352072756e2e736800" + "f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9" + - // 40000 src\0 + objectId - "34303030302073726300" + "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"; - - static Stream gitBlobProvider() { - return Stream.of(Arguments.of("DigestUtilsTest/hello.txt", "5f4a83288e67f1be2d6fcdad84165a86c6a970d7"), - Arguments.of("DigestUtilsTest/greetings.txt", "6cf4f797455661e61d1ee6913fc29344f5897243"), - Arguments.of("DigestUtilsTest/subdir/nested.txt", "07a392ddb4dbff06a373a7617939f30b2dcfe719")); - } - - private static Path resourcePath(final String resourceName) throws Exception { - return Paths.get(DigestUtilsTest.class.getClassLoader().getResource(resourceName).toURI()); - } - static Stream testShake128_256() { // @formatter:off return Stream.of( @@ -506,67 +475,6 @@ void testGetMessageDigest() { assertEquals(MessageDigestAlgorithms.MD5, digestUtils.getMessageDigest().getAlgorithm()); } - @ParameterizedTest - @MethodSource("gitBlobProvider") - void testGitBlobByteArray(final String resourceName, final String expectedSha1Hex) throws Exception { - final byte[] data = Files.readAllBytes(resourcePath(resourceName)); - assertArrayEquals(Hex.decodeHex(expectedSha1Hex), DigestUtils.gitBlob(DigestUtils.getSha1Digest(), data)); - } - - @ParameterizedTest - @MethodSource("gitBlobProvider") - void testGitBlobPath(final String resourceName, final String expectedSha1Hex) throws Exception { - assertArrayEquals(Hex.decodeHex(expectedSha1Hex), DigestUtils.gitBlob(DigestUtils.getSha1Digest(), resourcePath(resourceName))); - } - - @Test - void testGitBlobSymlink(@TempDir final Path tempDir) throws Exception { - final Path subDir = Files.createDirectory(tempDir.resolve("subdir")); - Files.write(subDir.resolve("file.txt"), "hello".getBytes(StandardCharsets.UTF_8)); - final Path linkToDir; - final Path linkToFile; - try { - linkToDir = Files.createSymbolicLink(tempDir.resolve("link-to-dir"), Paths.get("subdir")); - linkToFile = Files.createSymbolicLink(tempDir.resolve("link-to-file"), Paths.get("subdir/file.txt")); - } catch (final UnsupportedOperationException e) { - Assumptions.assumeTrue(false, "Symbolic links not supported on this filesystem"); - return; - } - final MessageDigest sha1 = DigestUtils.getSha1Digest(); - assertArrayEquals(Hex.decodeHex("8bbe8a53790056316b23b7c270f10ab6bf6bb1b4"), DigestUtils.gitBlob(sha1, linkToDir)); - assertArrayEquals(Hex.decodeHex("dfe6ef8392ae13a11ff85419b4fd906d997b6cb7"), DigestUtils.gitBlob(sha1, linkToFile)); - } - - @ParameterizedTest - @ValueSource(strings = {MessageDigestAlgorithms.SHA_1, MessageDigestAlgorithms.SHA_256}) - void testGitTreeCollection(final String algorithm) throws Exception { - final byte[] helloId = Hex.decodeHex("a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"); - final byte[] runId = Hex.decodeHex("f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9"); - final byte[] linkId = Hex.decodeHex("1234567890abcdef1234567890abcdef12345678"); - final byte[] srcId = Hex.decodeHex("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"); - - // Entries are supplied out of order to verify that the method sorts them correctly. - final List entries = new ArrayList<>(); - entries.add(new GitDirectoryEntry(Paths.get("src"), GitDirectoryEntry.Type.DIRECTORY, srcId)); - entries.add(new GitDirectoryEntry(Paths.get("run.sh"), GitDirectoryEntry.Type.EXECUTABLE, runId)); - entries.add(new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, helloId)); - entries.add(new GitDirectoryEntry(Paths.get("link.txt"), GitDirectoryEntry.Type.SYMBOLIC_LINK, linkId)); - - // Compute expected value - final byte[] treeBody = Hex.decodeHex(TREE_BODY_HEX); - final MessageDigest md = DigestUtils.getDigest(algorithm); - DigestUtils.updateDigest(md, ("tree " + treeBody.length + "\0").getBytes(StandardCharsets.UTF_8)); - final byte[] expected = DigestUtils.updateDigest(md, treeBody).digest(); - - assertArrayEquals(expected, DigestUtils.gitTree(md, entries)); - } - - @Test - void testGitTreePath() throws Exception { - assertArrayEquals(Hex.decodeHex("e4b21f6d78ceba6eb7c211ac15e3337ec4614e8a"), - DigestUtils.gitTree(DigestUtils.getSha1Digest(), resourcePath("DigestUtilsTest"))); - } - @Test void testInternalNoSuchAlgorithmException() { assertThrows(IllegalArgumentException.class, () -> DigestUtils.getDigest("Bogus Bogus")); diff --git a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java new file mode 100644 index 0000000000..075e08239a --- /dev/null +++ b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.digest; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; + +import org.apache.commons.codec.binary.Hex; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +/** + * Tests {@link GitIdentifiers}. + */ +class GitIdentifiersTest { + + /** + * Binary body of the test tree object used in {@link #testTreeIdCollection}. + * + *

Each entry has the format {@code SP NUL <20-byte-object-id>}.

+ */ + private static final String TREE_BODY_HEX = + // 100644 hello.txt\0 + objectId + "3130303634342068656c6c6f2e74787400" + "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0" + + // 120000 link.txt\0 + objectId + "313230303030206c696e6b2e74787400" + "1234567890abcdef1234567890abcdef12345678" + + // 100755 run.sh\0 + objectId + "3130303735352072756e2e736800" + "f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9" + + // 40000 src\0 + objectId + "34303030302073726300" + "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"; + + static Stream blobIdProvider() { + return Stream.of(Arguments.of("DigestUtilsTest/hello.txt", "5f4a83288e67f1be2d6fcdad84165a86c6a970d7"), + Arguments.of("DigestUtilsTest/greetings.txt", "6cf4f797455661e61d1ee6913fc29344f5897243"), + Arguments.of("DigestUtilsTest/subdir/nested.txt", "07a392ddb4dbff06a373a7617939f30b2dcfe719")); + } + + private static Path resourcePath(final String resourceName) throws Exception { + return Paths.get(GitIdentifiersTest.class.getClassLoader().getResource(resourceName).toURI()); + } + + @ParameterizedTest + @MethodSource("blobIdProvider") + void testBlobIdByteArray(final String resourceName, final String expectedSha1Hex) throws Exception { + final byte[] data = Files.readAllBytes(resourcePath(resourceName)); + assertArrayEquals(Hex.decodeHex(expectedSha1Hex), GitIdentifiers.blobId(DigestUtils.getSha1Digest(), data)); + } + + @ParameterizedTest + @MethodSource("blobIdProvider") + void testBlobIdPath(final String resourceName, final String expectedSha1Hex) throws Exception { + assertArrayEquals(Hex.decodeHex(expectedSha1Hex), GitIdentifiers.blobId(DigestUtils.getSha1Digest(), resourcePath(resourceName))); + } + + @Test + void testBlobIdSymlink(@TempDir final Path tempDir) throws Exception { + final Path subDir = Files.createDirectory(tempDir.resolve("subdir")); + Files.write(subDir.resolve("file.txt"), "hello".getBytes(StandardCharsets.UTF_8)); + final Path linkToDir; + final Path linkToFile; + try { + linkToDir = Files.createSymbolicLink(tempDir.resolve("link-to-dir"), Paths.get("subdir")); + linkToFile = Files.createSymbolicLink(tempDir.resolve("link-to-file"), Paths.get("subdir/file.txt")); + } catch (final UnsupportedOperationException e) { + Assumptions.assumeTrue(false, "Symbolic links not supported on this filesystem"); + return; + } + final MessageDigest sha1 = DigestUtils.getSha1Digest(); + assertArrayEquals(Hex.decodeHex("8bbe8a53790056316b23b7c270f10ab6bf6bb1b4"), GitIdentifiers.blobId(sha1, linkToDir)); + assertArrayEquals(Hex.decodeHex("dfe6ef8392ae13a11ff85419b4fd906d997b6cb7"), GitIdentifiers.blobId(sha1, linkToFile)); + } + + @ParameterizedTest + @ValueSource(strings = {MessageDigestAlgorithms.SHA_1, MessageDigestAlgorithms.SHA_256}) + void testTreeIdCollection(final String algorithm) throws Exception { + final byte[] helloId = Hex.decodeHex("a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"); + final byte[] runId = Hex.decodeHex("f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9"); + final byte[] linkId = Hex.decodeHex("1234567890abcdef1234567890abcdef12345678"); + final byte[] srcId = Hex.decodeHex("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"); + + // Entries are supplied out of order to verify that the method sorts them correctly. + final List entries = new ArrayList<>(); + entries.add(new GitDirectoryEntry(Paths.get("src"), GitDirectoryEntry.Type.DIRECTORY, srcId)); + entries.add(new GitDirectoryEntry(Paths.get("run.sh"), GitDirectoryEntry.Type.EXECUTABLE, runId)); + entries.add(new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, helloId)); + entries.add(new GitDirectoryEntry(Paths.get("link.txt"), GitDirectoryEntry.Type.SYMBOLIC_LINK, linkId)); + + // Compute expected value + final byte[] treeBody = Hex.decodeHex(TREE_BODY_HEX); + final MessageDigest md = DigestUtils.getDigest(algorithm); + DigestUtils.updateDigest(md, ("tree " + treeBody.length + "\0").getBytes(StandardCharsets.UTF_8)); + final byte[] expected = DigestUtils.updateDigest(md, treeBody).digest(); + + assertArrayEquals(expected, GitIdentifiers.treeId(md, entries)); + } + + @Test + void testTreeIdPath() throws Exception { + assertArrayEquals(Hex.decodeHex("e4b21f6d78ceba6eb7c211ac15e3337ec4614e8a"), + GitIdentifiers.treeId(DigestUtils.getSha1Digest(), resourcePath("DigestUtilsTest"))); + } +} From 940ef4c2c2e6bf855f35a1623e515c80af0416f7 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 14:00:29 +0200 Subject: [PATCH 02/14] fix: remove `OpenOption` parameters The `OpenOption` parameters are not very useful, since files are usually opened read-only. --- .../apache/commons/codec/digest/GitIdentifiers.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 3cbf48b8ea..caeeeeaf73 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; -import java.nio.file.OpenOption; import java.nio.file.Path; import java.security.MessageDigest; import java.util.ArrayList; @@ -89,12 +88,11 @@ public static byte[] blobId(final MessageDigest messageDigest, final byte[] data * * @param messageDigest The MessageDigest to use (for example SHA-1). * @param data Path to the file to digest. - * @param options Options how to open the file. * @return A generalized Git blob identifier. * @throws IOException On error accessing the file. * @since 1.22.0 */ - public static byte[] blobId(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException { + public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { messageDigest.reset(); if (Files.isSymbolicLink(data)) { final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); @@ -102,7 +100,7 @@ public static byte[] blobId(final MessageDigest messageDigest, final Path data, return DigestUtils.digest(messageDigest, linkTarget); } DigestUtils.updateDigest(messageDigest, gitBlobPrefix(Files.size(data))); - return DigestUtils.updateDigest(messageDigest, data, options).digest(); + return DigestUtils.updateDigest(messageDigest, data).digest(); } private static byte[] gitBlobPrefix(final long dataSize) { @@ -149,21 +147,20 @@ static byte[] treeId(final MessageDigest messageDigest, final Collection entries = new ArrayList<>(); try (DirectoryStream files = Files.newDirectoryStream(data)) { for (final Path path : files) { final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path); final byte[] rawObjectId; if (type == GitDirectoryEntry.Type.DIRECTORY) { - rawObjectId = treeId(messageDigest, path, options); + rawObjectId = treeId(messageDigest, path); } else { - rawObjectId = blobId(messageDigest, path, options); + rawObjectId = blobId(messageDigest, path); } entries.add(new GitDirectoryEntry(path, type, rawObjectId)); } From 1d44d87388e9fab0cdba26557aee9b64de4567c0 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 14:03:19 +0200 Subject: [PATCH 03/14] fix: remove useless @since annotations --- .../java/org/apache/commons/codec/digest/GitIdentifiers.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index caeeeeaf73..5a22760a92 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -90,7 +90,6 @@ public static byte[] blobId(final MessageDigest messageDigest, final byte[] data * @param data Path to the file to digest. * @return A generalized Git blob identifier. * @throws IOException On error accessing the file. - * @since 1.22.0 */ public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { messageDigest.reset(); @@ -149,7 +148,6 @@ static byte[] treeId(final MessageDigest messageDigest, final Collection entries = new ArrayList<>(); From 92bf278d18f2ce35f21bc9367e14d11bc1ba0860 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 14:09:06 +0200 Subject: [PATCH 04/14] fix: make GitDirectoryEntry internal --- .../codec/digest/GitDirectoryEntry.java | 183 ----------------- .../commons/codec/digest/GitIdentifiers.java | 186 ++++++++++++++++-- .../codec/digest/GitDirectoryEntryTest.java | 95 --------- .../codec/digest/GitIdentifiersTest.java | 79 +++++++- 4 files changed, 248 insertions(+), 295 deletions(-) delete mode 100644 src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java delete mode 100644 src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java diff --git a/src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java b/src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java deleted file mode 100644 index e41400e4b7..0000000000 --- a/src/main/java/org/apache/commons/codec/digest/GitDirectoryEntry.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.codec.digest; - -import java.nio.charset.StandardCharsets; -import java.nio.file.Path; -import java.util.Objects; - -/** - * Represents a single entry in a Git tree object. - * - *

A Git tree object encodes a directory snapshot. Each entry holds:

- *
    - *
  • a {@link Type} that determines the Unix file mode (e.g. {@code 100644} for a regular file),
  • - *
  • the entry name (file or directory name, without a path separator),
  • - *
  • the raw object id of the referenced blob or sub-tree.
  • - *
- * - *

Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} - * sorts after {@code foobar}.

- * - *

Call {@link #toTreeEntryBytes()} to obtain the binary encoding that Git feeds to its hash function when computing the tree object identifier.

- * - * @see Git Internals – Git Objects - * @see SWHID Directory Identifier - */ -class GitDirectoryEntry implements Comparable { - - /** - * The type of a Git tree entry, which maps to a Unix file-mode string. - * - *

Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here - * cover the four entry types that Git itself produces.

- * - *

This enum is package-private. If it were made public, {@link #mode} would need to be wrapped in an immutable copy to prevent external mutation.

- */ - enum Type { - - /** - * A sub-directory (Git sub-tree). - */ - DIRECTORY("40000"), - - /** - * An executable file. - */ - EXECUTABLE("100755"), - - /** - * A regular (non-executable) file. - */ - REGULAR("100644"), - - /** - * A symbolic link. - */ - SYMBOLIC_LINK("120000"); - - /** - * The ASCII-encoded octal mode string as it appears in the binary tree entry. - */ - private final byte[] mode; - - Type(final String mode) { - this.mode = mode.getBytes(StandardCharsets.US_ASCII); - } - } - - private static String getFileName(final Path path) { - final Path fileName = path.getFileName(); - if (fileName == null) { - throw new IllegalArgumentException(path.toString()); - } - return fileName.toString(); - } - - /** - * The entry name (file or directory name, no path separator). - */ - private final String name; - - /** - * The key used for ordering entries within a tree object. - * - *

>Git appends {@code '/'} to directory names before comparing.

- */ - private final String sortKey; - - /** - * The Git object type, which determines the Unix file-mode prefix. - */ - private final Type type; - - /** - * The raw object id of the referenced blob or sub-tree. - */ - private final byte[] rawObjectId; - - /** - * Creates an entry. - * - * @param path The path of the entry; must not be an empty path. - * @param type The type of the entry. - * @param rawObjectId The id of the entry. - * @throws IllegalArgumentException If the path is empty. - * @throws NullPointerException If any argument is {@code null}. - */ - GitDirectoryEntry(final Path path, final Type type, final byte[] rawObjectId) { - this(getFileName(path), type, rawObjectId); - } - - /** - * Creates an entry. - * - * @param name The name of the entry - * @param type The type of the entry - * @param rawObjectId The id of the entry - */ - private GitDirectoryEntry(final String name, final Type type, final byte[] rawObjectId) { - this.name = name; - this.type = Objects.requireNonNull(type); - this.sortKey = type == Type.DIRECTORY ? name + "/" : name; - this.rawObjectId = Objects.requireNonNull(rawObjectId); - } - - @Override - public int compareTo(final GitDirectoryEntry o) { - return sortKey.compareTo(o.sortKey); - } - - @Override - public boolean equals(final Object obj) { - if (obj == this) { - return true; - } - if (!(obj instanceof GitDirectoryEntry)) { - return false; - } - final GitDirectoryEntry other = (GitDirectoryEntry) obj; - return name.equals(other.name); - } - - @Override - public int hashCode() { - return name.hashCode(); - } - - /** - * Returns the binary encoding of this entry as it appears inside a Git tree object. - * - *

The format follows the Git tree entry layout:

- *
-     *   <mode> SP <name> NUL <20-byte-object-id>
-     * 
- * - * @return the binary tree-entry encoding; never {@code null}. - */ - byte[] toTreeEntryBytes() { - final byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); - final byte[] result = new byte[type.mode.length + nameBytes.length + rawObjectId.length + 2]; - System.arraycopy(type.mode, 0, result, 0, type.mode.length); - result[type.mode.length] = ' '; - System.arraycopy(nameBytes, 0, result, type.mode.length + 1, nameBytes.length); - result[type.mode.length + nameBytes.length + 1] = '\0'; - System.arraycopy(rawObjectId, 0, result, type.mode.length + nameBytes.length + 2, rawObjectId.length); - return result; - } -} diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 5a22760a92..39deccf13c 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.Objects; import java.util.TreeSet; /** @@ -44,18 +45,18 @@ */ public class GitIdentifiers { - private static GitDirectoryEntry.Type getGitDirectoryEntryType(final Path path) { + private static DirectoryEntry.Type getGitDirectoryEntryType(final Path path) { // Symbolic links first if (Files.isSymbolicLink(path)) { - return GitDirectoryEntry.Type.SYMBOLIC_LINK; + return DirectoryEntry.Type.SYMBOLIC_LINK; } if (Files.isDirectory(path)) { - return GitDirectoryEntry.Type.DIRECTORY; + return DirectoryEntry.Type.DIRECTORY; } if (Files.isExecutable(path)) { - return GitDirectoryEntry.Type.EXECUTABLE; + return DirectoryEntry.Type.EXECUTABLE; } - return GitDirectoryEntry.Type.REGULAR; + return DirectoryEntry.Type.REGULAR; } /** @@ -123,10 +124,10 @@ private static byte[] gitPrefix(final String prefix, final long dataSize) { * @param entries The directory entries. * @return A generalized Git tree identifier. */ - static byte[] treeId(final MessageDigest messageDigest, final Collection entries) { - final TreeSet treeSet = new TreeSet<>(entries); + static byte[] treeId(final MessageDigest messageDigest, final Collection entries) { + final TreeSet treeSet = new TreeSet<>(entries); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (final GitDirectoryEntry entry : treeSet) { + for (final DirectoryEntry entry : treeSet) { final byte[] treeEntryBytes = entry.toTreeEntryBytes(); baos.write(treeEntryBytes, 0, treeEntryBytes.length); } @@ -150,17 +151,17 @@ static byte[] treeId(final MessageDigest messageDigest, final Collection entries = new ArrayList<>(); + final List entries = new ArrayList<>(); try (DirectoryStream files = Files.newDirectoryStream(data)) { for (final Path path : files) { - final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path); + final DirectoryEntry.Type type = getGitDirectoryEntryType(path); final byte[] rawObjectId; - if (type == GitDirectoryEntry.Type.DIRECTORY) { + if (type == DirectoryEntry.Type.DIRECTORY) { rawObjectId = treeId(messageDigest, path); } else { rawObjectId = blobId(messageDigest, path); } - entries.add(new GitDirectoryEntry(path, type, rawObjectId)); + entries.add(new DirectoryEntry(path, type, rawObjectId)); } } return treeId(messageDigest, entries); @@ -173,4 +174,165 @@ private static byte[] gitTreePrefix(final long dataSize) { private GitIdentifiers() { // utility class } + + /** + * Represents a single entry in a Git tree object. + * + *

A Git tree object encodes a directory snapshot. Each entry holds:

+ *
    + *
  • a {@link Type} that determines the Unix file mode (e.g. {@code 100644} for a regular file),
  • + *
  • the entry name (file or directory name, without a path separator),
  • + *
  • the raw object id of the referenced blob or sub-tree.
  • + *
+ * + *

Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} + * sorts after {@code foobar}.

+ * + *

Call {@link #toTreeEntryBytes()} to obtain the binary encoding that Git feeds to its hash function when computing the tree object identifier.

+ * + * @see Git Internals – Git Objects + * @see SWHID Directory Identifier + */ + static class DirectoryEntry implements Comparable { + + /** + * The type of a Git tree entry, which maps to a Unix file-mode string. + * + *

Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here + * cover the four entry types that Git itself produces.

+ * + *

This enum is package-private. If it were made public, {@link #mode} would need to be wrapped in an immutable copy to prevent external mutation.

+ */ + enum Type { + + /** + * A sub-directory (Git sub-tree). + */ + DIRECTORY("40000"), + + /** + * An executable file. + */ + EXECUTABLE("100755"), + + /** + * A regular (non-executable) file. + */ + REGULAR("100644"), + + /** + * A symbolic link. + */ + SYMBOLIC_LINK("120000"); + + /** + * The ASCII-encoded octal mode string as it appears in the binary tree entry. + */ + private final byte[] mode; + + Type(final String mode) { + this.mode = mode.getBytes(StandardCharsets.US_ASCII); + } + } + + private static String getFileName(final Path path) { + final Path fileName = path.getFileName(); + if (fileName == null) { + throw new IllegalArgumentException(path.toString()); + } + return fileName.toString(); + } + + /** + * The entry name (file or directory name, no path separator). + */ + private final String name; + + /** + * The key used for ordering entries within a tree object. + * + *

>Git appends {@code '/'} to directory names before comparing.

+ */ + private final String sortKey; + + /** + * The Git object type, which determines the Unix file-mode prefix. + */ + private final Type type; + + /** + * The raw object id of the referenced blob or sub-tree. + */ + private final byte[] rawObjectId; + + /** + * Creates an entry. + * + * @param path The path of the entry; must not be an empty path. + * @param type The type of the entry. + * @param rawObjectId The id of the entry. + * @throws IllegalArgumentException If the path is empty. + * @throws NullPointerException If any argument is {@code null}. + */ + DirectoryEntry(final Path path, final Type type, final byte[] rawObjectId) { + this(getFileName(path), type, rawObjectId); + } + + /** + * Creates an entry. + * + * @param name The name of the entry + * @param type The type of the entry + * @param rawObjectId The id of the entry + */ + private DirectoryEntry(final String name, final Type type, final byte[] rawObjectId) { + this.name = name; + this.type = Objects.requireNonNull(type); + this.sortKey = type == Type.DIRECTORY ? name + "/" : name; + this.rawObjectId = Objects.requireNonNull(rawObjectId); + } + + @Override + public int compareTo(final DirectoryEntry o) { + return sortKey.compareTo(o.sortKey); + } + + @Override + public boolean equals(final Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof DirectoryEntry)) { + return false; + } + final DirectoryEntry other = (DirectoryEntry) obj; + return name.equals(other.name); + } + + @Override + public int hashCode() { + return name.hashCode(); + } + + /** + * Returns the binary encoding of this entry as it appears inside a Git tree object. + * + *

The format follows the Git tree entry layout:

+ *
+         *   <mode> SP <name> NUL <20-byte-object-id>
+         * 
+ * + * @return the binary tree-entry encoding; never {@code null}. + */ + byte[] toTreeEntryBytes() { + final byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); + final byte[] result = new byte[type.mode.length + nameBytes.length + rawObjectId.length + 2]; + System.arraycopy(type.mode, 0, result, 0, type.mode.length); + result[type.mode.length] = ' '; + System.arraycopy(nameBytes, 0, result, type.mode.length + 1, nameBytes.length); + result[type.mode.length + nameBytes.length + 1] = '\0'; + System.arraycopy(rawObjectId, 0, result, type.mode.length + nameBytes.length + 2, rawObjectId.length); + return result; + } + } } diff --git a/src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java b/src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java deleted file mode 100644 index 8dd22ce4aa..0000000000 --- a/src/test/java/org/apache/commons/codec/digest/GitDirectoryEntryTest.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.codec.digest; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; - -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.Test; - -class GitDirectoryEntryTest { - - private static final byte[] ZERO_ID = new byte[20]; - - @Test - void testConstructor() { - assertThrows(NullPointerException.class, () -> new GitDirectoryEntry(null, GitDirectoryEntry.Type.REGULAR, ZERO_ID)); - assertThrows(NullPointerException.class, () -> new GitDirectoryEntry(Paths.get("hello.txt"), null, ZERO_ID)); - assertThrows(NullPointerException.class, () -> new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, null)); - assertThrows(IllegalArgumentException.class, () -> new GitDirectoryEntry(Paths.get("/"), GitDirectoryEntry.Type.REGULAR, ZERO_ID)); - } - - /** - * Equality and hash code are based solely on the entry name. - */ - @Test - void testEqualityBasedOnNameOnly() { - final byte[] otherId = new byte[20]; - Arrays.fill(otherId, (byte) 0xff); - final GitDirectoryEntry regular = new GitDirectoryEntry(Paths.get("foo"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final GitDirectoryEntry executable = new GitDirectoryEntry(Paths.get("foo"), GitDirectoryEntry.Type.EXECUTABLE, otherId); - // Same name, different type and object id -> equal - assertEquals(regular, executable); - assertEquals(regular.hashCode(), executable.hashCode()); - // Different name -> not equal - assertNotEquals(regular, new GitDirectoryEntry(Paths.get("bar"), GitDirectoryEntry.Type.REGULAR, ZERO_ID)); - // Same reference -> equal - assertEquals(regular, regular); - // Not equal to null or unrelated type - assertNotEquals(regular, null); - assertNotEquals(regular, "foo"); - } - - /** - * The Path constructor must extract the filename component. - */ - @Test - void testPathConstructorUsesFilename() { - final GitDirectoryEntry fromLabel = new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final GitDirectoryEntry fromRelative = new GitDirectoryEntry(Paths.get("subdir/hello.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final GitDirectoryEntry fromAbsolute = new GitDirectoryEntry(Paths.get("hello.txt").toAbsolutePath(), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - assertEquals(fromLabel, fromRelative); - assertEquals(fromLabel, fromAbsolute); - assertArrayEquals(fromLabel.toTreeEntryBytes(), fromRelative.toTreeEntryBytes()); - assertArrayEquals(fromLabel.toTreeEntryBytes(), fromAbsolute.toTreeEntryBytes()); - } - - /** - * Entries should be sorted by Git sort rule. - * - *

Git compares the names of the entries, but adds a {@code /} at the end of directory entries.

- */ - @Test - void testSortOrder() { - final GitDirectoryEntry alpha = new GitDirectoryEntry(Paths.get("alpha.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final GitDirectoryEntry fooTxt = new GitDirectoryEntry(Paths.get("foo.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final GitDirectoryEntry fooDir = new GitDirectoryEntry(Paths.get("foo"), GitDirectoryEntry.Type.DIRECTORY, ZERO_ID); - final GitDirectoryEntry foobar = new GitDirectoryEntry(Paths.get("foobar"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final GitDirectoryEntry zeta = new GitDirectoryEntry(Paths.get("zeta.txt"), GitDirectoryEntry.Type.REGULAR, ZERO_ID); - final List entries = new ArrayList<>(Arrays.asList(zeta, foobar, fooDir, alpha, fooTxt)); - entries.sort(GitDirectoryEntry::compareTo); - assertEquals(Arrays.asList(alpha, fooTxt, fooDir, foobar, zeta), entries); - } -} diff --git a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java index 075e08239a..063ac6829d 100644 --- a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java +++ b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java @@ -18,6 +18,9 @@ package org.apache.commons.codec.digest; import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -25,10 +28,12 @@ import java.nio.file.Paths; import java.security.MessageDigest; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.stream.Stream; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.codec.digest.GitIdentifiers.DirectoryEntry; import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -80,6 +85,69 @@ void testBlobIdPath(final String resourceName, final String expectedSha1Hex) thr assertArrayEquals(Hex.decodeHex(expectedSha1Hex), GitIdentifiers.blobId(DigestUtils.getSha1Digest(), resourcePath(resourceName))); } + + private static final byte[] ZERO_ID = new byte[20]; + + @Test + void testDirectoryEntryConstructor() { + assertThrows(NullPointerException.class, () -> new DirectoryEntry(null, DirectoryEntry.Type.REGULAR, ZERO_ID)); + assertThrows(NullPointerException.class, () -> new DirectoryEntry(Paths.get("hello.txt"), null, ZERO_ID)); + assertThrows(NullPointerException.class, () -> new DirectoryEntry(Paths.get("hello.txt"), DirectoryEntry.Type.REGULAR, null)); + assertThrows(IllegalArgumentException.class, () -> new DirectoryEntry(Paths.get("/"), DirectoryEntry.Type.REGULAR, ZERO_ID)); + } + + /** + * Equality and hash code are based solely on the entry name. + */ + @Test + void testDirectoryEntryEqualityBasedOnNameOnly() { + final byte[] otherId = new byte[20]; + Arrays.fill(otherId, (byte) 0xff); + final DirectoryEntry regular = new DirectoryEntry(Paths.get("foo"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry executable = new DirectoryEntry(Paths.get("foo"), DirectoryEntry.Type.EXECUTABLE, otherId); + // Same name, different type and object id -> equal + assertEquals(regular, executable); + assertEquals(regular.hashCode(), executable.hashCode()); + // Different name -> not equal + assertNotEquals(regular, new DirectoryEntry(Paths.get("bar"), DirectoryEntry.Type.REGULAR, ZERO_ID)); + // Same reference -> equal + assertEquals(regular, regular); + // Not equal to null or unrelated type + assertNotEquals(null, regular); + assertNotEquals("foo", regular); + } + + /** + * The Path constructor must extract the filename component. + */ + @Test + void testDirectoryEntryPathConstructorUsesFilename() { + final DirectoryEntry fromLabel = new DirectoryEntry(Paths.get("hello.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry fromRelative = new DirectoryEntry(Paths.get("subdir/hello.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry fromAbsolute = new DirectoryEntry(Paths.get("hello.txt").toAbsolutePath(), DirectoryEntry.Type.REGULAR, ZERO_ID); + assertEquals(fromLabel, fromRelative); + assertEquals(fromLabel, fromAbsolute); + assertArrayEquals(fromLabel.toTreeEntryBytes(), fromRelative.toTreeEntryBytes()); + assertArrayEquals(fromLabel.toTreeEntryBytes(), fromAbsolute.toTreeEntryBytes()); + } + + /** + * Entries should be sorted by Git sort rule. + * + *

Git compares the names of the entries, but adds a {@code /} at the end of directory entries.

+ */ + @Test + void testDirectoryEntrySortOrder() { + final DirectoryEntry alpha = new DirectoryEntry(Paths.get("alpha.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry fooTxt = new DirectoryEntry(Paths.get("foo.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry fooDir = new DirectoryEntry(Paths.get("foo"), DirectoryEntry.Type.DIRECTORY, ZERO_ID); + final DirectoryEntry foobar = new DirectoryEntry(Paths.get("foobar"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry zeta = new DirectoryEntry(Paths.get("zeta.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final List entries = new ArrayList<>(Arrays.asList(zeta, foobar, fooDir, alpha, fooTxt)); + entries.sort(DirectoryEntry::compareTo); + assertEquals(Arrays.asList(alpha, fooTxt, fooDir, foobar, zeta), entries); + } + @Test void testBlobIdSymlink(@TempDir final Path tempDir) throws Exception { final Path subDir = Files.createDirectory(tempDir.resolve("subdir")); @@ -107,11 +175,11 @@ void testTreeIdCollection(final String algorithm) throws Exception { final byte[] srcId = Hex.decodeHex("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"); // Entries are supplied out of order to verify that the method sorts them correctly. - final List entries = new ArrayList<>(); - entries.add(new GitDirectoryEntry(Paths.get("src"), GitDirectoryEntry.Type.DIRECTORY, srcId)); - entries.add(new GitDirectoryEntry(Paths.get("run.sh"), GitDirectoryEntry.Type.EXECUTABLE, runId)); - entries.add(new GitDirectoryEntry(Paths.get("hello.txt"), GitDirectoryEntry.Type.REGULAR, helloId)); - entries.add(new GitDirectoryEntry(Paths.get("link.txt"), GitDirectoryEntry.Type.SYMBOLIC_LINK, linkId)); + final List entries = new ArrayList<>(); + entries.add(new DirectoryEntry(Paths.get("src"), DirectoryEntry.Type.DIRECTORY, srcId)); + entries.add(new DirectoryEntry(Paths.get("run.sh"), DirectoryEntry.Type.EXECUTABLE, runId)); + entries.add(new DirectoryEntry(Paths.get("hello.txt"), DirectoryEntry.Type.REGULAR, helloId)); + entries.add(new DirectoryEntry(Paths.get("link.txt"), DirectoryEntry.Type.SYMBOLIC_LINK, linkId)); // Compute expected value final byte[] treeBody = Hex.decodeHex(TREE_BODY_HEX); @@ -127,4 +195,5 @@ void testTreeIdPath() throws Exception { assertArrayEquals(Hex.decodeHex("e4b21f6d78ceba6eb7c211ac15e3337ec4614e8a"), GitIdentifiers.treeId(DigestUtils.getSha1Digest(), resourcePath("DigestUtilsTest"))); } + } From e2554aa5455f730eb15b61e5f7f467db9111d9bc Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 21:34:07 +0200 Subject: [PATCH 05/14] feat: add `GitIdBuilder` This change adds a `GitIdentifiers.TreeIdBuilder` class to allow the computation of a SWHID identifier from an archive. --- .../commons/codec/digest/GitIdentifiers.java | 571 +++++++++++------- .../codec/digest/GitIdentifiersTest.java | 240 +++++--- 2 files changed, 522 insertions(+), 289 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 39deccf13c..72bba0a7dc 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -19,15 +19,16 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.security.MessageDigest; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.util.HashMap; +import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.TreeSet; /** @@ -45,134 +46,57 @@ */ public class GitIdentifiers { - private static DirectoryEntry.Type getGitDirectoryEntryType(final Path path) { - // Symbolic links first - if (Files.isSymbolicLink(path)) { - return DirectoryEntry.Type.SYMBOLIC_LINK; - } - if (Files.isDirectory(path)) { - return DirectoryEntry.Type.DIRECTORY; - } - if (Files.isExecutable(path)) { - return DirectoryEntry.Type.EXECUTABLE; - } - return DirectoryEntry.Type.REGULAR; - } - /** - * Reads through a byte array and returns a generalized Git blob identifier. + * The type of a Git tree entry, which maps to a Unix file-mode string. * - *

The identifier is computed in the way described by the - * SWHID contents identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Data to digest. - * @return A generalized Git blob identifier. + *

Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here + * cover the four entry types that Git itself produces.

*/ - public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) { - messageDigest.reset(); - DigestUtils.updateDigest(messageDigest, gitBlobPrefix(data.length)); - return DigestUtils.digest(messageDigest, data); - } + public enum FileMode { - /** - * Reads through a file and returns a generalized Git blob identifier. - * - *

The identifier is computed in the way described by the - * SWHID contents identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Path to the file to digest. - * @return A generalized Git blob identifier. - * @throws IOException On error accessing the file. - */ - public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { - messageDigest.reset(); - if (Files.isSymbolicLink(data)) { - final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); - DigestUtils.updateDigest(messageDigest, gitBlobPrefix(linkTarget.length)); - return DigestUtils.digest(messageDigest, linkTarget); - } - DigestUtils.updateDigest(messageDigest, gitBlobPrefix(Files.size(data))); - return DigestUtils.updateDigest(messageDigest, data).digest(); - } + /** + * A sub-directory (Git sub-tree). + */ + DIRECTORY("40000"), - private static byte[] gitBlobPrefix(final long dataSize) { - return gitPrefix("blob ", dataSize); - } + /** + * An executable file. + */ + EXECUTABLE("100755"), - private static byte[] gitPrefix(final String prefix, final long dataSize) { - return (prefix + dataSize + "\0").getBytes(StandardCharsets.UTF_8); - } + /** + * A regular (non-executable) file. + */ + REGULAR("100644"), - /** - * Returns a generalized Git tree identifier for a collection of directory entries. - * - *

The identifier is computed in the way described by the - * SWHID directory identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param entries The directory entries. - * @return A generalized Git tree identifier. - */ - static byte[] treeId(final MessageDigest messageDigest, final Collection entries) { - final TreeSet treeSet = new TreeSet<>(entries); - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (final DirectoryEntry entry : treeSet) { - final byte[] treeEntryBytes = entry.toTreeEntryBytes(); - baos.write(treeEntryBytes, 0, treeEntryBytes.length); - } - messageDigest.reset(); - DigestUtils.updateDigest(messageDigest, gitTreePrefix(baos.size())); - return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest(); - } + /** + * A symbolic link. + */ + SYMBOLIC_LINK("120000"); - /** - * Reads through a directory and returns a generalized Git tree identifier. - * - *

The identifier is computed in the way described by the - * SWHID directory identifier, but it can use any hash - * algorithm.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Path to the directory to digest. - * @return A generalized Git tree identifier. - * @throws IOException On error accessing the directory or its contents. - */ - public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException { - final List entries = new ArrayList<>(); - try (DirectoryStream files = Files.newDirectoryStream(data)) { - for (final Path path : files) { - final DirectoryEntry.Type type = getGitDirectoryEntryType(path); - final byte[] rawObjectId; - if (type == DirectoryEntry.Type.DIRECTORY) { - rawObjectId = treeId(messageDigest, path); - } else { - rawObjectId = blobId(messageDigest, path); - } - entries.add(new DirectoryEntry(path, type, rawObjectId)); - } - } - return treeId(messageDigest, entries); - } + /** + * The octal mode as used by Git. + */ + private final String mode; - private static byte[] gitTreePrefix(final long dataSize) { - return gitPrefix("tree ", dataSize); - } + /** + * Serialized {@code mode}: since this is mutable, it must remain private. + */ + private final byte[] modeBytes; - private GitIdentifiers() { - // utility class + FileMode(final String mode) { + this.mode = mode; + this.modeBytes = mode.getBytes(StandardCharsets.US_ASCII); + } + + /** + * Gets the octal mode as used by Git. + * + * @return The octal mode + */ + public String getMode() { + return mode; + } } /** @@ -180,7 +104,7 @@ private GitIdentifiers() { * *

A Git tree object encodes a directory snapshot. Each entry holds:

*
    - *
  • a {@link Type} that determines the Unix file mode (e.g. {@code 100644} for a regular file),
  • + *
  • a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),
  • *
  • the entry name (file or directory name, without a path separator),
  • *
  • the raw object id of the referenced blob or sub-tree.
  • *
@@ -188,95 +112,29 @@ private GitIdentifiers() { *

Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} * sorts after {@code foobar}.

* - *

Call {@link #toTreeEntryBytes()} to obtain the binary encoding that Git feeds to its hash function when computing the tree object identifier.

- * * @see Git Internals – Git Objects * @see SWHID Directory Identifier */ static class DirectoryEntry implements Comparable { - /** - * The type of a Git tree entry, which maps to a Unix file-mode string. - * - *

Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here - * cover the four entry types that Git itself produces.

- * - *

This enum is package-private. If it were made public, {@link #mode} would need to be wrapped in an immutable copy to prevent external mutation.

- */ - enum Type { - - /** - * A sub-directory (Git sub-tree). - */ - DIRECTORY("40000"), - - /** - * An executable file. - */ - EXECUTABLE("100755"), - - /** - * A regular (non-executable) file. - */ - REGULAR("100644"), - - /** - * A symbolic link. - */ - SYMBOLIC_LINK("120000"); - - /** - * The ASCII-encoded octal mode string as it appears in the binary tree entry. - */ - private final byte[] mode; - - Type(final String mode) { - this.mode = mode.getBytes(StandardCharsets.US_ASCII); - } - } - - private static String getFileName(final Path path) { - final Path fileName = path.getFileName(); - if (fileName == null) { - throw new IllegalArgumentException(path.toString()); - } - return fileName.toString(); - } - /** * The entry name (file or directory name, no path separator). */ private final String name; - + /** + * The raw object id of the referenced blob or sub-tree. + */ + private final byte[] rawObjectId; /** * The key used for ordering entries within a tree object. * *

>Git appends {@code '/'} to directory names before comparing.

*/ private final String sortKey; - /** * The Git object type, which determines the Unix file-mode prefix. */ - private final Type type; - - /** - * The raw object id of the referenced blob or sub-tree. - */ - private final byte[] rawObjectId; - - /** - * Creates an entry. - * - * @param path The path of the entry; must not be an empty path. - * @param type The type of the entry. - * @param rawObjectId The id of the entry. - * @throws IllegalArgumentException If the path is empty. - * @throws NullPointerException If any argument is {@code null}. - */ - DirectoryEntry(final Path path, final Type type, final byte[] rawObjectId) { - this(getFileName(path), type, rawObjectId); - } + private final FileMode type; /** * Creates an entry. @@ -285,10 +143,13 @@ private static String getFileName(final Path path) { * @param type The type of the entry * @param rawObjectId The id of the entry */ - private DirectoryEntry(final String name, final Type type, final byte[] rawObjectId) { + DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) { + if (Objects.requireNonNull(name).indexOf('/') >= 0) { + throw new IllegalArgumentException("Entry name must not contain '/': " + name); + } this.name = name; this.type = Objects.requireNonNull(type); - this.sortKey = type == Type.DIRECTORY ? name + "/" : name; + this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name; this.rawObjectId = Objects.requireNonNull(rawObjectId); } @@ -314,25 +175,315 @@ public int hashCode() { return name.hashCode(); } + } + + /** + * Builds a Git tree identifier for a virtual directory structure, such as the contents of + * an archive. + */ + public static class TreeIdBuilder { + + /** + * A supplier of a blob identifier that may throw {@link IOException}. + */ + @FunctionalInterface + private interface BlobIdSupplier { + byte[] get() throws IOException; + } + + private static void checkPathComponent(String name) { + if (".".equals(name) || "..".equals(name)) { + throw new IllegalArgumentException("Path component not allowed: " + name); + } + } + private final Map dirEntries = new HashMap<>(); + private final Map fileEntries = new HashMap<>(); + private final MessageDigest messageDigest; + + TreeIdBuilder(final MessageDigest messageDigest) { + this.messageDigest = Objects.requireNonNull(messageDigest); + } + /** - * Returns the binary encoding of this entry as it appears inside a Git tree object. + * Returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent. * - *

The format follows the Git tree entry layout:

- *
-         *   <mode> SP <name> NUL <20-byte-object-id>
-         * 
+ * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}). + * @return The {@link TreeIdBuilder} for the subdirectory. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public TreeIdBuilder addDirectory(final String name) { + TreeIdBuilder current = this; + for (final String component : name.split("/", -1)) { + if (component.isEmpty()) { + continue; + } + checkPathComponent(component); + current = current.dirEntries.computeIfAbsent(component, k -> new TreeIdBuilder(messageDigest)); + } + return current; + } + + /** + * Adds a file entry at the given path within this tree. + * + *

If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.

+ * + *

The stream is eagerly drained.

* - * @return the binary tree-entry encoding; never {@code null}. + *

If the size of the stream is known in advance, consider using {@link #addFile(FileMode, String, long, InputStream)} instead.

+ * + * @param mode The file mode (e.g. {@link FileMode#REGULAR}). + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param data The file content. + * @throws IOException If the stream cannot be read. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. */ - byte[] toTreeEntryBytes() { - final byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); - final byte[] result = new byte[type.mode.length + nameBytes.length + rawObjectId.length + 2]; - System.arraycopy(type.mode, 0, result, 0, type.mode.length); - result[type.mode.length] = ' '; - System.arraycopy(nameBytes, 0, result, type.mode.length + 1, nameBytes.length); - result[type.mode.length + nameBytes.length + 1] = '\0'; - System.arraycopy(rawObjectId, 0, result, type.mode.length + nameBytes.length + 2, rawObjectId.length); - return result; + public void addFile(final FileMode mode, final String name, final InputStream data) throws IOException { + addFile(mode, name, () -> blobId(messageDigest, readAllBytes(data))); } + + /** + * Adds a file entry at the given path within this tree, streaming content without buffering. + * + *

If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.

+ * + *

The stream is eagerly drained.

+ * + * @param mode The file mode (e.g. {@link FileMode#REGULAR}). + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param dataSize The exact number of bytes in {@code data}. + * @param data The file content. + * @throws IOException If the stream cannot be read. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException { + addFile(mode, name, () -> blobId(messageDigest, dataSize, data)); + } + + private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException { + final int slash = name.indexOf('/'); + if (slash < 0) { + checkPathComponent(name); + fileEntries.put(name, new DirectoryEntry(name, mode, blobId.get())); + } else { + addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId); + } + } + + /** + * Adds a file entry at the given path within this tree. + * + *

If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.

+ * + * @param mode The file mode (e.g. {@link FileMode#REGULAR}). + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param data The file content. + * @throws IOException If an I/O error occurs. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException { + addFile(mode, name, () -> blobId(messageDigest, data)); + } + + /** + * Computes the Git tree identifier for this directory and all its descendants. + * + * @return The raw tree identifier bytes. + * @throws IOException If a digest operation fails. + */ + public byte[] build() throws IOException { + final Set entries = new TreeSet<>(fileEntries.values()); + for (final Map.Entry e : dirEntries.entrySet()) { + entries.add(new DirectoryEntry(e.getKey(), FileMode.DIRECTORY, e.getValue().build())); + } + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (final DirectoryEntry entry : entries) { + baos.write(entry.type.modeBytes); + baos.write(' '); + baos.write(entry.name.getBytes(StandardCharsets.UTF_8)); + baos.write('\0'); + baos.write(entry.rawObjectId); + } + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size())); + return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest(); + } + } + + /** + * Reads through a byte array and returns a generalized Git blob identifier. + * + *

The identifier is computed in the way described by the + * SWHID contents identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Data to digest. + * @return A generalized Git blob identifier. + */ + public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) { + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length)); + return DigestUtils.digest(messageDigest, data); + } + + /** + * Reads through a stream and returns a generalized Git blob identifier. + * + *

The stream is drained and its contents are buffered to determine the size before hashing. To avoid + * buffering, use {@link #blobId(MessageDigest, long, InputStream)} when the size is known in advance.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Stream to digest. + * @return A generalized Git blob identifier. + * @throws IOException On error reading the stream. + */ + public static byte[] blobId(final MessageDigest messageDigest, final InputStream data) throws IOException { + return blobId(messageDigest, readAllBytes(data)); + } + + /** + * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering. + * + *

When the size of the content is known in advance, this overload streams {@code data} directly through + * the digest without buffering the full content in memory.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param dataSize The exact number of bytes in {@code data}. + * @param data Stream to digest. + * @return A generalized Git blob identifier. + * @throws IOException On error reading the stream. + */ + public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException { + messageDigest.reset(); + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize)); + return DigestUtils.updateDigest(messageDigest, data).digest(); + } + + /** + * Reads through a file and returns a generalized Git blob identifier. + * + *

The identifier is computed in the way described by the + * SWHID contents identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Path to the file to digest. + * @return A generalized Git blob identifier. + * @throws IOException On error accessing the file. + */ + public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { + messageDigest.reset(); + if (Files.isSymbolicLink(data)) { + final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(linkTarget.length)); + return DigestUtils.digest(messageDigest, linkTarget); + } + DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data))); + return DigestUtils.updateDigest(messageDigest, data).digest(); + } + + private static FileMode getGitDirectoryEntryType(final Path path) { + // Symbolic links first + if (Files.isSymbolicLink(path)) { + return FileMode.SYMBOLIC_LINK; + } + if (Files.isDirectory(path)) { + return FileMode.DIRECTORY; + } + if (Files.isExecutable(path)) { + return FileMode.EXECUTABLE; + } + return FileMode.REGULAR; + } + + private static byte[] getGitBlobPrefix(final long dataSize) { + return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + + private static byte[] getGitTreePrefix(final long dataSize) { + return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + } + + private static void populateFromPath(final TreeIdBuilder builder, final Path directory) throws IOException { + try (DirectoryStream files = Files.newDirectoryStream(directory)) { + for (final Path path : files) { + final String name = path.getFileName().toString(); + final FileMode mode = getGitDirectoryEntryType(path); + switch (mode) { + case DIRECTORY: + populateFromPath(builder.addDirectory(name), path); + break; + case SYMBOLIC_LINK: + final byte[] linkTarget = Files.readSymbolicLink(path).toString().getBytes(StandardCharsets.UTF_8); + builder.addFile(FileMode.SYMBOLIC_LINK, name, linkTarget); + break; + default: + try (InputStream is = Files.newInputStream(path)) { + builder.addFile(mode, name, Files.size(path), is); + } + break; + } + } + } + } + + private static byte[] readAllBytes(final InputStream in) throws IOException { + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + final byte[] buf = new byte[DigestUtils.BUFFER_SIZE]; + int n; + while ((n = in.read(buf)) != -1) { + out.write(buf, 0, n); + } + return out.toByteArray(); + } + + /** + * Reads through a directory and returns a generalized Git tree identifier. + * + *

The identifier is computed in the way described by the + * SWHID directory identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @param data Path to the directory to digest. + * @return A generalized Git tree identifier. + * @throws IOException On error accessing the directory or its contents. + */ + public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException { + final TreeIdBuilder builder = treeIdBuilder(messageDigest); + populateFromPath(builder, data); + return builder.build(); + } + + /** + * Returns a new {@link TreeIdBuilder} for constructing a generalized Git tree identifier from a virtual directory + * structure, such as the contents of an archive. + * + *

The identifier is computed in the way described by the + * SWHID directory identifier, but it can use any hash + * algorithm.

+ * + *

When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.

+ * + * @param messageDigest The MessageDigest to use (for example SHA-1). + * @return A new {@link TreeIdBuilder}. + */ + public static TreeIdBuilder treeIdBuilder(final MessageDigest messageDigest) { + return new TreeIdBuilder(messageDigest); + } + + private GitIdentifiers() { + // utility class } } diff --git a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java index 063ac6829d..a1604b6faa 100644 --- a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java +++ b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java @@ -19,9 +19,11 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -47,20 +49,8 @@ */ class GitIdentifiersTest { - /** - * Binary body of the test tree object used in {@link #testTreeIdCollection}. - * - *

Each entry has the format {@code SP NUL <20-byte-object-id>}.

- */ - private static final String TREE_BODY_HEX = - // 100644 hello.txt\0 + objectId - "3130303634342068656c6c6f2e74787400" + "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0" + - // 120000 link.txt\0 + objectId - "313230303030206c696e6b2e74787400" + "1234567890abcdef1234567890abcdef12345678" + - // 100755 run.sh\0 + objectId - "3130303735352072756e2e736800" + "f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9" + - // 40000 src\0 + objectId - "34303030302073726300" + "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"; + + private static final byte[] ZERO_ID = new byte[20]; static Stream blobIdProvider() { return Stream.of(Arguments.of("DigestUtilsTest/hello.txt", "5f4a83288e67f1be2d6fcdad84165a86c6a970d7"), @@ -72,6 +62,20 @@ private static Path resourcePath(final String resourceName) throws Exception { return Paths.get(GitIdentifiersTest.class.getClassLoader().getResource(resourceName).toURI()); } + static Stream testTreeIdBuilder() { + return Stream.of( + Arguments.of(MessageDigestAlgorithms.SHA_1, + "ce013625030ba8dba906f756967f9e9ca394464a", // blob id of "hello\n" + "8bbe8a53790056316b23b7c270f10ab6bf6bb1b4", // blob id of "subdir" + "1a2485251c33a70432394c93fb89330ef214bfc9", // blob id of "#!/bin/sh\n" + "4b825dc642cb6eb9a060e54bf8d69288fbee4904"), // tree id of empty directory + Arguments.of(MessageDigestAlgorithms.SHA_256, + "2cf8d83d9ee29543b34a87727421fdecb7e3f3a183d337639025de576db9ebb4", + "33910dae80b0db75dbad7fa521dbbf1885a07edfab1228871c41a2e94ccd7edb", + "1249034e3cf9007362d695b09b1fbdb4c578903bf10b665749b94743f8177ce1", + "6ef19b41225c5369f1c104d45d8d85efa9b057b53b14b4b9b939dd74decc5321")); + } + @ParameterizedTest @MethodSource("blobIdProvider") void testBlobIdByteArray(final String resourceName, final String expectedSha1Hex) throws Exception { @@ -79,21 +83,49 @@ void testBlobIdByteArray(final String resourceName, final String expectedSha1Hex assertArrayEquals(Hex.decodeHex(expectedSha1Hex), GitIdentifiers.blobId(DigestUtils.getSha1Digest(), data)); } + @ParameterizedTest + @MethodSource("blobIdProvider") + void testBlobIdInputStream(final String resourceName, final String expectedSha1Hex) throws Exception { + final byte[] data = Files.readAllBytes(resourcePath(resourceName)); + assertArrayEquals(Hex.decodeHex(expectedSha1Hex), + GitIdentifiers.blobId(DigestUtils.getSha1Digest(), new ByteArrayInputStream(data))); + } + + @ParameterizedTest + @MethodSource("blobIdProvider") + void testBlobIdInputStreamWithSize(final String resourceName, final String expectedSha1Hex) throws Exception { + final byte[] data = Files.readAllBytes(resourcePath(resourceName)); + assertArrayEquals(Hex.decodeHex(expectedSha1Hex), + GitIdentifiers.blobId(DigestUtils.getSha1Digest(), data.length, new ByteArrayInputStream(data))); + } + @ParameterizedTest @MethodSource("blobIdProvider") void testBlobIdPath(final String resourceName, final String expectedSha1Hex) throws Exception { assertArrayEquals(Hex.decodeHex(expectedSha1Hex), GitIdentifiers.blobId(DigestUtils.getSha1Digest(), resourcePath(resourceName))); } - - private static final byte[] ZERO_ID = new byte[20]; + @Test + void testBlobIdSymlink(@TempDir final Path tempDir) throws Exception { + final Path subDir = Files.createDirectory(tempDir.resolve("subdir")); + Files.write(subDir.resolve("file.txt"), "hello".getBytes(StandardCharsets.UTF_8)); + try { + final Path linkToDir = Files.createSymbolicLink(tempDir.resolve("link-to-dir"), Paths.get("subdir")); + final Path linkToFile = Files.createSymbolicLink(tempDir.resolve("link-to-file"), Paths.get("subdir/file.txt")); + final MessageDigest sha1 = DigestUtils.getSha1Digest(); + assertArrayEquals(Hex.decodeHex("8bbe8a53790056316b23b7c270f10ab6bf6bb1b4"), GitIdentifiers.blobId(sha1, linkToDir)); + assertArrayEquals(Hex.decodeHex("dfe6ef8392ae13a11ff85419b4fd906d997b6cb7"), GitIdentifiers.blobId(sha1, linkToFile)); + } catch (final UnsupportedOperationException e) { + Assumptions.abort("Symbolic links not supported on this filesystem"); + } + } @Test void testDirectoryEntryConstructor() { - assertThrows(NullPointerException.class, () -> new DirectoryEntry(null, DirectoryEntry.Type.REGULAR, ZERO_ID)); - assertThrows(NullPointerException.class, () -> new DirectoryEntry(Paths.get("hello.txt"), null, ZERO_ID)); - assertThrows(NullPointerException.class, () -> new DirectoryEntry(Paths.get("hello.txt"), DirectoryEntry.Type.REGULAR, null)); - assertThrows(IllegalArgumentException.class, () -> new DirectoryEntry(Paths.get("/"), DirectoryEntry.Type.REGULAR, ZERO_ID)); + assertThrows(NullPointerException.class, () -> new DirectoryEntry(null, GitIdentifiers.FileMode.REGULAR, ZERO_ID)); + assertThrows(NullPointerException.class, () -> new DirectoryEntry("hello.txt", null, ZERO_ID)); + assertThrows(NullPointerException.class, () -> new DirectoryEntry("hello.txt", GitIdentifiers.FileMode.REGULAR, null)); + assertThrows(IllegalArgumentException.class, () -> new DirectoryEntry("/", GitIdentifiers.FileMode.REGULAR, ZERO_ID)); } /** @@ -103,32 +135,18 @@ void testDirectoryEntryConstructor() { void testDirectoryEntryEqualityBasedOnNameOnly() { final byte[] otherId = new byte[20]; Arrays.fill(otherId, (byte) 0xff); - final DirectoryEntry regular = new DirectoryEntry(Paths.get("foo"), DirectoryEntry.Type.REGULAR, ZERO_ID); - final DirectoryEntry executable = new DirectoryEntry(Paths.get("foo"), DirectoryEntry.Type.EXECUTABLE, otherId); + final DirectoryEntry regular = new DirectoryEntry("foo", GitIdentifiers.FileMode.REGULAR, ZERO_ID); + final DirectoryEntry executable = new DirectoryEntry("foo", GitIdentifiers.FileMode.EXECUTABLE, otherId); // Same name, different type and object id -> equal assertEquals(regular, executable); assertEquals(regular.hashCode(), executable.hashCode()); // Different name -> not equal - assertNotEquals(regular, new DirectoryEntry(Paths.get("bar"), DirectoryEntry.Type.REGULAR, ZERO_ID)); + assertNotEquals(regular, new DirectoryEntry("bar", GitIdentifiers.FileMode.REGULAR, ZERO_ID)); // Same reference -> equal assertEquals(regular, regular); // Not equal to null or unrelated type - assertNotEquals(null, regular); - assertNotEquals("foo", regular); - } - - /** - * The Path constructor must extract the filename component. - */ - @Test - void testDirectoryEntryPathConstructorUsesFilename() { - final DirectoryEntry fromLabel = new DirectoryEntry(Paths.get("hello.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); - final DirectoryEntry fromRelative = new DirectoryEntry(Paths.get("subdir/hello.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); - final DirectoryEntry fromAbsolute = new DirectoryEntry(Paths.get("hello.txt").toAbsolutePath(), DirectoryEntry.Type.REGULAR, ZERO_ID); - assertEquals(fromLabel, fromRelative); - assertEquals(fromLabel, fromAbsolute); - assertArrayEquals(fromLabel.toTreeEntryBytes(), fromRelative.toTreeEntryBytes()); - assertArrayEquals(fromLabel.toTreeEntryBytes(), fromAbsolute.toTreeEntryBytes()); + assertFalse(regular.equals(null)); + assertFalse(regular.equals("foo")); } /** @@ -138,56 +156,120 @@ void testDirectoryEntryPathConstructorUsesFilename() { */ @Test void testDirectoryEntrySortOrder() { - final DirectoryEntry alpha = new DirectoryEntry(Paths.get("alpha.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); - final DirectoryEntry fooTxt = new DirectoryEntry(Paths.get("foo.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); - final DirectoryEntry fooDir = new DirectoryEntry(Paths.get("foo"), DirectoryEntry.Type.DIRECTORY, ZERO_ID); - final DirectoryEntry foobar = new DirectoryEntry(Paths.get("foobar"), DirectoryEntry.Type.REGULAR, ZERO_ID); - final DirectoryEntry zeta = new DirectoryEntry(Paths.get("zeta.txt"), DirectoryEntry.Type.REGULAR, ZERO_ID); + final DirectoryEntry alpha = new DirectoryEntry("alpha.txt", GitIdentifiers.FileMode.REGULAR, ZERO_ID); + final DirectoryEntry fooTxt = new DirectoryEntry("foo.txt", GitIdentifiers.FileMode.REGULAR, ZERO_ID); + final DirectoryEntry fooDir = new DirectoryEntry("foo", GitIdentifiers.FileMode.DIRECTORY, ZERO_ID); + final DirectoryEntry foobar = new DirectoryEntry("foobar", GitIdentifiers.FileMode.REGULAR, ZERO_ID); + final DirectoryEntry zeta = new DirectoryEntry("zeta.txt", GitIdentifiers.FileMode.REGULAR, ZERO_ID); final List entries = new ArrayList<>(Arrays.asList(zeta, foobar, fooDir, alpha, fooTxt)); entries.sort(DirectoryEntry::compareTo); assertEquals(Arrays.asList(alpha, fooTxt, fooDir, foobar, zeta), entries); } + @ParameterizedTest + @MethodSource + void testTreeIdBuilder(final String algorithm, final String helloHex, final String linkHex, final String runHex, final String srcHex) throws Exception { + final byte[] helloContent = "hello\n".getBytes(StandardCharsets.UTF_8); + final byte[] runContent = "#!/bin/sh\n".getBytes(StandardCharsets.UTF_8); + final byte[] linkTarget = "subdir".getBytes(StandardCharsets.UTF_8); + final MessageDigest md = DigestUtils.getDigest(algorithm); + + // Verify individual blob IDs against pre-computed constants. + assertArrayEquals(Hex.decodeHex(helloHex), GitIdentifiers.blobId(md, helloContent)); + assertArrayEquals(Hex.decodeHex(linkHex), GitIdentifiers.blobId(md, linkTarget)); + assertArrayEquals(Hex.decodeHex(runHex), GitIdentifiers.blobId(md, runContent)); + + // Entries are supplied out of order to verify that the builder sorts them correctly. + final GitIdentifiers.TreeIdBuilder builder = GitIdentifiers.treeIdBuilder(md); + builder.addDirectory("src"); + builder.addFile(GitIdentifiers.FileMode.EXECUTABLE, "run.sh", runContent); + builder.addFile(GitIdentifiers.FileMode.REGULAR, "hello.txt", helloContent); + builder.addFile(GitIdentifiers.FileMode.SYMBOLIC_LINK, "link.txt", linkTarget); + + // Expected tree body: entries in Git sort order (hello.txt, link.txt, run.sh, src/). + // Each entry: hex-encoded " \0" followed by the object id. + final byte[] treeBody = Hex.decodeHex("3130303634342068656c6c6f2e74787400" + helloHex + // 100644 hello.txt\0 + "313230303030206c696e6b2e74787400" + linkHex + // 120000 link.txt\0 + "3130303735352072756e2e736800" + runHex + // 100755 run.sh\0 + "34303030302073726300" + srcHex); // 40000 src\0 + md.reset(); + DigestUtils.updateDigest(md, ("tree " + treeBody.length + "\0").getBytes(StandardCharsets.UTF_8)); + assertArrayEquals(DigestUtils.updateDigest(md, treeBody).digest(), builder.build()); + } + @Test - void testBlobIdSymlink(@TempDir final Path tempDir) throws Exception { - final Path subDir = Files.createDirectory(tempDir.resolve("subdir")); - Files.write(subDir.resolve("file.txt"), "hello".getBytes(StandardCharsets.UTF_8)); - final Path linkToDir; - final Path linkToFile; - try { - linkToDir = Files.createSymbolicLink(tempDir.resolve("link-to-dir"), Paths.get("subdir")); - linkToFile = Files.createSymbolicLink(tempDir.resolve("link-to-file"), Paths.get("subdir/file.txt")); - } catch (final UnsupportedOperationException e) { - Assumptions.assumeTrue(false, "Symbolic links not supported on this filesystem"); - return; - } - final MessageDigest sha1 = DigestUtils.getSha1Digest(); - assertArrayEquals(Hex.decodeHex("8bbe8a53790056316b23b7c270f10ab6bf6bb1b4"), GitIdentifiers.blobId(sha1, linkToDir)); - assertArrayEquals(Hex.decodeHex("dfe6ef8392ae13a11ff85419b4fd906d997b6cb7"), GitIdentifiers.blobId(sha1, linkToFile)); + void testTreeIdBuilderAddFileInputStream() throws Exception { + final MessageDigest md = DigestUtils.getSha1Digest(); + final byte[] content = "Hello, World!\n".getBytes(StandardCharsets.UTF_8); + + final GitIdentifiers.TreeIdBuilder byteArrayBuilder = GitIdentifiers.treeIdBuilder(md); + byteArrayBuilder.addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content); + final byte[] expected = byteArrayBuilder.build(); + + final GitIdentifiers.TreeIdBuilder streamBuilder = GitIdentifiers.treeIdBuilder(md); + streamBuilder.addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", new ByteArrayInputStream(content)); + assertArrayEquals(expected, streamBuilder.build()); + + final GitIdentifiers.TreeIdBuilder sizedStreamBuilder = GitIdentifiers.treeIdBuilder(md); + sizedStreamBuilder.addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content.length, new ByteArrayInputStream(content)); + assertArrayEquals(expected, sizedStreamBuilder.build()); + } + + @Test + void testTreeIdBuilderEmptyPathSegments() throws Exception { + final MessageDigest md = DigestUtils.getSha1Digest(); + final byte[] content = "hello\n".getBytes(StandardCharsets.UTF_8); + + // Canonical form + final GitIdentifiers.TreeIdBuilder canonical = GitIdentifiers.treeIdBuilder(md); + canonical.addFile(GitIdentifiers.FileMode.REGULAR, "subdir/file.txt", content); + final byte[] expected = canonical.build(); + + // Leading slash + final GitIdentifiers.TreeIdBuilder withLeading = GitIdentifiers.treeIdBuilder(md); + withLeading.addFile(GitIdentifiers.FileMode.REGULAR, "/subdir/file.txt", content); + assertArrayEquals(expected, withLeading.build()); + + // Consecutive slashes + final GitIdentifiers.TreeIdBuilder withDouble = GitIdentifiers.treeIdBuilder(md); + withDouble.addFile(GitIdentifiers.FileMode.REGULAR, "subdir//file.txt", content); + assertArrayEquals(expected, withDouble.build()); + + // addDirectory with leading/trailing slashes + final GitIdentifiers.TreeIdBuilder viaDirectory = GitIdentifiers.treeIdBuilder(md); + viaDirectory.addDirectory("/subdir/").addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content); + assertArrayEquals(expected, viaDirectory.build()); } @ParameterizedTest - @ValueSource(strings = {MessageDigestAlgorithms.SHA_1, MessageDigestAlgorithms.SHA_256}) - void testTreeIdCollection(final String algorithm) throws Exception { - final byte[] helloId = Hex.decodeHex("a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"); - final byte[] runId = Hex.decodeHex("f0e1d2c3b4a5f6e7d8c9b0a1f2e3d4c5b6a7f8e9"); - final byte[] linkId = Hex.decodeHex("1234567890abcdef1234567890abcdef12345678"); - final byte[] srcId = Hex.decodeHex("deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"); - - // Entries are supplied out of order to verify that the method sorts them correctly. - final List entries = new ArrayList<>(); - entries.add(new DirectoryEntry(Paths.get("src"), DirectoryEntry.Type.DIRECTORY, srcId)); - entries.add(new DirectoryEntry(Paths.get("run.sh"), DirectoryEntry.Type.EXECUTABLE, runId)); - entries.add(new DirectoryEntry(Paths.get("hello.txt"), DirectoryEntry.Type.REGULAR, helloId)); - entries.add(new DirectoryEntry(Paths.get("link.txt"), DirectoryEntry.Type.SYMBOLIC_LINK, linkId)); - - // Compute expected value - final byte[] treeBody = Hex.decodeHex(TREE_BODY_HEX); - final MessageDigest md = DigestUtils.getDigest(algorithm); - DigestUtils.updateDigest(md, ("tree " + treeBody.length + "\0").getBytes(StandardCharsets.UTF_8)); - final byte[] expected = DigestUtils.updateDigest(md, treeBody).digest(); + @ValueSource(strings = {".", ".."}) + void testTreeIdBuilderInvalidPathSegments(final String segment) { + final MessageDigest md = DigestUtils.getSha1Digest(); + final byte[] data = new byte[0]; + // Sole path component + assertThrows(IllegalArgumentException.class, + () -> GitIdentifiers.treeIdBuilder(md).addFile(GitIdentifiers.FileMode.REGULAR, segment, data)); + assertThrows(IllegalArgumentException.class, + () -> GitIdentifiers.treeIdBuilder(md).addDirectory(segment)); + // Embedded in a longer path + assertThrows(IllegalArgumentException.class, + () -> GitIdentifiers.treeIdBuilder(md).addFile(GitIdentifiers.FileMode.REGULAR, "subdir/" + segment + "/file.txt", data)); + assertThrows(IllegalArgumentException.class, + () -> GitIdentifiers.treeIdBuilder(md).addDirectory("subdir/" + segment)); + } + + @Test + void testTreeIdBuilderNestedFileEquivalentToDirectoryAndFile() throws Exception { + final MessageDigest md = DigestUtils.getSha1Digest(); + final byte[] content = "hello\n".getBytes(StandardCharsets.UTF_8); + + final GitIdentifiers.TreeIdBuilder direct = GitIdentifiers.treeIdBuilder(md); + direct.addFile(GitIdentifiers.FileMode.REGULAR, "nested/file.txt", content); + + final GitIdentifiers.TreeIdBuilder indirect = GitIdentifiers.treeIdBuilder(md); + indirect.addDirectory("nested").addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content); - assertArrayEquals(expected, GitIdentifiers.treeId(md, entries)); + assertArrayEquals(direct.build(), indirect.build()); } @Test From 587369c23f36ec62b48af1932675355cd7e058e2 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 22:07:18 +0200 Subject: [PATCH 06/14] fix: SpotBug error --- .../java/org/apache/commons/codec/digest/GitIdentifiers.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 72bba0a7dc..d78279aa03 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -416,7 +416,7 @@ private static byte[] getGitTreePrefix(final long dataSize) { private static void populateFromPath(final TreeIdBuilder builder, final Path directory) throws IOException { try (DirectoryStream files = Files.newDirectoryStream(directory)) { for (final Path path : files) { - final String name = path.getFileName().toString(); + final String name = Objects.toString(path.getFileName()); final FileMode mode = getGitDirectoryEntryType(path); switch (mode) { case DIRECTORY: From 378e9153519d818fba2420c71f62f769a8524510 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 22:29:56 +0200 Subject: [PATCH 07/14] fix: try fix Java 8 javadoc error --- .../java/org/apache/commons/codec/digest/GitIdentifiers.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index d78279aa03..72b6d8c089 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -230,7 +230,7 @@ public TreeIdBuilder addDirectory(final String name) { * *

The stream is eagerly drained.

* - *

If the size of the stream is known in advance, consider using {@link #addFile(FileMode, String, long, InputStream)} instead.

+ *

If the size of the stream is known in advance, consider using {@link TreeIdBuilder#addFile(FileMode, String, long, InputStream)} instead.

* * @param mode The file mode (e.g. {@link FileMode#REGULAR}). * @param name The relative path of the entry in normalized form(may contain {@code '/'}). From 4585d6baad1de4a0fe6673036895e481fb1676e0 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 22:35:24 +0200 Subject: [PATCH 08/14] fix: extract `getGitPrefix` again --- .../org/apache/commons/codec/digest/GitIdentifiers.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 72b6d8c089..a0ea8748d0 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -406,11 +406,15 @@ private static FileMode getGitDirectoryEntryType(final Path path) { } private static byte[] getGitBlobPrefix(final long dataSize) { - return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + return getGitPrefix("blob", dataSize); + } + + private static byte[] getGitPrefix(final String type, final long dataSize) { + return (type + " " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); } private static byte[] getGitTreePrefix(final long dataSize) { - return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); + return getGitPrefix("tree", dataSize); } private static void populateFromPath(final TreeIdBuilder builder, final Path directory) throws IOException { From faafda83a468f3ff59ebe3a80a01e70d2b59b4f1 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 22:38:52 +0200 Subject: [PATCH 09/14] fix: improve encapsulation --- .../java/org/apache/commons/codec/digest/GitIdentifiers.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index a0ea8748d0..ba18fd3e5a 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -181,7 +181,7 @@ public int hashCode() { * Builds a Git tree identifier for a virtual directory structure, such as the contents of * an archive. */ - public static class TreeIdBuilder { + public static final class TreeIdBuilder { /** * A supplier of a blob identifier that may throw {@link IOException}. @@ -200,7 +200,7 @@ private static void checkPathComponent(String name) { private final Map fileEntries = new HashMap<>(); private final MessageDigest messageDigest; - TreeIdBuilder(final MessageDigest messageDigest) { + private TreeIdBuilder(final MessageDigest messageDigest) { this.messageDigest = Objects.requireNonNull(messageDigest); } From 3eaa9baf5658787fe37c13597fd576c3340a5c12 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Thu, 9 Apr 2026 22:41:58 +0200 Subject: [PATCH 10/14] fix: Javadoc on Java 8 --- .../java/org/apache/commons/codec/digest/GitIdentifiers.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index ba18fd3e5a..439ba61bb4 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -230,7 +230,8 @@ public TreeIdBuilder addDirectory(final String name) { * *

The stream is eagerly drained.

* - *

If the size of the stream is known in advance, consider using {@link TreeIdBuilder#addFile(FileMode, String, long, InputStream)} instead.

+ *

If the size of the stream is known in advance, consider using {@link #addFile(GitIdentifiers.FileMode, String, long, InputStream)} + * instead.

* * @param mode The file mode (e.g. {@link FileMode#REGULAR}). * @param name The relative path of the entry in normalized form(may contain {@code '/'}). From 9064aa23dce73af3c0eba956e5480e85192116de Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sat, 11 Apr 2026 19:44:58 +0200 Subject: [PATCH 11/14] fix: remove `blobId(MessageDigest, InputStream)` --- .../commons/codec/digest/GitIdentifiers.java | 47 ------------------- .../codec/digest/GitIdentifiersTest.java | 12 ----- 2 files changed, 59 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 439ba61bb4..4cd24e346d 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -223,26 +223,6 @@ public TreeIdBuilder addDirectory(final String name) { return current; } - /** - * Adds a file entry at the given path within this tree. - * - *

If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.

- * - *

The stream is eagerly drained.

- * - *

If the size of the stream is known in advance, consider using {@link #addFile(GitIdentifiers.FileMode, String, long, InputStream)} - * instead.

- * - * @param mode The file mode (e.g. {@link FileMode#REGULAR}). - * @param name The relative path of the entry in normalized form(may contain {@code '/'}). - * @param data The file content. - * @throws IOException If the stream cannot be read. - * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. - */ - public void addFile(final FileMode mode, final String name, final InputStream data) throws IOException { - addFile(mode, name, () -> blobId(messageDigest, readAllBytes(data))); - } - /** * Adds a file entry at the given path within this tree, streaming content without buffering. * @@ -330,23 +310,6 @@ public static byte[] blobId(final MessageDigest messageDigest, final byte[] data return DigestUtils.digest(messageDigest, data); } - /** - * Reads through a stream and returns a generalized Git blob identifier. - * - *

The stream is drained and its contents are buffered to determine the size before hashing. To avoid - * buffering, use {@link #blobId(MessageDigest, long, InputStream)} when the size is known in advance.

- * - *

When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.

- * - * @param messageDigest The MessageDigest to use (for example SHA-1). - * @param data Stream to digest. - * @return A generalized Git blob identifier. - * @throws IOException On error reading the stream. - */ - public static byte[] blobId(final MessageDigest messageDigest, final InputStream data) throws IOException { - return blobId(messageDigest, readAllBytes(data)); - } - /** * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering. * @@ -441,16 +404,6 @@ private static void populateFromPath(final TreeIdBuilder builder, final Path dir } } - private static byte[] readAllBytes(final InputStream in) throws IOException { - final ByteArrayOutputStream out = new ByteArrayOutputStream(); - final byte[] buf = new byte[DigestUtils.BUFFER_SIZE]; - int n; - while ((n = in.read(buf)) != -1) { - out.write(buf, 0, n); - } - return out.toByteArray(); - } - /** * Reads through a directory and returns a generalized Git tree identifier. * diff --git a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java index a1604b6faa..76349ca094 100644 --- a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java +++ b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java @@ -83,14 +83,6 @@ void testBlobIdByteArray(final String resourceName, final String expectedSha1Hex assertArrayEquals(Hex.decodeHex(expectedSha1Hex), GitIdentifiers.blobId(DigestUtils.getSha1Digest(), data)); } - @ParameterizedTest - @MethodSource("blobIdProvider") - void testBlobIdInputStream(final String resourceName, final String expectedSha1Hex) throws Exception { - final byte[] data = Files.readAllBytes(resourcePath(resourceName)); - assertArrayEquals(Hex.decodeHex(expectedSha1Hex), - GitIdentifiers.blobId(DigestUtils.getSha1Digest(), new ByteArrayInputStream(data))); - } - @ParameterizedTest @MethodSource("blobIdProvider") void testBlobIdInputStreamWithSize(final String resourceName, final String expectedSha1Hex) throws Exception { @@ -206,10 +198,6 @@ void testTreeIdBuilderAddFileInputStream() throws Exception { byteArrayBuilder.addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content); final byte[] expected = byteArrayBuilder.build(); - final GitIdentifiers.TreeIdBuilder streamBuilder = GitIdentifiers.treeIdBuilder(md); - streamBuilder.addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", new ByteArrayInputStream(content)); - assertArrayEquals(expected, streamBuilder.build()); - final GitIdentifiers.TreeIdBuilder sizedStreamBuilder = GitIdentifiers.treeIdBuilder(md); sizedStreamBuilder.addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content.length, new ByteArrayInputStream(content)); assertArrayEquals(expected, sizedStreamBuilder.build()); From 235142d69798ed61117078eb08df6585b92fc341 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sat, 11 Apr 2026 19:59:11 +0200 Subject: [PATCH 12/14] fix: add `addSymbolicLink` helper --- .../commons/codec/digest/GitIdentifiers.java | 14 ++++++++++++++ .../commons/codec/digest/GitIdentifiersTest.java | 6 +++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 4cd24e346d..07653568c3 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -266,6 +266,20 @@ public void addFile(final FileMode mode, final String name, final byte[] data) t addFile(mode, name, () -> blobId(messageDigest, data)); } + /** + * Adds a symbolic link entry at the give path within this tree. + * + *

If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.

+ * + * @param name The relative path of the entry in normalized form(may contain {@code '/'}). + * @param target The target of the symbolic link. + * @throws IOException If an I/O error occurs. + * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + */ + public void addSymbolicLink(final String name, final String target) throws IOException { + addFile(FileMode.SYMBOLIC_LINK, name, target.getBytes(StandardCharsets.UTF_8)); + } + /** * Computes the Git tree identifier for this directory and all its descendants. * diff --git a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java index 76349ca094..df54287c5b 100644 --- a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java +++ b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java @@ -163,12 +163,12 @@ void testDirectoryEntrySortOrder() { void testTreeIdBuilder(final String algorithm, final String helloHex, final String linkHex, final String runHex, final String srcHex) throws Exception { final byte[] helloContent = "hello\n".getBytes(StandardCharsets.UTF_8); final byte[] runContent = "#!/bin/sh\n".getBytes(StandardCharsets.UTF_8); - final byte[] linkTarget = "subdir".getBytes(StandardCharsets.UTF_8); + final String linkTarget = "subdir"; final MessageDigest md = DigestUtils.getDigest(algorithm); // Verify individual blob IDs against pre-computed constants. assertArrayEquals(Hex.decodeHex(helloHex), GitIdentifiers.blobId(md, helloContent)); - assertArrayEquals(Hex.decodeHex(linkHex), GitIdentifiers.blobId(md, linkTarget)); + assertArrayEquals(Hex.decodeHex(linkHex), GitIdentifiers.blobId(md, linkTarget.getBytes(StandardCharsets.UTF_8))); assertArrayEquals(Hex.decodeHex(runHex), GitIdentifiers.blobId(md, runContent)); // Entries are supplied out of order to verify that the builder sorts them correctly. @@ -176,7 +176,7 @@ void testTreeIdBuilder(final String algorithm, final String helloHex, final Stri builder.addDirectory("src"); builder.addFile(GitIdentifiers.FileMode.EXECUTABLE, "run.sh", runContent); builder.addFile(GitIdentifiers.FileMode.REGULAR, "hello.txt", helloContent); - builder.addFile(GitIdentifiers.FileMode.SYMBOLIC_LINK, "link.txt", linkTarget); + builder.addSymbolicLink("link.txt", linkTarget); // Expected tree body: entries in Git sort order (hello.txt, link.txt, run.sh, src/). // Each entry: hex-encoded " \0" followed by the object id. From 31a27923c47a654d81e787ac2059d6b7073a66f5 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sat, 11 Apr 2026 20:11:57 +0200 Subject: [PATCH 13/14] fix: simplify `populateFromPath` and `blobId` --- .../commons/codec/digest/GitIdentifiers.java | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 07653568c3..7cf646aceb 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -359,12 +359,11 @@ public static byte[] blobId(final MessageDigest messageDigest, final long dataSi * @throws IOException On error accessing the file. */ public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { - messageDigest.reset(); if (Files.isSymbolicLink(data)) { final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); - DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(linkTarget.length)); - return DigestUtils.digest(messageDigest, linkTarget); + return blobId(messageDigest, linkTarget); } + messageDigest.reset(); DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data))); return DigestUtils.updateDigest(messageDigest, data).digest(); } @@ -400,19 +399,10 @@ private static void populateFromPath(final TreeIdBuilder builder, final Path dir for (final Path path : files) { final String name = Objects.toString(path.getFileName()); final FileMode mode = getGitDirectoryEntryType(path); - switch (mode) { - case DIRECTORY: - populateFromPath(builder.addDirectory(name), path); - break; - case SYMBOLIC_LINK: - final byte[] linkTarget = Files.readSymbolicLink(path).toString().getBytes(StandardCharsets.UTF_8); - builder.addFile(FileMode.SYMBOLIC_LINK, name, linkTarget); - break; - default: - try (InputStream is = Files.newInputStream(path)) { - builder.addFile(mode, name, Files.size(path), is); - } - break; + if (mode == FileMode.DIRECTORY) { + populateFromPath(builder.addDirectory(name), path); + } else { + builder.addFile(mode, name, () -> blobId(builder.messageDigest, path)); } } } From b30589571bdab54cbf54f8a3a433c5644d43ac5e Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sat, 11 Apr 2026 22:59:07 +0200 Subject: [PATCH 14/14] fix: ignore `.` path segment --- .../commons/codec/digest/GitIdentifiers.java | 25 ++++---- .../codec/digest/GitIdentifiersTest.java | 64 +++++++++---------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java index 7cf646aceb..6b4c0ccf2f 100644 --- a/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java +++ b/src/main/java/org/apache/commons/codec/digest/GitIdentifiers.java @@ -191,11 +191,13 @@ private interface BlobIdSupplier { byte[] get() throws IOException; } - private static void checkPathComponent(String name) { - if (".".equals(name) || "..".equals(name)) { + private static String requireNoParentTraversal(String name) { + if ("..".equals(name)) { throw new IllegalArgumentException("Path component not allowed: " + name); } + return name; } + private final Map dirEntries = new HashMap<>(); private final Map fileEntries = new HashMap<>(); private final MessageDigest messageDigest; @@ -209,16 +211,16 @@ private TreeIdBuilder(final MessageDigest messageDigest) { * * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}). * @return The {@link TreeIdBuilder} for the subdirectory. - * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + * @throws IllegalArgumentException If any path component is {@code ".."}. */ public TreeIdBuilder addDirectory(final String name) { TreeIdBuilder current = this; for (final String component : name.split("/", -1)) { - if (component.isEmpty()) { + // Noop segments + if (component.isEmpty() || ".".equals(component)) { continue; } - checkPathComponent(component); - current = current.dirEntries.computeIfAbsent(component, k -> new TreeIdBuilder(messageDigest)); + current = current.dirEntries.computeIfAbsent(requireNoParentTraversal(component), k -> new TreeIdBuilder(messageDigest)); } return current; } @@ -235,17 +237,16 @@ public TreeIdBuilder addDirectory(final String name) { * @param dataSize The exact number of bytes in {@code data}. * @param data The file content. * @throws IOException If the stream cannot be read. - * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + * @throws IllegalArgumentException If any path component is {@code ".."}. */ public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException { addFile(mode, name, () -> blobId(messageDigest, dataSize, data)); } private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException { - final int slash = name.indexOf('/'); + final int slash = name.lastIndexOf('/'); if (slash < 0) { - checkPathComponent(name); - fileEntries.put(name, new DirectoryEntry(name, mode, blobId.get())); + fileEntries.put(name, new DirectoryEntry(requireNoParentTraversal(name), mode, blobId.get())); } else { addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId); } @@ -260,7 +261,7 @@ private void addFile(final FileMode mode, final String name, final BlobIdSupplie * @param name The relative path of the entry in normalized form(may contain {@code '/'}). * @param data The file content. * @throws IOException If an I/O error occurs. - * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + * @throws IllegalArgumentException If any path component is {@code ".."}. */ public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException { addFile(mode, name, () -> blobId(messageDigest, data)); @@ -274,7 +275,7 @@ public void addFile(final FileMode mode, final String name, final byte[] data) t * @param name The relative path of the entry in normalized form(may contain {@code '/'}). * @param target The target of the symbolic link. * @throws IOException If an I/O error occurs. - * @throws IllegalArgumentException If any path component is {@code "."} or {@code ".."}. + * @throws IllegalArgumentException If any path component is {@code ".."}. */ public void addSymbolicLink(final String name, final String target) throws IOException { addFile(FileMode.SYMBOLIC_LINK, name, target.getBytes(StandardCharsets.UTF_8)); diff --git a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java index df54287c5b..8c8b7c45b0 100644 --- a/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java +++ b/src/test/java/org/apache/commons/codec/digest/GitIdentifiersTest.java @@ -204,46 +204,19 @@ void testTreeIdBuilderAddFileInputStream() throws Exception { } @Test - void testTreeIdBuilderEmptyPathSegments() throws Exception { - final MessageDigest md = DigestUtils.getSha1Digest(); - final byte[] content = "hello\n".getBytes(StandardCharsets.UTF_8); - - // Canonical form - final GitIdentifiers.TreeIdBuilder canonical = GitIdentifiers.treeIdBuilder(md); - canonical.addFile(GitIdentifiers.FileMode.REGULAR, "subdir/file.txt", content); - final byte[] expected = canonical.build(); - - // Leading slash - final GitIdentifiers.TreeIdBuilder withLeading = GitIdentifiers.treeIdBuilder(md); - withLeading.addFile(GitIdentifiers.FileMode.REGULAR, "/subdir/file.txt", content); - assertArrayEquals(expected, withLeading.build()); - - // Consecutive slashes - final GitIdentifiers.TreeIdBuilder withDouble = GitIdentifiers.treeIdBuilder(md); - withDouble.addFile(GitIdentifiers.FileMode.REGULAR, "subdir//file.txt", content); - assertArrayEquals(expected, withDouble.build()); - - // addDirectory with leading/trailing slashes - final GitIdentifiers.TreeIdBuilder viaDirectory = GitIdentifiers.treeIdBuilder(md); - viaDirectory.addDirectory("/subdir/").addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content); - assertArrayEquals(expected, viaDirectory.build()); - } - - @ParameterizedTest - @ValueSource(strings = {".", ".."}) - void testTreeIdBuilderInvalidPathSegments(final String segment) { + void testTreeIdBuilderInvalidPathSegments() { final MessageDigest md = DigestUtils.getSha1Digest(); final byte[] data = new byte[0]; // Sole path component assertThrows(IllegalArgumentException.class, - () -> GitIdentifiers.treeIdBuilder(md).addFile(GitIdentifiers.FileMode.REGULAR, segment, data)); + () -> GitIdentifiers.treeIdBuilder(md).addFile(GitIdentifiers.FileMode.REGULAR, "..", data)); assertThrows(IllegalArgumentException.class, - () -> GitIdentifiers.treeIdBuilder(md).addDirectory(segment)); + () -> GitIdentifiers.treeIdBuilder(md).addDirectory("..")); // Embedded in a longer path assertThrows(IllegalArgumentException.class, - () -> GitIdentifiers.treeIdBuilder(md).addFile(GitIdentifiers.FileMode.REGULAR, "subdir/" + segment + "/file.txt", data)); + () -> GitIdentifiers.treeIdBuilder(md).addFile(GitIdentifiers.FileMode.REGULAR, "subdir/../file.txt", data)); assertThrows(IllegalArgumentException.class, - () -> GitIdentifiers.treeIdBuilder(md).addDirectory("subdir/" + segment)); + () -> GitIdentifiers.treeIdBuilder(md).addDirectory("subdir/..")); } @Test @@ -260,6 +233,33 @@ void testTreeIdBuilderNestedFileEquivalentToDirectoryAndFile() throws Exception assertArrayEquals(direct.build(), indirect.build()); } + @ParameterizedTest + @ValueSource(strings = {"", "."}) + void testTreeIdBuilderNoopPathSegments(String segment) throws Exception { + final MessageDigest md = DigestUtils.getSha1Digest(); + final byte[] content = "hello\n".getBytes(StandardCharsets.UTF_8); + + // Canonical form + final GitIdentifiers.TreeIdBuilder canonical = GitIdentifiers.treeIdBuilder(md); + canonical.addFile(GitIdentifiers.FileMode.REGULAR, "subdir/file.txt", content); + final byte[] expected = canonical.build(); + + // Leading segment + final GitIdentifiers.TreeIdBuilder withLeading = GitIdentifiers.treeIdBuilder(md); + withLeading.addFile(GitIdentifiers.FileMode.REGULAR, segment + "/subdir/file.txt", content); + assertArrayEquals(expected, withLeading.build()); + + // Intermediate segment + final GitIdentifiers.TreeIdBuilder withIntermediate = GitIdentifiers.treeIdBuilder(md); + withIntermediate.addFile(GitIdentifiers.FileMode.REGULAR, "subdir/" + segment + "/file.txt", content); + assertArrayEquals(expected, withIntermediate.build()); + + // addDirectory with leading/trailing segments + final GitIdentifiers.TreeIdBuilder viaDirectory = GitIdentifiers.treeIdBuilder(md); + viaDirectory.addDirectory(segment + "/subdir/" + segment).addFile(GitIdentifiers.FileMode.REGULAR, "file.txt", content); + assertArrayEquals(expected, viaDirectory.build()); + } + @Test void testTreeIdPath() throws Exception { assertArrayEquals(Hex.decodeHex("e4b21f6d78ceba6eb7c211ac15e3337ec4614e8a"),