theobisproject commented on pull request #113: URL: https://github.com/apache/commons-compress/pull/113#issuecomment-730541671
The benchmark was added to the https://github.com/bodewig/commons-compress-benchmarks repository for easier setup. ```java package de.samaflost.commons_compress.tar; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarFile; import org.apache.commons.compress.utils.IOUtils; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.util.NullOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.concurrent.TimeUnit; @State(Scope.Benchmark) @Fork(1) @OutputTimeUnit(TimeUnit.SECONDS) @Measurement(iterations = 10) @Warmup(iterations = 10) public class ReadLargeTarBenchmark { private InputStream getLargeTarStream() throws IOException { return Files.newInputStream(getLargeTarPath()); } private Path getLargeTarPath() { return Paths.get("path-to-file"); } private void readEntry(final InputStream inputStream) throws IOException { IOUtils.copy(inputStream, new NullOutputStream(), 8192); } @Benchmark public void readAllEntries_tarFile() throws IOException { try (TarFile tarFile = new TarFile(getLargeTarPath())) { for (TarArchiveEntry entry : tarFile.getEntries()) { try (InputStream entryInput = tarFile.getInputStream(entry)) { readEntry(entryInput); } } } } @Benchmark public void readAllEntries_tarStream() throws IOException { try (TarArchiveInputStream tarIn = new TarArchiveInputStream(getLargeTarStream())) { TarArchiveEntry entry; while ((entry = tarIn.getNextTarEntry()) != null) { readEntry(tarIn); } } } @Benchmark public void readFirstEntry_tarFile() throws IOException { try (TarFile tarFile = new TarFile(getLargeTarPath())) { try (InputStream entryInput = tarFile.getInputStream(tarFile.getEntries().get(0))) { readEntry(entryInput); } } } @Benchmark public void readFirstEntry_tarStream() throws IOException { try (TarArchiveInputStream tarIn = new TarArchiveInputStream(getLargeTarStream())) { tarIn.getNextTarEntry(); readEntry(tarIn); } } @Benchmark public void readSecondEntry_tarFile() throws IOException { try (TarFile tarFile = new TarFile(getLargeTarPath())) { try (InputStream entryInput = tarFile.getInputStream(tarFile.getEntries().get(1))) { readEntry(entryInput); } } } @Benchmark public void readSecondEntry_tarStream() throws IOException { try (TarArchiveInputStream tarIn = new TarArchiveInputStream(getLargeTarStream())) { tarIn.getNextTarEntry(); tarIn.getNextTarEntry(); readEntry(tarIn); } } @Benchmark public void readLastEntry_tarFile() throws IOException { try (TarFile tarFile = new TarFile(getLargeTarPath())) { try (InputStream entryInput = tarFile.getInputStream(tarFile.getEntries().get(tarFile.getEntries().size() - 1))) { readEntry(entryInput); } } } @Benchmark public void readLastEntry_tarStream() throws IOException { try (TarArchiveInputStream tarIn = new TarArchiveInputStream(getLargeTarStream())) { tarIn.getNextTarEntry(); tarIn.getNextTarEntry(); tarIn.getNextTarEntry(); readEntry(tarIn); } } } ``` This is the result on my machine (Windows 10, Java 11.0.9, File on SSD) with the testfile containing 3 times the Ubuntu 20.04 Desktop image. ``` Benchmark Mode Cnt Score Error Units ReadLargeTarBenchmark.readAllEntries_tarFile thrpt 10 0,228 ± 0,003 ops/s ReadLargeTarBenchmark.readAllEntries_tarStream thrpt 10 0,256 ± 0,002 ops/s ReadLargeTarBenchmark.readFirstEntry_tarFile thrpt 10 0,680 ± 0,010 ops/s ReadLargeTarBenchmark.readFirstEntry_tarStream thrpt 10 0,766 ± 0,006 ops/s ReadLargeTarBenchmark.readLastEntry_tarFile thrpt 10 0,689 ± 0,006 ops/s ReadLargeTarBenchmark.readLastEntry_tarStream thrpt 10 0,127 ± 0,001 ops/s ReadLargeTarBenchmark.readSecondEntry_tarFile thrpt 10 0,690 ± 0,012 ops/s ReadLargeTarBenchmark.readSecondEntry_tarStream thrpt 10 0,218 ± 0,003 ops/s ``` I think this demonstrates the strength and weakness of the implementation. **Strength** - Performance of random access is constant - Extracting all entries is near to the performance of the `TarArchiveStream` **Weakness** - Only extracting the first entry is slower because of the extra work which is done for the random access The performance gap should be smaller when the archive is smaller. Feel free to experiment with different tar contents and number of files and report back your result. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org