Jakob Sultan Ericsson created COMPRESS-501:
----------------------------------------------

             Summary: Possibility to introduce a fast Zip open with some caveats
                 Key: COMPRESS-501
                 URL: https://issues.apache.org/jira/browse/COMPRESS-501
             Project: Commons Compress
          Issue Type: Improvement
          Components: Archivers
    Affects Versions: 1.19
         Environment: OSX 10.14.6 and Linux
            Reporter: Jakob Sultan Ericsson
         Attachments: zipfile-speed-improvements.diff

About a year ago I created an improvement 
(https://issues.apache.org/jira/browse/COMPRESS-466) to speed up some things in 
commons-compress for Zip-files. This helped us quite a lot but we wanted it to 
be even faster so I optimised away some stuff that I thought was not that 
important for us.
I was able to improve opening of a 34GB zip file from ~12s to ~2s.

Now to my question, do you think it would be possible to introduce some of my 
fixes (diff included) into master?

Yes, I know that I shortcut some features for some specific zip files and don't 
expose everything anymore.

I haven't really made a good switchable solution for it because we just use our 
own build locally with this path.

But with some hints from you I might be able to do it somehow. I'm happy to 
help and would love to get this speed open into master (it is always cumbersome 
with custom changes to public libraries). 

{code:java}
diff --git 
a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java 
b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
index 767f615d..d441b12d 100644
--- 
a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
+++ 
b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
@@ -146,6 +146,7 @@
     private boolean isStreamContiguous = false;
     private NameSource nameSource = NameSource.NAME;
     private CommentSource commentSource = CommentSource.COMMENT;
+    private byte[] cdExtraData = null;
 
 
     /**
@@ -397,6 +398,14 @@ public void setAlignment(int alignment) {
         this.alignment = alignment;
     }
 
+    public void setRawCentralDirectoryExtra(byte[] cdExtraData) {
+        this.cdExtraData = cdExtraData;
+    }
+
+    public byte[] getRawCentralDirectoryExtra() {
+        return this.cdExtraData;
+    }
+
     /**
      * Replaces all currently attached extra fields with the new array.
      * @param fields an array of extra fields
diff --git 
a/src/main/java/org/apache/commons/compress/archivers/zip/ZipFile.java 
b/src/main/java/org/apache/commons/compress/archivers/zip/ZipFile.java
index 152272b5..bb33b50f 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipFile.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipFile.java
@@ -691,10 +691,10 @@ protected void finalize() throws Throwable {
         final HashMap<ZipArchiveEntry, NameAndComment> noUTF8Flag =
             new HashMap<>();
 
-        positionAtCentralDirectory();
+        ByteBuffer ceDir = positionAtCentralDirectory();
 
         wordBbuf.rewind();
-        IOUtils.readFully(archive, wordBbuf);
+        ceDir.get(wordBuf);
         long sig = ZipLong.getValue(wordBuf);
 
         if (sig != CFH_SIG && startsWithLocalFileHeader()) {
@@ -703,9 +703,12 @@ protected void finalize() throws Throwable {
         }
 
         while (sig == CFH_SIG) {
-            readCentralDirectoryEntry(noUTF8Flag);
+            readCentralDirectoryEntry(ceDir, noUTF8Flag);
             wordBbuf.rewind();
-            IOUtils.readFully(archive, wordBbuf);
+            if (ceDir.remaining() == 0) {
+                break;
+            }
+            ceDir.get(wordBuf);
             sig = ZipLong.getValue(wordBuf);
         }
         return noUTF8Flag;
@@ -721,10 +724,10 @@ protected void finalize() throws Throwable {
      * added to this map.
      */
     private void
-        readCentralDirectoryEntry(final Map<ZipArchiveEntry, NameAndComment> 
noUTF8Flag)
+        readCentralDirectoryEntry(ByteBuffer ceDir, final Map<ZipArchiveEntry, 
NameAndComment> noUTF8Flag)
         throws IOException {
         cfhBbuf.rewind();
-        IOUtils.readFully(archive, cfhBbuf);
+        ceDir.get(cfhBuf);
         int off = 0;
         final Entry ze = new Entry();
 
@@ -752,8 +755,9 @@ protected void finalize() throws Throwable {
         ze.setMethod(ZipShort.getValue(cfhBuf, off));
         off += SHORT;
 
-        final long time = ZipUtil.dosToJavaTime(ZipLong.getValue(cfhBuf, off));
-        ze.setTime(time);
+        //long ts = ZipLong.getValue(cfhBuf, off);
+        //final long time = ZipUtil.dosToJavaTime(ts);
+        //ze.setTime(time);
         off += WORD;
 
         ze.setCrc(ZipLong.getValue(cfhBuf, off));
@@ -784,7 +788,7 @@ protected void finalize() throws Throwable {
         off += WORD;
 
         final byte[] fileName = new byte[fileNameLen];
-        IOUtils.readFully(archive, ByteBuffer.wrap(fileName));
+        ceDir.get(fileName);
         ze.setName(entryEncoding.decode(fileName), fileName);
 
         // LFH offset,
@@ -792,19 +796,22 @@ protected void finalize() throws Throwable {
         // data offset will be filled later
         entries.add(ze);
 
+//        ceDir.position(ceDir.position() + extraLen + commentLen);
         final byte[] cdExtraData = new byte[extraLen];
-        IOUtils.readFully(archive, ByteBuffer.wrap(cdExtraData));
-        ze.setCentralDirectoryExtra(cdExtraData);
+        ceDir.get(cdExtraData);
+        ze.setRawCentralDirectoryExtra(cdExtraData);
+//        ze.setCentralDirectoryExtra(cdExtraData);
 
-        setSizesAndOffsetFromZip64Extra(ze, diskStart);
+//        setSizesAndOffsetFromZip64Extra(ze, diskStart);
 
-        final byte[] comment = new byte[commentLen];
-        IOUtils.readFully(archive, ByteBuffer.wrap(comment));
-        ze.setComment(entryEncoding.decode(comment));
-
-        if (!hasUTF8Flag && useUnicodeExtraFields) {
-            noUTF8Flag.put(ze, new NameAndComment(fileName, comment));
-        }
+        ceDir.position(ceDir.position() + commentLen);
+//        final byte[] comment = new byte[commentLen];
+//        ceDir.get(comment);
+//        ze.setComment(entryEncoding.decode(comment));
+//
+//        if (!hasUTF8Flag && useUnicodeExtraFields) {
+//            noUTF8Flag.put(ze, new NameAndComment(fileName, comment));
+//        }
 
         ze.setStreamContiguous(true);
     }
@@ -953,9 +960,10 @@ private void setSizesAndOffsetFromZip64Extra(final 
ZipArchiveEntry ze,
      * it and positions the stream at the first central directory
      * record.
      */
-    private void positionAtCentralDirectory()
+    private ByteBuffer positionAtCentralDirectory()
         throws IOException {
         positionAtEndOfCentralDirectoryRecord();
+        long end = archive.position();
         boolean found = false;
         final boolean searchedForZip64EOCD =
             archive.position() > ZIP64_EOCDL_LENGTH;
@@ -975,6 +983,12 @@ private void positionAtCentralDirectory()
         } else {
             positionAtCentralDirectory64();
         }
+        long start = archive.position();
+        ByteBuffer bigb = ByteBuffer.allocateDirect((int)(end - start));
+        archive.read(bigb);
+        archive.position(start);
+        bigb.rewind();
+        return bigb;
     }
 
     /**
@@ -1168,6 +1182,8 @@ private void fillNameMap() {
     private long getDataOffset(ZipArchiveEntry ze) throws IOException {
         long s = ze.getDataOffset();
         if (s == EntryStreamOffsets.OFFSET_UNKNOWN) {
+            ze.setCentralDirectoryExtra(ze.getRawCentralDirectoryExtra());
+            setSizesAndOffsetFromZip64Extra(ze, 0);
             setDataOffset(ze);
             return ze.getDataOffset();
         }
{code}




--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to