Request for review: Optimize common-case UTF8 path for native ZipFile.getEntry calls

Neil Richards Mon, 25 Apr 2011 09:46:40 -0700

Hi,
Changeset 1aaeb8fbe705 [1] implemented a good set of enhancements, to
allow the zip code to properly support zip files containing names and
comments either encoded in non-UTF-8 codesets, or which contain
supplementary characters.


This support is achieved by doing the character conversion in Java code
(using the code in ZipCoder.getBytes()), then feeding the resulting
byte[] through to native code.

This approach is more costly, both in terms of memory allocation and
amount of data copying, than the preceding one, which did the conversion
directly into a native byte[] using JNI's 'GetStringUTFRegion', which
assumes a codeset of "modified UTF-8".

The conversion performed for "modified UTF-8" is the same as that for
standard UTF-8 for all codepoints except those for supplementary
characters or for the character '\u0000' [2].

As the "common-case" is likely to be dealing with zip files with UTF-8
encoded names and comments containing neither supplementary characters
nor '\u0000', the increase in cost of the conversion in this case is
unfortunate.

By introducing an extra check to determine if a UTF-8 encoded string may
be safely converted using "modified UTF-8" (ie. to check that it has not
any supplementary codepoints or '\u0000' in it), and using the
aforementioned JNI routine to do the conversion if the check succeeds,
I've noticed a decent performance improvement can be achieved by
catering for this common-case.

This benefit is particularly significant for applications which do a lot
of rummaging around in zip files, or those involving lots of jar files
and/or jar files with lots of entries in them.

Please find below a changeset which implements the check I describe
above ("isSafeToUseModifiedUTF8"), and uses it to choose whether to call
a new modified-UTF8-specific variant of the native getEntry() method
("getEntryByModifiedUTF8"), which makes use of the JNI
'GetStringUTFRegion' method for the conversion.

Please let me know if you have and comments, criticism of suggestions on
this,
Thanks,
Neil

[1] http://hg.openjdk.java.net/jdk7/jdk7/jdk/rev/1aaeb8fbe705
[2] 
http://java.sun.com/developer/technicalArticles/Intl/Supplementary/#Modified_UTF-8

-- 
Unless stated above:
IBM email: neil_richards at uk.ibm.com
IBM United Kingdom Limited - Registered in England and Wales with number 741598.
Registered office: PO Box 41, North Harbour, Portsmouth, Hampshire PO6 3AU


# HG changeset patch
# User Neil Richards <neil.richa...@ngmr.net>, <neil_richa...@uk.ibm.com>
# Date 1303394058 -3600
# Branch ojdk-121
# Node ID e5c8d8da677bce826b6bafc379d67c5d7775e7f6
# Parent  60d3d55dcc9c31a30ced9caa6ef5c0dcd7db031d
Summary: Optimize name conversion when it is safe to use modified UTF-8 (common 
case)
Contributed-by: <neil.richa...@ngmr.net>

diff -r 60d3d55dcc9c -r e5c8d8da677b make/java/zip/mapfile-vers
--- a/make/java/zip/mapfile-vers        Wed Apr 13 16:56:16 2011 -0700
+++ b/make/java/zip/mapfile-vers        Thu Apr 21 14:54:18 2011 +0100
@@ -54,6 +54,7 @@
                Java_java_util_zip_ZipFile_getCommentBytes;
                Java_java_util_zip_ZipFile_freeEntry;
                Java_java_util_zip_ZipFile_getEntry;
+               Java_java_util_zip_ZipFile_getEntryByModifiedUTF8;
                Java_java_util_zip_ZipFile_getEntryBytes;
                Java_java_util_zip_ZipFile_getEntryCrc;
                Java_java_util_zip_ZipFile_getEntryCSize;
diff -r 60d3d55dcc9c -r e5c8d8da677b 
src/share/classes/java/util/zip/ZipFile.java
--- a/src/share/classes/java/util/zip/ZipFile.java      Wed Apr 13 16:56:16 
2011 -0700
+++ b/src/share/classes/java/util/zip/ZipFile.java      Thu Apr 21 14:54:18 
2011 +0100
@@ -283,6 +283,24 @@
         }
     }
 
+    private static boolean isSafeToUseModifiedUTF8(String s) {
+        final int length = s.length();
+
+        int codepoint;
+        // It's okay to step through the string one char at a time, as any 
+        // codepoint encoded to more than one char will be greater or equal to
+        // Character.MIN_SUPPLEMENTARY_CODE_POINT, and so will have caused this
+        // method to return 'false'.
+        for (int i = 0; i < length; i++) {
+            codepoint = s.codePointAt(i);
+            if ((0 == codepoint) || 
+                    (codepoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     /**
      * Returns the zip file entry for the specified name, or null
      * if not found.
@@ -298,7 +316,13 @@
         long jzentry = 0;
         synchronized (this) {
             ensureOpen();
-            jzentry = getEntry(jzfile, zc.getBytes(name), true);
+            // Optimize if 'name' can be converted safely using 
+            // "modified UTF-8" (common case)
+            if (zc.isUTF8() && isSafeToUseModifiedUTF8(name)) {
+                jzentry = getEntryByModifiedUTF8(jzfile, name, true);
+            } else {
+                jzentry = getEntry(jzfile, zc.getBytes(name), true);
+            }
             if (jzentry != 0) {
                 ZipEntry ze = getZipEntry(name, jzentry);
                 freeEntry(jzfile, jzentry);
@@ -308,6 +332,9 @@
         return null;
     }
 
+    private static native long getEntryByModifiedUTF8(long jzfile, String name,
+                                        boolean addSlash);
+
     private static native long getEntry(long jzfile, byte[] name,
                                         boolean addSlash);
 
@@ -339,8 +366,15 @@
         ZipFileInputStream in = null;
         synchronized (this) {
             ensureOpen();
-            if (!zc.isUTF8() && (entry.flag & EFS) != 0) {
-                jzentry = getEntry(jzfile, zc.getBytesUTF8(entry.name), false);
+            if (zc.isUTF8() || ((entry.flag & EFS) != 0)) {
+                // Optimize if 'entry.name' can be converted safely using 
+                // "modified UTF-8" (common case)
+                if (isSafeToUseModifiedUTF8(entry.name)) {
+                    jzentry = getEntryByModifiedUTF8(jzfile, entry.name, 
false);
+                } else {
+                    jzentry = 
+                        getEntry(jzfile, zc.getBytesUTF8(entry.name), false);
+                }
             } else {
                 jzentry = getEntry(jzfile, zc.getBytes(entry.name), false);
             }
diff -r 60d3d55dcc9c -r e5c8d8da677b src/share/native/java/util/zip/ZipFile.c
--- a/src/share/native/java/util/zip/ZipFile.c  Wed Apr 13 16:56:16 2011 -0700
+++ b/src/share/native/java/util/zip/ZipFile.c  Thu Apr 21 14:54:18 2011 +0100
@@ -142,11 +142,12 @@
     ZIP_Close(jlong_to_ptr(zfile));
 }
 
+#define MAXNAME 1024
+
 JNIEXPORT jlong JNICALL
 Java_java_util_zip_ZipFile_getEntry(JNIEnv *env, jclass cls, jlong zfile,
                                     jbyteArray name, jboolean addSlash)
 {
-#define MAXNAME 1024
     jzfile *zip = jlong_to_ptr(zfile);
     jsize ulen = (*env)->GetArrayLength(env, name);
     char buf[MAXNAME+2], *path;
@@ -170,6 +171,40 @@
     }
     if (path != buf) {
         free(path);
+        path = NULL;
+    }
+    return ptr_to_jlong(ze);
+}
+
+JNIEXPORT jlong JNICALL
+Java_java_util_zip_ZipFile_getEntryByModifiedUTF8(JNIEnv *env, jclass cls, 
+        jlong zfile, jstring name, jboolean addSlash)
+{
+    jzfile *zip = jlong_to_ptr(zfile);
+    jsize slen = (*env)->GetStringLength(env, name);
+    jsize ulen = (*env)->GetStringUTFLength(env, name);
+    char buf[MAXNAME+2], *path;
+    jzentry *ze;
+
+    if (ulen > MAXNAME) {
+        path = malloc(ulen + 2);
+        if (path == 0) {
+            JNU_ThrowOutOfMemoryError(env, 0);
+            return 0;
+        }
+    } else {
+        path = buf;
+    }
+    (*env)->GetStringUTFRegion(env, name, 0, slen, path);
+    path[ulen] = '\0';
+    if (addSlash == JNI_FALSE) {
+        ze = ZIP_GetEntry(zip, path, 0);
+    } else {
+        ze = ZIP_GetEntry(zip, path, (jint)ulen);
+    }
+    if (path != buf) {
+        free(path);
+        path = NULL;
     }
     return ptr_to_jlong(ze);
 }

Request for review: Optimize common-case UTF8 path for native ZipFile.getEntry calls

Reply via email to