Hi, Changeset 1aaeb8fbe705 [1] implemented a good set of enhancements, to allow the zip code to properly support zip files containing names and comments either encoded in non-UTF-8 codesets, or which contain supplementary characters.
This support is achieved by doing the character conversion in Java code (using the code in ZipCoder.getBytes()), then feeding the resulting byte[] through to native code. This approach is more costly, both in terms of memory allocation and amount of data copying, than the preceding one, which did the conversion directly into a native byte[] using JNI's 'GetStringUTFRegion', which assumes a codeset of "modified UTF-8". The conversion performed for "modified UTF-8" is the same as that for standard UTF-8 for all codepoints except those for supplementary characters or for the character '\u0000' [2]. As the "common-case" is likely to be dealing with zip files with UTF-8 encoded names and comments containing neither supplementary characters nor '\u0000', the increase in cost of the conversion in this case is unfortunate. By introducing an extra check to determine if a UTF-8 encoded string may be safely converted using "modified UTF-8" (ie. to check that it has not any supplementary codepoints or '\u0000' in it), and using the aforementioned JNI routine to do the conversion if the check succeeds, I've noticed a decent performance improvement can be achieved by catering for this common-case. This benefit is particularly significant for applications which do a lot of rummaging around in zip files, or those involving lots of jar files and/or jar files with lots of entries in them. Please find below a changeset which implements the check I describe above ("isSafeToUseModifiedUTF8"), and uses it to choose whether to call a new modified-UTF8-specific variant of the native getEntry() method ("getEntryByModifiedUTF8"), which makes use of the JNI 'GetStringUTFRegion' method for the conversion. Please let me know if you have and comments, criticism of suggestions on this, Thanks, Neil [1] http://hg.openjdk.java.net/jdk7/jdk7/jdk/rev/1aaeb8fbe705 [2] http://java.sun.com/developer/technicalArticles/Intl/Supplementary/#Modified_UTF-8 -- Unless stated above: IBM email: neil_richards at uk.ibm.com IBM United Kingdom Limited - Registered in England and Wales with number 741598. Registered office: PO Box 41, North Harbour, Portsmouth, Hampshire PO6 3AU # HG changeset patch # User Neil Richards <neil.richa...@ngmr.net>, <neil_richa...@uk.ibm.com> # Date 1303394058 -3600 # Branch ojdk-121 # Node ID e5c8d8da677bce826b6bafc379d67c5d7775e7f6 # Parent 60d3d55dcc9c31a30ced9caa6ef5c0dcd7db031d Summary: Optimize name conversion when it is safe to use modified UTF-8 (common case) Contributed-by: <neil.richa...@ngmr.net> diff -r 60d3d55dcc9c -r e5c8d8da677b make/java/zip/mapfile-vers --- a/make/java/zip/mapfile-vers Wed Apr 13 16:56:16 2011 -0700 +++ b/make/java/zip/mapfile-vers Thu Apr 21 14:54:18 2011 +0100 @@ -54,6 +54,7 @@ Java_java_util_zip_ZipFile_getCommentBytes; Java_java_util_zip_ZipFile_freeEntry; Java_java_util_zip_ZipFile_getEntry; + Java_java_util_zip_ZipFile_getEntryByModifiedUTF8; Java_java_util_zip_ZipFile_getEntryBytes; Java_java_util_zip_ZipFile_getEntryCrc; Java_java_util_zip_ZipFile_getEntryCSize; diff -r 60d3d55dcc9c -r e5c8d8da677b src/share/classes/java/util/zip/ZipFile.java --- a/src/share/classes/java/util/zip/ZipFile.java Wed Apr 13 16:56:16 2011 -0700 +++ b/src/share/classes/java/util/zip/ZipFile.java Thu Apr 21 14:54:18 2011 +0100 @@ -283,6 +283,24 @@ } } + private static boolean isSafeToUseModifiedUTF8(String s) { + final int length = s.length(); + + int codepoint; + // It's okay to step through the string one char at a time, as any + // codepoint encoded to more than one char will be greater or equal to + // Character.MIN_SUPPLEMENTARY_CODE_POINT, and so will have caused this + // method to return 'false'. + for (int i = 0; i < length; i++) { + codepoint = s.codePointAt(i); + if ((0 == codepoint) || + (codepoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT)) { + return false; + } + } + return true; + } + /** * Returns the zip file entry for the specified name, or null * if not found. @@ -298,7 +316,13 @@ long jzentry = 0; synchronized (this) { ensureOpen(); - jzentry = getEntry(jzfile, zc.getBytes(name), true); + // Optimize if 'name' can be converted safely using + // "modified UTF-8" (common case) + if (zc.isUTF8() && isSafeToUseModifiedUTF8(name)) { + jzentry = getEntryByModifiedUTF8(jzfile, name, true); + } else { + jzentry = getEntry(jzfile, zc.getBytes(name), true); + } if (jzentry != 0) { ZipEntry ze = getZipEntry(name, jzentry); freeEntry(jzfile, jzentry); @@ -308,6 +332,9 @@ return null; } + private static native long getEntryByModifiedUTF8(long jzfile, String name, + boolean addSlash); + private static native long getEntry(long jzfile, byte[] name, boolean addSlash); @@ -339,8 +366,15 @@ ZipFileInputStream in = null; synchronized (this) { ensureOpen(); - if (!zc.isUTF8() && (entry.flag & EFS) != 0) { - jzentry = getEntry(jzfile, zc.getBytesUTF8(entry.name), false); + if (zc.isUTF8() || ((entry.flag & EFS) != 0)) { + // Optimize if 'entry.name' can be converted safely using + // "modified UTF-8" (common case) + if (isSafeToUseModifiedUTF8(entry.name)) { + jzentry = getEntryByModifiedUTF8(jzfile, entry.name, false); + } else { + jzentry = + getEntry(jzfile, zc.getBytesUTF8(entry.name), false); + } } else { jzentry = getEntry(jzfile, zc.getBytes(entry.name), false); } diff -r 60d3d55dcc9c -r e5c8d8da677b src/share/native/java/util/zip/ZipFile.c --- a/src/share/native/java/util/zip/ZipFile.c Wed Apr 13 16:56:16 2011 -0700 +++ b/src/share/native/java/util/zip/ZipFile.c Thu Apr 21 14:54:18 2011 +0100 @@ -142,11 +142,12 @@ ZIP_Close(jlong_to_ptr(zfile)); } +#define MAXNAME 1024 + JNIEXPORT jlong JNICALL Java_java_util_zip_ZipFile_getEntry(JNIEnv *env, jclass cls, jlong zfile, jbyteArray name, jboolean addSlash) { -#define MAXNAME 1024 jzfile *zip = jlong_to_ptr(zfile); jsize ulen = (*env)->GetArrayLength(env, name); char buf[MAXNAME+2], *path; @@ -170,6 +171,40 @@ } if (path != buf) { free(path); + path = NULL; + } + return ptr_to_jlong(ze); +} + +JNIEXPORT jlong JNICALL +Java_java_util_zip_ZipFile_getEntryByModifiedUTF8(JNIEnv *env, jclass cls, + jlong zfile, jstring name, jboolean addSlash) +{ + jzfile *zip = jlong_to_ptr(zfile); + jsize slen = (*env)->GetStringLength(env, name); + jsize ulen = (*env)->GetStringUTFLength(env, name); + char buf[MAXNAME+2], *path; + jzentry *ze; + + if (ulen > MAXNAME) { + path = malloc(ulen + 2); + if (path == 0) { + JNU_ThrowOutOfMemoryError(env, 0); + return 0; + } + } else { + path = buf; + } + (*env)->GetStringUTFRegion(env, name, 0, slen, path); + path[ulen] = '\0'; + if (addSlash == JNI_FALSE) { + ze = ZIP_GetEntry(zip, path, 0); + } else { + ze = ZIP_GetEntry(zip, path, (jint)ulen); + } + if (path != buf) { + free(path); + path = NULL; } return ptr_to_jlong(ze); }