This is an automated email from the ASF dual-hosted git repository. rcordier pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/james-mime4j.git
The following commit(s) were added to refs/heads/master by this push: new bdb2264a Fixes parsing headers containing UTF-8 characters (#103) bdb2264a is described below commit bdb2264abd17badf70e27f7e32907bccc82d005e Author: Thomas <thomas.weinl...@ul.com> AuthorDate: Fri Apr 26 04:51:34 2024 +0200 Fixes parsing headers containing UTF-8 characters (#103) --- .../apache/james/mime4j/stream/RawFieldParser.java | 46 +++++++--------------- .../org/apache/james/mime4j/util/CharsetUtil.java | 21 ++++++++++ .../org/apache/james/mime4j/util/ContentUtil.java | 7 +++- .../james/mime4j/stream/RawFieldParserTest.java | 28 +++++++++++++ .../field/LenientContentDispositionFieldTest.java | 23 +++-------- 5 files changed, 75 insertions(+), 50 deletions(-) diff --git a/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java b/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java index b546b302..8437927f 100644 --- a/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java +++ b/core/src/main/java/org/apache/james/mime4j/stream/RawFieldParser.java @@ -195,12 +195,6 @@ public class RawFieldParser { * is not delimited by any character. */ public String parseValue(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) { - if (!CharsetUtil.isASCII(buf)) { - String value = parseUtf8Filename(buf); - if (value != null) - return value; - } - StringBuilder dst = new StringBuilder(); boolean whitespace = false; while (!cursor.atEnd()) { @@ -229,25 +223,6 @@ public class RawFieldParser { return dst.toString(); } - /** - * Special case for parsing {@code filename} attribute in nonstandard encoding like: - * {@code Content-Disposition: attachment; filename="УПД ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ "СТАНЦИЯ ВИРТУАЛЬНАЯ" 01-05-21.pdf"} - * - * @param buf field raw. - * @return filename value or {@code null}. - */ - private String parseUtf8Filename(ByteSequence buf) { - final String value = new String(buf.toByteArray(), StandardCharsets.UTF_8); - - final String prefix = "filename=\""; - final int pos = value.indexOf(prefix); - if (pos > 0) { - return value.substring(pos + prefix.length(), value.length() - 1); - } - - return null; - } - /** * Skips semantically insignificant whitespace characters and moves the cursor to the closest * non-whitespace character. @@ -379,16 +354,22 @@ public class RawFieldParser { int pos = cursor.getPos(); int indexFrom = cursor.getPos(); int indexTo = cursor.getUpperBound(); + + ByteArrayBuffer dstRaw = new ByteArrayBuffer(indexTo - indexFrom); + for (int i = indexFrom; i < indexTo; i++) { - char current = (char) (buf.byteAt(i) & 0xff); + byte currentByte = buf.byteAt(i); + char current = (char) (currentByte & 0xff); if ((delimiters != null && delimiters.get(current)) || CharsetUtil.isWhitespace(current) || current == '(' || current == '\"') { break; } else { pos++; - dst.append(current); + dstRaw.append(currentByte); } } + String decoded = CharsetUtil.isASCII(dstRaw) ? ContentUtil.decode(dstRaw) : ContentUtil.decode(StandardCharsets.UTF_8, dstRaw); + dst.append(decoded); cursor.updatePos(pos); } @@ -414,16 +395,17 @@ public class RawFieldParser { pos++; indexFrom++; - ByteArrayBuffer dstRaw = new ByteArrayBuffer(200); + ByteArrayBuffer dstRaw = new ByteArrayBuffer(indexTo - indexFrom); boolean escaped = false; for (int i = indexFrom; i < indexTo; i++, pos++) { - current = (char) (buf.byteAt(i) & 0xff); + byte currentByte = buf.byteAt(i); + current = (char) (currentByte & 0xff); if (escaped) { if (current != '\"' && current != '\\') { dstRaw.append('\\'); } - dstRaw.append(current); + dstRaw.append(currentByte); escaped = false; } else { if (current == '\"') { @@ -433,12 +415,12 @@ public class RawFieldParser { if (current == '\\') { escaped = true; } else if (current != '\r' && current != '\n') { - dstRaw.append(current); + dstRaw.append(currentByte); } } } - String decoded = ContentUtil.decode(dstRaw); + String decoded = CharsetUtil.isASCII(dstRaw) ? ContentUtil.decode(dstRaw) : ContentUtil.decode(StandardCharsets.UTF_8, dstRaw); if (decoded.startsWith("=?")) { decoded = DecoderUtil.decodeEncodedWords(decoded, DecodeMonitor.SILENT); } diff --git a/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java b/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java index 0a9c983c..4503cbdf 100644 --- a/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java +++ b/core/src/main/java/org/apache/james/mime4j/util/CharsetUtil.java @@ -95,6 +95,27 @@ public class CharsetUtil { return true; } + /** + * Returns <code>true</code> if the specified string consists entirely of + * US ASCII characters. + * + * @param s + * string to test. + * @return <code>true</code> if the specified string consists entirely of + * US ASCII characters, <code>false</code> otherwise. + */ + public static boolean isASCII(final CharSequence s) { + if (s == null) { + throw new IllegalArgumentException("String may not be null"); + } + final int len = s.length(); + for (int i = 0; i < len; i++) { + if (!isASCII(s.charAt(i))) { + return false; + } + } + return true; + } /** * Returns <code>true</code> if the specified character is a whitespace * character (CR, LF, SP or HT). diff --git a/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java b/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java index 1e078120..b5c365dc 100644 --- a/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java +++ b/core/src/main/java/org/apache/james/mime4j/util/ContentUtil.java @@ -30,6 +30,7 @@ import java.lang.ref.SoftReference; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.james.mime4j.Charsets; @@ -131,7 +132,8 @@ public class ContentUtil { /** * Encodes the specified string into an immutable sequence of bytes using - * the US-ASCII charset. + * the US-ASCII charset or UTF-8 in case none ASCII characters are in the + * sequence. * * @param string * string to encode. @@ -141,6 +143,9 @@ public class ContentUtil { if (string == null) { return null; } + if (!CharsetUtil.isASCII(string)) { + return encode(StandardCharsets.UTF_8, string); + } ByteArrayBuffer buf = new ByteArrayBuffer(string.length()); for (int i = 0; i < string.length(); i++) { buf.append((byte) string.charAt(i)); diff --git a/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java b/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java index dff799e3..ca41198e 100644 --- a/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java +++ b/core/src/test/java/org/apache/james/mime4j/stream/RawFieldParserTest.java @@ -461,4 +461,32 @@ public class RawFieldParserTest { org.junit.Assert.assertEquals("simple boundary", params.get(0).getValue()); } + @Test + public void testRegressionForContentDispositionParsingASCIIonly() { + ByteSequence buf = ContentUtil.encode( + "name=\"filedata\"; filename=\"Sanity a.doc\""); + ParserCursor cursor = new ParserCursor(0, buf.length()); + List<NameValuePair> params = parser.parseParameters(buf, cursor); + + org.junit.Assert.assertEquals(2, params.size()); + org.junit.Assert.assertEquals("name", params.get(0).getName()); + org.junit.Assert.assertEquals("filedata", params.get(0).getValue()); + org.junit.Assert.assertEquals("filename", params.get(1).getName()); + org.junit.Assert.assertEquals("Sanity a.doc", params.get(1).getValue()); + } + + @Test + public void testRegressionForContentDispositionParsingUTF8() { + ByteSequence buf = ContentUtil.encode("name=\"filedata\"; filename=\"Sanity ä.doc\""); + ParserCursor cursor = new ParserCursor(0, buf.length()); + List<NameValuePair> params = parser.parseParameters(buf, cursor); + + org.junit.Assert.assertEquals(2, params.size()); + org.junit.Assert.assertEquals("name", params.get(0).getName()); + org.junit.Assert.assertEquals("filedata", params.get(0).getValue()); + org.junit.Assert.assertEquals("filename", params.get(1).getName()); + org.junit.Assert.assertEquals("Sanity ä.doc", params.get(1).getValue()); + } + + } diff --git a/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java b/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java index 5978f3b2..eb563e28 100644 --- a/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java +++ b/dom/src/test/java/org/apache/james/mime4j/field/LenientContentDispositionFieldTest.java @@ -19,14 +19,12 @@ package org.apache.james.mime4j.field; -import java.nio.charset.StandardCharsets; import java.util.Date; import org.apache.james.mime4j.MimeException; import org.apache.james.mime4j.dom.field.ContentDispositionField; import org.apache.james.mime4j.stream.RawField; import org.apache.james.mime4j.stream.RawFieldParser; -import org.apache.james.mime4j.util.ByteArrayBuffer; import org.apache.james.mime4j.util.ByteSequence; import org.apache.james.mime4j.util.ContentUtil; import org.junit.Assert; @@ -40,11 +38,6 @@ public class LenientContentDispositionFieldTest { return ContentDispositionFieldLenientImpl.PARSER.parse(rawField, null); } - static ContentDispositionField parse(final byte[] raw) throws MimeException { - RawField rawField = RawFieldParser.DEFAULT.parseField(new ByteArrayBuffer(raw, true)); - return ContentDispositionFieldLenientImpl.PARSER.parse(rawField, null); - } - @Test public void testDispositionTypeWithSemiColonNoParams() throws Exception { ContentDispositionField f = parse("Content-Disposition: inline;"); @@ -120,10 +113,9 @@ public class LenientContentDispositionFieldTest { @Test public void testGetFilenameEncoded() throws Exception { - byte[] data = ("Content-Disposition: attachment;\n" + + String data = "Content-Disposition: attachment;\n" + " FileName=\"=?WINDOWS-1251?Q?3244659=5F=C0=EA=F2_=E7=E0_=C8=FE=EB=FC_?=\n" + - " =?WINDOWS-1251?Q?2020.pdf?=\"") - .getBytes(StandardCharsets.UTF_8); + " =?WINDOWS-1251?Q?2020.pdf?=\""; ContentDispositionField f = parse(data); @@ -132,10 +124,8 @@ public class LenientContentDispositionFieldTest { @Test public void testGetFilenameUtf8() throws Exception { - byte[] data = - "Content-Disposition: attachment; filename=\"УПД ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ \"СТАНЦИЯ ВИРТУАЛЬНАЯ\" 01-05-21.pdf\"" - .getBytes(StandardCharsets.UTF_8); - + String data = + "Content-Disposition: attachment; filename=\"УПД ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ \\\"СТАНЦИЯ ВИРТУАЛЬНАЯ\\\" 01-05-21.pdf\""; ContentDispositionField f = parse(data); Assert.assertEquals("UTF8 encoded filename", "УПД ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ \"СТАНЦИЯ ВИРТУАЛЬНАЯ\" 01-05-21.pdf", f.getFilename()); @@ -143,10 +133,9 @@ public class LenientContentDispositionFieldTest { @Test public void testGetFilenameMultipartUtf8() throws Exception { - byte[] data = ("Content-Disposition: attachment;\n" + + String data = "Content-Disposition: attachment;\n" + " filename*0*=\"UTF-8''%D0%A0%D0%BE%D1%81%D1%82%D0%B5%D0%BB%D0%B5%D0%BA%D0%BE\";\n" + - " filename*1*=\"%D0%BC%2E%78%6C%73%78\"\n") - .getBytes(StandardCharsets.UTF_8); + " filename*1*=\"%D0%BC%2E%78%6C%73%78\"\n"; ContentDispositionField f = parse(data); Assert.assertEquals("Ростелеком.xlsx", f.getFilename()); --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org