[CALCITE-2619] Reduce string literal creation cost by deferring and caching charset conversion (Ted Xu)
Close apache/calcite#911 Project: http://git-wip-us.apache.org/repos/asf/calcite/repo Commit: http://git-wip-us.apache.org/repos/asf/calcite/commit/36bd5fb2 Tree: http://git-wip-us.apache.org/repos/asf/calcite/tree/36bd5fb2 Diff: http://git-wip-us.apache.org/repos/asf/calcite/diff/36bd5fb2 Branch: refs/heads/master Commit: 36bd5fb2e6db955a581de5f55b00a089c4bc1389 Parents: 96605a8 Author: Ted Xu <[email protected]> Authored: Sat Nov 10 14:46:24 2018 +0800 Committer: Julian Hyde <[email protected]> Committed: Sat Dec 1 14:42:49 2018 -0800 ---------------------------------------------------------------------- .../java/org/apache/calcite/rex/RexBuilder.java | 17 ++ .../java/org/apache/calcite/sql/SqlUtil.java | 65 +++++++- .../java/org/apache/calcite/util/NlsString.java | 155 +++++++++++++------ .../org/apache/calcite/rex/RexBuilderTest.java | 50 ++++++ .../calcite/sql/parser/SqlParserTest.java | 1 + 5 files changed, 234 insertions(+), 54 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/main/java/org/apache/calcite/rex/RexBuilder.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/calcite/rex/RexBuilder.java b/core/src/main/java/org/apache/calcite/rex/RexBuilder.java index 5741a11..d346ec7 100644 --- a/core/src/main/java/org/apache/calcite/rex/RexBuilder.java +++ b/core/src/main/java/org/apache/calcite/rex/RexBuilder.java @@ -28,6 +28,7 @@ import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.runtime.FlatLists; import org.apache.calcite.sql.SqlAggFunction; +import org.apache.calcite.sql.SqlCollation; import org.apache.calcite.sql.SqlIntervalQualifier; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.SqlOperator; @@ -1034,6 +1035,22 @@ public class RexBuilder { } /** + * Creates a character string literal with type CHAR. + * + * @param value String value in bytes + * @param charsetName SQL-level charset name + * @param collation Sql collation + * @return String literal + */ + protected RexLiteral makePreciseStringLiteral(ByteString value, + String charsetName, SqlCollation collation) { + return makeLiteral( + new NlsString(value, charsetName, collation), + typeFactory.createSqlType(SqlTypeName.CHAR), + SqlTypeName.CHAR); + } + + /** * Ensures expression is interpreted as a specified type. The returned * expression may be wrapped with a cast. * http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/main/java/org/apache/calcite/sql/SqlUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/calcite/sql/SqlUtil.java b/core/src/main/java/org/apache/calcite/sql/SqlUtil.java index 52ba9a4..461b7ed 100644 --- a/core/src/main/java/org/apache/calcite/sql/SqlUtil.java +++ b/core/src/main/java/org/apache/calcite/sql/SqlUtil.java @@ -16,6 +16,7 @@ */ package org.apache.calcite.sql; +import org.apache.calcite.avatica.util.ByteString; import org.apache.calcite.linq4j.Ord; import org.apache.calcite.linq4j.function.Functions; import org.apache.calcite.rel.type.RelDataType; @@ -38,11 +39,14 @@ import org.apache.calcite.util.Pair; import org.apache.calcite.util.Util; import com.google.common.base.Predicates; +import com.google.common.base.Utf8; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.sql.DatabaseMetaData; import java.sql.SQLException; import java.text.MessageFormat; @@ -836,18 +840,63 @@ public abstract class SqlUtil { * @return Java-level name, or null if SQL-level name is unknown */ public static String translateCharacterSetName(String name) { - if (name.equals("LATIN1")) { + switch (name) { + case "BIG5": + return "Big5"; + case "LATIN1": return "ISO-8859-1"; - } else if (name.equals("UTF16")) { - return ConversionUtil.NATIVE_UTF16_CHARSET_NAME; - } else if (name.equals(ConversionUtil.NATIVE_UTF16_CHARSET_NAME)) { - // no translation needed + case "GB2312": + case "GBK": return name; - } else if (name.equals("ISO-8859-1")) { - // no translation needed + case "UTF8": + return "UTF-8"; + case "UTF16": + return ConversionUtil.NATIVE_UTF16_CHARSET_NAME; + case "UTF-16BE": + case "UTF-16LE": + case "ISO-8859-1": + case "UTF-8": return name; + default: + return null; + } + } + + /** + * Returns the Java-level {@link Charset} based on given SQL-level name. + * + * @param charsetName Sql charset name, must not be null. + * @return charset, or default charset if charsetName is null. + * @throws UnsupportedCharsetException If no support for the named charset + * is available in this instance of the Java virtual machine + */ + public static Charset getCharset(String charsetName) { + assert charsetName != null; + charsetName = charsetName.toUpperCase(Locale.ROOT); + String javaCharsetName = translateCharacterSetName(charsetName); + if (javaCharsetName == null) { + throw new UnsupportedCharsetException(charsetName); + } + return Charset.forName(javaCharsetName); + } + + /** + * Validate if value can be decoded by given charset. + * + * @param value nls string in byte array + * @param charset charset + * @throws RuntimeException If the given value cannot be represented in the + * given charset + */ + public static void validateCharset(ByteString value, Charset charset) { + if (charset == StandardCharsets.UTF_8) { + final byte[] bytes = value.getBytes(); + if (!Utf8.isWellFormed(bytes)) { + //CHECKSTYLE: IGNORE 1 + final String string = new String(bytes, charset); + throw RESOURCE.charsetEncoding(string, charset.name()).ex(); + } } - return null; } /** If a node is "AS", returns the underlying expression; otherwise returns http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/main/java/org/apache/calcite/util/NlsString.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/calcite/util/NlsString.java b/core/src/main/java/org/apache/calcite/util/NlsString.java index fbb9386..fd41811 100644 --- a/core/src/main/java/org/apache/calcite/util/NlsString.java +++ b/core/src/main/java/org/apache/calcite/util/NlsString.java @@ -16,19 +16,25 @@ */ package org.apache.calcite.util; +import org.apache.calcite.avatica.util.ByteString; import org.apache.calcite.runtime.SqlFunctions; import org.apache.calcite.sql.SqlCollation; import org.apache.calcite.sql.SqlUtil; -import java.nio.CharBuffer; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; + +import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; +import java.nio.charset.CharsetDecoder; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.List; import java.util.Locale; import java.util.Objects; +import javax.annotation.Nonnull; import static org.apache.calcite.util.Static.RESOURCE; @@ -39,8 +45,31 @@ import static org.apache.calcite.util.Static.RESOURCE; public class NlsString implements Comparable<NlsString>, Cloneable { //~ Instance fields -------------------------------------------------------- + private static final LoadingCache<Pair<ByteString, Charset>, String> + DECODE_MAP = + CacheBuilder.newBuilder() + .softValues() + .build( + new CacheLoader<Pair<ByteString, Charset>, String>() { + public String load(@Nonnull Pair<ByteString, Charset> key) { + final Charset charset = key.right; + final CharsetDecoder decoder = charset.newDecoder(); + final byte[] bytes = key.left.getBytes(); + final ByteBuffer buffer = ByteBuffer.wrap(bytes); + try { + return decoder.decode(buffer).toString(); + } catch (CharacterCodingException ex) { + throw RESOURCE.charsetEncoding( + //CHECKSTYLE: IGNORE 1 + new String(bytes, Charset.defaultCharset()), + charset.name()).ex(); + } + } + }); + + private final String stringValue; + private final ByteString bytesValue; private final String charsetName; - private final String value; private final Charset charset; private final SqlCollation collation; @@ -49,43 +78,71 @@ public class NlsString implements Comparable<NlsString>, Cloneable { /** * Creates a string in a specified character set. * - * @param value String constant, must not be null - * @param charsetName Name of the character set, may be null + * @param bytesValue Byte array constant, must not be null + * @param charsetName Name of the character set, must not be null * @param collation Collation, may be null + * * @throws IllegalCharsetNameException If the given charset name is illegal * @throws UnsupportedCharsetException If no support for the named charset * is available in this instance of the Java virtual machine * @throws RuntimeException If the given value cannot be represented in the * given charset */ - public NlsString( - String value, - String charsetName, + public NlsString(ByteString bytesValue, String charsetName, SqlCollation collation) { - assert value != null; - if (null != charsetName) { - charsetName = charsetName.toUpperCase(Locale.ROOT); - this.charsetName = charsetName; - String javaCharsetName = - SqlUtil.translateCharacterSetName(charsetName); - if (javaCharsetName == null) { - throw new UnsupportedCharsetException(charsetName); - } - this.charset = Charset.forName(javaCharsetName); - CharsetEncoder encoder = charset.newEncoder(); - - // dry run to see if encoding hits any problems - try { - encoder.encode(CharBuffer.wrap(value)); - } catch (CharacterCodingException ex) { - throw RESOURCE.charsetEncoding(value, javaCharsetName).ex(); - } + this(null, Objects.requireNonNull(bytesValue), + Objects.requireNonNull(charsetName), collation); + } + + /** + * Easy constructor for Java string. + * + * @param stringValue String constant, must not be null + * @param charsetName Name of the character set, may be null + * @param collation Collation, may be null + * + * @throws IllegalCharsetNameException If the given charset name is illegal + * @throws UnsupportedCharsetException If no support for the named charset + * is available in this instance of the Java virtual machine + * @throws RuntimeException If the given value cannot be represented in the + * given charset + */ + public NlsString(String stringValue, String charsetName, + SqlCollation collation) { + this(Objects.requireNonNull(stringValue), null, charsetName, collation); + } + + /** Internal constructor; other constructors must call it. */ + private NlsString(String stringValue, ByteString bytesValue, + String charsetName, SqlCollation collation) { + if (charsetName != null) { + this.charsetName = charsetName.toUpperCase(Locale.ROOT); + this.charset = SqlUtil.getCharset(charsetName); } else { this.charsetName = null; this.charset = null; } + if ((stringValue != null) == (bytesValue != null)) { + throw new IllegalArgumentException("Specify stringValue or bytesValue"); + } + if (bytesValue != null) { + if (charsetName == null) { + throw new IllegalArgumentException("Bytes value requires charset"); + } + SqlUtil.validateCharset(bytesValue, charset); + } else { + // Java string can be malformed if LATIN1 is required. + if (this.charsetName != null + && (this.charsetName.equals("LATIN1") + || this.charsetName.equals("ISO-8859-1"))) { + if (!charset.newEncoder().canEncode(stringValue)) { + throw RESOURCE.charsetEncoding(stringValue, charset.name()).ex(); + } + } + } this.collation = collation; - this.value = value; + this.stringValue = stringValue; + this.bytesValue = bytesValue; } //~ Methods ---------------------------------------------------------------- @@ -99,25 +156,22 @@ public class NlsString implements Comparable<NlsString>, Cloneable { } public int hashCode() { - return Objects.hash(value, charsetName, collation); + return Objects.hash(stringValue, bytesValue, charsetName, collation); } public boolean equals(Object obj) { - if (!(obj instanceof NlsString)) { - return false; - } - NlsString that = (NlsString) obj; - return Objects.equals(value, that.value) - && Objects.equals(charsetName, that.charsetName) - && Objects.equals(collation, that.collation); + return this == obj + || obj instanceof NlsString + && Objects.equals(stringValue, ((NlsString) obj).stringValue) + && Objects.equals(bytesValue, ((NlsString) obj).bytesValue) + && Objects.equals(charsetName, ((NlsString) obj).charsetName) + && Objects.equals(collation, ((NlsString) obj).collation); } - // implement Comparable - public int compareTo(NlsString other) { + @Override public int compareTo(NlsString other) { // TODO jvs 18-Jan-2006: Actual collation support. This just uses // the default collation. - - return value.compareTo(other.value); + return getValue().compareTo(other.getValue()); } public String getCharsetName() { @@ -133,7 +187,11 @@ public class NlsString implements Comparable<NlsString>, Cloneable { } public String getValue() { - return value; + if (stringValue == null) { + assert bytesValue != null; + return DECODE_MAP.getUnchecked(Pair.of(bytesValue, charset)); + } + return stringValue; } /** @@ -141,8 +199,8 @@ public class NlsString implements Comparable<NlsString>, Cloneable { * right. */ public NlsString rtrim() { - String trimmed = SqlFunctions.rtrim(value); - if (!trimmed.equals(value)) { + String trimmed = SqlFunctions.rtrim(getValue()); + if (!trimmed.equals(getValue())) { return new NlsString(trimmed, charsetName, collation); } return this; @@ -165,7 +223,7 @@ public class NlsString implements Comparable<NlsString>, Cloneable { ret.append(charsetName); } ret.append("'"); - ret.append(Util.replace(value, "'", "''")); + ret.append(Util.replace(getValue(), "'", "''")); ret.append("'"); // NOTE jvs 3-Feb-2005: see FRG-78 for why this should go away @@ -200,12 +258,12 @@ public class NlsString implements Comparable<NlsString>, Cloneable { } String charSetName = args.get(0).charsetName; SqlCollation collation = args.get(0).collation; - int length = args.get(0).value.length(); + int length = args.get(0).getValue().length(); // sum string lengths and validate for (int i = 1; i < args.size(); i++) { final NlsString arg = args.get(i); - length += arg.value.length(); + length += arg.getValue().length(); if (!((arg.charsetName == null) || arg.charsetName.equals(charSetName))) { throw new IllegalArgumentException("mismatched charsets"); @@ -218,7 +276,7 @@ public class NlsString implements Comparable<NlsString>, Cloneable { StringBuilder sb = new StringBuilder(length); for (NlsString arg : args) { - sb.append(arg.value); + sb.append(arg.getValue()); } return new NlsString( sb.toString(), @@ -231,6 +289,11 @@ public class NlsString implements Comparable<NlsString>, Cloneable { public NlsString copy(String value) { return new NlsString(value, charsetName, collation); } + + /** Returns the value as a {@link ByteString}. */ + public ByteString getValueBytes() { + return bytesValue; + } } // End NlsString.java http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java b/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java index f959a76..1445e95 100644 --- a/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java +++ b/core/src/test/java/org/apache/calcite/rex/RexBuilderTest.java @@ -16,12 +16,15 @@ */ package org.apache.calcite.rex; +import org.apache.calcite.avatica.util.ByteString; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rel.type.RelDataTypeSystem; +import org.apache.calcite.sql.SqlCollation; import org.apache.calcite.sql.type.SqlTypeFactoryImpl; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.util.DateString; +import org.apache.calcite.util.NlsString; import org.apache.calcite.util.TimeString; import org.apache.calcite.util.TimestampString; import org.apache.calcite.util.TimestampWithTimeZoneString; @@ -29,6 +32,7 @@ import org.apache.calcite.util.Util; import org.junit.Test; +import java.nio.charset.StandardCharsets; import java.util.Calendar; import java.util.TimeZone; @@ -489,6 +493,52 @@ public class RexBuilderTest { } } + /** + * Test string literal encoding. + */ + @Test public void testStringLiteral() { + final RelDataTypeFactory typeFactory = + new SqlTypeFactoryImpl(RelDataTypeSystem.DEFAULT); + final RelDataType varchar = + typeFactory.createSqlType(SqlTypeName.VARCHAR); + final RexBuilder builder = new RexBuilder(typeFactory); + + final NlsString latin1 = new NlsString("foobar", "LATIN1", SqlCollation.IMPLICIT); + final NlsString utf8 = new NlsString("foobar", "UTF8", SqlCollation.IMPLICIT); + + RexNode literal = builder.makePreciseStringLiteral("foobar"); + assertEquals("'foobar'", literal.toString()); + literal = builder.makePreciseStringLiteral( + new ByteString(new byte[] { 'f', 'o', 'o', 'b', 'a', 'r'}), + "UTF8", + SqlCollation.IMPLICIT); + assertEquals("_UTF8'foobar'", literal.toString()); + literal = builder.makePreciseStringLiteral( + new ByteString("\u82f1\u56fd".getBytes(StandardCharsets.UTF_8)), + "UTF8", + SqlCollation.IMPLICIT); + assertEquals("_UTF8'\u82f1\u56fd'", literal.toString()); + // Test again to check decode cache. + literal = builder.makePreciseStringLiteral( + new ByteString("\u82f1".getBytes(StandardCharsets.UTF_8)), + "UTF8", + SqlCollation.IMPLICIT); + assertEquals("_UTF8'\u82f1'", literal.toString()); + try { + literal = builder.makePreciseStringLiteral( + new ByteString("\u82f1\u56fd".getBytes(StandardCharsets.UTF_8)), + "GB2312", + SqlCollation.IMPLICIT); + fail("expected exception, got " + literal); + } catch (RuntimeException e) { + assertThat(e.getMessage(), containsString("Failed to encode")); + } + literal = builder.makeLiteral(latin1, varchar, false); + assertEquals("_LATIN1'foobar'", literal.toString()); + literal = builder.makeLiteral(utf8, varchar, false); + assertEquals("_UTF8'foobar'", literal.toString()); + } + } // End RexBuilderTest.java http://git-wip-us.apache.org/repos/asf/calcite/blob/36bd5fb2/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java b/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java index a7a1233..af49b2f 100644 --- a/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java +++ b/core/src/test/java/org/apache/calcite/sql/parser/SqlParserTest.java @@ -3727,6 +3727,7 @@ public class SqlParserTest { checkExp( "_iso-8859-1'bye' \n\n--\n-- this is a comment\n' bye'", "_ISO-8859-1'bye'\n' bye'"); + checkExp("_utf8'hi'", "_UTF8'hi'"); // newline in string literal checkExp("'foo\rbar'", "'foo\rbar'");
