This is an automated email from the ASF dual-hosted git repository. chaokunyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/fury.git
The following commit(s) were added to refs/heads/main by this push: new 09fda94a refactor(java): move latin language checker method from string serializer to string util (#1708) 09fda94a is described below commit 09fda94ab7f476da4dc3b6752825b7249bda6ac2 Author: Anagh Mehran <anag...@gmail.com> AuthorDate: Sat Jun 29 06:59:56 2024 -0400 refactor(java): move latin language checker method from string serializer to string util (#1708) ## What does this PR do? <!-- Describe the purpose of this PR. --> This PR decouples and moves the `isLatin([])` method from `StringSerializer` class to `StringUtils`. ## Related issues <!-- Is there any related issue? Please attach here. - #1703 - #xxxx1 - #xxxx2 --> #1703 ## Does this PR introduce any user-facing change? <!-- If any user-facing interface changes, please [open an issue](https://github.com/apache/fury/issues/new/choose) describing the need to do so and update the document if necessary. --> - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark <!-- When the PR has an impact on performance (if you don't know whether the PR will have an impact on performance, you can submit the PR first, and if it will have impact on performance, the code reviewer will explain it), be sure to attach a benchmark data here. --> --------- Co-authored-by: Shawn Yang <chaokuny...@apache.org> --- .../apache/fury/benchmark/CompressStringSuite.java | 3 +- .../org/apache/fury/meta/MetaStringEncoder.java | 6 +- .../org/apache/fury/serializer/Serializers.java | 3 +- .../apache/fury/serializer/StringSerializer.java | 42 +---------- .../java/org/apache/fury/util/StringUtils.java | 42 +++++++++++ .../fury-core/native-image.properties | 3 +- .../fury/serializer/StringSerializerTest.java | 83 --------------------- .../java/org/apache/fury/util/StringUtilsTest.java | 85 +++++++++++++++++++++- 8 files changed, 137 insertions(+), 130 deletions(-) diff --git a/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java b/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java index 63979c11..bc09fa20 100644 --- a/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java +++ b/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java @@ -22,7 +22,6 @@ package org.apache.fury.benchmark; import java.nio.ByteBuffer; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; -import org.apache.fury.serializer.StringSerializer; import org.apache.fury.util.StringUtils; import org.openjdk.jmh.Main; import org.openjdk.jmh.annotations.Benchmark; @@ -102,7 +101,7 @@ public class CompressStringSuite { @Benchmark public Object latinSuperWordCheck() { - return StringSerializer.isLatin(latinStrChars); + return StringUtils.isLatin(latinStrChars); } public static void main(String[] args) throws Exception { diff --git a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java index b6a0a58b..90298e8e 100644 --- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java +++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java @@ -23,8 +23,8 @@ import java.nio.charset.StandardCharsets; import java.util.HashSet; import org.apache.fury.collection.Collections; import org.apache.fury.meta.MetaString.Encoding; -import org.apache.fury.serializer.StringSerializer; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringUtils; /** Encodes plain text strings into MetaString objects with specified encoding mechanisms. */ public class MetaStringEncoder { @@ -57,7 +57,7 @@ public class MetaStringEncoder { if (input.isEmpty()) { return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]); } - if (!StringSerializer.isLatin(input.toCharArray())) { + if (!StringUtils.isLatin(input.toCharArray())) { return new MetaString( input, Encoding.UTF_8, @@ -79,7 +79,7 @@ public class MetaStringEncoder { public MetaString encode(String input, Encoding encoding) { Preconditions.checkArgument( input.length() < Short.MAX_VALUE, "Long meta string than 32767 is not allowed"); - if (encoding != Encoding.UTF_8 && !StringSerializer.isLatin(input.toCharArray())) { + if (encoding != Encoding.UTF_8 && !StringUtils.isLatin(input.toCharArray())) { throw new IllegalArgumentException("Non-ASCII characters in meta string are not allowed"); } if (input.isEmpty()) { diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java index 3dbd70eb..63d0099c 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java @@ -51,6 +51,7 @@ import org.apache.fury.resolver.ClassResolver; import org.apache.fury.type.Type; import org.apache.fury.util.ExceptionUtils; import org.apache.fury.util.GraalvmSupport; +import org.apache.fury.util.StringUtils; import org.apache.fury.util.unsafe._JDKAccess; /** Serialization utils and common serializers. */ @@ -257,7 +258,7 @@ public class Serializers { buffer.writeBytes(v, 0, bytesLen); } else { char[] v = (char[]) GET_VALUE.apply(value); - if (StringSerializer.isLatin(v)) { + if (StringUtils.isLatin(v)) { stringSerializer.writeCharsLatin(buffer, v, value.length()); } else { stringSerializer.writeCharsUTF16(buffer, v, value.length()); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java index 2e26096e..0c77bca4 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java @@ -43,6 +43,7 @@ import org.apache.fury.reflect.ReflectionUtils; import org.apache.fury.type.Type; import org.apache.fury.util.MathUtils; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringUtils; import org.apache.fury.util.unsafe._JDKAccess; /** @@ -63,8 +64,6 @@ public final class StringSerializer extends Serializer<String> { private static final Byte UTF16_BOXED = UTF16; private static final byte UTF8 = 2; private static final int DEFAULT_BUFFER_SIZE = 1024; - // A long mask used to clear all-higher bits of char in a super-word way. - private static final long MULTI_CHARS_NON_LATIN_MASK; // Make offset compatible with graalvm native image. private static final long STRING_VALUE_FIELD_OFFSET; @@ -103,15 +102,6 @@ public final class StringSerializer extends Serializer<String> { Preconditions.checkArgument( ReflectionUtils.getFieldNullable(String.class, "offset") == null, "Current jdk not supported"); - if (Platform.IS_LITTLE_ENDIAN) { - // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; - // Using 0x00,0xff(0xff00) to clear latin bits. - MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L; - } else { - // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; - // Using 0x00,0xff(0x00ff) to clear latin bits. - MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL; - } } private final boolean compressString; @@ -178,7 +168,7 @@ public final class StringSerializer extends Serializer<String> { // Invoked by jit public void writeCharsStringCompressed(MemoryBuffer buffer, String value) { final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); - if (isLatin(chars)) { + if (StringUtils.isLatin(chars)) { writeCharsLatin(buffer, chars, chars.length); } else { writeCharsUTF16(buffer, chars, chars.length); @@ -288,7 +278,7 @@ public final class StringSerializer extends Serializer<String> { assert STRING_VALUE_FIELD_IS_CHARS; final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); if (compressString) { - if (isLatin(chars)) { + if (StringUtils.isLatin(chars)) { writeCharsLatin(buffer, chars, chars.length); } else { writeCharsUTF16(buffer, chars, chars.length); @@ -300,32 +290,6 @@ public final class StringSerializer extends Serializer<String> { } } - public static boolean isLatin(char[] chars) { - int numChars = chars.length; - int vectorizedLen = numChars >> 2; - int vectorizedChars = vectorizedLen << 2; - int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); - boolean isLatin = true; - for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { - // check 4 chars in a vectorized way, 4 times faster than scalar check loop. - // See benchmark in CompressStringSuite.latinSuperWordCheck. - long multiChars = Platform.getLong(chars, offset); - if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) != 0) { - isLatin = false; - break; - } - } - if (isLatin) { - for (int i = vectorizedChars; i < numChars; i++) { - if (chars[i] > 0xFF) { - isLatin = false; - break; - } - } - } - return isLatin; - } - // Invoked by fury JIT public String readJavaString(MemoryBuffer buffer) { if (STRING_VALUE_FIELD_IS_BYTES) { diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java index 338073fc..cc892bef 100644 --- a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java +++ b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java @@ -22,12 +22,28 @@ package org.apache.fury.util; import java.util.HashMap; import java.util.Map; import java.util.Random; +import org.apache.fury.memory.Platform; public class StringUtils { + // A long mask used to clear all-higher bits of char in a super-word way. + private static final long MULTI_CHARS_NON_LATIN_MASK; + private static final char[] BASE16_CHARS2 = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; + static { + if (Platform.IS_LITTLE_ENDIAN) { + // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; + // Using 0x00,0xff(0xff00) to clear latin bits. + MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L; + } else { + // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; + // Using 0x00,0xff(0x00ff) to clear latin bits. + MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL; + } + } + /** Converts a bytes array into a hexadecimal string. */ public static String encodeHexString(final byte[] data) { StringBuilder result = new StringBuilder(data.length * 2); @@ -249,4 +265,30 @@ public class StringUtils { return builder.toString(); } + + public static boolean isLatin(char[] chars) { + int numChars = chars.length; + int vectorizedLen = numChars >> 2; + int vectorizedChars = vectorizedLen << 2; + int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); + boolean isLatin = true; + for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { + // check 4 chars in a vectorized way, 4 times faster than scalar check loop. + // See benchmark in CompressStringSuite.latinSuperWordCheck. + long multiChars = Platform.getLong(chars, offset); + if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) != 0) { + isLatin = false; + break; + } + } + if (isLatin) { + for (int i = vectorizedChars; i < numChars; i++) { + if (chars[i] > 0xFF) { + isLatin = false; + break; + } + } + } + return isLatin; + } } diff --git a/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties b/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties index 80986a2b..3b6e47de 100644 --- a/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties +++ b/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties @@ -172,4 +172,5 @@ Args=--initialize-at-build-time=org.apache.fury.memory.MemoryBuffer,\ org.apache.fury.reflect.Types$ClassOwnership,\ org.apache.fury.reflect.Types$ClassOwnership$1,\ org.apache.fury.reflect.Types$ClassOwnership$2,\ - org.apache.fury.resolver.DisallowedList + org.apache.fury.resolver.DisallowedList,\ + org.apache.fury.util.StringUtils diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java index 51779c8c..84cc69ed 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java @@ -21,8 +21,6 @@ package org.apache.fury.serializer; import static org.apache.fury.serializer.StringSerializer.newBytesStringZeroCopy; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; import java.lang.reflect.Field; import java.nio.ByteBuffer; @@ -290,87 +288,6 @@ public class StringSerializerTest extends FuryTestBase { } } - @Test(dataProvider = "endian") - public void testVectorizedLatinCheckAlgorithm(boolean endian) { - // assertTrue(isLatin("Fury".toCharArray(), endian)); - // assertTrue(isLatin(StringUtils.random(8 * 10).toCharArray(), endian)); - // test unaligned - assertTrue(isLatin((StringUtils.random(8 * 10) + "1").toCharArray(), endian)); - assertTrue(isLatin((StringUtils.random(8 * 10) + "12").toCharArray(), endian)); - assertTrue(isLatin((StringUtils.random(8 * 10) + "123").toCharArray(), endian)); - assertFalse(isLatin("你好, Fury".toCharArray(), endian)); - assertFalse(isLatin((StringUtils.random(8 * 10) + "你好").toCharArray(), endian)); - assertFalse(isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray(), endian)); - } - - private boolean isLatin(char[] chars, boolean isLittle) { - boolean reverseBytes = - (Platform.IS_LITTLE_ENDIAN && !isLittle) || (!Platform.IS_LITTLE_ENDIAN && !isLittle); - if (reverseBytes) { - for (int i = 0; i < chars.length; i++) { - chars[i] = Character.reverseBytes(chars[i]); - } - } - long mask; - if (isLittle) { - // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; - // Using 0x00,0xff(0xff00) to clear latin bits. - mask = 0xff00ff00ff00ff00L; - } else { - // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; - // Using 0x00,0xff(0x00ff) to clear latin bits. - mask = 0x00ff00ff00ff00ffL; - } - int numChars = chars.length; - int vectorizedLen = numChars >> 2; - int vectorizedChars = vectorizedLen << 2; - int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); - boolean isLatin = true; - for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { - // check 4 chars in a vectorized way, 4 times faster than scalar check loop. - long multiChars = Platform.getLong(chars, offset); - if ((multiChars & mask) != 0) { - isLatin = false; - break; - } - } - if (isLatin) { - for (int i = vectorizedChars; i < numChars; i++) { - char c = chars[i]; - if (reverseBytes) { - c = Character.reverseBytes(c); - } - if (c > 0xFF) { - isLatin = false; - break; - } - } - } - return isLatin; - } - - @Test - public void testLatinCheck() { - assertTrue(StringSerializer.isLatin("Fury".toCharArray())); - assertTrue(StringSerializer.isLatin(StringUtils.random(8 * 10).toCharArray())); - // test unaligned - assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "1").toCharArray())); - assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "12").toCharArray())); - assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "123").toCharArray())); - assertFalse(StringSerializer.isLatin("你好, Fury".toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(8 * 10) + "你好").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(11) + "你").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(10) + "你好").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(9) + "性能好").toCharArray())); - assertFalse(StringSerializer.isLatin("\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("a\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("ab\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("abc\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("abcd\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("Javaone Keynote\u1234".toCharArray())); - } - @Test public void testReadUtf8String() { Fury fury = getJavaFury(); diff --git a/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java b/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java index dea41788..dec17d63 100644 --- a/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java @@ -23,9 +23,11 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; +import org.apache.fury.FuryTestBase; +import org.apache.fury.memory.Platform; import org.testng.annotations.Test; -public class StringUtilsTest { +public class StringUtilsTest extends FuryTestBase { @Test public void testEncodeHexString() { @@ -87,4 +89,85 @@ public class StringUtilsTest { assertEquals(StringUtils.lowerCamelToLowerUnderscore("some123variable"), "some123variable"); assertEquals(StringUtils.lowerCamelToLowerUnderscore("someVariable123"), "some_variable123"); } + + @Test(dataProvider = "endian") + public void testVectorizedLatinCheckAlgorithm(boolean endian) { + // assertTrue(isLatin("Fury".toCharArray(), endian)); + // assertTrue(isLatin(StringUtils.random(8 * 10).toCharArray(), endian)); + // test unaligned + assertTrue(isLatin((StringUtils.random(8 * 10) + "1").toCharArray(), endian)); + assertTrue(isLatin((StringUtils.random(8 * 10) + "12").toCharArray(), endian)); + assertTrue(isLatin((StringUtils.random(8 * 10) + "123").toCharArray(), endian)); + assertFalse(isLatin("你好, Fury".toCharArray(), endian)); + assertFalse(isLatin((StringUtils.random(8 * 10) + "你好").toCharArray(), endian)); + assertFalse(isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray(), endian)); + } + + private boolean isLatin(char[] chars, boolean isLittle) { + boolean reverseBytes = + (Platform.IS_LITTLE_ENDIAN && !isLittle) || (!Platform.IS_LITTLE_ENDIAN && !isLittle); + if (reverseBytes) { + for (int i = 0; i < chars.length; i++) { + chars[i] = Character.reverseBytes(chars[i]); + } + } + long mask; + if (isLittle) { + // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; + // Using 0x00,0xff(0xff00) to clear latin bits. + mask = 0xff00ff00ff00ff00L; + } else { + // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; + // Using 0x00,0xff(0x00ff) to clear latin bits. + mask = 0x00ff00ff00ff00ffL; + } + int numChars = chars.length; + int vectorizedLen = numChars >> 2; + int vectorizedChars = vectorizedLen << 2; + int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); + boolean isLatin = true; + for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { + // check 4 chars in a vectorized way, 4 times faster than scalar check loop. + long multiChars = Platform.getLong(chars, offset); + if ((multiChars & mask) != 0) { + isLatin = false; + break; + } + } + if (isLatin) { + for (int i = vectorizedChars; i < numChars; i++) { + char c = chars[i]; + if (reverseBytes) { + c = Character.reverseBytes(c); + } + if (c > 0xFF) { + isLatin = false; + break; + } + } + } + return isLatin; + } + + @Test + public void testLatinCheck() { + assertTrue(StringUtils.isLatin("Fury".toCharArray())); + assertTrue(StringUtils.isLatin(StringUtils.random(8 * 10).toCharArray())); + // test unaligned + assertTrue(StringUtils.isLatin((StringUtils.random(8 * 10) + "1").toCharArray())); + assertTrue(StringUtils.isLatin((StringUtils.random(8 * 10) + "12").toCharArray())); + assertTrue(StringUtils.isLatin((StringUtils.random(8 * 10) + "123").toCharArray())); + assertFalse(StringUtils.isLatin("你好, Fury".toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(8 * 10) + "你好").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(11) + "你").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(10) + "你好").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(9) + "性能好").toCharArray())); + assertFalse(StringUtils.isLatin("\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("a\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("ab\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("abc\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("abcd\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("Javaone Keynote\u1234".toCharArray())); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@fury.apache.org For additional commands, e-mail: commits-h...@fury.apache.org