(fury) branch main updated: feat(c++): add simd accelerated c++ ascii/latin1 check funcion (#1999)

chaokunyang Thu, 09 Jan 2025 00:34:16 -0800

This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git



The following commit(s) were added to refs/heads/main by this push:
     new 36da3f82 feat(c++): add simd accelerated c++ ascii/latin1 check 
funcion (#1999)
36da3f82 is described below

commit 36da3f82bbaebbdd28afb327d30bfc3e8e06bc24
Author: Shawn Yang <[email protected]>
AuthorDate: Thu Jan 9 16:16:09 2025 +0800

    feat(c++): add simd accelerated c++ ascii/latin1 check funcion (#1999)
    
    ## What does this PR do?
    
    add simd accelerated c++ ascii/latin1 check funcion
    
    ## Related issues
    
    <!--
    Is there any related issue? Please attach here.
    
    - #xxxx0
    - #xxxx1
    - #xxxx2
    -->
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fury/issues/new/choose) describing the
    need to do so and update the document if necessary.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    -->
---
 cpp/fury/util/string_util.cc      |  79 ----------------------------
 cpp/fury/util/string_util.h       | 105 +++++++++++++++++++++++++++++++++++++-
 cpp/fury/util/string_util_test.cc |  76 +++++++++++++++++++--------
 3 files changed, 158 insertions(+), 102 deletions(-)

diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc
index 3c28ac79..df2737c2 100644
--- a/cpp/fury/util/string_util.cc
+++ b/cpp/fury/util/string_util.cc
@@ -183,30 +183,6 @@ std::u16string utf8ToUtf16SIMD(const std::string &utf8, 
bool is_little_endian) {
 
 #if defined(__x86_64__) || defined(_M_X64)
 
-bool isLatin(const std::string &str) {
-  const char *data = str.data();
-  size_t len = str.size();
-
-  size_t i = 0;
-  __m256i latin_mask = _mm256_set1_epi8(0x80);
-  for (; i + 32 <= len; i += 32) {
-    __m256i chars =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(data + i));
-    __m256i result = _mm256_and_si256(chars, latin_mask);
-    if (!_mm256_testz_si256(result, result)) {
-      return false;
-    }
-  }
-
-  for (; i < len; ++i) {
-    if (static_cast<unsigned char>(data[i]) >= 128) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   std::string utf8;
   utf8.reserve(utf16.size() *
@@ -297,29 +273,6 @@ std::u16string utf8ToUtf16(const std::string &utf8, bool 
is_little_endian) {
 
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
 
-bool isLatin(const std::string &str) {
-  const char *data = str.data();
-  size_t len = str.size();
-
-  size_t i = 0;
-  uint8x16_t latin_mask = vdupq_n_u8(0x80);
-  for (; i + 16 <= len; i += 16) {
-    uint8x16_t chars = vld1q_u8(reinterpret_cast<const uint8_t *>(data + i));
-    uint8x16_t result = vandq_u8(chars, latin_mask);
-    if (vmaxvq_u8(result) != 0) {
-      return false;
-    }
-  }
-
-  for (; i < len; ++i) {
-    if (static_cast<unsigned char>(data[i]) >= 128) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   std::string utf8;
   utf8.reserve(utf16.size() * 3);
@@ -397,29 +350,6 @@ std::u16string utf8ToUtf16(const std::string &utf8, bool 
is_little_endian) {
 
 #elif defined(__riscv) && __riscv_vector
 
-bool isLatin(const std::string &str) {
-  const char *data = str.data();
-  size_t len = str.size();
-
-  size_t i = 0;
-  auto latin_mask = vmv_v_x_u8m1(0x80, 16);
-  for (; i + 16 <= len; i += 16) {
-    auto chars = vle8_v_u8m1(reinterpret_cast<const uint8_t *>(data + i), 16);
-    auto result = vand_vv_u8m1(chars, latin_mask, 16);
-    if (vfirst_m_b8(vmsne_vx_u8m1_b8(result, 0, 16))) {
-      return false;
-    }
-  }
-
-  for (; i < len; ++i) {
-    if (static_cast<unsigned char>(data[i]) >= 128) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   std::string utf8;
   utf8.reserve(utf16.size() * 3);
@@ -502,15 +432,6 @@ std::u16string utf8ToUtf16(const std::string &utf8, bool 
is_little_endian) {
 
 #else
 
-bool isLatin(const std::string &str) {
-  for (char c : str) {
-    if (static_cast<unsigned char>(c) >= 128) {
-      return false;
-    }
-  }
-  return true;
-}
-
 // Fallback implementation without SIMD acceleration
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   std::string utf8;
diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h
index 35254eab..2336615e 100644
--- a/cpp/fury/util/string_util.h
+++ b/cpp/fury/util/string_util.h
@@ -33,7 +33,34 @@
 
 namespace fury {
 
-bool isLatin(const std::string &str);
+static inline bool isAsciiFallback(const char *data, size_t size) {
+  size_t i = 0;
+  // Loop through 8-byte chunks
+  for (; i + 7 < size; i += 8) {
+    // Load 8 bytes from the string
+    uint64_t chunk = *reinterpret_cast<const uint64_t *>(data + i);
+    // Check if any byte in the 64-bit chunk is >= 128
+    // This checks if any of the top bits of each byte are set
+    if (chunk & 0x8080808080808080ULL) {
+      return false;
+    }
+  }
+  for (; i < size; ++i) {
+    if (static_cast<unsigned char>(data[i]) >= 128) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static inline bool isLatin1Fallback(const uint16_t *data, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    if (data[i] > 0xFF) {
+      return false;
+    }
+  }
+  return true;
+}
 
 static inline bool hasSurrogatePairFallback(const uint16_t *data, size_t size) 
{
   for (size_t i = 0; i < size; ++i) {
@@ -46,6 +73,34 @@ static inline bool hasSurrogatePairFallback(const uint16_t 
*data, size_t size) {
 }
 
 #if defined(USE_NEON_SIMD)
+inline bool isAscii(const char *data, size_t length) {
+  size_t i = 0;
+  uint8x16_t mostSignificantBit = vdupq_n_u8(0x80);
+  for (; i + 15 < length; i += 16) {
+    uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t *>(&data[i]));
+    uint8x16_t result = vandq_u8(chunk, mostSignificantBit);
+    if (vmaxvq_u8(result) != 0) {
+      return false;
+    }
+  }
+  // Check the remaining characters
+  return isAsciiFallback(data + i, length - i);
+}
+
+inline bool isLatin1(const uint16_t *data, size_t length) {
+  size_t i = 0;
+  uint16x8_t maxAllowed = vdupq_n_u16(0xFF);
+  for (; i + 7 < length; i += 8) {
+    uint16x8_t chunk = vld1q_u16(&data[i]);
+    uint16x8_t cmp = vcgtq_u16(chunk, maxAllowed);
+    if (vmaxvq_u16(cmp) != 0) {
+      return false;
+    }
+  }
+  // Check the remaining elements
+  return isLatin1Fallback(data + i, length - i);
+}
+
 inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
   size_t i = 0;
   uint16x8_t lower_bound = vdupq_n_u16(0xD800);
@@ -61,6 +116,36 @@ inline bool utf16HasSurrogatePairs(const uint16_t *data, 
size_t length) {
   return hasSurrogatePairFallback(data + i, length - i);
 }
 #elif defined(USE_SSE2_SIMD)
+inline bool isAscii(const char *data, size_t length) {
+  const __m128i mostSignificantBit = _mm_set1_epi8(static_cast<char>(0x80));
+  size_t i = 0;
+  for (; i + 15 < length; i += 16) {
+    __m128i chunk =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i]));
+    __m128i result = _mm_and_si128(chunk, mostSignificantBit);
+    if (_mm_movemask_epi8(result) != 0) {
+      return false;
+    }
+  }
+  // Check the remaining characters
+  return isAsciiFallback(data + i, length - i);
+}
+
+inline bool isLatin1(const uint16_t *data, size_t length) {
+  const __m128i maxAllowed = _mm_set1_epi16(0xFF);
+  size_t i = 0;
+  for (; i + 7 < length; i += 8) {
+    __m128i chunk =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i]));
+    __m128i cmp = _mm_cmpgt_epi16(chunk, maxAllowed);
+    if (_mm_movemask_epi8(cmp) != 0) {
+      return false;
+    }
+  }
+  // Check the remaining elements
+  return isLatin1Fallback(data + i, length - i);
+}
+
 inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
   size_t i = 0;
   __m128i lower_bound = _mm_set1_epi16(0xd7ff);
@@ -77,11 +162,29 @@ inline bool utf16HasSurrogatePairs(const uint16_t *data, 
size_t length) {
   return hasSurrogatePairFallback(data + i, length - i);
 }
 #else
+inline bool isAscii(const char *data, size_t length) {
+  return isAsciiFallback(data, length);
+}
+
+inline bool isLatin1(const uint16_t *data, size_t length) {
+  return isLatin1Fallback(data, length);
+}
+
 inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
   return hasSurrogatePairFallback(data, length);
 }
 #endif
 
+inline bool isAscii(const std::string &str) {
+  return isAscii(str.data(), str.size());
+}
+
+inline bool isLatin1(const std::u16string &str) {
+  const std::uint16_t *data =
+      reinterpret_cast<const std::uint16_t *>(str.data());
+  return isLatin1(data, str.size());
+}
+
 inline bool utf16HasSurrogatePairs(const std::u16string &str) {
   // Get the data pointer
   const std::uint16_t *data =
diff --git a/cpp/fury/util/string_util_test.cc 
b/cpp/fury/util/string_util_test.cc
index 5fe800dd..c95b0ecb 100644
--- a/cpp/fury/util/string_util_test.cc
+++ b/cpp/fury/util/string_util_test.cc
@@ -45,7 +45,7 @@ std::string generateRandomString(size_t length) {
   return result;
 }
 
-bool isLatin_BaseLine(const std::string &str) {
+bool isAscii_BaseLine(const std::string &str) {
   for (char c : str) {
     if (static_cast<unsigned char>(c) >= 128) {
       return false;
@@ -54,10 +54,10 @@ bool isLatin_BaseLine(const std::string &str) {
   return true;
 }
 
-TEST(StringUtilTest, TestIsLatinFunctions) {
+TEST(StringUtilTest, TestisAsciiFunctions) {
   std::string testStr = generateRandomString(100000);
   auto start_time = std::chrono::high_resolution_clock::now();
-  bool result = isLatin_BaseLine(testStr);
+  bool result = isAscii_BaseLine(testStr);
   auto end_time = std::chrono::high_resolution_clock::now();
   auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
                       end_time - start_time)
@@ -65,7 +65,7 @@ TEST(StringUtilTest, TestIsLatinFunctions) {
   FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns.";
 
   start_time = std::chrono::high_resolution_clock::now();
-  result = isLatin(testStr);
+  result = isAscii(testStr);
   end_time = std::chrono::high_resolution_clock::now();
   duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time -
                                                                   start_time)
@@ -75,29 +75,61 @@ TEST(StringUtilTest, TestIsLatinFunctions) {
   EXPECT_TRUE(result);
 }
 
-TEST(StringUtilTest, TestIsLatinLogic) {
+TEST(StringUtilTest, TestisAsciiLogic) {
   // Test strings with only Latin characters
-  EXPECT_TRUE(isLatin("Fury"));
-  EXPECT_TRUE(isLatin(generateRandomString(80)));
+  EXPECT_TRUE(isAscii("Fury"));
+  EXPECT_TRUE(isAscii(generateRandomString(80)));
 
   // Test unaligned strings with only Latin characters
-  EXPECT_TRUE(isLatin(generateRandomString(80) + "1"));
-  EXPECT_TRUE(isLatin(generateRandomString(80) + "12"));
-  EXPECT_TRUE(isLatin(generateRandomString(80) + "123"));
+  EXPECT_TRUE(isAscii(generateRandomString(80) + "1"));
+  EXPECT_TRUE(isAscii(generateRandomString(80) + "12"));
+  EXPECT_TRUE(isAscii(generateRandomString(80) + "123"));
 
   // Test strings with non-Latin characters
-  EXPECT_FALSE(isLatin("你好, Fury"));
-  EXPECT_FALSE(isLatin(generateRandomString(80) + "你好"));
-  EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好"));
-  EXPECT_FALSE(isLatin(generateRandomString(11) + "你"));
-  EXPECT_FALSE(isLatin(generateRandomString(10) + "你好"));
-  EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好"));
-  EXPECT_FALSE(isLatin("\u1234"));
-  EXPECT_FALSE(isLatin("a\u1234"));
-  EXPECT_FALSE(isLatin("ab\u1234"));
-  EXPECT_FALSE(isLatin("abc\u1234"));
-  EXPECT_FALSE(isLatin("abcd\u1234"));
-  EXPECT_FALSE(isLatin("Javaone Keynote\u1234"));
+  EXPECT_FALSE(isAscii("你好, Fury"));
+  EXPECT_FALSE(isAscii(generateRandomString(80) + "你好"));
+  EXPECT_FALSE(isAscii(generateRandomString(80) + "1你好"));
+  EXPECT_FALSE(isAscii(generateRandomString(11) + "你"));
+  EXPECT_FALSE(isAscii(generateRandomString(10) + "你好"));
+  EXPECT_FALSE(isAscii(generateRandomString(9) + "性能好"));
+  EXPECT_FALSE(isAscii("\u1234"));
+  EXPECT_FALSE(isAscii("a\u1234"));
+  EXPECT_FALSE(isAscii("ab\u1234"));
+  EXPECT_FALSE(isAscii("abc\u1234"));
+  EXPECT_FALSE(isAscii("abcd\u1234"));
+  EXPECT_FALSE(isAscii("Javaone Keynote\u1234"));
+
+  for (size_t i = 1; i < 256; i++) {
+    EXPECT_TRUE(isAscii(std::string(i, '.') + "Fury"));
+    EXPECT_FALSE(isAscii(std::string(i, '.') + "序列化"));
+  }
+}
+
+TEST(StringUtilTest, TestisLatin1) {
+  // Test strings with only Latin characters
+  EXPECT_TRUE(isLatin1(u"Fury"));
+  EXPECT_TRUE(isLatin1(u"\xE9")); // é in Latin-1
+  EXPECT_TRUE(isLatin1(u"\xF1")); // ñ in Latin-1
+  // Test strings with non-Latin characters
+  EXPECT_FALSE(isLatin1(u"你好, Fury"));
+  EXPECT_FALSE(isLatin1(u"a\u1234"));
+  EXPECT_FALSE(isLatin1(u"ab\u1234"));
+  EXPECT_FALSE(isLatin1(u"abc\u1234"));
+  EXPECT_FALSE(isLatin1(u"abcd\u1234"));
+  EXPECT_FALSE(isLatin1(u"Javaone Keynote\u1234"));
+  EXPECT_TRUE(isLatin1(u"a\xFF")); // ÿ in Latin-1
+  EXPECT_TRUE(isLatin1(u"\x80"));  //  in Latin-1
+  const uint16_t str[] = {256, 256};
+  EXPECT_FALSE(isLatin1(str, 2)); // Ā (not in Latin-1)
+
+  for (size_t i = 1; i < 256; i++) {
+    EXPECT_TRUE(isLatin1(std::u16string(i, '.') + u"Fury"));
+    EXPECT_FALSE(isLatin1(std::u16string(i, '.') + u"序列化"));
+    EXPECT_TRUE(isLatin1(std::u16string(i, '.') + u"a\xFF")); // ÿ in Latin-1
+    EXPECT_TRUE(isLatin1(std::u16string(i, '.') + u"\x80"));  //  in Latin-1
+    EXPECT_FALSE(isLatin1(std::u16string(i, '.') +
+                          std::u16string({256}))); // Ā (not in Latin-1)
+  }
 }
 
 // Generate random UTF-16 string ensuring valid surrogate pairs


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(fury) branch main updated: feat(c++): add simd accelerated c++ ascii/latin1 check funcion (#1999)

Reply via email to