This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git
The following commit(s) were added to refs/heads/main by this push:
new 1e63705e feat(c++): BenchMark will be a Module (#2028)
1e63705e is described below
commit 1e63705eef4f0c684e6d0f267f3e21d0491b3195
Author: PAN <[email protected]>
AuthorDate: Mon Jan 27 18:53:40 2025 +0800
feat(c++): BenchMark will be a Module (#2028)
<!--
**Thanks for contributing to Fury.**
**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**
Contribution Checklist
- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).
- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->
## What does this PR do?
benchmark will be a module rather than a file
<img width="431" alt="image"
src="https://github.com/user-attachments/assets/59cad85d-3ee9-46ff-9d0f-b0109614cbce"
/>
<!-- Describe the purpose of this PR. -->
## Related issues
close #2022
relate #2023
<!--
Is there any related issue? Please attach here.
- #xxxx0
- #xxxx1
- #xxxx2
-->
## Does this PR introduce any user-facing change?
<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->
- [x] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
---
cpp/README.md | 20 +-
cpp/fury/benchmark/BUILD | 24 ++
cpp/fury/benchmark/benchmark_string_util.cc | 510 ++++++++++++++++++++++++++++
cpp/fury/util/BUILD | 12 +-
cpp/fury/util/benchmark.cc | 119 -------
cpp/fury/util/string_util_test.cc | 203 +----------
6 files changed, 569 insertions(+), 319 deletions(-)
diff --git a/cpp/README.md b/cpp/README.md
index 089f2159..3a44aabf 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -1,8 +1,26 @@
# Apache Fury™ C++
-## How to build
+Fury is a blazingly-fast multi-language serialization framework powered by
just-in-time compilation and zero-copy.
+
+## Build Fury C++
```bash
+# Build all projects
bazel build //:all
+# Run all tests
bazel test //:all
```
+
+## Environment
+
+- Bazel version: 6.3.2
+
+## Benchmark
+
+```bash
+bazel build //cpp/fury/benchmark:all
+bazel test //cpp/fury/benchmark:all
+# You can also run a single benchmark to see how efficient it is.
+# For example
+bazel run //cpp/fury/benchmark:benchmark_string_util
+```
diff --git a/cpp/fury/benchmark/BUILD b/cpp/fury/benchmark/BUILD
new file mode 100644
index 00000000..f6e3f697
--- /dev/null
+++ b/cpp/fury/benchmark/BUILD
@@ -0,0 +1,24 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+cc_library(
+ name = "fury_benchmark",
+ srcs = glob(["*.cc"], exclude=["benchmark*.cc"]),
+ hdrs = glob(["*.h"]),
+ strip_include_prefix = "/cpp",
+ alwayslink = True,
+ linkstatic = True,
+ deps = [
+ "//cpp/fury/util:fury_util",
+ "@com_google_benchmark//:benchmark",
+ ],
+ visibility = ["//visibility:public"],
+)
+
+
+cc_test(
+ name = "benchmark_string_util",
+ srcs = ["benchmark_string_util.cc"],
+ deps = [
+ ":fury_benchmark",
+ ],
+)
diff --git a/cpp/fury/benchmark/benchmark_string_util.cc
b/cpp/fury/benchmark/benchmark_string_util.cc
new file mode 100644
index 00000000..851cd3e5
--- /dev/null
+++ b/cpp/fury/benchmark/benchmark_string_util.cc
@@ -0,0 +1,510 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <codecvt>
+#include <locale>
+#include <random>
+
+#include "fury/util/string_util.h"
+
+#include <cstring>
+#include <string>
+
+/*
+ * TEST
+ */
+
+// Generate random bytes (0x00 to 0xFF)
+std::string generateRandom(size_t length) {
+ std::string result;
+ result.reserve(length);
+
+ std::mt19937 generator(std::random_device{}());
+ std::uniform_int_distribution<unsigned short> distribution(0x00, 0xFF);
+
+ for (size_t i = 0; i < length; ++i) {
+ result.push_back(static_cast<char>(distribution(generator)));
+ }
+ return result;
+}
+
+// Generate ASCII string (0x00 to 0x7F)
+std::string generateAscii(size_t length) {
+ const char charset[] =
+ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+ std::default_random_engine rng(std::random_device{}());
+ std::uniform_int_distribution<> dist(0, sizeof(charset) - 2);
+
+ std::string result;
+ result.reserve(length);
+ for (size_t i = 0; i < length; ++i) {
+ result += charset[dist(rng)];
+ }
+ return result;
+}
+
+// Generate Latin-1 string (0x00 to 0xFF) as std::u16string
+std::u16string generateLatin1(size_t length) {
+ std::u16string result;
+ result.reserve(length);
+
+ std::mt19937 generator(std::random_device{}());
+ std::uniform_int_distribution<uint16_t> distribution(0x00, 0xFF);
+
+ for (size_t i = 0; i < length; ++i) {
+ result.push_back(static_cast<char16_t>(distribution(generator)));
+ }
+ return result;
+}
+
+// Generate UTF-8 string (valid Unicode code points)
+std::string generateUtf8(size_t length) {
+ std::string result;
+ result.reserve(length);
+
+ std::mt19937 generator(std::random_device{}());
+ std::uniform_int_distribution<uint32_t> distribution(0, 0x10FFFF);
+
+ while (result.size() < length) {
+ uint32_t code_point = distribution(generator);
+
+ // Skip surrogate pairs (0xD800 to 0xDFFF) and invalid Unicode code points
+ if ((code_point >= 0xD800 && code_point <= 0xDFFF) ||
+ code_point > 0x10FFFF) {
+ continue;
+ }
+
+ if (code_point <= 0x7F) {
+ result.push_back(static_cast<char>(code_point));
+ } else if (code_point <= 0x7FF) {
+ result.push_back(0xC0 | (code_point >> 6));
+ result.push_back(0x80 | (code_point & 0x3F));
+ } else if (code_point <= 0xFFFF) {
+ result.push_back(0xE0 | (code_point >> 12));
+ result.push_back(0x80 | ((code_point >> 6) & 0x3F));
+ result.push_back(0x80 | (code_point & 0x3F));
+ } else {
+ result.push_back(0xF0 | (code_point >> 18));
+ result.push_back(0x80 | ((code_point >> 12) & 0x3F));
+ result.push_back(0x80 | ((code_point >> 6) & 0x3F));
+ result.push_back(0x80 | (code_point & 0x3F));
+ }
+ }
+ return result;
+}
+
+// Generate UTF-16 string (valid Unicode code points)
+std::u16string generateUtf16(size_t length) {
+ std::u16string result;
+ result.reserve(length);
+
+ std::mt19937 generator(std::random_device{}());
+ std::uniform_int_distribution<uint32_t> distribution(0, 0x10FFFF);
+
+ while (result.size() < length) {
+ uint32_t code_point = distribution(generator);
+
+ // Skip surrogate pairs (0xD800 to 0xDFFF) and invalid Unicode code points
+ if ((code_point >= 0xD800 && code_point <= 0xDFFF) ||
+ code_point > 0x10FFFF) {
+ continue;
+ }
+
+ if (code_point <= 0xFFFF) {
+ result.push_back(static_cast<char16_t>(code_point));
+ } else {
+ // Handle code points greater than 0xFFFF (requires surrogate pairs)
+ code_point -= 0x10000;
+ char16_t high_surrogate = 0xD800 | ((code_point >> 10) & 0x3FF);
+ char16_t low_surrogate = 0xDC00 | (code_point & 0x3FF);
+ result.push_back(high_surrogate);
+ result.push_back(low_surrogate);
+ }
+ }
+ return result;
+}
+
+/*
+ * TEST NUM
+ */
+const size_t num_tests = 1000;
+const size_t string_length = 1000;
+
+/*
+ * TEST Strings
+ */
+// Generate a vector of Ascii strings for testing
+std::vector<std::string> generateAsciiString(size_t num_tests,
+ size_t string_length) {
+ std::vector<std::string> test_strings;
+ for (size_t i = 0; i < string_length; ++i) {
+ test_strings.push_back(generateUtf8(num_tests));
+ }
+ return test_strings;
+}
+
+const std::vector<std::string> test_ascii_strings =
+ generateAsciiString(num_tests, string_length);
+
+// Generate a vector of Latin-1 strings for testing
+std::vector<std::u16string> generateLatin1String(size_t num_tests,
+ size_t string_length) {
+ std::vector<std::u16string> test_strings;
+ for (size_t i = 0; i < num_tests; ++i) {
+ test_strings.push_back(generateLatin1(string_length));
+ }
+ return test_strings;
+}
+
+const std::vector<std::u16string> test_latin1_strings =
+ generateLatin1String(num_tests, string_length);
+
+// Generate random UTF-16 string
+std::vector<std::u16string> generateUTF16String(size_t num_tests,
+ size_t string_length) {
+ std::vector<std::u16string> test_strings;
+ for (size_t i = 0; i < string_length; ++i) {
+ test_strings.push_back(generateUtf16(num_tests));
+ }
+ return test_strings;
+}
+
+const std::vector<std::u16string> test_utf16_strings =
+ generateUTF16String(num_tests, string_length);
+
+// Generate random UTF-8 string
+std::vector<std::string> generateUTF8String(size_t num_tests,
+ size_t string_length) {
+ std::vector<std::string> test_strings;
+ for (size_t i = 0; i < string_length; ++i) {
+ test_strings.push_back(generateUtf8(num_tests));
+ }
+ return test_strings;
+}
+
+const std::vector<std::string> test_utf8_strings =
+ generateUTF8String(num_tests, string_length);
+
+/*
+ * TEST IsAscii
+ */
+
+// Check if a string is ASCII (all characters <= 0x7F)
+bool isAscii_BaseLine(const std::string &str) {
+ for (char c : str) {
+ if (static_cast<uint8_t>(c) > 0x7F) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Benchmark function for Baseline ASCII check
+static void BM_IsAscii_BaseLine(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_ascii_strings) {
+ bool result = isAscii_BaseLine(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+// Benchmark function for SIMD ASCII check
+static void BM_IsAscii_SIMD(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_ascii_strings) {
+ bool result = fury::isAscii(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+BENCHMARK(BM_IsAscii_BaseLine);
+BENCHMARK(BM_IsAscii_SIMD);
+
+// Baseline implementation to check if a string is Latin-1
+bool isLatin1_BaseLine(const std::u16string &str) {
+ const std::uint16_t *data =
+ reinterpret_cast<const std::uint16_t *>(str.data());
+ size_t size = str.size();
+
+ for (size_t i = 0; i < size; ++i) {
+ if (data[i] > 0xFF) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Benchmark function for Baseline Latin-1 check
+static void BM_IsLatin1_BaseLine(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_latin1_strings) {
+ bool result = isLatin1_BaseLine(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+// Benchmark function for Optimized Latin-1 check
+static void BM_IsLatin1_SIMD(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_latin1_strings) {
+ bool result = fury::isLatin1(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+BENCHMARK(BM_IsLatin1_BaseLine);
+BENCHMARK(BM_IsLatin1_SIMD);
+
+/*
+ * TEST Utf16HasSurrogatePairs
+ */
+// Check if a UTF-16 string contains surrogate pairs
+bool utf16HasSurrogatePairs_BaseLine(const std::u16string &str) {
+ for (size_t i = 0; i < str.size(); ++i) {
+ char16_t c = str[i];
+ if (c >= 0xD800 && c <= 0xDFFF) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Benchmark function for checking if a UTF-16 string contains surrogate pairs
+static void BM_Utf16HasSurrogatePairs_BaseLine(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf16_strings) {
+ bool result = utf16HasSurrogatePairs_BaseLine(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+// Benchmark function for checking if a UTF-16 string contains surrogate pairs
+// with SIMD
+static void BM_Utf16HasSurrogatePairs_SIMD(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf16_strings) {
+ bool result = fury::utf16HasSurrogatePairs(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine);
+BENCHMARK(BM_Utf16HasSurrogatePairs_SIMD);
+
+/*
+ * TEST Utf16ToUtf8
+ */
+
+// UTF16 to UTF8 using the standard library
+std::string utf16ToUtf8StandardLibrary(const std::u16string &utf16) {
+ std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
+ return convert.to_bytes(utf16);
+}
+
+// UTF16 to UTF8 baseline conversion (without SIMD)
+std::string utf16ToUtf8BaseLine(const std::u16string &utf16,
+ bool is_little_endian = true) {
+ size_t utf16_length = utf16.length();
+ size_t utf8_length = utf16_length * 3;
+ std::string utf8_result(utf8_length, '\0');
+
+ size_t i = 0, j = 0;
+ while (i < utf16_length) {
+ char16_t utf16_char = utf16[i++];
+ if (utf16_char < 0x80) {
+ utf8_result[j++] = static_cast<char>(utf16_char);
+ } else if (utf16_char < 0x800) {
+ utf8_result[j++] = static_cast<char>(0xC0 | (utf16_char >> 6));
+ utf8_result[j++] = static_cast<char>(0x80 | (utf16_char & 0x3F));
+ } else {
+ utf8_result[j++] = static_cast<char>(0xE0 | (utf16_char >> 12));
+ utf8_result[j++] = static_cast<char>(0x80 | ((utf16_char >> 6) & 0x3F));
+ utf8_result[j++] = static_cast<char>(0x80 | (utf16_char & 0x3F));
+ }
+ }
+
+ utf8_result.resize(j);
+ return utf8_result;
+}
+
+// Benchmark function for Standard Library UTF-16 to UTF-8 conversion
+static void BM_Utf16ToUtf8_StandardLibrary(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf16_strings) {
+ std::string utf8 = utf16ToUtf8StandardLibrary(str);
+ benchmark::DoNotOptimize(
+ utf8); // Prevents the compiler from optimizing away unused variables
+ }
+ }
+}
+
+// Benchmark function for Baseline UTF-16 to UTF-8 conversion
+static void BM_Utf16ToUtf8_BaseLine(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf16_strings) {
+ std::string utf8 = utf16ToUtf8BaseLine(str, true);
+ benchmark::DoNotOptimize(
+ utf8); // Prevents the compiler from optimizing away unused variables
+ }
+ }
+}
+
+// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion
+static void BM_Utf16ToUtf8_SIMD(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf16_strings) {
+ std::string utf8 = fury::utf16ToUtf8(str, true);
+ benchmark::DoNotOptimize(
+ utf8); // Prevents the compiler from optimizing away unused variables
+ }
+ }
+}
+
+BENCHMARK(BM_Utf16ToUtf8_StandardLibrary);
+BENCHMARK(BM_Utf16ToUtf8_BaseLine);
+BENCHMARK(BM_Utf16ToUtf8_SIMD);
+
+/*
+ * TEST Utf8ToUtf16
+ */
+
+// UTF8 to UTF16 using the standard library
+std::u16string utf8ToUtf16StandardLibrary(const std::string &utf8) {
+ std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
+ return convert.from_bytes(utf8);
+}
+
+// UTF8 to UTF16 baseline conversion (without SIMD)
+std::u16string utf8ToUtf16BaseLine(const std::string &utf8,
+ bool is_little_endian) {
+ std::u16string utf16; // Resulting UTF-16 string
+ size_t i = 0; // Index for traversing the UTF-8 string
+ size_t n = utf8.size(); // Total length of the UTF-8 string
+
+ // Loop through each byte of the UTF-8 string
+ while (i < n) {
+ uint32_t code_point = 0; // The Unicode code point
+ unsigned char c = utf8[i]; // Current byte of the UTF-8 string
+
+ // Determine the number of bytes for this character based on its first byte
+ if ((c & 0x80) == 0) {
+ // 1-byte character (ASCII)
+ code_point = c;
+ ++i;
+ } else if ((c & 0xE0) == 0xC0) {
+ // 2-byte character
+ code_point = c & 0x1F;
+ code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+ i += 2;
+ } else if ((c & 0xF0) == 0xE0) {
+ // 3-byte character
+ code_point = c & 0x0F;
+ code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+ code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
+ i += 3;
+ } else if ((c & 0xF8) == 0xF0) {
+ // 4-byte character
+ code_point = c & 0x07;
+ code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+ code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
+ code_point = (code_point << 6) | (utf8[i + 3] & 0x3F);
+ i += 4;
+ } else {
+ // Invalid UTF-8 byte sequence
+ throw std::invalid_argument("Invalid UTF-8 encoding.");
+ }
+
+ // If the code point is beyond the BMP range, use surrogate pairs
+ if (code_point >= 0x10000) {
+ code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair
+ uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate
+ uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate
+
+ // If not little-endian, swap bytes of the surrogates
+ if (!is_little_endian) {
+ high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
+ low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
+ }
+
+ // Add both high and low surrogates to the UTF-16 string
+ utf16.push_back(high_surrogate);
+ utf16.push_back(low_surrogate);
+ } else {
+ // For code points within the BMP range, directly store as a 16-bit value
+ uint16_t utf16_char = static_cast<uint16_t>(code_point);
+
+ // If not little-endian, swap the bytes of the 16-bit character
+ if (!is_little_endian) {
+ utf16_char = (utf16_char >> 8) | (utf16_char << 8);
+ }
+
+ // Add the UTF-16 character to the string
+ utf16.push_back(utf16_char);
+ }
+ }
+
+ // Return the resulting UTF-16 string
+ return utf16;
+}
+
+// Benchmark function for Standard Library UTF-8 to UTF-16 conversion
+static void BM_Utf8ToUtf16_StandardLibrary(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf8_strings) {
+ std::u16string utf16 = utf8ToUtf16StandardLibrary(str);
+ benchmark::DoNotOptimize(
+ utf16); // Prevents the compiler from optimizing away unused
variables
+ }
+ }
+}
+
+// Benchmark function for Baseline UTF-8 to UTF-16 conversion
+static void BM_Utf8ToUtf16_BaseLine(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf8_strings) {
+ std::u16string utf16 = utf8ToUtf16BaseLine(str, true);
+ benchmark::DoNotOptimize(
+ utf16); // Prevents the compiler from optimizing away unused
variables
+ }
+ }
+}
+
+// Benchmark function for SIMD-based UTF-8 to UTF-16 conversion
+static void BM_Utf8ToUtf16_SIMD(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf8_strings) {
+ std::u16string utf16 = fury::utf8ToUtf16(str, true);
+ benchmark::DoNotOptimize(
+ utf16); // Prevents the compiler from optimizing away unused
variables
+ }
+ }
+}
+
+BENCHMARK(BM_Utf8ToUtf16_StandardLibrary);
+BENCHMARK(BM_Utf8ToUtf16_BaseLine);
+BENCHMARK(BM_Utf8ToUtf16_SIMD);
+
+BENCHMARK_MAIN();
diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD
index 073c0e06..36fe126b 100644
--- a/cpp/fury/util/BUILD
+++ b/cpp/fury/util/BUILD
@@ -2,7 +2,7 @@ load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
cc_library(
name = "fury_util",
- srcs = glob(["*.cc"], exclude=["*test.cc", "benchmark.cc"]),
+ srcs = glob(["*.cc"], exclude=["*test.cc"]),
hdrs = glob(["*.h"]),
copts = ["-mavx2"], # Enable AVX2 support
linkopts = ["-mavx2"], # Ensure linker also knows about AVX2
@@ -14,7 +14,6 @@ cc_library(
"@com_google_absl//absl/debugging:failure_signal_handler",
"@com_google_absl//absl/debugging:stacktrace",
"@com_google_absl//absl/debugging:symbolize",
- "@com_google_benchmark//:benchmark",
],
visibility = ["//visibility:public"],
)
@@ -64,12 +63,3 @@ cc_test(
"@com_google_googletest//:gtest",
],
)
-
-cc_test(
- name = "benchmark",
- srcs = ["benchmark.cc"],
- deps = [
- ":fury_util",
- "@com_google_benchmark//:benchmark",
- ],
-)
\ No newline at end of file
diff --git a/cpp/fury/util/benchmark.cc b/cpp/fury/util/benchmark.cc
deleted file mode 100644
index e911a70c..00000000
--- a/cpp/fury/util/benchmark.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <benchmark/benchmark.h>
-
-#include <codecvt>
-#include <locale>
-#include <random>
-
-#include "string_util.h"
-
-#include <cstring>
-#include <string>
-
-// Function to generate a random UTF-16 string
-std::u16string generateRandomUTF16String(size_t length) {
- const char charset[] =
- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
- std::default_random_engine rng(std::random_device{}());
- std::uniform_int_distribution<> dist(0, sizeof(charset) - 2);
-
- std::u16string result;
- result.reserve(length);
- for (size_t i = 0; i < length; ++i) {
- result += static_cast<char16_t>(charset[dist(rng)]);
- }
-
- return result;
-}
-
-std::vector<std::u16string> generateUTF16String(size_t num_tests) {
- std::vector<std::u16string> test_strings;
- for (size_t i = 0; i < num_tests; ++i) {
- test_strings.push_back(generateRandomUTF16String(num_tests));
- }
- return test_strings;
-}
-
-const std::vector<std::u16string> test_strings = generateUTF16String(1000);
-
-// UTF16 to UTF8 using the standard library
-std::string utf16ToUtf8StandardLibrary(const std::u16string &utf16) {
- std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
- return convert.to_bytes(utf16);
-}
-
-// UTF16 to UTF8 baseline conversion (without SIMD)
-std::string utf16ToUtf8BaseLine(const std::u16string &utf16,
- bool is_little_endian = true) {
- size_t utf16_length = utf16.length();
- size_t utf8_length = utf16_length * 3;
- std::string utf8_result(utf8_length, '\0');
-
- size_t i = 0, j = 0;
- while (i < utf16_length) {
- char16_t utf16_char = utf16[i++];
- if (utf16_char < 0x80) {
- utf8_result[j++] = static_cast<char>(utf16_char);
- } else if (utf16_char < 0x800) {
- utf8_result[j++] = static_cast<char>(0xC0 | (utf16_char >> 6));
- utf8_result[j++] = static_cast<char>(0x80 | (utf16_char & 0x3F));
- } else {
- utf8_result[j++] = static_cast<char>(0xE0 | (utf16_char >> 12));
- utf8_result[j++] = static_cast<char>(0x80 | ((utf16_char >> 6) & 0x3F));
- utf8_result[j++] = static_cast<char>(0x80 | (utf16_char & 0x3F));
- }
- }
-
- utf8_result.resize(j);
- return utf8_result;
-}
-
-// Benchmark function for Standard Library UTF-16 to UTF-8 conversion
-static void BM_StandardLibrary(benchmark::State &state) {
- for (auto _ : state) {
- for (const auto &str : test_strings) {
- std::string utf8 = utf16ToUtf8StandardLibrary(str);
- }
- }
-}
-
-// Benchmark function for Baseline UTF-16 to UTF-8 conversion
-static void BM_BaseLine(benchmark::State &state) {
- for (auto _ : state) {
- for (const auto &str : test_strings) {
- std::string utf8 = utf16ToUtf8BaseLine(str, true);
- }
- }
-}
-
-// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion
-static void BM_SIMD(benchmark::State &state) {
- for (auto _ : state) {
- for (const auto &str : test_strings) {
- std::string utf8 = fury::utf16ToUtf8(str, true);
- }
- }
-}
-
-BENCHMARK(BM_StandardLibrary);
-BENCHMARK(BM_BaseLine);
-BENCHMARK(BM_SIMD);
-BENCHMARK_MAIN();
diff --git a/cpp/fury/util/string_util_test.cc
b/cpp/fury/util/string_util_test.cc
index f2a2fdab..2f267e14 100644
--- a/cpp/fury/util/string_util_test.cc
+++ b/cpp/fury/util/string_util_test.cc
@@ -17,7 +17,6 @@
* under the License.
*/
-#include <chrono>
#include <codecvt>
#include <iostream>
#include <locale>
@@ -30,8 +29,8 @@
namespace fury {
-// Function to generate a random string
-std::string generateRandomString(size_t length) {
+// Generate ASCII string
+std::string generateAscii(size_t length) {
const char charset[] =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
std::default_random_engine rng(std::random_device{}());
@@ -42,57 +41,26 @@ std::string generateRandomString(size_t length) {
for (size_t i = 0; i < length; ++i) {
result += charset[dist(rng)];
}
-
return result;
}
-bool isAscii_BaseLine(const std::string &str) {
- for (char c : str) {
- if (static_cast<unsigned char>(c) >= 128) {
- return false;
- }
- }
- return true;
-}
-
-TEST(StringUtilTest, TestisAsciiFunctions) {
- std::string testStr = generateRandomString(100000);
- auto start_time = std::chrono::high_resolution_clock::now();
- bool result = isAscii_BaseLine(testStr);
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "BaseLine Running Time: " << duration << " ns.";
-
- start_time = std::chrono::high_resolution_clock::now();
- result = isAscii(testStr);
- end_time = std::chrono::high_resolution_clock::now();
- duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time -
- start_time)
- .count();
- FURY_LOG(FURY_INFO) << "Optimized Running Time: " << duration << " ns.";
-
- EXPECT_TRUE(result);
-}
-
TEST(StringUtilTest, TestisAsciiLogic) {
- // Test strings with only Latin characters
+ // Test strings with only Ascii characters
EXPECT_TRUE(isAscii("Fury"));
- EXPECT_TRUE(isAscii(generateRandomString(80)));
+ EXPECT_TRUE(isAscii(generateAscii(80)));
- // Test unaligned strings with only Latin characters
- EXPECT_TRUE(isAscii(generateRandomString(80) + "1"));
- EXPECT_TRUE(isAscii(generateRandomString(80) + "12"));
- EXPECT_TRUE(isAscii(generateRandomString(80) + "123"));
+ // Test unaligned strings with only Ascii characters
+ EXPECT_TRUE(isAscii(generateAscii(80) + "1"));
+ EXPECT_TRUE(isAscii(generateAscii(80) + "12"));
+ EXPECT_TRUE(isAscii(generateAscii(80) + "123"));
- // Test strings with non-Latin characters
+ // Test strings with non-Ascii characters
EXPECT_FALSE(isAscii("你好, Fury"));
- EXPECT_FALSE(isAscii(generateRandomString(80) + "你好"));
- EXPECT_FALSE(isAscii(generateRandomString(80) + "1你好"));
- EXPECT_FALSE(isAscii(generateRandomString(11) + "你"));
- EXPECT_FALSE(isAscii(generateRandomString(10) + "你好"));
- EXPECT_FALSE(isAscii(generateRandomString(9) + "性能好"));
+ EXPECT_FALSE(isAscii(generateAscii(80) + "你好"));
+ EXPECT_FALSE(isAscii(generateAscii(80) + "1你好"));
+ EXPECT_FALSE(isAscii(generateAscii(11) + "你"));
+ EXPECT_FALSE(isAscii(generateAscii(10) + "你好"));
+ EXPECT_FALSE(isAscii(generateAscii(9) + "性能好"));
EXPECT_FALSE(isAscii("\u1234"));
EXPECT_FALSE(isAscii("a\u1234"));
EXPECT_FALSE(isAscii("ab\u1234"));
@@ -249,7 +217,7 @@ std::string generateRandomUTF8String(size_t length) {
str.push_back(0xE0 | (code_point >> 12));
str.push_back(0x80 | ((code_point >> 6) & 0x3F));
str.push_back(0x80 | (code_point & 0x3F));
- } else if (code_point <= 0x10FFFF) {
+ } else {
str.push_back(0xF0 | (code_point >> 18));
str.push_back(0x80 | ((code_point >> 12) & 0x3F));
str.push_back(0x80 | ((code_point >> 6) & 0x3F));
@@ -260,78 +228,6 @@ std::string generateRandomUTF8String(size_t length) {
return str;
}
-std::u16string utf8ToUtf16BaseLine(const std::string &utf8,
- bool is_little_endian) {
- std::u16string utf16; // Resulting UTF-16 string
- size_t i = 0; // Index for traversing the UTF-8 string
- size_t n = utf8.size(); // Total length of the UTF-8 string
-
- // Loop through each byte of the UTF-8 string
- while (i < n) {
- uint32_t code_point = 0; // The Unicode code point
- unsigned char c = utf8[i]; // Current byte of the UTF-8 string
-
- // Determine the number of bytes for this character based on its first byte
- if ((c & 0x80) == 0) {
- // 1-byte character (ASCII)
- code_point = c;
- ++i;
- } else if ((c & 0xE0) == 0xC0) {
- // 2-byte character
- code_point = c & 0x1F;
- code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
- i += 2;
- } else if ((c & 0xF0) == 0xE0) {
- // 3-byte character
- code_point = c & 0x0F;
- code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
- code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
- i += 3;
- } else if ((c & 0xF8) == 0xF0) {
- // 4-byte character
- code_point = c & 0x07;
- code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
- code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
- code_point = (code_point << 6) | (utf8[i + 3] & 0x3F);
- i += 4;
- } else {
- // Invalid UTF-8 byte sequence
- throw std::invalid_argument("Invalid UTF-8 encoding.");
- }
-
- // If the code point is beyond the BMP range, use surrogate pairs
- if (code_point >= 0x10000) {
- code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair
- uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate
- uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate
-
- // If not little-endian, swap bytes of the surrogates
- if (!is_little_endian) {
- high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
- low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
- }
-
- // Add both high and low surrogates to the UTF-16 string
- utf16.push_back(high_surrogate);
- utf16.push_back(low_surrogate);
- } else {
- // For code points within the BMP range, directly store as a 16-bit value
- uint16_t utf16_char = static_cast<uint16_t>(code_point);
-
- // If not little-endian, swap the bytes of the 16-bit character
- if (!is_little_endian) {
- utf16_char = (utf16_char >> 8) | (utf16_char << 8);
- }
-
- // Add the UTF-16 character to the string
- utf16.push_back(utf16_char);
- }
- }
-
- // Return the resulting UTF-16 string
- return utf16;
-}
-
// Testing Basic Logic
TEST(UTF8ToUTF16Test, BasicConversion) {
std::string utf8 = u8"Hello, 世界!";
@@ -396,75 +292,6 @@ TEST(UTF8ToUTF16Test, RoundTripConversion) {
ASSERT_EQ(original_utf8, utf8_converted_back);
}
-// Testing Performance
-TEST(UTF8ToUTF16Test, PerformanceTest) {
- const size_t num_tests = 1000;
- const size_t string_length = 1000;
- // Default little_endian
- bool is_little_endian = true;
-
- // Random UTF-8
- std::vector<std::string> test_strings;
- for (size_t i = 0; i < num_tests; ++i) {
- test_strings.push_back(generateRandomUTF8String(string_length));
- }
-
- // Standard Library
- try {
- auto start_time = std::chrono::high_resolution_clock::now();
- std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> convert;
- // Loop through test strings and convert each UTF-8 string to UTF-16
- for (const auto &str : test_strings) {
- std::wstring wide_str = convert.from_bytes(str);
- std::u16string utf16;
- for (wchar_t wc : wide_str) {
- utf16.push_back(static_cast<char16_t>(wc));
- }
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "Standard Library Running Time: " << duration
- << " ns";
- } catch (const std::exception &e) {
- FURY_LOG(FURY_FATAL) << "Caught exception in standard library conversion: "
- << e.what();
- }
-
- // BaseLine
- try {
- auto start_time = std::chrono::high_resolution_clock::now();
- for (const auto &str : test_strings) {
- std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian);
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "BaseLine Running Time: " << duration << " ns";
- } catch (const std::exception &e) {
- FURY_LOG(FURY_FATAL) << "Caught exception in baseline conversion: "
- << e.what();
- }
-
- // Optimized (SIMD)
- try {
- auto start_time = std::chrono::high_resolution_clock::now();
- for (const auto &str : test_strings) {
- std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian);
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "SIMD Optimized Running Time: " << duration << "
ns";
- } catch (const std::exception &e) {
- FURY_LOG(FURY_FATAL) << "Caught exception in SIMD optimized conversion: "
- << e.what();
- }
-}
-
} // namespace fury
int main(int argc, char **argv) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]