This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new a89b101 feat: add UUID representation (#242)
a89b101 is described below
commit a89b10105112942445daf6b63f1dcd9ec2f2e41a
Author: Junwang Zhao <[email protected]>
AuthorDate: Mon Sep 29 23:28:44 2025 +0800
feat: add UUID representation (#242)
UUID representation along with utilities such as generators for v4 and
v7.
---
src/iceberg/CMakeLists.txt | 3 +-
src/iceberg/util/uuid.cc | 220 +++++++++++++++++++++++++++++++++++++++++++++
src/iceberg/util/uuid.h | 85 ++++++++++++++++++
test/CMakeLists.txt | 1 +
test/uuid_test.cc | 117 ++++++++++++++++++++++++
5 files changed, 425 insertions(+), 1 deletion(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index c8fb077..747d4c4 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -52,7 +52,8 @@ set(ICEBERG_SOURCES
util/decimal.cc
util/murmurhash3_internal.cc
util/timepoint.cc
- util/gzip_internal.cc)
+ util/gzip_internal.cc
+ util/uuid.cc)
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)
diff --git a/src/iceberg/util/uuid.cc b/src/iceberg/util/uuid.cc
new file mode 100644
index 0000000..1425675
--- /dev/null
+++ b/src/iceberg/util/uuid.cc
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/uuid.h"
+
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <random>
+#include <string>
+
+#include "iceberg/exception.h"
+#include "iceberg/result.h"
+#include "iceberg/util/formatter.h" // IWYU pragma: keep
+#include "iceberg/util/int128.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr std::array<uint8_t, 256> BuildHexTable() {
+ std::array<uint8_t, 256> buf{};
+ for (int32_t i = 0; i < 256; i++) {
+ if (i >= '0' && i <= '9') {
+ buf[i] = static_cast<uint8_t>(i - '0');
+ } else if (i >= 'a' && i <= 'f') {
+ buf[i] = static_cast<uint8_t>(i - 'a' + 10);
+ } else if (i >= 'A' && i <= 'F') {
+ buf[i] = static_cast<uint8_t>(i - 'A' + 10);
+ } else {
+ buf[i] = 0xFF;
+ }
+ }
+ return buf;
+}
+
+constexpr std::array<uint8_t, 256> BuildShl4Table() {
+ std::array<uint8_t, 256> buf{};
+ for (int32_t i = 0; i < 256; i++) {
+ buf[i] = static_cast<uint8_t>(i << 4);
+ }
+ return buf;
+}
+
+constexpr auto kHexTable = BuildHexTable();
+constexpr auto kShl4Table = BuildShl4Table();
+
+// Parse a UUID string without dashes, e.g. "67e5504410b1426f9247bb680e5fe0c8"
+inline Result<Uuid> ParseSimple(std::string_view s) {
+ ICEBERG_DCHECK(s.size() == 32, "s must be 32 characters long");
+
+ std::array<uint8_t, 16> uuid{};
+ for (size_t i = 0; i < 16; i++) {
+ uint8_t h1 = kHexTable[static_cast<uint8_t>(s[i * 2])];
+ uint8_t h2 = kHexTable[static_cast<uint8_t>(s[i * 2 + 1])];
+
+ if ((h1 | h2) == 0xFF) [[unlikely]] {
+ return InvalidArgument("Invalid UUID string: {}", s);
+ }
+
+ uuid[i] = static_cast<uint8_t>(kShl4Table[h1] | h2);
+ }
+ return Uuid(std::move(uuid));
+}
+
+// Parse a UUID string with dashes, e.g. "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+inline Result<Uuid> ParseHyphenated(std::string_view s) {
+ ICEBERG_DCHECK(s.size() == 36, "s must be 36 characters long");
+
+ // Check that dashes are in the right places
+ if (!(s[8] == '-' && s[13] == '-' && s[18] == '-' && s[23] == '-'))
[[unlikely]] {
+ return InvalidArgument("Invalid UUID string: {}", s);
+ }
+
+ constexpr std::array<size_t, 8> positions = {0, 4, 9, 14, 19, 24, 28, 32};
+ std::array<uint8_t, 16> uuid{};
+
+ for (size_t j = 0; j < 8; j++) {
+ size_t i = positions[j];
+ uint8_t h1 = kHexTable[static_cast<uint8_t>(s[i])];
+ uint8_t h2 = kHexTable[static_cast<uint8_t>(s[i + 1])];
+ uint8_t h3 = kHexTable[static_cast<uint8_t>(s[i + 2])];
+ uint8_t h4 = kHexTable[static_cast<uint8_t>(s[i + 3])];
+
+ if ((h1 | h2 | h3 | h4) == 0xFF) [[unlikely]] {
+ return InvalidArgument("Invalid UUID string: {}", s);
+ }
+
+ uuid[j * 2] = static_cast<uint8_t>(kShl4Table[h1] | h2);
+ uuid[j * 2 + 1] = static_cast<uint8_t>(kShl4Table[h3] | h4);
+ }
+
+ return Uuid(std::move(uuid));
+}
+
+} // namespace
+
+Uuid::Uuid(std::array<uint8_t, kLength> data) : data_(std::move(data)) {}
+
+Uuid Uuid::GenerateV4() {
+ static std::random_device rd;
+ static std::mt19937 gen(rd());
+ static std::uniform_int_distribution<uint64_t> distrib(
+ std::numeric_limits<uint64_t>::min(),
std::numeric_limits<uint64_t>::max());
+ std::array<uint8_t, 16> uuid;
+
+ // Generate two random 64-bit integers
+ uint64_t high_bits = distrib(gen);
+ uint64_t low_bits = distrib(gen);
+
+ // Combine them into a uint128_t
+ uint128_t random_128_bit_number = (static_cast<uint128_t>(high_bits) << 64)
| low_bits;
+
+ // Copy the bytes into the uuid array
+ std::memcpy(uuid.data(), &random_128_bit_number, 16);
+
+ // Set magic numbers for a "version 4" (pseudorandom) UUID and variant,
+ // see https://datatracker.ietf.org/doc/html/rfc9562#name-uuid-version-4
+ uuid[6] = (uuid[6] & 0x0F) | 0x40;
+ // Set variant field, top two bits are 1, 0
+ uuid[8] = (uuid[8] & 0x3F) | 0x80;
+
+ return Uuid(std::move(uuid));
+}
+
+Uuid Uuid::GenerateV7() {
+ // Get the current time in milliseconds since the Unix epoch
+ auto now = std::chrono::system_clock::now();
+ auto duration_since_epoch = now.time_since_epoch();
+ auto unix_ts_ms =
+
std::chrono::duration_cast<std::chrono::milliseconds>(duration_since_epoch).count();
+
+ return GenerateV7(static_cast<uint64_t>(unix_ts_ms));
+}
+
+Uuid Uuid::GenerateV7(uint64_t unix_ts_ms) {
+ std::array<uint8_t, 16> uuid = {};
+
+ // Set the timestamp (in milliseconds since Unix epoch)
+ uuid[0] = (unix_ts_ms >> 40) & 0xFF;
+ uuid[1] = (unix_ts_ms >> 32) & 0xFF;
+ uuid[2] = (unix_ts_ms >> 24) & 0xFF;
+ uuid[3] = (unix_ts_ms >> 16) & 0xFF;
+ uuid[4] = (unix_ts_ms >> 8) & 0xFF;
+ uuid[5] = unix_ts_ms & 0xFF;
+
+ // Generate random bytes for the remaining fields
+ static std::random_device rd;
+ static std::mt19937 gen(rd());
+ static std::uniform_int_distribution<uint16_t> distrib(
+ std::numeric_limits<uint16_t>::min(),
std::numeric_limits<uint16_t>::max());
+
+ // Note: uint8_t is invalid for uniform_int_distribution on Windows
+ for (size_t i = 6; i < 16; i += 2) {
+ auto rand = static_cast<uint16_t>(distrib(gen));
+ uuid[i] = (rand >> 8) & 0xFF;
+ uuid[i + 1] = rand & 0xFF;
+ }
+
+ // Set magic numbers for a "version 7" (pseudorandom) UUID and variant,
+ // see https://www.rfc-editor.org/rfc/rfc9562#name-version-field
+ uuid[6] = (uuid[6] & 0x0F) | 0x70;
+ // set variant field, top two bits are 1, 0
+ uuid[8] = (uuid[8] & 0x3F) | 0x80;
+
+ return Uuid(std::move(uuid));
+}
+
+Result<Uuid> Uuid::FromString(std::string_view str) {
+ if (str.size() == 32) {
+ return ParseSimple(str);
+ } else if (str.size() == 36) {
+ return ParseHyphenated(str);
+ } else {
+ return InvalidArgument("Invalid UUID string: {}", str);
+ }
+}
+
+Result<Uuid> Uuid::FromBytes(std::span<const uint8_t> bytes) {
+ if (bytes.size() != kLength) [[unlikely]] {
+ return InvalidArgument("UUID byte array must be exactly {} bytes, was {}",
kLength,
+ bytes.size());
+ }
+ std::array<uint8_t, kLength> data;
+ std::memcpy(data.data(), bytes.data(), kLength);
+ return Uuid(std::move(data));
+}
+
+uint8_t Uuid::operator[](size_t index) const {
+ ICEBERG_CHECK(index < kLength, "UUID index out of range: {}", index);
+ return data_[index];
+}
+
+std::string Uuid::ToString() const {
+ return std::format(
+
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}"
+ "{:02x}{:02x}{:02x}",
+ data_[0], data_[1], data_[2], data_[3], data_[4], data_[5], data_[6],
data_[7],
+ data_[8], data_[9], data_[10], data_[11], data_[12], data_[13],
data_[14],
+ data_[15]);
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/util/uuid.h b/src/iceberg/util/uuid.h
new file mode 100644
index 0000000..64db7c5
--- /dev/null
+++ b/src/iceberg/util/uuid.h
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <span>
+#include <string_view>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/util/formattable.h"
+
+/// \file iceberg/util/uuid.h
+/// \brief UUID (Universally Unique Identifier) representation.
+
+namespace iceberg {
+
+class ICEBERG_EXPORT Uuid : public util::Formattable {
+ public:
+ Uuid() = delete;
+ constexpr static size_t kLength = 16;
+
+ explicit Uuid(std::array<uint8_t, kLength> data);
+
+ /// \brief Generate a random UUID (version 4).
+ static Uuid GenerateV4();
+
+ /// \brief Generate UUID version 7 per RFC 9562, with the current timestamp.
+ static Uuid GenerateV7();
+
+ /// \brief Generate UUID version 7 per RFC 9562, with the given timestamp.
+ ///
+ /// UUID version 7 consists of a Unix timestamp in milliseconds (48 bits) and
+ /// 74 random bits, excluding the required version and variant bits.
+ ///
+ /// \param unix_ts_ms number of milliseconds since start of the UNIX epoch
+ ///
+ /// \note unix_ts_ms cannot be negative per RFC.
+ static Uuid GenerateV7(uint64_t unix_ts_ms);
+
+ /// \brief Create a UUID from a string in standard format.
+ static Result<Uuid> FromString(std::string_view str);
+
+ /// \brief Create a UUID from a 16-byte array.
+ static Result<Uuid> FromBytes(std::span<const uint8_t> bytes);
+
+ /// \brief Get the raw bytes of the UUID.
+ std::span<const uint8_t> bytes() const { return data_; }
+
+ /// \brief Access individual bytes of the UUID.
+ /// \param index The index of the byte to access (0-15).
+ /// \return The byte at the specified index.
+ /// \throw IcebergError if index is out of bounds.
+ uint8_t operator[](size_t index) const;
+
+ /// \brief Convert the UUID to a string in standard format.
+ std::string ToString() const override;
+
+ friend bool operator==(const Uuid& lhs, const Uuid& rhs) {
+ return lhs.data_ == rhs.data_;
+ }
+
+ private:
+ std::array<uint8_t, kLength> data_;
+};
+
+} // namespace iceberg
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d70b4f8..3c74735 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -91,6 +91,7 @@ add_iceberg_test(util_test
endian_test.cc
formatter_test.cc
string_util_test.cc
+ uuid_test.cc
visit_type_test.cc)
add_iceberg_test(roaring_test SOURCES roaring_test.cc)
diff --git a/test/uuid_test.cc b/test/uuid_test.cc
new file mode 100644
index 0000000..3dbe573
--- /dev/null
+++ b/test/uuid_test.cc
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/uuid.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "matchers.h"
+
+namespace iceberg {
+
+TEST(UUIDUtilTest, GenerateV4) {
+ auto uuid = Uuid::GenerateV4();
+ // just ensure it runs and produces a value
+ EXPECT_EQ(uuid.bytes().size(), Uuid::kLength);
+ // Version 4 UUIDs have the version number (4) in the 7th byte
+ EXPECT_EQ((uuid[6] >> 4) & 0x0F, 4);
+ // Variant is in the 9th byte, the two most significant bits should be 10
+ EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
+}
+
+TEST(UUIDUtilTest, GenerateV7) {
+ auto uuid = Uuid::GenerateV7();
+ // just ensure it runs and produces a value
+ EXPECT_EQ(uuid.bytes().size(), 16);
+ // Version 7 UUIDs have the version number (7) in the 7th byte
+ EXPECT_EQ((uuid[6] >> 4) & 0x0F, 7);
+ // Variant is in the 9th byte, the two most significant bits should be 10
+ EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
+}
+
+TEST(UUIDUtilTest, FromString) {
+ std::vector<std::string> uuid_strings = {
+ "123e4567-e89b-12d3-a456-426614174000",
+ "550e8400-e29b-41d4-a716-446655440000",
+ "f47ac10b-58cc-4372-a567-0e02b2c3d479",
+ };
+
+ for (const auto& uuid_str : uuid_strings) {
+ auto result = Uuid::FromString(uuid_str);
+ EXPECT_THAT(result, IsOk());
+ auto uuid = result.value();
+ EXPECT_EQ(uuid.ToString(), uuid_str);
+ }
+
+ std::vector<std::pair<std::string, std::string>> uuid_string_pairs = {
+ {"123e4567e89b12d3a456426614174000",
"123e4567-e89b-12d3-a456-426614174000"},
+ {"550E8400E29B41D4A716446655440000",
"550e8400-e29b-41d4-a716-446655440000"},
+ {"F47AC10B58CC4372A5670E02B2C3D479",
"f47ac10b-58cc-4372-a567-0e02b2c3d479"},
+ };
+
+ for (const auto& [input_str, expected_str] : uuid_string_pairs) {
+ auto result = Uuid::FromString(input_str);
+ EXPECT_THAT(result, IsOk());
+ auto uuid = result.value();
+ EXPECT_EQ(uuid.ToString(), expected_str);
+ }
+}
+
+TEST(UUIDUtilTest, FromStringInvalid) {
+ std::vector<std::string> invalid_uuid_strings = {
+ "123e4567-e89b-12d3-a456-42661417400", // too short
+ "123e4567-e89b-12d3-a456-4266141740000", // too long
+ "g23e4567-e89b-12d3-a456-426614174000", // invalid character
+ "123e4567e89b12d3a45642661417400", // too short without dashes
+ "123e4567e89b12d3a4564266141740000", // too long without dashes
+ "550e8400-e29b-41d4-a716-44665544000Z", // invalid character at end
+ "550e8400-e29b-41d4-a716-44665544000-", // invalid character at end
+ "550e8400-e29b-41d4-a716-4466554400", // too short
+ };
+
+ for (const auto& uuid_str : invalid_uuid_strings) {
+ auto result = Uuid::FromString(uuid_str);
+ EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(result, HasErrorMessage("Invalid UUID string"));
+ }
+}
+
+TEST(UUIDUtilTest, FromBytes) {
+ std::array<uint8_t, Uuid::kLength> bytes = {0x12, 0x3e, 0x45, 0x67, 0xe8,
0x9b,
+ 0x12, 0xd3, 0xa4, 0x56, 0x42,
0x66,
+ 0x14, 0x17, 0x40, 0x00};
+ auto result = Uuid::FromBytes(bytes);
+ EXPECT_THAT(result, IsOk());
+ auto uuid = result.value();
+ EXPECT_EQ(uuid.ToString(), "123e4567-e89b-12d3-a456-426614174000");
+ EXPECT_EQ(uuid, Uuid(bytes));
+}
+
+TEST(UUIDUtilTest, FromBytesInvalid) {
+ std::array<uint8_t, Uuid::kLength - 1> short_bytes = {0x12, 0x3e, 0x45,
0x67, 0xe8,
+ 0x9b, 0x12, 0xd3,
0xa4, 0x56,
+ 0x42, 0x66, 0x14,
0x17, 0x40};
+ auto result = Uuid::FromBytes(short_bytes);
+ EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(result, HasErrorMessage("UUID byte array must be exactly 16
bytes"));
+}
+
+} // namespace iceberg