This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new a89b101  feat: add UUID representation (#242)
a89b101 is described below

commit a89b10105112942445daf6b63f1dcd9ec2f2e41a
Author: Junwang Zhao <[email protected]>
AuthorDate: Mon Sep 29 23:28:44 2025 +0800

    feat: add UUID representation (#242)
    
    UUID representation along with utilities such as generators for v4 and
    v7.
---
 src/iceberg/CMakeLists.txt |   3 +-
 src/iceberg/util/uuid.cc   | 220 +++++++++++++++++++++++++++++++++++++++++++++
 src/iceberg/util/uuid.h    |  85 ++++++++++++++++++
 test/CMakeLists.txt        |   1 +
 test/uuid_test.cc          | 117 ++++++++++++++++++++++++
 5 files changed, 425 insertions(+), 1 deletion(-)

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index c8fb077..747d4c4 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -52,7 +52,8 @@ set(ICEBERG_SOURCES
     util/decimal.cc
     util/murmurhash3_internal.cc
     util/timepoint.cc
-    util/gzip_internal.cc)
+    util/gzip_internal.cc
+    util/uuid.cc)
 
 set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
 set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)
diff --git a/src/iceberg/util/uuid.cc b/src/iceberg/util/uuid.cc
new file mode 100644
index 0000000..1425675
--- /dev/null
+++ b/src/iceberg/util/uuid.cc
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/uuid.h"
+
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <random>
+#include <string>
+
+#include "iceberg/exception.h"
+#include "iceberg/result.h"
+#include "iceberg/util/formatter.h"  // IWYU pragma: keep
+#include "iceberg/util/int128.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr std::array<uint8_t, 256> BuildHexTable() {
+  std::array<uint8_t, 256> buf{};
+  for (int32_t i = 0; i < 256; i++) {
+    if (i >= '0' && i <= '9') {
+      buf[i] = static_cast<uint8_t>(i - '0');
+    } else if (i >= 'a' && i <= 'f') {
+      buf[i] = static_cast<uint8_t>(i - 'a' + 10);
+    } else if (i >= 'A' && i <= 'F') {
+      buf[i] = static_cast<uint8_t>(i - 'A' + 10);
+    } else {
+      buf[i] = 0xFF;
+    }
+  }
+  return buf;
+}
+
+constexpr std::array<uint8_t, 256> BuildShl4Table() {
+  std::array<uint8_t, 256> buf{};
+  for (int32_t i = 0; i < 256; i++) {
+    buf[i] = static_cast<uint8_t>(i << 4);
+  }
+  return buf;
+}
+
+constexpr auto kHexTable = BuildHexTable();
+constexpr auto kShl4Table = BuildShl4Table();
+
+// Parse a UUID string without dashes, e.g. "67e5504410b1426f9247bb680e5fe0c8"
+inline Result<Uuid> ParseSimple(std::string_view s) {
+  ICEBERG_DCHECK(s.size() == 32, "s must be 32 characters long");
+
+  std::array<uint8_t, 16> uuid{};
+  for (size_t i = 0; i < 16; i++) {
+    uint8_t h1 = kHexTable[static_cast<uint8_t>(s[i * 2])];
+    uint8_t h2 = kHexTable[static_cast<uint8_t>(s[i * 2 + 1])];
+
+    if ((h1 | h2) == 0xFF) [[unlikely]] {
+      return InvalidArgument("Invalid UUID string: {}", s);
+    }
+
+    uuid[i] = static_cast<uint8_t>(kShl4Table[h1] | h2);
+  }
+  return Uuid(std::move(uuid));
+}
+
+// Parse a UUID string with dashes, e.g. "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+inline Result<Uuid> ParseHyphenated(std::string_view s) {
+  ICEBERG_DCHECK(s.size() == 36, "s must be 36 characters long");
+
+  // Check that dashes are in the right places
+  if (!(s[8] == '-' && s[13] == '-' && s[18] == '-' && s[23] == '-')) 
[[unlikely]] {
+    return InvalidArgument("Invalid UUID string: {}", s);
+  }
+
+  constexpr std::array<size_t, 8> positions = {0, 4, 9, 14, 19, 24, 28, 32};
+  std::array<uint8_t, 16> uuid{};
+
+  for (size_t j = 0; j < 8; j++) {
+    size_t i = positions[j];
+    uint8_t h1 = kHexTable[static_cast<uint8_t>(s[i])];
+    uint8_t h2 = kHexTable[static_cast<uint8_t>(s[i + 1])];
+    uint8_t h3 = kHexTable[static_cast<uint8_t>(s[i + 2])];
+    uint8_t h4 = kHexTable[static_cast<uint8_t>(s[i + 3])];
+
+    if ((h1 | h2 | h3 | h4) == 0xFF) [[unlikely]] {
+      return InvalidArgument("Invalid UUID string: {}", s);
+    }
+
+    uuid[j * 2] = static_cast<uint8_t>(kShl4Table[h1] | h2);
+    uuid[j * 2 + 1] = static_cast<uint8_t>(kShl4Table[h3] | h4);
+  }
+
+  return Uuid(std::move(uuid));
+}
+
+}  // namespace
+
+Uuid::Uuid(std::array<uint8_t, kLength> data) : data_(std::move(data)) {}
+
+Uuid Uuid::GenerateV4() {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+  static std::uniform_int_distribution<uint64_t> distrib(
+      std::numeric_limits<uint64_t>::min(), 
std::numeric_limits<uint64_t>::max());
+  std::array<uint8_t, 16> uuid;
+
+  // Generate two random 64-bit integers
+  uint64_t high_bits = distrib(gen);
+  uint64_t low_bits = distrib(gen);
+
+  // Combine them into a uint128_t
+  uint128_t random_128_bit_number = (static_cast<uint128_t>(high_bits) << 64) 
| low_bits;
+
+  // Copy the bytes into the uuid array
+  std::memcpy(uuid.data(), &random_128_bit_number, 16);
+
+  // Set magic numbers for a "version 4" (pseudorandom) UUID and variant,
+  // see https://datatracker.ietf.org/doc/html/rfc9562#name-uuid-version-4
+  uuid[6] = (uuid[6] & 0x0F) | 0x40;
+  // Set variant field, top two bits are 1, 0
+  uuid[8] = (uuid[8] & 0x3F) | 0x80;
+
+  return Uuid(std::move(uuid));
+}
+
+Uuid Uuid::GenerateV7() {
+  // Get the current time in milliseconds since the Unix epoch
+  auto now = std::chrono::system_clock::now();
+  auto duration_since_epoch = now.time_since_epoch();
+  auto unix_ts_ms =
+      
std::chrono::duration_cast<std::chrono::milliseconds>(duration_since_epoch).count();
+
+  return GenerateV7(static_cast<uint64_t>(unix_ts_ms));
+}
+
+Uuid Uuid::GenerateV7(uint64_t unix_ts_ms) {
+  std::array<uint8_t, 16> uuid = {};
+
+  // Set the timestamp (in milliseconds since Unix epoch)
+  uuid[0] = (unix_ts_ms >> 40) & 0xFF;
+  uuid[1] = (unix_ts_ms >> 32) & 0xFF;
+  uuid[2] = (unix_ts_ms >> 24) & 0xFF;
+  uuid[3] = (unix_ts_ms >> 16) & 0xFF;
+  uuid[4] = (unix_ts_ms >> 8) & 0xFF;
+  uuid[5] = unix_ts_ms & 0xFF;
+
+  // Generate random bytes for the remaining fields
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+  static std::uniform_int_distribution<uint16_t> distrib(
+      std::numeric_limits<uint16_t>::min(), 
std::numeric_limits<uint16_t>::max());
+
+  // Note: uint8_t is invalid for uniform_int_distribution on Windows
+  for (size_t i = 6; i < 16; i += 2) {
+    auto rand = static_cast<uint16_t>(distrib(gen));
+    uuid[i] = (rand >> 8) & 0xFF;
+    uuid[i + 1] = rand & 0xFF;
+  }
+
+  // Set magic numbers for a "version 7" (pseudorandom) UUID and variant,
+  // see https://www.rfc-editor.org/rfc/rfc9562#name-version-field
+  uuid[6] = (uuid[6] & 0x0F) | 0x70;
+  // set variant field, top two bits are 1, 0
+  uuid[8] = (uuid[8] & 0x3F) | 0x80;
+
+  return Uuid(std::move(uuid));
+}
+
+Result<Uuid> Uuid::FromString(std::string_view str) {
+  if (str.size() == 32) {
+    return ParseSimple(str);
+  } else if (str.size() == 36) {
+    return ParseHyphenated(str);
+  } else {
+    return InvalidArgument("Invalid UUID string: {}", str);
+  }
+}
+
+Result<Uuid> Uuid::FromBytes(std::span<const uint8_t> bytes) {
+  if (bytes.size() != kLength) [[unlikely]] {
+    return InvalidArgument("UUID byte array must be exactly {} bytes, was {}", 
kLength,
+                           bytes.size());
+  }
+  std::array<uint8_t, kLength> data;
+  std::memcpy(data.data(), bytes.data(), kLength);
+  return Uuid(std::move(data));
+}
+
+uint8_t Uuid::operator[](size_t index) const {
+  ICEBERG_CHECK(index < kLength, "UUID index out of range: {}", index);
+  return data_[index];
+}
+
+std::string Uuid::ToString() const {
+  return std::format(
+      
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}"
+      "{:02x}{:02x}{:02x}",
+      data_[0], data_[1], data_[2], data_[3], data_[4], data_[5], data_[6], 
data_[7],
+      data_[8], data_[9], data_[10], data_[11], data_[12], data_[13], 
data_[14],
+      data_[15]);
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/util/uuid.h b/src/iceberg/util/uuid.h
new file mode 100644
index 0000000..64db7c5
--- /dev/null
+++ b/src/iceberg/util/uuid.h
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <span>
+#include <string_view>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/util/formattable.h"
+
+/// \file iceberg/util/uuid.h
+/// \brief UUID (Universally Unique Identifier) representation.
+
+namespace iceberg {
+
+class ICEBERG_EXPORT Uuid : public util::Formattable {
+ public:
+  Uuid() = delete;
+  constexpr static size_t kLength = 16;
+
+  explicit Uuid(std::array<uint8_t, kLength> data);
+
+  /// \brief Generate a random UUID (version 4).
+  static Uuid GenerateV4();
+
+  /// \brief Generate UUID version 7 per RFC 9562, with the current timestamp.
+  static Uuid GenerateV7();
+
+  /// \brief Generate UUID version 7 per RFC 9562, with the given timestamp.
+  ///
+  /// UUID version 7 consists of a Unix timestamp in milliseconds (48 bits) and
+  /// 74 random bits, excluding the required version and variant bits.
+  ///
+  /// \param unix_ts_ms number of milliseconds since start of the UNIX epoch
+  ///
+  /// \note unix_ts_ms cannot be negative per RFC.
+  static Uuid GenerateV7(uint64_t unix_ts_ms);
+
+  /// \brief Create a UUID from a string in standard format.
+  static Result<Uuid> FromString(std::string_view str);
+
+  /// \brief Create a UUID from a 16-byte array.
+  static Result<Uuid> FromBytes(std::span<const uint8_t> bytes);
+
+  /// \brief Get the raw bytes of the UUID.
+  std::span<const uint8_t> bytes() const { return data_; }
+
+  /// \brief Access individual bytes of the UUID.
+  /// \param index The index of the byte to access (0-15).
+  /// \return The byte at the specified index.
+  /// \throw IcebergError if index is out of bounds.
+  uint8_t operator[](size_t index) const;
+
+  /// \brief Convert the UUID to a string in standard format.
+  std::string ToString() const override;
+
+  friend bool operator==(const Uuid& lhs, const Uuid& rhs) {
+    return lhs.data_ == rhs.data_;
+  }
+
+ private:
+  std::array<uint8_t, kLength> data_;
+};
+
+}  // namespace iceberg
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d70b4f8..3c74735 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -91,6 +91,7 @@ add_iceberg_test(util_test
                  endian_test.cc
                  formatter_test.cc
                  string_util_test.cc
+                 uuid_test.cc
                  visit_type_test.cc)
 
 add_iceberg_test(roaring_test SOURCES roaring_test.cc)
diff --git a/test/uuid_test.cc b/test/uuid_test.cc
new file mode 100644
index 0000000..3dbe573
--- /dev/null
+++ b/test/uuid_test.cc
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/uuid.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "matchers.h"
+
+namespace iceberg {
+
+TEST(UUIDUtilTest, GenerateV4) {
+  auto uuid = Uuid::GenerateV4();
+  // just ensure it runs and produces a value
+  EXPECT_EQ(uuid.bytes().size(), Uuid::kLength);
+  // Version 4 UUIDs have the version number (4) in the 7th byte
+  EXPECT_EQ((uuid[6] >> 4) & 0x0F, 4);
+  // Variant is in the 9th byte, the two most significant bits should be 10
+  EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
+}
+
+TEST(UUIDUtilTest, GenerateV7) {
+  auto uuid = Uuid::GenerateV7();
+  // just ensure it runs and produces a value
+  EXPECT_EQ(uuid.bytes().size(), 16);
+  // Version 7 UUIDs have the version number (7) in the 7th byte
+  EXPECT_EQ((uuid[6] >> 4) & 0x0F, 7);
+  // Variant is in the 9th byte, the two most significant bits should be 10
+  EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
+}
+
+TEST(UUIDUtilTest, FromString) {
+  std::vector<std::string> uuid_strings = {
+      "123e4567-e89b-12d3-a456-426614174000",
+      "550e8400-e29b-41d4-a716-446655440000",
+      "f47ac10b-58cc-4372-a567-0e02b2c3d479",
+  };
+
+  for (const auto& uuid_str : uuid_strings) {
+    auto result = Uuid::FromString(uuid_str);
+    EXPECT_THAT(result, IsOk());
+    auto uuid = result.value();
+    EXPECT_EQ(uuid.ToString(), uuid_str);
+  }
+
+  std::vector<std::pair<std::string, std::string>> uuid_string_pairs = {
+      {"123e4567e89b12d3a456426614174000", 
"123e4567-e89b-12d3-a456-426614174000"},
+      {"550E8400E29B41D4A716446655440000", 
"550e8400-e29b-41d4-a716-446655440000"},
+      {"F47AC10B58CC4372A5670E02B2C3D479", 
"f47ac10b-58cc-4372-a567-0e02b2c3d479"},
+  };
+
+  for (const auto& [input_str, expected_str] : uuid_string_pairs) {
+    auto result = Uuid::FromString(input_str);
+    EXPECT_THAT(result, IsOk());
+    auto uuid = result.value();
+    EXPECT_EQ(uuid.ToString(), expected_str);
+  }
+}
+
+TEST(UUIDUtilTest, FromStringInvalid) {
+  std::vector<std::string> invalid_uuid_strings = {
+      "123e4567-e89b-12d3-a456-42661417400",    // too short
+      "123e4567-e89b-12d3-a456-4266141740000",  // too long
+      "g23e4567-e89b-12d3-a456-426614174000",   // invalid character
+      "123e4567e89b12d3a45642661417400",        // too short without dashes
+      "123e4567e89b12d3a4564266141740000",      // too long without dashes
+      "550e8400-e29b-41d4-a716-44665544000Z",   // invalid character at end
+      "550e8400-e29b-41d4-a716-44665544000-",   // invalid character at end
+      "550e8400-e29b-41d4-a716-4466554400",     // too short
+  };
+
+  for (const auto& uuid_str : invalid_uuid_strings) {
+    auto result = Uuid::FromString(uuid_str);
+    EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
+    EXPECT_THAT(result, HasErrorMessage("Invalid UUID string"));
+  }
+}
+
+TEST(UUIDUtilTest, FromBytes) {
+  std::array<uint8_t, Uuid::kLength> bytes = {0x12, 0x3e, 0x45, 0x67, 0xe8, 
0x9b,
+                                              0x12, 0xd3, 0xa4, 0x56, 0x42, 
0x66,
+                                              0x14, 0x17, 0x40, 0x00};
+  auto result = Uuid::FromBytes(bytes);
+  EXPECT_THAT(result, IsOk());
+  auto uuid = result.value();
+  EXPECT_EQ(uuid.ToString(), "123e4567-e89b-12d3-a456-426614174000");
+  EXPECT_EQ(uuid, Uuid(bytes));
+}
+
+TEST(UUIDUtilTest, FromBytesInvalid) {
+  std::array<uint8_t, Uuid::kLength - 1> short_bytes = {0x12, 0x3e, 0x45, 
0x67, 0xe8,
+                                                        0x9b, 0x12, 0xd3, 
0xa4, 0x56,
+                                                        0x42, 0x66, 0x14, 
0x17, 0x40};
+  auto result = Uuid::FromBytes(short_bytes);
+  EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(result, HasErrorMessage("UUID byte array must be exactly 16 
bytes"));
+}
+
+}  // namespace iceberg

Reply via email to