This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 046f149 feat: implement literal expressions with binary serialization
support (#185)
046f149 is described below
commit 046f149b76e840761786e3e1a999a7778b17d990
Author: Li Feiyang <[email protected]>
AuthorDate: Fri Oct 10 21:56:17 2025 +0800
feat: implement literal expressions with binary serialization support
(#185)
## Summary
Implements binary serialization and deserialization support for Literal
values, enabling conversion between Literal objects and binary
representations. Adds comprehensive formatting support for date, time,
and timestamp types.
## Changes
- Added `Conversions` utility class
(`src/iceberg/util/conversions.cc/h`) with `ToBytes()` and `FromBytes()`
methods for `Literal` binary serialization/deserialization
- Added literal formatting utilities
(`src/iceberg/util/literal_format.cc/h`) for `date`, `time`,
`timestamp`, and `timestamptz` formatting
- Implemented `Literal` serialization methods: Replaced placeholder
implementations of `Serialize()` and `Deserialize()` with full
functionality
- Enhanced `Literal::ToString()`: Added support for `date`, `time`,
`timestamp`, and `timestamptz` types
- Added `TypeId` string conversion: Implemented `ToString(TypeId)`
utility function for type name lookups
- Updated CMake configuration: Added new util source files to build
system
## Test Plan
- Comprehensive binary round-trip tests for all primitive types
(boolean, int, long, float, double, string, binary)
- Serialization correctness tests verify exact byte representations
match expected formats
- Date/time formatting tests ensure proper ISO 8601 compatible string
output
- Modify existing test(e.g. manifest_reader_test.cc) to use binary
serialization.
---
src/iceberg/CMakeLists.txt | 3 +-
src/iceberg/expression/literal.cc | 30 +++-
src/iceberg/expression/literal.h | 68 ++++++++-
src/iceberg/test/literal_test.cc | 207 +++++++++++++++++++++++++-
src/iceberg/test/manifest_list_reader_test.cc | 70 ++++-----
src/iceberg/test/manifest_reader_test.cc | 57 ++++---
src/iceberg/type.cc | 42 ++++++
src/iceberg/type.h | 9 ++
src/iceberg/util/conversions.cc | 202 +++++++++++++++++++++++++
src/iceberg/util/conversions.h | 65 ++++++++
10 files changed, 684 insertions(+), 69 deletions(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index a2a648f..8327b59 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -55,10 +55,11 @@ set(ICEBERG_SOURCES
manifest_reader_internal.cc
manifest_writer.cc
arrow_c_data_guard_internal.cc
+ util/conversions.cc
util/decimal.cc
+ util/gzip_internal.cc
util/murmurhash3_internal.cc
util/timepoint.cc
- util/gzip_internal.cc
util/uuid.cc)
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
diff --git a/src/iceberg/expression/literal.cc
b/src/iceberg/expression/literal.cc
index e3abb6a..adfe535 100644
--- a/src/iceberg/expression/literal.cc
+++ b/src/iceberg/expression/literal.cc
@@ -23,6 +23,8 @@
#include <concepts>
#include "iceberg/exception.h"
+#include "iceberg/util/conversions.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
@@ -149,13 +151,18 @@ Literal Literal::Binary(std::vector<uint8_t> value) {
return {Value{std::move(value)}, binary()};
}
+Literal Literal::Fixed(std::vector<uint8_t> value) {
+ auto length = static_cast<int32_t>(value.size());
+ return {Value{std::move(value)}, fixed(length)};
+}
+
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
std::shared_ptr<PrimitiveType> type) {
- return NotImplemented("Deserialization of Literal is not implemented yet");
+ return Conversions::FromBytes(std::move(type), data);
}
Result<std::vector<uint8_t>> Literal::Serialize() const {
- return NotImplemented("Serialization of Literal is not implemented yet");
+ return Conversions::ToBytes(*this);
}
// Getters
@@ -189,7 +196,7 @@ bool Literal::operator==(const Literal& other) const {
return (*this <=> other)
// Three-way comparison operator
std::partial_ordering Literal::operator<=>(const Literal& other) const {
// If types are different, comparison is unordered
- if (type_->type_id() != other.type_->type_id()) {
+ if (*type_ != *other.type_) {
return std::partial_ordering::unordered;
}
@@ -216,6 +223,7 @@ std::partial_ordering Literal::operator<=>(const Literal&
other) const {
}
case TypeId::kLong:
+ case TypeId::kTime:
case TypeId::kTimestamp:
case TypeId::kTimestampTz: {
auto this_val = std::get<int64_t>(value_);
@@ -249,6 +257,12 @@ std::partial_ordering Literal::operator<=>(const Literal&
other) const {
return this_val <=> other_val;
}
+ case TypeId::kFixed: {
+ auto& this_val = std::get<std::vector<uint8_t>>(value_);
+ auto& other_val = std::get<std::vector<uint8_t>>(other.value_);
+ return this_val <=> other_val;
+ }
+
default:
// For unsupported types, return unordered
return std::partial_ordering::unordered;
@@ -294,9 +308,17 @@ std::string Literal::ToString() const {
}
return result;
}
+ case TypeId::kFixed: {
+ const auto& fixed_data = std::get<std::vector<uint8_t>>(value_);
+ std::string result;
+ result.reserve(fixed_data.size() * 2); // 2 chars per byte
+ for (const auto& byte : fixed_data) {
+ std::format_to(std::back_inserter(result), "{:02X}", byte);
+ }
+ return result;
+ }
case TypeId::kDecimal:
case TypeId::kUuid:
- case TypeId::kFixed:
case TypeId::kDate:
case TypeId::kTime:
case TypeId::kTimestamp:
diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h
index 1c16b8e..c11d48f 100644
--- a/src/iceberg/expression/literal.h
+++ b/src/iceberg/expression/literal.h
@@ -72,6 +72,7 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
static Literal Double(double value);
static Literal String(std::string value);
static Literal Binary(std::vector<uint8_t> value);
+ static Literal Fixed(std::vector<uint8_t> value);
/// \brief Create a literal representing a null value.
static Literal Null(std::shared_ptr<PrimitiveType> type) {
@@ -144,11 +145,76 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
private:
Literal(Value value, std::shared_ptr<PrimitiveType> type);
+ friend class Conversions;
friend class LiteralCaster;
- private:
Value value_;
std::shared_ptr<PrimitiveType> type_;
};
+template <TypeId type_id>
+struct LiteralTraits {
+ using ValueType = void;
+};
+
+template <>
+struct LiteralTraits<TypeId::kBoolean> {
+ using ValueType = bool;
+};
+
+template <>
+struct LiteralTraits<TypeId::kInt> {
+ using ValueType = int32_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kDate> {
+ using ValueType = int32_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kLong> {
+ using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kTime> {
+ using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kTimestamp> {
+ using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kTimestampTz> {
+ using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kFloat> {
+ using ValueType = float;
+};
+
+template <>
+struct LiteralTraits<TypeId::kDouble> {
+ using ValueType = double;
+};
+
+template <>
+struct LiteralTraits<TypeId::kString> {
+ using ValueType = std::string;
+};
+
+template <>
+struct LiteralTraits<TypeId::kBinary> {
+ using ValueType = std::vector<uint8_t>;
+};
+
+template <>
+struct LiteralTraits<TypeId::kFixed> {
+ using ValueType = std::vector<uint8_t>;
+};
+
} // namespace iceberg
diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc
index e9ddd47..bd7544b 100644
--- a/src/iceberg/test/literal_test.cc
+++ b/src/iceberg/test/literal_test.cc
@@ -81,7 +81,7 @@ TEST(LiteralTest, IntCastTo) {
auto long_result = int_literal.CastTo(iceberg::int64());
ASSERT_THAT(long_result, IsOk());
EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong);
- EXPECT_EQ(long_result->ToString(), "42");
+ EXPECT_EQ(std::get<int64_t>(long_result->value()), 42L);
// Cast to Float
auto float_result = int_literal.CastTo(iceberg::float32());
@@ -137,7 +137,6 @@ TEST(LiteralTest, LongCastTo) {
}
TEST(LiteralTest, LongCastToIntOverflow) {
- // Test overflow cases
auto max_long =
Literal::Long(static_cast<int64_t>(std::numeric_limits<int32_t>::max())
+ 1);
auto min_long =
@@ -383,4 +382,208 @@ TEST(LiteralTest, DoubleZeroComparison) {
EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less);
}
+struct LiteralParam {
+ std::string test_name;
+ std::vector<uint8_t> serialized;
+ Literal value;
+ std::shared_ptr<PrimitiveType> type;
+};
+
+class LiteralSerDeParam : public ::testing::TestWithParam<LiteralParam> {};
+
+TEST_P(LiteralSerDeParam, RoundTrip) {
+ const auto& param = GetParam();
+
+ // Deserialize from bytes
+ Result<Literal> literal_result = Literal::Deserialize(param.serialized,
param.type);
+ ASSERT_TRUE(literal_result.has_value())
+ << "Deserialization failed: " << literal_result.error().message;
+
+ // Check type and value
+ EXPECT_EQ(*literal_result, param.value);
+
+ // Serialize back to bytes
+ Result<std::vector<uint8_t>> bytes_result = literal_result->Serialize();
+ ASSERT_TRUE(bytes_result.has_value())
+ << "Serialization failed: " << bytes_result.error().message;
+ EXPECT_EQ(*bytes_result, param.serialized);
+
+ // Deserialize again to verify idempotency
+ Result<Literal> final_literal = Literal::Deserialize(*bytes_result,
param.type);
+ ASSERT_TRUE(final_literal.has_value())
+ << "Final deserialization failed: " << final_literal.error().message;
+ EXPECT_EQ(*final_literal, param.value);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BinarySerialization, LiteralSerDeParam,
+ ::testing::Values(
+ // Basic types
+ LiteralParam{"BooleanTrue", {1}, Literal::Boolean(true), boolean()},
+ LiteralParam{"BooleanFalse", {0}, Literal::Boolean(false), boolean()},
+
+ LiteralParam{"Int", {32, 0, 0, 0}, Literal::Int(32), int32()},
+ LiteralParam{
+ "IntMaxValue", {255, 255, 255, 127}, Literal::Int(2147483647),
int32()},
+ LiteralParam{"IntMinValue", {0, 0, 0, 128}, Literal::Int(-2147483648),
int32()},
+ LiteralParam{"NegativeInt", {224, 255, 255, 255}, Literal::Int(-32),
int32()},
+
+ LiteralParam{"Long", {32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32),
int64()},
+ LiteralParam{"LongMaxValue",
+ {255, 255, 255, 255, 255, 255, 255, 127},
+ Literal::Long(std::numeric_limits<int64_t>::max()),
+ int64()},
+ LiteralParam{"LongMinValue",
+ {0, 0, 0, 0, 0, 0, 0, 128},
+ Literal::Long(std::numeric_limits<int64_t>::min()),
+ int64()},
+ LiteralParam{"NegativeLong",
+ {224, 255, 255, 255, 255, 255, 255, 255},
+ Literal::Long(-32),
+ int64()},
+
+ LiteralParam{"Float", {0, 0, 128, 63}, Literal::Float(1.0f),
float32()},
+ LiteralParam{"FloatNegativeInfinity",
+ {0, 0, 128, 255},
+ Literal::Float(-std::numeric_limits<float>::infinity()),
+ float32()},
+ LiteralParam{"FloatMaxValue",
+ {255, 255, 127, 127},
+ Literal::Float(std::numeric_limits<float>::max()),
+ float32()},
+ LiteralParam{"FloatMinValue",
+ {255, 255, 127, 255},
+ Literal::Float(std::numeric_limits<float>::lowest()),
+ float32()},
+
+ LiteralParam{
+ "Double", {0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0),
float64()},
+ LiteralParam{"DoubleNegativeInfinity",
+ {0, 0, 0, 0, 0, 0, 240, 255},
+ Literal::Double(-std::numeric_limits<double>::infinity()),
+ float64()},
+ LiteralParam{"DoubleMaxValue",
+ {255, 255, 255, 255, 255, 255, 239, 127},
+ Literal::Double(std::numeric_limits<double>::max()),
+ float64()},
+ LiteralParam{"DoubleMinValue",
+ {255, 255, 255, 255, 255, 255, 239, 255},
+ Literal::Double(std::numeric_limits<double>::lowest()),
+ float64()},
+
+ LiteralParam{"String",
+ {105, 99, 101, 98, 101, 114, 103},
+ Literal::String("iceberg"),
+ string()},
+ LiteralParam{"StringLong",
+ {65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65},
+ Literal::String("AAAAAAAAAAAAAAAA"),
+ string()},
+
+ LiteralParam{"BinaryData",
+ {0x01, 0x02, 0x03, 0xFF},
+ Literal::Binary({0x01, 0x02, 0x03, 0xFF}),
+ binary()},
+ LiteralParam{"BinarySingleByte", {42}, Literal::Binary({42}),
binary()},
+
+ // Fixed type
+ LiteralParam{"FixedLength4",
+ {0x01, 0x02, 0x03, 0x04},
+ Literal::Fixed({0x01, 0x02, 0x03, 0x04}),
+ fixed(4)},
+ LiteralParam{"FixedLength8",
+ {0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00, 0x11},
+ Literal::Fixed({0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00,
0x11}),
+ fixed(8)},
+ LiteralParam{"FixedLength16",
+ {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
+ Literal::Fixed({0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
0x07, 0x08,
+ 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
0x0F}),
+ fixed(16)},
+ LiteralParam{"FixedSingleByte", {0xFF}, Literal::Fixed({0xFF}),
fixed(1)},
+
+ // Temporal types
+ LiteralParam{"DateEpoch", {0, 0, 0, 0}, Literal::Date(0), date()},
+ LiteralParam{"DateNextDay", {1, 0, 0, 0}, Literal::Date(1), date()},
+ LiteralParam{"DateY2K", {205, 42, 0, 0}, Literal::Date(10957), date()},
+ LiteralParam{"DateNegative", {255, 255, 255, 255}, Literal::Date(-1),
date()},
+
+ LiteralParam{"TimeMidnight", {0, 0, 0, 0, 0, 0, 0, 0},
Literal::Time(0), time()},
+ LiteralParam{"TimeNoon",
+ {128, 9, 230, 124, 10, 0, 0, 0},
+ Literal::Time(45045123456),
+ time()},
+ LiteralParam{
+ "TimeOneSecond", {64, 66, 15, 0, 0, 0, 0, 0},
Literal::Time(1000000), time()},
+
+ LiteralParam{"TimestampEpoch",
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ Literal::Timestamp(0),
+ timestamp()},
+ LiteralParam{"TimestampOneSecond",
+ {64, 66, 15, 0, 0, 0, 0, 0},
+ Literal::Timestamp(1000000),
+ timestamp()},
+ LiteralParam{"TimestampNoon2024",
+ {128, 9, 230, 124, 10, 0, 0, 0},
+ Literal::Timestamp(45045123456),
+ timestamp()},
+
+ LiteralParam{"TimestampTzEpoch",
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ Literal::TimestampTz(0),
+ timestamp_tz()},
+ LiteralParam{"TimestampTzOneHour",
+ {0, 164, 147, 214, 0, 0, 0, 0},
+ Literal::TimestampTz(3600000000),
+ timestamp_tz()},
+
+ // Empty values
+ LiteralParam{"EmptyString", {}, Literal::String(""), string()},
+ LiteralParam{"EmptyBinary", {}, Literal::Binary({}), binary()}),
+
+ [](const testing::TestParamInfo<LiteralSerDeParam::ParamType>& info) {
+ return info.param.test_name;
+ });
+
+TEST(LiteralSerDeTest, EmptyString) {
+ auto empty_string = Literal::String("");
+ auto empty_bytes = empty_string.Serialize();
+ ASSERT_TRUE(empty_bytes.has_value());
+ EXPECT_TRUE(empty_bytes->empty());
+
+ auto deserialize_result = Literal::Deserialize(*empty_bytes, string());
+ ASSERT_THAT(deserialize_result, IsOk());
+ EXPECT_TRUE(std::get<std::string>(deserialize_result->value()).empty());
+}
+
+TEST(LiteralSerDeTest, EmptyBinary) {
+ auto empty_binary = Literal::Binary({});
+ auto empty_bytes = empty_binary.Serialize();
+ ASSERT_TRUE(empty_bytes.has_value());
+ EXPECT_TRUE(empty_bytes->empty());
+
+ auto deserialize_result = Literal::Deserialize(*empty_bytes, binary());
+ ASSERT_THAT(deserialize_result, IsOk());
+
EXPECT_TRUE(std::get<std::vector<uint8_t>>(deserialize_result->value()).empty());
+}
+
+// Type promotion tests
+TEST(LiteralSerDeTest, TypePromotion) {
+ // 4-byte int data can be deserialized as long
+ std::vector<uint8_t> int_data = {32, 0, 0, 0};
+ auto long_result = Literal::Deserialize(int_data, int64());
+ ASSERT_TRUE(long_result.has_value());
+ EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong);
+ EXPECT_EQ(std::get<int64_t>(long_result->value()), 32L);
+
+ // 4-byte float data can be deserialized as double
+ std::vector<uint8_t> float_data = {0, 0, 128, 63};
+ auto double_result = Literal::Deserialize(float_data, float64());
+ ASSERT_TRUE(double_result.has_value());
+ EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble);
+ EXPECT_DOUBLE_EQ(std::get<double>(double_result->value()), 1.0);
+}
+
} // namespace iceberg
diff --git a/src/iceberg/test/manifest_list_reader_test.cc
b/src/iceberg/test/manifest_list_reader_test.cc
index a3c08c3..9fd6e4c 100644
--- a/src/iceberg/test/manifest_list_reader_test.cc
+++ b/src/iceberg/test/manifest_list_reader_test.cc
@@ -23,6 +23,7 @@
#include "iceberg/arrow/arrow_fs_file_io_internal.h"
#include "iceberg/avro/avro_register.h"
+#include "iceberg/expression/literal.h"
#include "iceberg/manifest_list.h"
#include "iceberg/manifest_reader.h"
#include "temp_file_test_base.h"
@@ -76,43 +77,38 @@ class ManifestListReaderV1Test : public
ManifestListReaderTestBase {
std::vector<int64_t> file_size = {6185, 6113};
std::vector<int64_t> snapshot_id = {7532614258660258098,
7532614258660258098};
- std::vector<std::vector<std::uint8_t>> lower_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
-
- std::vector<std::vector<std::uint8_t>> upper_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 4,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 6,
- .existing_rows_count = 0,
- .deleted_rows_count = 0,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[0],
- .upper_bound = upper_bounds[0]}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 2,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 6,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[1],
- .upper_bound = upper_bounds[1]}}}};
+ return {
+ {.manifest_path = paths[0],
+ .manifest_length = file_size[0],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[0],
+ .added_files_count = 4,
+ .existing_files_count = 0,
+ .deleted_files_count = 0,
+ .added_rows_count = 6,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 0,
+ .partitions = {{.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
Literal::String("2022-02-22").Serialize().value(),
+ .upper_bound =
+
Literal::String("2022-2-23").Serialize().value()}}},
+
+ {.manifest_path = paths[1],
+ .manifest_length = file_size[1],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[1],
+ .added_files_count = 0,
+ .existing_files_count = 0,
+ .deleted_files_count = 2,
+ .added_rows_count = 0,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 6,
+ .partitions = {
+ {.contains_null = false,
+ .contains_nan = false,
+ .lower_bound = Literal::String("2022-2-22").Serialize().value(),
+ .upper_bound =
Literal::String("2022-2-23").Serialize().value()}}}};
}
std::vector<ManifestFile> PrepareComplexTypeTestData() {
diff --git a/src/iceberg/test/manifest_reader_test.cc
b/src/iceberg/test/manifest_reader_test.cc
index db703c1..7381b29 100644
--- a/src/iceberg/test/manifest_reader_test.cc
+++ b/src/iceberg/test/manifest_reader_test.cc
@@ -94,24 +94,33 @@ class ManifestReaderV1Test : public ManifestReaderTestBase {
"order_ts_hour=2021-01-26-00/"
"00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"};
std::vector<int64_t> partitions = {447696, 473976, 465192, 447672};
+
+ // TODO(Li Feiyang): The Decimal type and its serialization logic are not
yet fully
+ // implemented to support variable-length encoding as required by the
Iceberg
+ // specification. Using Literal::Binary as a temporary substitute to
represent the raw
+ // bytes for the decimal values.
std::vector<std::map<int32_t, std::vector<uint8_t>>> bounds = {
- {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x12, 0xe2}},
- {4, {0xc0, 'y', 0xe7, 0x98, 0xd6, 0xb9, 0x05, 0x00}}},
- {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x12, 0xe3}},
- {4, {0xc0, 0x19, '#', '=', 0xe2, 0x0f, 0x06, 0x00}}},
- {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x0e, '"'}},
- {4, {0xc0, 0xd9, '7', 0x93, 0x1f, 0xf3, 0x05, 0x00}}},
- {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x0e, '!'}},
- {4, {0xc0, 0x19, 0x10, '{', 0xc2, 0xb9, 0x05, 0x00}}},
+ {{1, Literal::Long(1234).Serialize().value()},
+ {2, Literal::Long(5678).Serialize().value()},
+ {3, Literal::Binary({0x12, 0xe2}).Serialize().value()},
+
+ {4, Literal::Timestamp(1611706223000000LL).Serialize().value()}},
+ {{1, Literal::Long(1234).Serialize().value()},
+ {2, Literal::Long(5678).Serialize().value()},
+ {3, Literal::Binary({0x12, 0xe3}).Serialize().value()},
+
+ {4, Literal::Timestamp(1706314223000000LL).Serialize().value()}},
+ {{1, Literal::Long(123).Serialize().value()},
+ {2, Literal::Long(456).Serialize().value()},
+ {3, Literal::Binary({0x0e, 0x22}).Serialize().value()},
+
+ {4, Literal::Timestamp(1674691823000000LL).Serialize().value()}},
+ {{1, Literal::Long(123).Serialize().value()},
+ {2, Literal::Long(456).Serialize().value()},
+ {3, Literal::Binary({0x0e, 0x21}).Serialize().value()},
+ {4, Literal::Timestamp(1611619823000000LL).Serialize().value()}},
};
+
for (int i = 0; i < 4; ++i) {
ManifestEntry entry;
entry.status = ManifestStatus::kAdded;
@@ -159,16 +168,16 @@ class ManifestReaderV2Test : public
ManifestReaderTestBase {
std::vector<int64_t> record_counts = {4};
std::vector<std::map<int32_t, std::vector<uint8_t>>> lower_bounds = {
- {{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}},
- {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_',
'1'}},
- {4, {0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xdc, 0x5e, 0x40}}}};
+ {{1, Literal::Long(1).Serialize().value()},
+ {2, Literal::String("record_four").Serialize().value()},
+ {3, Literal::String("data_content_1").Serialize().value()},
+ {4, Literal::Double(123.45).Serialize().value()}}};
std::vector<std::map<int32_t, std::vector<uint8_t>>> upper_bounds = {
- {{1, {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 't', 'w', 'o'}},
- {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_',
'4'}},
- {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}};
+ {{1, Literal::Long(4).Serialize().value()},
+ {2, Literal::String("record_two").Serialize().value()},
+ {3, Literal::String("data_content_4").Serialize().value()},
+ {4, Literal::Double(456.78).Serialize().value()}}};
DataFile data_file{.file_path = test_dir_prefix + paths[0],
.file_format = FileFormatType::kParquet,
diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc
index 7b0f094..ddb3285 100644
--- a/src/iceberg/type.cc
+++ b/src/iceberg/type.cc
@@ -22,6 +22,7 @@
#include <format>
#include <iterator>
#include <memory>
+#include <utility>
#include "iceberg/exception.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
@@ -386,4 +387,45 @@ std::shared_ptr<StructType>
struct_(std::vector<SchemaField> fields) {
return std::make_shared<StructType>(std::move(fields));
}
+std::string_view ToString(TypeId id) {
+ switch (id) {
+ case TypeId::kStruct:
+ return "struct";
+ case TypeId::kList:
+ return "list";
+ case TypeId::kMap:
+ return "map";
+ case TypeId::kBoolean:
+ return "boolean";
+ case TypeId::kInt:
+ return "int";
+ case TypeId::kLong:
+ return "long";
+ case TypeId::kFloat:
+ return "float";
+ case TypeId::kDouble:
+ return "double";
+ case TypeId::kDecimal:
+ return "decimal";
+ case TypeId::kDate:
+ return "date";
+ case TypeId::kTime:
+ return "time";
+ case TypeId::kTimestamp:
+ return "timestamp";
+ case TypeId::kTimestampTz:
+ return "timestamptz";
+ case TypeId::kString:
+ return "string";
+ case TypeId::kUuid:
+ return "uuid";
+ case TypeId::kFixed:
+ return "fixed";
+ case TypeId::kBinary:
+ return "binary";
+ }
+
+ std::unreachable();
+}
+
} // namespace iceberg
diff --git a/src/iceberg/type.h b/src/iceberg/type.h
index 01c911d..2565268 100644
--- a/src/iceberg/type.h
+++ b/src/iceberg/type.h
@@ -531,4 +531,13 @@ ICEBERG_EXPORT std::shared_ptr<MapType> map(SchemaField
key, SchemaField value);
/// @}
+/// \brief Get the lowercase string representation of a TypeId.
+///
+/// This returns the same lowercase string as used by Type::ToString() methods.
+/// For example: TypeId::kBoolean -> "boolean", TypeId::kInt -> "int", etc.
+///
+/// \param id The TypeId to convert to string
+/// \return A string_view containing the lowercase type name
+ICEBERG_EXPORT std::string_view ToString(TypeId id);
+
} // namespace iceberg
diff --git a/src/iceberg/util/conversions.cc b/src/iceberg/util/conversions.cc
new file mode 100644
index 0000000..c5dbcf3
--- /dev/null
+++ b/src/iceberg/util/conversions.cc
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/conversions.h"
+
+#include <cstring>
+#include <span>
+#include <string>
+
+#include "iceberg/util/endian.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+/// \brief Write a value in little-endian format and return as vector.
+template <EndianConvertible T>
+std::vector<uint8_t> WriteLittleEndian(T value) {
+ value = ToLittleEndian(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ std::vector<uint8_t> result;
+ result.insert(result.end(), bytes, bytes + sizeof(T));
+ return result;
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <EndianConvertible T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+ if (data.size() != sizeof(T)) [[unlikely]] {
+ return InvalidArgument("Insufficient data to read {} bytes, got {}",
sizeof(T),
+ data.size());
+ }
+
+ T value;
+ std::memcpy(&value, data.data(), sizeof(T));
+ return FromLittleEndian(value);
+}
+
+template <TypeId type_id>
+Result<std::vector<uint8_t>> ToBytesImpl(const Literal::Value& value) {
+ using CppType = typename LiteralTraits<type_id>::ValueType;
+ return WriteLittleEndian(std::get<CppType>(value));
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kBoolean>(const
Literal::Value& value) {
+ return std::vector<uint8_t>{std::get<bool>(value) ?
static_cast<uint8_t>(0x01)
+ :
static_cast<uint8_t>(0x00)};
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kString>(const
Literal::Value& value) {
+ const auto& str = std::get<std::string>(value);
+ return std::vector<uint8_t>(str.begin(), str.end());
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kBinary>(const
Literal::Value& value) {
+ return std::get<std::vector<uint8_t>>(value);
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kFixed>(const Literal::Value&
value) {
+ return std::get<std::vector<uint8_t>>(value);
+}
+
+#define DISPATCH_LITERAL_TO_BYTES(type_id) \
+ case type_id: \
+ return ToBytesImpl<type_id>(value);
+
+Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type,
+ const Literal::Value& value)
{
+ const auto type_id = type.type_id();
+
+ switch (type_id) {
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kInt)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kDate)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kLong)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kTime)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestamp)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestampTz)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kFloat)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kDouble)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kString)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kBinary)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kFixed)
+ // TODO(Li Feiyang): Add support for UUID and Decimal
+
+ default:
+ return NotSupported("Serialization for type {} is not supported",
type.ToString());
+ }
+}
+
+#undef DISPATCH_LITERAL_TO_BYTES
+
+Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return NotSupported("Cannot serialize AboveMax");
+ }
+ if (literal.IsBelowMin()) {
+ return NotSupported("Cannot serialize BelowMin");
+ }
+ if (literal.IsNull()) {
+ return NotSupported("Cannot serialize null");
+ }
+
+ return ToBytes(*literal.type(), literal.value());
+}
+
+Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
+ std::span<const uint8_t> data) {
+ const auto type_id = type.type_id();
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+ return Literal::Value{static_cast<bool>(value != 0x00)};
+ }
+ case TypeId::kInt: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Value{value};
+ }
+ case TypeId::kDate: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Value{value};
+ }
+ case TypeId::kLong:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz: {
+ int64_t value;
+ if (data.size() < 8) {
+ // Type was promoted from int to long
+ ICEBERG_ASSIGN_OR_RAISE(auto int_value,
ReadLittleEndian<int32_t>(data));
+ value = static_cast<int64_t>(int_value);
+ } else {
+ ICEBERG_ASSIGN_OR_RAISE(auto long_value,
ReadLittleEndian<int64_t>(data));
+ value = long_value;
+ }
+ return Literal::Value{value};
+ }
+ case TypeId::kFloat: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<float>(data));
+ return Literal::Value{value};
+ }
+ case TypeId::kDouble: {
+ if (data.size() < 8) {
+ // Type was promoted from float to double
+ ICEBERG_ASSIGN_OR_RAISE(auto float_value,
ReadLittleEndian<float>(data));
+ return Literal::Value{static_cast<double>(float_value)};
+ } else {
+ ICEBERG_ASSIGN_OR_RAISE(auto double_value,
ReadLittleEndian<double>(data));
+ return Literal::Value{double_value};
+ }
+ }
+ case TypeId::kString:
+ return Literal::Value{
+ std::string(reinterpret_cast<const char*>(data.data()),
data.size())};
+ case TypeId::kBinary:
+ return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())};
+ case TypeId::kFixed: {
+ const auto& fixed_type = static_cast<const FixedType&>(type);
+ if (data.size() != fixed_type.length()) {
+ return InvalidArgument("Invalid data size for Fixed literal, got size:
{}",
+ data.size());
+ }
+ return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())};
+ }
+ // TODO(Li Feiyang): Add support for UUID and Decimal
+ default:
+ return NotSupported("Deserialization for type {} is not supported",
+ type.ToString());
+ }
+}
+
+Result<Literal> Conversions::FromBytes(std::shared_ptr<PrimitiveType> type,
+ std::span<const uint8_t> data) {
+ if (!type) {
+ return InvalidArgument("Type cannot be null");
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto value, FromBytes(*type, data));
+ return Literal(std::move(value), std::move(type));
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/util/conversions.h b/src/iceberg/util/conversions.h
new file mode 100644
index 0000000..fe383bc
--- /dev/null
+++ b/src/iceberg/util/conversions.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <span>
+#include <vector>
+
+#include "iceberg/expression/literal.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+/// \file iceberg/util/conversions.h
+/// \brief Conversion utilities for primitive types
+
+namespace iceberg {
+
+/// \brief Conversion utilities for primitive types
+class ICEBERG_EXPORT Conversions {
+ public:
+ /// \brief Serializes a raw literal value into a byte vector according to
its type.
+ /// \param type The primitive type of the value.
+ /// \param value The std::variant holding the raw literal value to serialize.
+ /// \return A Result containing the serialized value.
+ static Result<std::vector<uint8_t>> ToBytes(const PrimitiveType& type,
+ const Literal::Value& value);
+
+ /// \brief Serializes a complete Literal object into a byte vector.
+ /// \param literal The Literal object to serialize.
+ /// \return A Result containing the serialized value.
+ static Result<std::vector<uint8_t>> ToBytes(const Literal& literal);
+
+ /// \brief Deserializes a span of bytes into a raw literal value based on
the given
+ /// type.
+ /// \param type The target primitive type to interpret the bytes as.
+ /// \param data A std::span of bytes representing the serialized value.
+ /// \return A Result containing the deserialized value.
+ static Result<Literal::Value> FromBytes(const PrimitiveType& type,
+ std::span<const uint8_t> data);
+
+ /// \brief Deserializes a span of bytes into a complete Literal object.
+ /// \param type A shared pointer to the target primitive type.
+ /// \param data A std::span of bytes representing the serialized value.
+ /// \return A Result containing the deserialized value.
+ static Result<Literal> FromBytes(std::shared_ptr<PrimitiveType> type,
+ std::span<const uint8_t> data);
+};
+
+} // namespace iceberg