This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new e759807 feat: add SortOrder::Make and SortOrder::Validate (#300)
e759807 is described below
commit e759807cfd739068d1a90a0d1d26c421c0d5ec70
Author: Junwang Zhao <[email protected]>
AuthorDate: Mon Nov 10 10:08:14 2025 +0800
feat: add SortOrder::Make and SortOrder::Validate (#300)
---
src/iceberg/sort_order.cc | 64 +++++++++++++++++++-
src/iceberg/sort_order.h | 25 ++++++++
src/iceberg/test/schema_field_test.cc | 2 +-
src/iceberg/test/schema_test.cc | 1 -
src/iceberg/test/sort_order_test.cc | 109 ++++++++++++++++++++++++++++++++++
src/iceberg/test/transform_test.cc | 73 +++++++++++++++++++++++
src/iceberg/transform.cc | 69 +++++++++++++++++++++
src/iceberg/transform.h | 5 ++
8 files changed, 345 insertions(+), 3 deletions(-)
diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc
index 48a3066..9e42b15 100644
--- a/src/iceberg/sort_order.cc
+++ b/src/iceberg/sort_order.cc
@@ -20,9 +20,15 @@
#include "iceberg/sort_order.h"
#include <format>
+#include <optional>
#include <ranges>
+#include "iceberg/result.h"
+#include "iceberg/schema.h"
+#include "iceberg/sort_field.h"
+#include "iceberg/transform.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
+#include "iceberg/util/macros.h"
namespace iceberg {
@@ -31,7 +37,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector<SortField>
fields)
const std::shared_ptr<SortOrder>& SortOrder::Unsorted() {
static const std::shared_ptr<SortOrder> unsorted =
- std::make_shared<SortOrder>(/*order_id=*/0, std::vector<SortField>{});
+ std::make_shared<SortOrder>(kUnsortedOrderId, std::vector<SortField>{});
return unsorted;
}
@@ -80,4 +86,60 @@ bool SortOrder::Equals(const SortOrder& other) const {
return order_id_ == other.order_id_ && fields_ == other.fields_;
}
+Status SortOrder::Validate(const Schema& schema) const {
+ for (const auto& field : fields_) {
+ auto schema_field = schema.FindFieldById(field.source_id());
+ if (!schema_field.has_value() || schema_field.value() == std::nullopt) {
+ return InvalidArgument("Cannot find source column for sort field: {}",
field);
+ }
+
+ const auto& source_type = schema_field.value().value().get().type();
+
+ if (!field.transform()->CanTransform(*source_type)) {
+ return InvalidArgument("Invalid source type {} for transform {}",
+ source_type->ToString(),
field.transform()->ToString());
+ }
+ }
+ return {};
+}
+
+Result<std::unique_ptr<SortOrder>> SortOrder::Make(const Schema& schema,
int32_t sort_id,
+ std::vector<SortField>
fields) {
+ if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
+ return InvalidArgument("{} is reserved for unsorted sort order",
kUnsortedOrderId);
+ }
+
+ if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
+ return InvalidArgument("Sort order must have at least one sort field");
+ }
+
+ for (const auto& field : fields) {
+ ICEBERG_ASSIGN_OR_RAISE(auto schema_field,
schema.FindFieldById(field.source_id()));
+ if (schema_field == std::nullopt) {
+ return InvalidArgument("Cannot find source column for sort field: {}",
field);
+ }
+
+ const auto& source_type = schema_field.value().get().type();
+ if (field.transform()->CanTransform(*source_type) == false) {
+ return InvalidArgument("Invalid source type {} for transform {}",
+ source_type->ToString(),
field.transform()->ToString());
+ }
+ }
+
+ return std::make_unique<SortOrder>(sort_id, std::move(fields));
+}
+
+Result<std::unique_ptr<SortOrder>> SortOrder::Make(int32_t sort_id,
+ std::vector<SortField>
fields) {
+ if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
+ return InvalidArgument("{} is reserved for unsorted sort order",
kUnsortedOrderId);
+ }
+
+ if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
+ return InvalidArgument("Sort order must have at least one sort field");
+ }
+
+ return std::make_unique<SortOrder>(sort_id, std::move(fields));
+}
+
} // namespace iceberg
diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h
index e245a74..4a0b11a 100644
--- a/src/iceberg/sort_order.h
+++ b/src/iceberg/sort_order.h
@@ -20,11 +20,13 @@
#pragma once
#include <cstdint>
+#include <memory>
#include <span>
#include <vector>
#include "iceberg/iceberg_export.h"
#include "iceberg/sort_field.h"
+#include "iceberg/type_fwd.h"
#include "iceberg/util/formattable.h"
namespace iceberg {
@@ -36,6 +38,7 @@ namespace iceberg {
/// applied to the data.
class ICEBERG_EXPORT SortOrder : public util::Formattable {
public:
+ static constexpr int32_t kUnsortedOrderId = 0;
static constexpr int32_t kInitialSortOrderId = 1;
SortOrder(int32_t order_id, std::vector<SortField> fields);
@@ -69,6 +72,28 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable {
return lhs.Equals(rhs);
}
+ /// \brief Validates the sort order against a schema.
+ /// \param schema The schema to validate against.
+ /// \return Error status if the sort order has any invalid transform.
+ Status Validate(const Schema& schema) const;
+
+ /// \brief Create a SortOrder.
+ /// \param schema The schema to bind the sort order to.
+ /// \param sort_id The sort order id.
+ /// \param fields The sort fields.
+ /// \return A Result containing the SortOrder or an error.
+ static Result<std::unique_ptr<SortOrder>> Make(const Schema& schema, int32_t
sort_id,
+ std::vector<SortField>
fields);
+
+ /// \brief Create a SortOrder without binding to a schema.
+ /// \param sort_id The sort order id.
+ /// \param fields The sort fields.
+ /// \return A Result containing the SortOrder or an error.
+ /// \note This method does not check whether the sort fields are valid for
any schema.
+ /// Use IsBoundToSchema to check if the sort order is valid for a given
schema.
+ static Result<std::unique_ptr<SortOrder>> Make(int32_t sort_id,
+ std::vector<SortField>
fields);
+
private:
/// \brief Compare two sort orders for equality.
bool Equals(const SortOrder& other) const;
diff --git a/src/iceberg/test/schema_field_test.cc
b/src/iceberg/test/schema_field_test.cc
index bc0dbbd..1078aad 100644
--- a/src/iceberg/test/schema_field_test.cc
+++ b/src/iceberg/test/schema_field_test.cc
@@ -63,7 +63,7 @@ TEST(SchemaFieldTest, Equality) {
iceberg::SchemaField field1(1, "foo", iceberg::int32(), false);
iceberg::SchemaField field2(2, "foo", iceberg::int32(), false);
iceberg::SchemaField field3(1, "bar", iceberg::int32(), false);
- iceberg::SchemaField field4(1, "foo", std::make_shared<iceberg::LongType>(),
false);
+ iceberg::SchemaField field4(1, "foo", iceberg::int64(), false);
iceberg::SchemaField field5(1, "foo", iceberg::int32(), true);
iceberg::SchemaField field6(1, "foo", iceberg::int32(), false);
diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc
index d99e9b3..89a8d54 100644
--- a/src/iceberg/test/schema_test.cc
+++ b/src/iceberg/test/schema_test.cc
@@ -25,7 +25,6 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
-#include "gtest/gtest.h"
#include "iceberg/result.h"
#include "iceberg/schema_field.h"
#include "iceberg/test/matchers.h"
diff --git a/src/iceberg/test/sort_order_test.cc
b/src/iceberg/test/sort_order_test.cc
index bb407af..86bd525 100644
--- a/src/iceberg/test/sort_order_test.cc
+++ b/src/iceberg/test/sort_order_test.cc
@@ -26,11 +26,40 @@
#include "iceberg/schema.h"
#include "iceberg/sort_field.h"
+#include "iceberg/test/matchers.h"
#include "iceberg/transform.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
namespace iceberg {
+class SortOrderMakeTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ field1_ = std::make_unique<SchemaField>(1, "x", int32(), true);
+ field2_ = std::make_unique<SchemaField>(2, "y", string(), true);
+ field3_ = std::make_unique<SchemaField>(3, "time", timestamp(), true);
+
+ schema_ = std::make_unique<Schema>(
+ std::vector<SchemaField>{*field1_, *field2_, *field3_}, 1);
+
+ sort_field1_ = std::make_unique<SortField>(
+ 1, Transform::Identity(), SortDirection::kAscending,
NullOrder::kFirst);
+ sort_field2_ = std::make_unique<SortField>(
+ 2, Transform::Bucket(10), SortDirection::kDescending,
NullOrder::kLast);
+ sort_field3_ = std::make_unique<SortField>(
+ 3, Transform::Day(), SortDirection::kAscending, NullOrder::kFirst);
+ }
+
+ std::unique_ptr<Schema> schema_;
+ std::unique_ptr<SchemaField> field1_;
+ std::unique_ptr<SchemaField> field2_;
+ std::unique_ptr<SchemaField> field3_;
+
+ std::unique_ptr<SortField> sort_field1_;
+ std::unique_ptr<SortField> sort_field2_;
+ std::unique_ptr<SortField> sort_field3_;
+};
+
TEST(SortOrderTest, Basics) {
{
SchemaField field1(5, "ts", iceberg::timestamp(), true);
@@ -148,4 +177,84 @@ TEST(SortOrderTest, Satisfies) {
EXPECT_FALSE(sort_order2.Satisfies(sort_order4));
}
+TEST_F(SortOrderMakeTest, MakeValidSortOrder) {
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto sort_order,
+ SortOrder::Make(*schema_, 1, std::vector<SortField>{*sort_field1_,
*sort_field2_}));
+ ASSERT_NE(sort_order, nullptr);
+
+ EXPECT_TRUE(sort_order->is_sorted());
+ ASSERT_EQ(sort_order->fields().size(), 2);
+ EXPECT_EQ(sort_order->fields()[0], *sort_field1_);
+ EXPECT_EQ(sort_order->fields()[1], *sort_field2_);
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderEmptyFields) {
+ auto sort_order = SortOrder::Make(*schema_, 1, std::vector<SortField>{});
+ EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(sort_order,
+ HasErrorMessage("Sort order must have at least one sort field"));
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderUnsortedId) {
+ auto sort_order = SortOrder::Make(*schema_, SortOrder::kUnsortedOrderId,
+ std::vector<SortField>{*sort_field1_});
+ EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(sort_order,
+ HasErrorMessage(std::format("{} is reserved for unsorted sort
order",
+ SortOrder::kUnsortedOrderId)));
+}
+
+TEST_F(SortOrderMakeTest, MakeValidUnsortedSortOrder) {
+ ICEBERG_UNWRAP_OR_FAIL(auto sort_order,
SortOrder::Make(SortOrder::kUnsortedOrderId,
+
std::vector<SortField>{}));
+ ASSERT_NE(sort_order, nullptr);
+
+ EXPECT_TRUE(sort_order->is_unsorted());
+ EXPECT_EQ(sort_order->fields().size(), 0);
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderNonPrimitiveField) {
+ auto struct_field = std::make_unique<SchemaField>(
+ 4, "struct_field",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeRequired(41, "inner_field", iceberg::int32()),
+ }),
+ true);
+
+ Schema schema_with_struct(
+ std::vector<SchemaField>{*field1_, *field2_, *field3_, *struct_field},
1);
+
+ SortField sort_field_invalid(4, Transform::Identity(),
SortDirection::kAscending,
+ NullOrder::kFirst);
+
+ auto sort_order = SortOrder::Make(
+ schema_with_struct, 1, std::vector<SortField>{*sort_field1_,
sort_field_invalid});
+ EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(sort_order, HasErrorMessage("Invalid source type"));
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderFieldNotInSchema) {
+ SortField sort_field_invalid(999, Transform::Identity(),
SortDirection::kAscending,
+ NullOrder::kFirst);
+
+ auto sort_order = SortOrder::Make(
+ *schema_, 1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
+ EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(sort_order, HasErrorMessage("Cannot find source column for sort
field"));
+}
+
+TEST_F(SortOrderMakeTest, MakeUnboundSortOrder) {
+ SortField sort_field_invalid(999, Transform::Identity(),
SortDirection::kAscending,
+ NullOrder::kFirst);
+
+ auto sort_order =
+ SortOrder::Make(1, std::vector<SortField>{*sort_field1_,
sort_field_invalid});
+ ASSERT_THAT(sort_order, IsOk());
+ auto validate_status = sort_order.value()->Validate(*schema_);
+ EXPECT_THAT(validate_status, IsError(ErrorKind::kInvalidArgument));
+ EXPECT_THAT(validate_status,
+ HasErrorMessage("Cannot find source column for sort field"));
+}
+
} // namespace iceberg
diff --git a/src/iceberg/test/transform_test.cc
b/src/iceberg/test/transform_test.cc
index e4352af..529297b 100644
--- a/src/iceberg/test/transform_test.cc
+++ b/src/iceberg/test/transform_test.cc
@@ -27,6 +27,7 @@
#include <gtest/gtest.h>
#include "iceberg/expression/literal.h"
+#include "iceberg/schema_field.h"
#include "iceberg/test/matchers.h"
#include "iceberg/test/temporal_test_helper.h"
#include "iceberg/type.h"
@@ -841,4 +842,76 @@ TEST(TransformSatisfiesOrderOfTest, SatisfiesOrderOf) {
}
}
+TEST(TransformCanTransformTest, CanTransform) {
+ struct Case {
+ std::string transform_str;
+ std::shared_ptr<Type> source_type;
+ bool expected;
+ };
+
+ const std::vector<Case> cases = {
+ // Identity can transform all primitive types
+ {.transform_str = "identity", .source_type = int32(), .expected = true},
+ {.transform_str = "identity", .source_type = string(), .expected = true},
+ {.transform_str = "identity", .source_type = boolean(), .expected =
true},
+ {.transform_str = "identity",
+ .source_type = list(SchemaField(123, "element", int32(), false)),
+ .expected = false},
+
+ // Void can transform any type
+ {.transform_str = "void", .source_type = iceberg::int32(), .expected =
true},
+ {.transform_str = "void",
+ .source_type = iceberg::map(SchemaField(123, "key", iceberg::string(),
false),
+ SchemaField(124, "value", iceberg::int32(),
true)),
+ .expected = true},
+
+ // Bucket can transform specific types
+ {.transform_str = "bucket[16]", .source_type = iceberg::int32(),
.expected = true},
+ {.transform_str = "bucket[16]", .source_type = iceberg::string(),
.expected = true},
+ {.transform_str = "bucket[16]",
+ .source_type = iceberg::float32(),
+ .expected = false},
+
+ // Truncate can transform specific types
+ {.transform_str = "truncate[32]",
+ .source_type = iceberg::int32(),
+ .expected = true},
+ {.transform_str = "truncate[32]",
+ .source_type = iceberg::string(),
+ .expected = true},
+ {.transform_str = "truncate[32]",
+ .source_type = iceberg::boolean(),
+ .expected = false},
+
+ // Year can transform date and timestamp types
+ {.transform_str = "year", .source_type = iceberg::date(), .expected =
true},
+ {.transform_str = "year", .source_type = iceberg::timestamp(), .expected
= true},
+ {.transform_str = "year", .source_type = iceberg::string(), .expected =
false},
+
+ // Month can transform date and timestamp types
+ {.transform_str = "month", .source_type = iceberg::date(), .expected =
true},
+ {.transform_str = "month", .source_type = iceberg::timestamp(),
.expected = true},
+ {.transform_str = "month", .source_type = iceberg::binary(), .expected =
false},
+
+ // Day can transform date and timestamp types
+ {.transform_str = "day", .source_type = iceberg::date(), .expected =
true},
+ {.transform_str = "day", .source_type = iceberg::timestamp(), .expected
= true},
+ {.transform_str = "day", .source_type = iceberg::uuid(), .expected =
false},
+
+ // Hour can transform timestamp types
+ {.transform_str = "hour", .source_type = iceberg::timestamp(), .expected
= true},
+ {.transform_str = "hour", .source_type = iceberg::timestamp_tz(),
.expected = true},
+ {.transform_str = "hour", .source_type = iceberg::int32(), .expected =
false},
+ };
+
+ for (const auto& c : cases) {
+ auto transform = TransformFromString(c.transform_str);
+ ASSERT_TRUE(transform.has_value()) << "Failed to parse: " <<
c.transform_str;
+
+ EXPECT_EQ(transform.value()->CanTransform(*c.source_type), c.expected)
+ << "Unexpected result for transform: " << c.transform_str
+ << " and source type: " << c.source_type->ToString();
+ }
+}
+
} // namespace iceberg
diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc
index 4724cc1..2e39cb0 100644
--- a/src/iceberg/transform.cc
+++ b/src/iceberg/transform.cc
@@ -125,6 +125,75 @@ Result<std::shared_ptr<TransformFunction>> Transform::Bind(
}
}
+bool Transform::CanTransform(const Type& source_type) const {
+ switch (transform_type_) {
+ case TransformType::kIdentity:
+ if (!source_type.is_primitive()) [[unlikely]] {
+ return false;
+ }
+ return true;
+ case TransformType::kVoid:
+ case TransformType::kUnknown:
+ return true;
+ case TransformType::kBucket:
+ switch (source_type.type_id()) {
+ case TypeId::kInt:
+ case TypeId::kLong:
+ case TypeId::kDecimal:
+ case TypeId::kDate:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz:
+ case TypeId::kString:
+ case TypeId::kUuid:
+ case TypeId::kFixed:
+ case TypeId::kBinary:
+ return true;
+ default:
+ return false;
+ }
+ case TransformType::kTruncate:
+ switch (source_type.type_id()) {
+ case TypeId::kInt:
+ case TypeId::kLong:
+ case TypeId::kString:
+ case TypeId::kBinary:
+ case TypeId::kDecimal:
+ return true;
+ default:
+ return false;
+ }
+ case TransformType::kYear:
+ case TransformType::kMonth:
+ switch (source_type.type_id()) {
+ case TypeId::kDate:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz:
+ return true;
+ default:
+ return false;
+ }
+ case TransformType::kDay:
+ switch (source_type.type_id()) {
+ case TypeId::kDate:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz:
+ return true;
+ default:
+ return false;
+ }
+ case TransformType::kHour:
+ switch (source_type.type_id()) {
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz:
+ return true;
+ default:
+ return false;
+ }
+ }
+ std::unreachable();
+}
+
bool Transform::PreservesOrder() const {
switch (transform_type_) {
case TransformType::kUnknown:
diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h
index 21483e0..4f608c4 100644
--- a/src/iceberg/transform.h
+++ b/src/iceberg/transform.h
@@ -150,6 +150,11 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
Result<std::shared_ptr<TransformFunction>> Bind(
const std::shared_ptr<Type>& source_type) const;
+ /// \brief Checks whether this function can be applied to the given Type.
+ /// \param source_type The source type to check.
+ /// \return true if this transform can be applied to the type, false
otherwise
+ bool CanTransform(const Type& source_type) const;
+
/// \brief Whether the transform preserves the order of values (is
monotonic).
bool PreservesOrder() const;