wgtmac commented on code in PR #488:
URL: https://github.com/apache/iceberg-cpp/pull/488#discussion_r2672672757
##########
src/iceberg/metrics_config.h:
##########
@@ -22,24 +22,120 @@
/// \file iceberg/metrics_config.h
/// \brief Metrics configuration for Iceberg tables
+#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
+#include "iceberg/util/formattable.h"
namespace iceberg {
+class ICEBERG_EXPORT MetricsMode : public util::Formattable {
Review Comment:
Is it an overkill to use class hierarchy for it? Can we do this instead?
```cpp
struct MetricsMode {
enum class Kind : uint8_t { kNone, ... };
Kind kind;
std::variant<std::monostate, int32_t> param;
};
```
##########
src/iceberg/metrics_config.h:
##########
@@ -22,24 +22,120 @@
/// \file iceberg/metrics_config.h
/// \brief Metrics configuration for Iceberg tables
+#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
+#include "iceberg/util/formattable.h"
namespace iceberg {
+class ICEBERG_EXPORT MetricsMode : public util::Formattable {
+ public:
+ enum class Kind : uint8_t {
+ kNone,
+ kCounts,
+ kTruncate,
+ kFull,
+ };
+
+ static Result<std::shared_ptr<MetricsMode>> FromString(const std::string&
mode);
+
+ static const std::shared_ptr<MetricsMode>& None();
+ static const std::shared_ptr<MetricsMode>& Counts();
+ static const std::shared_ptr<MetricsMode>& Truncate();
Review Comment:
Is it useful to keep this parameterless `Truncate`?
##########
src/iceberg/util/type_util.h:
##########
@@ -122,6 +122,20 @@ class PruneColumnVisitor {
const bool select_full_types_;
};
+/// \brief Visitor for get field IDs which could be used for projection.
+class GetProjectedIdsVisitor {
+ public:
+ explicit GetProjectedIdsVisitor(bool include_struct_ids = false);
+
+ Status Visit(const std::shared_ptr<Type>& type);
Review Comment:
```suggestion
Status Visit(const Type& type);
```
It looks weird that one is shared_ptr but the other is not. Is it better to
use `PrimitiveType` as the type?
##########
src/iceberg/metrics_config.h:
##########
@@ -22,24 +22,120 @@
/// \file iceberg/metrics_config.h
/// \brief Metrics configuration for Iceberg tables
+#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
+#include "iceberg/util/formattable.h"
namespace iceberg {
+class ICEBERG_EXPORT MetricsMode : public util::Formattable {
+ public:
+ enum class Kind : uint8_t {
+ kNone,
+ kCounts,
+ kTruncate,
+ kFull,
+ };
+
+ static Result<std::shared_ptr<MetricsMode>> FromString(const std::string&
mode);
Review Comment:
```suggestion
static Result<std::shared_ptr<MetricsMode>> FromString(std::string_view
mode);
```
##########
src/iceberg/sort_order.cc:
##########
@@ -132,4 +132,18 @@ Result<std::unique_ptr<SortOrder>> SortOrder::Make(int32_t
sort_id,
return std::unique_ptr<SortOrder>(new SortOrder(sort_id, std::move(fields)));
}
+std::unordered_set<std::string_view> SortOrder::OrderPreservingSortedColumns(
+ const Schema& schema, const SortOrder& order) {
+ return order.fields() | std::views::filter([&schema](const SortField& field)
{
+ return field.transform()->PreservesOrder();
+ }) |
+ std::views::transform([&schema](const SortField& field) {
+ return schema.FindColumnNameById(field.source_id())
+ .value_or(std::nullopt)
Review Comment:
Why there is two `value_or`? Should we return error if `FindColumnNameById`
returns error instead of swallowing it? (This is not a strong opinion)
##########
src/iceberg/metrics_config.cc:
##########
@@ -19,15 +19,235 @@
#include "iceberg/metrics_config.h"
+#include <charconv>
#include <string>
#include <unordered_map>
#include "iceberg/result.h"
#include "iceberg/schema.h"
+#include "iceberg/sort_order.h"
+#include "iceberg/table.h"
#include "iceberg/table_properties.h"
+#include "iceberg/util/checked_cast.h"
+#include "iceberg/util/type_util.h"
namespace iceberg {
+namespace {
+
+constexpr std::string_view kNoneName = "none";
+constexpr std::string_view kCountsName = "counts";
+constexpr std::string_view kFullName = "full";
+constexpr std::string_view kTruncatePrefix = "truncate(";
+constexpr int32_t kDefaultTruncateLength = 16;
+const std::shared_ptr<MetricsMode> kDefaultMetricsMode =
+ std::make_shared<TruncateMetricsMode>(kDefaultTruncateLength);
+
+std::shared_ptr<MetricsMode> SortedColumnDefaultMode(
+ std::shared_ptr<MetricsMode> default_mode) {
+ if (default_mode->kind() == MetricsMode::Kind::kNone ||
+ default_mode->kind() == MetricsMode::Kind::kCounts) {
+ return kDefaultMetricsMode;
+ } else {
+ return std::move(default_mode);
+ }
+}
+
+int32_t MaxInferredColumns(const TableProperties& properties) {
+ int32_t max_inferred_columns =
+ properties.Get(TableProperties::kMetricsMaxInferredColumnDefaults);
+ if (max_inferred_columns < 0) {
+ // fallback to default
+ return TableProperties::kMetricsMaxInferredColumnDefaults.value();
+ }
+ return max_inferred_columns;
+}
+
+Result<std::shared_ptr<MetricsMode>> ParseMode(const std::string& mode,
+ std::shared_ptr<MetricsMode>
fallback) {
+ if (auto metrics_mode = MetricsMode::FromString(mode);
metrics_mode.has_value()) {
+ return std::move(metrics_mode.value());
+ }
+ return std::move(fallback);
+}
+
+} // namespace
+
+const std::shared_ptr<MetricsMode>& MetricsMode::None() {
+ static const std::shared_ptr<MetricsMode> none =
std::make_shared<NoneMetricsMode>();
+ return none;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Counts() {
+ static const std::shared_ptr<MetricsMode> counts =
+ std::make_shared<CountsMetricsMode>();
+ return counts;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Full() {
+ static const std::shared_ptr<MetricsMode> full =
std::make_shared<FullMetricsMode>();
+ return full;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Truncate() {
+ return kDefaultMetricsMode;
+}
+
+Result<std::shared_ptr<MetricsMode>> MetricsMode::FromString(const
std::string& mode) {
+ if (StringUtils::EqualsIgnoreCase(mode, kNoneName)) {
+ return MetricsMode::None();
+ } else if (StringUtils::EqualsIgnoreCase(mode, kCountsName)) {
+ return MetricsMode::Counts();
+ } else if (StringUtils::EqualsIgnoreCase(mode, kFullName)) {
+ return MetricsMode::Full();
+ }
+
+ if (mode.starts_with(kTruncatePrefix) && mode.ends_with(")")) {
Review Comment:
It should also usse case insensitive comparison.
##########
src/iceberg/metrics_config.h:
##########
@@ -22,24 +22,120 @@
/// \file iceberg/metrics_config.h
/// \brief Metrics configuration for Iceberg tables
+#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
+#include "iceberg/util/formattable.h"
namespace iceberg {
+class ICEBERG_EXPORT MetricsMode : public util::Formattable {
+ public:
+ enum class Kind : uint8_t {
+ kNone,
+ kCounts,
+ kTruncate,
+ kFull,
+ };
+
+ static Result<std::shared_ptr<MetricsMode>> FromString(const std::string&
mode);
+
+ static const std::shared_ptr<MetricsMode>& None();
+ static const std::shared_ptr<MetricsMode>& Counts();
+ static const std::shared_ptr<MetricsMode>& Truncate();
+ static const std::shared_ptr<MetricsMode>& Full();
+
+ /// \brief Return the kind of this metrics mode.
+ virtual Kind kind() const = 0;
+
+ std::string ToString() const override = 0;
+};
+
+class ICEBERG_EXPORT NoneMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kNone; }
+
+ std::string ToString() const override;
+};
+
+class ICEBERG_EXPORT CountsMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kCounts; }
+
+ std::string ToString() const override;
+};
+
+class ICEBERG_EXPORT TruncateMetricsMode : public MetricsMode {
+ public:
+ explicit TruncateMetricsMode(int32_t length) : length_(length) {}
+
+ constexpr Kind kind() const override { return Kind::kTruncate; }
+
+ std::string ToString() const override;
+
+ static Result<std::shared_ptr<MetricsMode>> Make(int32_t length);
+
+ private:
+ const int32_t length_;
+};
+
+class ICEBERG_EXPORT FullMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kFull; }
+
+ std::string ToString() const override;
+};
+
/// \brief Configuration utilities for table metrics
class ICEBERG_EXPORT MetricsConfig {
public:
+ MetricsConfig(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>>
column_modes,
+ std::shared_ptr<MetricsMode> default_mode);
+
+ /// \brief Get the default metrics config.
+ static const std::shared_ptr<MetricsConfig>& Default();
+
+ /// \brief Creates a metrics config from a table.
+ static Result<std::shared_ptr<MetricsConfig>> Make(std::shared_ptr<Table>
table);
Review Comment:
```suggestion
static Result<std::unique_ptr<MetricsConfig>> Make(const Table& table);
```
##########
src/iceberg/metrics_config.h:
##########
@@ -22,24 +22,120 @@
/// \file iceberg/metrics_config.h
/// \brief Metrics configuration for Iceberg tables
+#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
+#include "iceberg/util/formattable.h"
namespace iceberg {
+class ICEBERG_EXPORT MetricsMode : public util::Formattable {
+ public:
+ enum class Kind : uint8_t {
+ kNone,
+ kCounts,
+ kTruncate,
+ kFull,
+ };
+
+ static Result<std::shared_ptr<MetricsMode>> FromString(const std::string&
mode);
+
+ static const std::shared_ptr<MetricsMode>& None();
+ static const std::shared_ptr<MetricsMode>& Counts();
+ static const std::shared_ptr<MetricsMode>& Truncate();
+ static const std::shared_ptr<MetricsMode>& Full();
+
+ /// \brief Return the kind of this metrics mode.
+ virtual Kind kind() const = 0;
+
+ std::string ToString() const override = 0;
+};
+
+class ICEBERG_EXPORT NoneMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kNone; }
+
+ std::string ToString() const override;
+};
+
+class ICEBERG_EXPORT CountsMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kCounts; }
+
+ std::string ToString() const override;
+};
+
+class ICEBERG_EXPORT TruncateMetricsMode : public MetricsMode {
+ public:
+ explicit TruncateMetricsMode(int32_t length) : length_(length) {}
+
+ constexpr Kind kind() const override { return Kind::kTruncate; }
+
+ std::string ToString() const override;
+
+ static Result<std::shared_ptr<MetricsMode>> Make(int32_t length);
+
+ private:
+ const int32_t length_;
+};
+
+class ICEBERG_EXPORT FullMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kFull; }
+
+ std::string ToString() const override;
+};
+
/// \brief Configuration utilities for table metrics
class ICEBERG_EXPORT MetricsConfig {
public:
+ MetricsConfig(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>>
column_modes,
+ std::shared_ptr<MetricsMode> default_mode);
Review Comment:
```suggestion
std::unordered_map<std::string, std::string>> properties);
```
Should we use this as the function signature? I think MetricsConfig is
created directly from the table property map.
##########
src/iceberg/metrics_config.cc:
##########
@@ -19,15 +19,235 @@
#include "iceberg/metrics_config.h"
+#include <charconv>
#include <string>
#include <unordered_map>
#include "iceberg/result.h"
#include "iceberg/schema.h"
+#include "iceberg/sort_order.h"
+#include "iceberg/table.h"
#include "iceberg/table_properties.h"
+#include "iceberg/util/checked_cast.h"
+#include "iceberg/util/type_util.h"
namespace iceberg {
+namespace {
+
+constexpr std::string_view kNoneName = "none";
+constexpr std::string_view kCountsName = "counts";
+constexpr std::string_view kFullName = "full";
+constexpr std::string_view kTruncatePrefix = "truncate(";
+constexpr int32_t kDefaultTruncateLength = 16;
+const std::shared_ptr<MetricsMode> kDefaultMetricsMode =
+ std::make_shared<TruncateMetricsMode>(kDefaultTruncateLength);
+
+std::shared_ptr<MetricsMode> SortedColumnDefaultMode(
+ std::shared_ptr<MetricsMode> default_mode) {
+ if (default_mode->kind() == MetricsMode::Kind::kNone ||
+ default_mode->kind() == MetricsMode::Kind::kCounts) {
+ return kDefaultMetricsMode;
+ } else {
+ return std::move(default_mode);
+ }
+}
+
+int32_t MaxInferredColumns(const TableProperties& properties) {
+ int32_t max_inferred_columns =
+ properties.Get(TableProperties::kMetricsMaxInferredColumnDefaults);
+ if (max_inferred_columns < 0) {
+ // fallback to default
+ return TableProperties::kMetricsMaxInferredColumnDefaults.value();
+ }
+ return max_inferred_columns;
+}
+
+Result<std::shared_ptr<MetricsMode>> ParseMode(const std::string& mode,
+ std::shared_ptr<MetricsMode>
fallback) {
+ if (auto metrics_mode = MetricsMode::FromString(mode);
metrics_mode.has_value()) {
+ return std::move(metrics_mode.value());
+ }
+ return std::move(fallback);
+}
+
+} // namespace
+
+const std::shared_ptr<MetricsMode>& MetricsMode::None() {
+ static const std::shared_ptr<MetricsMode> none =
std::make_shared<NoneMetricsMode>();
+ return none;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Counts() {
+ static const std::shared_ptr<MetricsMode> counts =
+ std::make_shared<CountsMetricsMode>();
+ return counts;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Full() {
+ static const std::shared_ptr<MetricsMode> full =
std::make_shared<FullMetricsMode>();
+ return full;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Truncate() {
+ return kDefaultMetricsMode;
+}
+
+Result<std::shared_ptr<MetricsMode>> MetricsMode::FromString(const
std::string& mode) {
+ if (StringUtils::EqualsIgnoreCase(mode, kNoneName)) {
+ return MetricsMode::None();
+ } else if (StringUtils::EqualsIgnoreCase(mode, kCountsName)) {
+ return MetricsMode::Counts();
+ } else if (StringUtils::EqualsIgnoreCase(mode, kFullName)) {
+ return MetricsMode::Full();
+ }
+
+ if (mode.starts_with(kTruncatePrefix) && mode.ends_with(")")) {
+ int32_t length;
+ auto [ptr, ec] = std::from_chars(mode.data() + 9 /* "truncate(" length */,
+ mode.data() + mode.size() - 1, length);
+ if (ec != std::errc{}) {
+ return InvalidArgument("Invalid truncate mode: {}", mode);
+ }
+ if (length == kDefaultTruncateLength) {
+ return kDefaultMetricsMode;
+ }
+ return TruncateMetricsMode::Make(length);
+ }
+ return InvalidArgument("Invalid metrics mode: {}", mode);
+}
+
+std::string NoneMetricsMode::ToString() const { return std::string(kNoneName);
}
+std::string CountsMetricsMode::ToString() const { return
std::string(kCountsName); }
+std::string FullMetricsMode::ToString() const { return std::string(kFullName);
}
+std::string TruncateMetricsMode::ToString() const {
+ return std::format("truncate({})", length_);
+}
+
+Result<std::shared_ptr<MetricsMode>> TruncateMetricsMode::Make(int32_t length)
{
+ ICEBERG_PRECHECK(length > 0, "Truncate length should be positive.");
+ return std::make_shared<TruncateMetricsMode>(length);
+}
+
+MetricsConfig::MetricsConfig(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>> column_modes,
+ std::shared_ptr<MetricsMode> default_mode)
+ : column_modes_(std::move(column_modes)),
default_mode_(std::move(default_mode)) {}
+
+const std::shared_ptr<MetricsConfig>& MetricsConfig::Default() {
+ static const auto default_config = std::make_shared<MetricsConfig>(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>>{},
+ kDefaultMetricsMode);
+ return default_config;
+}
+
+Result<std::shared_ptr<MetricsConfig>>
MetricsConfig::Make(std::shared_ptr<Table> table) {
+ ICEBERG_PRECHECK(table != nullptr, "table cannot be null");
+ ICEBERG_ASSIGN_OR_RAISE(auto schema, table->schema());
+
+ auto sort_order = table->sort_order();
+ return MakeInternal(
+ table->properties(), *schema,
+ sort_order.has_value() ? *sort_order.value() : *SortOrder::Unsorted());
+}
+
+Result<std::shared_ptr<MetricsConfig>> MetricsConfig::MakeInternal(
+ const TableProperties& props, const Schema& schema, const SortOrder&
order) {
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>> column_modes;
+
+ std::shared_ptr<MetricsMode> default_mode = kDefaultMetricsMode;
+ if
(props.configs().contains(TableProperties::kDefaultWriteMetricsMode.key())) {
+ std::string configured_metrics_mode =
+ props.Get(TableProperties::kDefaultWriteMetricsMode);
+ ICEBERG_ASSIGN_OR_RAISE(default_mode,
+ ParseMode(configured_metrics_mode,
kDefaultMetricsMode));
+ } else {
+ int32_t max_inferred_columns = MaxInferredColumns(props);
+ GetProjectedIdsVisitor visitor(true);
+ ICEBERG_RETURN_UNEXPECTED(
+ visitor.Visit(internal::checked_cast<const StructType&>(schema)));
+ int32_t projected_columns = visitor.Finish().size();
+ if (max_inferred_columns < projected_columns) {
+ ICEBERG_ASSIGN_OR_RAISE(auto limit_field_ids,
+ LimitFieldIds(schema, max_inferred_columns));
+ for (auto id : limit_field_ids) {
+ ICEBERG_ASSIGN_OR_RAISE(auto column_name,
schema.FindColumnNameById(id));
+ ICEBERG_CHECK(column_name.has_value(), "Field id {} not found in
schema", id);
+ column_modes[std::string(column_name.value())] = kDefaultMetricsMode;
+ }
+ // All other columns don't use metrics
+ default_mode = MetricsMode::None();
+ }
+ }
+
+ // First set sorted column with sorted column default (can be overridden by
user)
+ auto sorted_col_default_mode = SortedColumnDefaultMode(default_mode);
+ auto sorted_columns = SortOrder::OrderPreservingSortedColumns(schema, order);
+ for (const auto& sc : sorted_columns) {
+ column_modes[std::string(sc)] = sorted_col_default_mode;
+ }
+
+ // Handle user overrides of defaults
+ for (const auto& prop : props.configs()) {
+ if (prop.first.starts_with(TableProperties::kMetricModeColumnConfPrefix)) {
+ std::string column_alias =
+
prop.first.substr(TableProperties::kMetricModeColumnConfPrefix.size());
+ ICEBERG_ASSIGN_OR_RAISE(auto mode, ParseMode(prop.second, default_mode));
+ column_modes[std::move(column_alias)] = mode;
+ }
+ }
+
+ return std::make_shared<MetricsConfig>(std::move(column_modes),
+ std::move(default_mode));
+}
+
+Result<std::unordered_set<int32_t>> MetricsConfig::LimitFieldIds(const Schema&
schema,
+ int32_t
limit) {
+ class Visitor {
+ public:
+ explicit Visitor(int32_t limit) : limit_(limit) {}
+
+ Status Visit(const std::shared_ptr<Type>& type) {
+ if (type->is_nested()) {
+ return Visit(internal::checked_cast<const NestedType&>(*type));
+ }
+ return {};
+ }
+
+ Status Visit(const NestedType& type) {
+ for (auto& field : type.fields()) {
+ if (!ShouldContinue()) {
+ break;
+ }
+ if (field.type()->is_primitive()) {
Review Comment:
Add a TODO comment to support `variant` type in the future.
##########
src/iceberg/metrics_config.h:
##########
@@ -22,24 +22,120 @@
/// \file iceberg/metrics_config.h
/// \brief Metrics configuration for Iceberg tables
+#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
+#include "iceberg/util/formattable.h"
namespace iceberg {
+class ICEBERG_EXPORT MetricsMode : public util::Formattable {
+ public:
+ enum class Kind : uint8_t {
+ kNone,
+ kCounts,
+ kTruncate,
+ kFull,
+ };
+
+ static Result<std::shared_ptr<MetricsMode>> FromString(const std::string&
mode);
+
+ static const std::shared_ptr<MetricsMode>& None();
+ static const std::shared_ptr<MetricsMode>& Counts();
+ static const std::shared_ptr<MetricsMode>& Truncate();
+ static const std::shared_ptr<MetricsMode>& Full();
+
+ /// \brief Return the kind of this metrics mode.
+ virtual Kind kind() const = 0;
+
+ std::string ToString() const override = 0;
+};
+
+class ICEBERG_EXPORT NoneMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kNone; }
+
+ std::string ToString() const override;
+};
+
+class ICEBERG_EXPORT CountsMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kCounts; }
+
+ std::string ToString() const override;
+};
+
+class ICEBERG_EXPORT TruncateMetricsMode : public MetricsMode {
+ public:
+ explicit TruncateMetricsMode(int32_t length) : length_(length) {}
+
+ constexpr Kind kind() const override { return Kind::kTruncate; }
+
+ std::string ToString() const override;
+
+ static Result<std::shared_ptr<MetricsMode>> Make(int32_t length);
+
+ private:
+ const int32_t length_;
+};
+
+class ICEBERG_EXPORT FullMetricsMode : public MetricsMode {
+ public:
+ constexpr Kind kind() const override { return Kind::kFull; }
+
+ std::string ToString() const override;
+};
+
/// \brief Configuration utilities for table metrics
class ICEBERG_EXPORT MetricsConfig {
public:
+ MetricsConfig(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>>
column_modes,
+ std::shared_ptr<MetricsMode> default_mode);
Review Comment:
BTW, should we make it a private ctor and add a Make function to create one
by validating all inputs?
##########
src/iceberg/metrics_config.cc:
##########
@@ -19,15 +19,235 @@
#include "iceberg/metrics_config.h"
+#include <charconv>
#include <string>
#include <unordered_map>
#include "iceberg/result.h"
#include "iceberg/schema.h"
+#include "iceberg/sort_order.h"
+#include "iceberg/table.h"
#include "iceberg/table_properties.h"
+#include "iceberg/util/checked_cast.h"
+#include "iceberg/util/type_util.h"
namespace iceberg {
+namespace {
+
+constexpr std::string_view kNoneName = "none";
+constexpr std::string_view kCountsName = "counts";
+constexpr std::string_view kFullName = "full";
+constexpr std::string_view kTruncatePrefix = "truncate(";
+constexpr int32_t kDefaultTruncateLength = 16;
+const std::shared_ptr<MetricsMode> kDefaultMetricsMode =
+ std::make_shared<TruncateMetricsMode>(kDefaultTruncateLength);
+
+std::shared_ptr<MetricsMode> SortedColumnDefaultMode(
+ std::shared_ptr<MetricsMode> default_mode) {
+ if (default_mode->kind() == MetricsMode::Kind::kNone ||
+ default_mode->kind() == MetricsMode::Kind::kCounts) {
+ return kDefaultMetricsMode;
+ } else {
+ return std::move(default_mode);
+ }
+}
+
+int32_t MaxInferredColumns(const TableProperties& properties) {
+ int32_t max_inferred_columns =
+ properties.Get(TableProperties::kMetricsMaxInferredColumnDefaults);
+ if (max_inferred_columns < 0) {
+ // fallback to default
+ return TableProperties::kMetricsMaxInferredColumnDefaults.value();
+ }
+ return max_inferred_columns;
+}
+
+Result<std::shared_ptr<MetricsMode>> ParseMode(const std::string& mode,
+ std::shared_ptr<MetricsMode>
fallback) {
+ if (auto metrics_mode = MetricsMode::FromString(mode);
metrics_mode.has_value()) {
+ return std::move(metrics_mode.value());
+ }
+ return std::move(fallback);
+}
+
+} // namespace
+
+const std::shared_ptr<MetricsMode>& MetricsMode::None() {
+ static const std::shared_ptr<MetricsMode> none =
std::make_shared<NoneMetricsMode>();
+ return none;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Counts() {
+ static const std::shared_ptr<MetricsMode> counts =
+ std::make_shared<CountsMetricsMode>();
+ return counts;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Full() {
+ static const std::shared_ptr<MetricsMode> full =
std::make_shared<FullMetricsMode>();
+ return full;
+}
+
+const std::shared_ptr<MetricsMode>& MetricsMode::Truncate() {
+ return kDefaultMetricsMode;
+}
+
+Result<std::shared_ptr<MetricsMode>> MetricsMode::FromString(const
std::string& mode) {
+ if (StringUtils::EqualsIgnoreCase(mode, kNoneName)) {
+ return MetricsMode::None();
+ } else if (StringUtils::EqualsIgnoreCase(mode, kCountsName)) {
+ return MetricsMode::Counts();
+ } else if (StringUtils::EqualsIgnoreCase(mode, kFullName)) {
+ return MetricsMode::Full();
+ }
+
+ if (mode.starts_with(kTruncatePrefix) && mode.ends_with(")")) {
+ int32_t length;
+ auto [ptr, ec] = std::from_chars(mode.data() + 9 /* "truncate(" length */,
+ mode.data() + mode.size() - 1, length);
+ if (ec != std::errc{}) {
+ return InvalidArgument("Invalid truncate mode: {}", mode);
+ }
+ if (length == kDefaultTruncateLength) {
+ return kDefaultMetricsMode;
+ }
+ return TruncateMetricsMode::Make(length);
+ }
+ return InvalidArgument("Invalid metrics mode: {}", mode);
+}
+
+std::string NoneMetricsMode::ToString() const { return std::string(kNoneName);
}
+std::string CountsMetricsMode::ToString() const { return
std::string(kCountsName); }
+std::string FullMetricsMode::ToString() const { return std::string(kFullName);
}
+std::string TruncateMetricsMode::ToString() const {
+ return std::format("truncate({})", length_);
+}
+
+Result<std::shared_ptr<MetricsMode>> TruncateMetricsMode::Make(int32_t length)
{
+ ICEBERG_PRECHECK(length > 0, "Truncate length should be positive.");
+ return std::make_shared<TruncateMetricsMode>(length);
+}
+
+MetricsConfig::MetricsConfig(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>> column_modes,
+ std::shared_ptr<MetricsMode> default_mode)
+ : column_modes_(std::move(column_modes)),
default_mode_(std::move(default_mode)) {}
+
+const std::shared_ptr<MetricsConfig>& MetricsConfig::Default() {
+ static const auto default_config = std::make_shared<MetricsConfig>(
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>>{},
+ kDefaultMetricsMode);
+ return default_config;
+}
+
+Result<std::shared_ptr<MetricsConfig>>
MetricsConfig::Make(std::shared_ptr<Table> table) {
+ ICEBERG_PRECHECK(table != nullptr, "table cannot be null");
+ ICEBERG_ASSIGN_OR_RAISE(auto schema, table->schema());
+
+ auto sort_order = table->sort_order();
+ return MakeInternal(
+ table->properties(), *schema,
+ sort_order.has_value() ? *sort_order.value() : *SortOrder::Unsorted());
+}
+
+Result<std::shared_ptr<MetricsConfig>> MetricsConfig::MakeInternal(
+ const TableProperties& props, const Schema& schema, const SortOrder&
order) {
+ std::unordered_map<std::string, std::shared_ptr<MetricsMode>> column_modes;
+
+ std::shared_ptr<MetricsMode> default_mode = kDefaultMetricsMode;
+ if
(props.configs().contains(TableProperties::kDefaultWriteMetricsMode.key())) {
+ std::string configured_metrics_mode =
+ props.Get(TableProperties::kDefaultWriteMetricsMode);
+ ICEBERG_ASSIGN_OR_RAISE(default_mode,
+ ParseMode(configured_metrics_mode,
kDefaultMetricsMode));
+ } else {
+ int32_t max_inferred_columns = MaxInferredColumns(props);
+ GetProjectedIdsVisitor visitor(true);
+ ICEBERG_RETURN_UNEXPECTED(
+ visitor.Visit(internal::checked_cast<const StructType&>(schema)));
+ int32_t projected_columns = visitor.Finish().size();
+ if (max_inferred_columns < projected_columns) {
+ ICEBERG_ASSIGN_OR_RAISE(auto limit_field_ids,
+ LimitFieldIds(schema, max_inferred_columns));
+ for (auto id : limit_field_ids) {
+ ICEBERG_ASSIGN_OR_RAISE(auto column_name,
schema.FindColumnNameById(id));
+ ICEBERG_CHECK(column_name.has_value(), "Field id {} not found in
schema", id);
+ column_modes[std::string(column_name.value())] = kDefaultMetricsMode;
+ }
+ // All other columns don't use metrics
+ default_mode = MetricsMode::None();
+ }
+ }
+
+ // First set sorted column with sorted column default (can be overridden by
user)
+ auto sorted_col_default_mode = SortedColumnDefaultMode(default_mode);
+ auto sorted_columns = SortOrder::OrderPreservingSortedColumns(schema, order);
+ for (const auto& sc : sorted_columns) {
+ column_modes[std::string(sc)] = sorted_col_default_mode;
+ }
+
+ // Handle user overrides of defaults
+ for (const auto& prop : props.configs()) {
+ if (prop.first.starts_with(TableProperties::kMetricModeColumnConfPrefix)) {
+ std::string column_alias =
+
prop.first.substr(TableProperties::kMetricModeColumnConfPrefix.size());
+ ICEBERG_ASSIGN_OR_RAISE(auto mode, ParseMode(prop.second, default_mode));
+ column_modes[std::move(column_alias)] = mode;
+ }
+ }
+
+ return std::make_shared<MetricsConfig>(std::move(column_modes),
+ std::move(default_mode));
+}
+
+Result<std::unordered_set<int32_t>> MetricsConfig::LimitFieldIds(const Schema&
schema,
+ int32_t
limit) {
+ class Visitor {
+ public:
+ explicit Visitor(int32_t limit) : limit_(limit) {}
+
+ Status Visit(const std::shared_ptr<Type>& type) {
Review Comment:
Similarly, it looks weird to use `const std::shared_ptr<Type>&` and `const
NestedType&` inconsistently. Can you fix this? You may find `VisitTypeInline`
useful.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]