This is an automated email from the ASF dual-hosted git repository.
huaxingao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new 17891bc4e3 API, Core: Align offsets of field stats with Design doc /
Spec (#15432)
17891bc4e3 is described below
commit 17891bc4e300c49b3678117be3189864aaf3866b
Author: Eduard Tudenhoefner <[email protected]>
AuthorDate: Wed Feb 25 20:09:20 2026 +0100
API, Core: Align offsets of field stats with Design doc / Spec (#15432)
---
.../org/apache/iceberg/stats/FieldStatistic.java | 92 +++++++++++++---------
.../java/org/apache/iceberg/stats/StatsUtil.java | 2 +-
.../org/apache/iceberg/stats/TestStatsUtil.java | 24 +++---
.../org/apache/iceberg/stats/BaseFieldStats.java | 4 +-
.../org/apache/iceberg/stats/TestContentStats.java | 22 +++---
.../org/apache/iceberg/stats/TestFieldStats.java | 22 +++---
6 files changed, 91 insertions(+), 75 deletions(-)
diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
index 7715359ea2..8d13ba5567 100644
--- a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
+++ b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java
@@ -24,14 +24,14 @@ import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
public enum FieldStatistic {
- VALUE_COUNT(0, "value_count"),
- NULL_VALUE_COUNT(1, "null_value_count"),
- NAN_VALUE_COUNT(2, "nan_value_count"),
- AVG_VALUE_SIZE(3, "avg_value_size"),
- MAX_VALUE_SIZE(4, "max_value_size"),
- LOWER_BOUND(5, "lower_bound"),
- UPPER_BOUND(6, "upper_bound"),
- EXACT_BOUNDS(7, "exact_bounds");
+ VALUE_COUNT(1, "value_count"),
+ NULL_VALUE_COUNT(2, "null_value_count"),
+ NAN_VALUE_COUNT(3, "nan_value_count"),
+ AVG_VALUE_SIZE(4, "avg_value_size"),
+ MAX_VALUE_SIZE(5, "max_value_size"),
+ LOWER_BOUND(6, "lower_bound"),
+ UPPER_BOUND(7, "upper_bound"),
+ EXACT_BOUNDS(8, "exact_bounds");
private final int offset;
private final String fieldName;
@@ -41,68 +41,84 @@ public enum FieldStatistic {
this.fieldName = fieldName;
}
+ /**
+ * The offset from the field ID of the base stats structure
+ *
+ * @return The offset from the field ID of the base strats structure
+ */
public int offset() {
return offset;
}
+ /**
+ * The ordinal position (0-based) within the stats structure
+ *
+ * @return The ordinal position (0-based) within the stats structure
+ */
+ public int position() {
+ return offset - 1;
+ }
+
+ /**
+ * The field name
+ *
+ * @return The field name
+ */
public String fieldName() {
return fieldName;
}
- public static FieldStatistic fromOffset(int offset) {
- switch (offset) {
- case 0:
- return VALUE_COUNT;
- case 1:
- return NULL_VALUE_COUNT;
- case 2:
- return NAN_VALUE_COUNT;
- case 3:
- return AVG_VALUE_SIZE;
- case 4:
- return MAX_VALUE_SIZE;
- case 5:
- return LOWER_BOUND;
- case 6:
- return UPPER_BOUND;
- case 7:
- return EXACT_BOUNDS;
- default:
- throw new IllegalArgumentException("Invalid statistic offset: " +
offset);
- }
+ /**
+ * Returns the {@link FieldStatistic} from its ordinal position (0-based) in
the stats structure
+ *
+ * @param position The ordinal position (0-based) in the stats structure
+ * @return The {@link FieldStatistic} from its ordinal position (0-based) in
the stats structure
+ */
+ public static FieldStatistic fromPosition(int position) {
+ return switch (position) {
+ case 0 -> VALUE_COUNT;
+ case 1 -> NULL_VALUE_COUNT;
+ case 2 -> NAN_VALUE_COUNT;
+ case 3 -> AVG_VALUE_SIZE;
+ case 4 -> MAX_VALUE_SIZE;
+ case 5 -> LOWER_BOUND;
+ case 6 -> UPPER_BOUND;
+ case 7 -> EXACT_BOUNDS;
+ default -> throw new IllegalArgumentException("Invalid statistic
position: " + position);
+ };
}
- public static Types.StructType fieldStatsFor(Type type, int fieldId) {
+ public static Types.StructType fieldStatsFor(Type type, int baseFieldId) {
return Types.StructType.of(
optional(
- fieldId + VALUE_COUNT.offset(),
+ baseFieldId + VALUE_COUNT.offset(),
VALUE_COUNT.fieldName(),
Types.LongType.get(),
"Total value count, including null and NaN"),
optional(
- fieldId + NULL_VALUE_COUNT.offset(),
+ baseFieldId + NULL_VALUE_COUNT.offset(),
NULL_VALUE_COUNT.fieldName(),
Types.LongType.get(),
"Total null value count"),
optional(
- fieldId + NAN_VALUE_COUNT.offset(),
+ baseFieldId + NAN_VALUE_COUNT.offset(),
NAN_VALUE_COUNT.fieldName(),
Types.LongType.get(),
"Total NaN value count"),
optional(
- fieldId + AVG_VALUE_SIZE.offset(),
+ baseFieldId + AVG_VALUE_SIZE.offset(),
AVG_VALUE_SIZE.fieldName(),
Types.IntegerType.get(),
"Avg value size of variable-length types (String, Binary)"),
optional(
- fieldId + MAX_VALUE_SIZE.offset(),
+ baseFieldId + MAX_VALUE_SIZE.offset(),
MAX_VALUE_SIZE.fieldName(),
Types.IntegerType.get(),
"Max value size of variable-length types (String, Binary)"),
- optional(fieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(),
type, "Lower bound"),
- optional(fieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(),
type, "Upper bound"),
+ optional(baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(),
type, "Lower bound"),
+ optional(baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(),
type, "Upper bound"),
optional(
- fieldId + EXACT_BOUNDS.offset(),
+ baseFieldId + EXACT_BOUNDS.offset(),
EXACT_BOUNDS.fieldName(),
Types.BooleanType.get(),
"Whether the upper/lower bound is exact or not"));
diff --git a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
b/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
index 1e2bef98a7..349f9fe75b 100644
--- a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
+++ b/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java
@@ -178,7 +178,7 @@ public class StatsUtil {
int fieldId = StatsUtil.statsFieldIdForField(field.fieldId());
if (fieldId >= 0) {
- Types.StructType structType =
FieldStatistic.fieldStatsFor(field.type(), fieldId + 1);
+ Types.StructType structType =
FieldStatistic.fieldStatsFor(field.type(), fieldId);
return optional(fieldId, Integer.toString(field.fieldId()),
structType);
} else {
skippedFieldIds.add(field.fieldId());
diff --git a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
b/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
index cf7f2fc3f9..4a17081ab7 100644
--- a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
+++ b/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java
@@ -153,17 +153,17 @@ public class TestStatsUtil {
"content_stats",
Types.StructType.of(
optional(
- 10000, "0",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10001)),
+ 10000, "0",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10000)),
optional(
- 10400, "2",
FieldStatistic.fieldStatsFor(Types.FloatType.get(), 10401)),
+ 10400, "2",
FieldStatistic.fieldStatsFor(Types.FloatType.get(), 10400)),
optional(
- 10800, "4",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 10801)),
+ 10800, "4",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 10800)),
optional(
- 11200, "6",
FieldStatistic.fieldStatsFor(Types.BooleanType.get(), 11201)),
+ 11200, "6",
FieldStatistic.fieldStatsFor(Types.BooleanType.get(), 11200)),
optional(
200010000,
"1000000",
- FieldStatistic.fieldStatsFor(Types.UUIDType.get(),
200010001)))));
+ FieldStatistic.fieldStatsFor(Types.UUIDType.get(),
200010000)))));
Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema));
assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct());
}
@@ -193,21 +193,21 @@ public class TestStatsUtil {
"content_stats",
Types.StructType.of(
optional(
- 10000, "0",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10001)),
+ 10000, "0",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10000)),
optional(
- 10600, "3",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10601)),
+ 10600, "3",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 10600)),
optional(
- 11400, "7",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 11401)),
+ 11400, "7",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 11400)),
optional(
- 11600, "8",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 11601)),
+ 11600, "8",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 11600)),
optional(
- 14400, "22",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 14401)),
+ 14400, "22",
FieldStatistic.fieldStatsFor(Types.IntegerType.get(), 14400)),
optional(
- 14800, "24",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 14801)),
+ 14800, "24",
FieldStatistic.fieldStatsFor(Types.StringType.get(), 14800)),
optional(
20010000,
"100000",
- FieldStatistic.fieldStatsFor(Types.UUIDType.get(),
20010001)))));
+ FieldStatistic.fieldStatsFor(Types.UUIDType.get(),
20010000)))));
Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema));
assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct());
}
diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
b/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
index 26f26d2a5f..f26294213c 100644
--- a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
+++ b/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java
@@ -145,12 +145,12 @@ public class BaseFieldStats<T> implements FieldStats<T>,
Serializable {
@Override
public int size() {
- return 7;
+ return 8;
}
@Override
public <X> X get(int pos, Class<X> javaClass) {
- return switch (FieldStatistic.fromOffset(pos)) {
+ return switch (FieldStatistic.fromPosition(pos)) {
case VALUE_COUNT -> javaClass.cast(valueCount);
case NULL_VALUE_COUNT -> javaClass.cast(nullValueCount);
case NAN_VALUE_COUNT -> javaClass.cast(nanValueCount);
diff --git a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
b/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
index 7e64b9f11e..d083e73065 100644
--- a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
+++ b/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java
@@ -261,14 +261,14 @@ public class TestContentStats {
.hasExactBounds()
.build();
- record.set(VALUE_COUNT.offset(), fieldStats.valueCount());
- record.set(NULL_VALUE_COUNT.offset(), fieldStats.nullValueCount());
- record.set(NAN_VALUE_COUNT.offset(), fieldStats.nanValueCount());
- record.set(AVG_VALUE_SIZE.offset(), fieldStats.avgValueSize());
- record.set(MAX_VALUE_SIZE.offset(), fieldStats.maxValueSize());
- record.set(LOWER_BOUND.offset(), fieldStats.lowerBound());
- record.set(UPPER_BOUND.offset(), fieldStats.upperBound());
- record.set(EXACT_BOUNDS.offset(), fieldStats.hasExactBounds());
+ record.set(VALUE_COUNT.position(), fieldStats.valueCount());
+ record.set(NULL_VALUE_COUNT.position(), fieldStats.nullValueCount());
+ record.set(NAN_VALUE_COUNT.position(), fieldStats.nanValueCount());
+ record.set(AVG_VALUE_SIZE.position(), fieldStats.avgValueSize());
+ record.set(MAX_VALUE_SIZE.position(), fieldStats.maxValueSize());
+ record.set(LOWER_BOUND.position(), fieldStats.lowerBound());
+ record.set(UPPER_BOUND.position(), fieldStats.upperBound());
+ record.set(EXACT_BOUNDS.position(), fieldStats.hasExactBounds());
// this is typically called by Avro reflection code
BaseContentStats stats = new BaseContentStats(rootStatsStruct);
@@ -287,17 +287,17 @@ public class TestContentStats {
BaseContentStats stats = new BaseContentStats(rootStatsStruct);
// invalid lower bound
- record.set(LOWER_BOUND.offset(), 5.0);
+ record.set(LOWER_BOUND.position(), 5.0);
assertThatThrownBy(() -> stats.set(0, record))
.isInstanceOf(IllegalArgumentException.class)
.hasMessage(
"Invalid lower bound type, expected a subtype of class
java.lang.Integer: java.lang.Double");
// set valid lower bound so that upper bound is evaluated
- record.set(LOWER_BOUND.offset(), 5);
+ record.set(LOWER_BOUND.position(), 5);
// invalid upper bound
- record.set(UPPER_BOUND.offset(), "20");
+ record.set(UPPER_BOUND.position(), "20");
assertThatThrownBy(() -> stats.set(0, record))
.isInstanceOf(IllegalArgumentException.class)
.hasMessage(
diff --git a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
b/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
index eb56439f85..ffd91efd8a 100644
--- a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
+++ b/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java
@@ -201,22 +201,22 @@ public class TestFieldStats {
.hasExactBounds()
.build();
- assertThat(fieldStats.get(VALUE_COUNT.offset(),
Long.class)).isEqualTo(10L);
- assertThat(fieldStats.get(NULL_VALUE_COUNT.offset(),
Long.class)).isEqualTo(2L);
- assertThat(fieldStats.get(NAN_VALUE_COUNT.offset(),
Long.class)).isEqualTo(3L);
- assertThat(fieldStats.get(AVG_VALUE_SIZE.offset(),
Integer.class)).isEqualTo(30);
- assertThat(fieldStats.get(MAX_VALUE_SIZE.offset(),
Integer.class)).isEqualTo(70);
- assertThat(fieldStats.get(LOWER_BOUND.offset(),
Integer.class)).isEqualTo(5);
- assertThat(fieldStats.get(UPPER_BOUND.offset(),
Integer.class)).isEqualTo(20);
- assertThat(fieldStats.get(EXACT_BOUNDS.offset(),
Boolean.class)).isEqualTo(true);
+ assertThat(fieldStats.get(VALUE_COUNT.position(),
Long.class)).isEqualTo(10L);
+ assertThat(fieldStats.get(NULL_VALUE_COUNT.position(),
Long.class)).isEqualTo(2L);
+ assertThat(fieldStats.get(NAN_VALUE_COUNT.position(),
Long.class)).isEqualTo(3L);
+ assertThat(fieldStats.get(AVG_VALUE_SIZE.position(),
Integer.class)).isEqualTo(30);
+ assertThat(fieldStats.get(MAX_VALUE_SIZE.position(),
Integer.class)).isEqualTo(70);
+ assertThat(fieldStats.get(LOWER_BOUND.position(),
Integer.class)).isEqualTo(5);
+ assertThat(fieldStats.get(UPPER_BOUND.position(),
Integer.class)).isEqualTo(20);
+ assertThat(fieldStats.get(EXACT_BOUNDS.position(),
Boolean.class)).isEqualTo(true);
assertThatThrownBy(() -> assertThat(fieldStats.get(10, Long.class)))
.isInstanceOf(IllegalArgumentException.class)
- .hasMessage("Invalid statistic offset: 10");
- assertThatThrownBy(() -> assertThat(fieldStats.get(VALUE_COUNT.offset(),
Double.class)))
+ .hasMessage("Invalid statistic position: 10");
+ assertThatThrownBy(() -> assertThat(fieldStats.get(VALUE_COUNT.position(),
Double.class)))
.isInstanceOf(ClassCastException.class)
.hasMessage("Cannot cast java.lang.Long to java.lang.Double");
- assertThatThrownBy(() ->
assertThat(fieldStats.get(AVG_VALUE_SIZE.offset(), Long.class)))
+ assertThatThrownBy(() ->
assertThat(fieldStats.get(AVG_VALUE_SIZE.position(), Long.class)))
.isInstanceOf(ClassCastException.class)
.hasMessage("Cannot cast java.lang.Integer to java.lang.Long");
}