pvary commented on code in PR #8803:
URL: https://github.com/apache/iceberg/pull/8803#discussion_r1372831403
##########
core/src/main/java/org/apache/iceberg/BaseFile.java:
##########
@@ -504,6 +508,27 @@ private static Map<Integer, ByteBuffer>
toReadableByteBufferMap(Map<Integer, Byt
}
}
+ private static <TypeT> Map<Integer, TypeT> filterColumnsStats(
+ Map<Integer, TypeT> map, Set<Integer> columnIds) {
+ if (columnIds == null || columnIds.isEmpty()) {
+ return SerializableMap.copyOf(map);
+ }
+
+ if (map == null) {
+ return null;
+ }
+
+ Map<Integer, TypeT> filtered =
Maps.newHashMapWithExpectedSize(columnIds.size());
+ for (Integer columnId : columnIds) {
+ TypeT value = map.get(columnId);
+ if (value != null) {
Review Comment:
I did some testing with Parquet files:
- Used `TestParquet` to check what `Metrics` do we generate when we have
only `null` values:
```
@Test
public void testNullMetricsForEmptyColumns() throws IOException {
Schema schema = new Schema(optional(1, "longCol", Types.LongType.get()),
optional(2, "nullCol", Types.LongType.get()));
File file = createTempFile(temp);
List<GenericData.Record> records = Lists.newArrayListWithCapacity(1);
org.apache.avro.Schema avroSchema =
AvroSchemaUtil.convert(schema.asStruct());
GenericData.Record smallRecord = new GenericData.Record(avroSchema);
smallRecord.put("longCol", 1L);
records.add(smallRecord);
write(
file,
schema,
Collections.emptyMap(),
ParquetAvroWriter::buildWriter,
records.toArray(new GenericData.Record[] {}));
InputFile inputFile = Files.localInput(file);
Metrics metrics = ParquetUtil.fileMetrics(inputFile,
MetricsConfig.getDefault());
assertThat(metrics.nullValueCounts()).hasSize(2);
assertThat(metrics.lowerBounds()).hasSize(1).containsKey(1);
assertThat(metrics.upperBounds()).hasSize(1).containsKey(1);
}
```
Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1.
There is no `null` value for column '2'
- Just to double check, I have tested how we read files with missing column
metrics (like the ones created above). I have used `TestManifestReaderStats`
for this:
```
@Test
public void testReadFileWithMissingStatsIncludesFullStats() throws
IOException {
Metrics missingStats =
new Metrics(
3L, null,
ImmutableMap.of(3, 3L, 4, 4L),
ImmutableMap.of(3, 3L, 4, 0L),
ImmutableMap.of(3, 0L, 4, 1L),
ImmutableMap.of(4,
Conversions.toByteBuffer(Types.StringType.get(), "a")),
ImmutableMap.of(4,
Conversions.toByteBuffer(Types.StringType.get(), "a")));
DataFile fileWithMissingStats =
DataFiles.builder(SPEC)
.withPath(FILE_PATH)
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=0") // easy way to set partition
data for now
.withRecordCount(3)
.withMetrics(missingStats)
.build();
ManifestFile manifest = writeManifest(1000L, fileWithMissingStats);
try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest,
FILE_IO)) {
CloseableIterable<ManifestEntry<DataFile>> entries = reader.entries();
ManifestEntry<DataFile> entry = entries.iterator().next();
DataFile dataFile = entry.file();
Assert.assertEquals(3, dataFile.recordCount());
assertThat(dataFile.lowerBounds()).hasSize(1).containsKey(4);
assertThat(dataFile.upperBounds()).hasSize(1).containsKey(4);
}
}
```
Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1.
There is no `null` value for column '3'
So I think the actual phrasing of the documentation is just not clean
enough, as the map does not contain `null` values, it just does not contain the
keys. While putting `null` value to a map is certainly possible, it should be
used only in very specific cases, as it is very error-prone (see our discussion
above 😄). If our goal would be to identify the case where all of the values in
the given column are `null`, we can use the other stats to archive the result.
##########
core/src/main/java/org/apache/iceberg/BaseFile.java:
##########
@@ -504,6 +508,27 @@ private static Map<Integer, ByteBuffer>
toReadableByteBufferMap(Map<Integer, Byt
}
}
+ private static <TypeT> Map<Integer, TypeT> filterColumnsStats(
+ Map<Integer, TypeT> map, Set<Integer> columnIds) {
+ if (columnIds == null || columnIds.isEmpty()) {
+ return SerializableMap.copyOf(map);
+ }
+
+ if (map == null) {
+ return null;
+ }
+
+ Map<Integer, TypeT> filtered =
Maps.newHashMapWithExpectedSize(columnIds.size());
+ for (Integer columnId : columnIds) {
+ TypeT value = map.get(columnId);
+ if (value != null) {
Review Comment:
I did some testing with Parquet files:
- Used `TestParquet` to check what `Metrics` do we generate when we have
only `null` values:
```
@Test
public void testNullMetricsForEmptyColumns() throws IOException {
Schema schema = new Schema(optional(1, "longCol", Types.LongType.get()),
optional(2, "nullCol", Types.LongType.get()));
File file = createTempFile(temp);
List<GenericData.Record> records = Lists.newArrayListWithCapacity(1);
org.apache.avro.Schema avroSchema =
AvroSchemaUtil.convert(schema.asStruct());
GenericData.Record smallRecord = new GenericData.Record(avroSchema);
smallRecord.put("longCol", 1L);
records.add(smallRecord);
write(
file,
schema,
Collections.emptyMap(),
ParquetAvroWriter::buildWriter,
records.toArray(new GenericData.Record[] {}));
InputFile inputFile = Files.localInput(file);
Metrics metrics = ParquetUtil.fileMetrics(inputFile,
MetricsConfig.getDefault());
assertThat(metrics.nullValueCounts()).hasSize(2);
assertThat(metrics.lowerBounds()).hasSize(1).containsKey(1);
assertThat(metrics.upperBounds()).hasSize(1).containsKey(1);
}
```
Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1.
There is no `null` value for column '2'
- Just to double check, I have tested how we read files with missing column
metrics (like the ones created above). I have used `TestManifestReaderStats`
for this:
```
@Test
public void testReadFileWithMissingStatsIncludesFullStats() throws
IOException {
Metrics missingStats =
new Metrics(
3L, null,
ImmutableMap.of(3, 3L, 4, 4L),
ImmutableMap.of(3, 3L, 4, 0L),
ImmutableMap.of(3, 0L, 4, 1L),
ImmutableMap.of(4,
Conversions.toByteBuffer(Types.StringType.get(), "a")),
ImmutableMap.of(4,
Conversions.toByteBuffer(Types.StringType.get(), "a")));
DataFile fileWithMissingStats =
DataFiles.builder(SPEC)
.withPath(FILE_PATH)
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=0") // easy way to set partition
data for now
.withRecordCount(3)
.withMetrics(missingStats)
.build();
ManifestFile manifest = writeManifest(1000L, fileWithMissingStats);
try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest,
FILE_IO)) {
CloseableIterable<ManifestEntry<DataFile>> entries = reader.entries();
ManifestEntry<DataFile> entry = entries.iterator().next();
DataFile dataFile = entry.file();
Assert.assertEquals(3, dataFile.recordCount());
assertThat(dataFile.lowerBounds()).hasSize(1).containsKey(4);
assertThat(dataFile.upperBounds()).hasSize(1).containsKey(4);
}
}
```
Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1.
There is no `null` value for column '3'
So I think the actual phrasing of the documentation is just not clean
enough, as the map does not contain `null` values, it just does not contain the
keys. While putting `null` value to a map is certainly possible, it should be
used only in very specific cases, as it is very error-prone (see our discussion
above 😄). If our goal would be to identify the case where all of the values in
the given column are `null`, we can use the other stats to archive the result.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]