nssalian commented on code in PR #16370: URL: https://github.com/apache/iceberg/pull/16370#discussion_r3283483664
########## data/src/test/java/org/apache/iceberg/data/TestRecordVariantShreddingAnalyzer.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Files; +import org.apache.iceberg.InternalTestHelpers; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.encryption.EncryptedFiles; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.formats.FileWriterBuilder; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetFileTestUtils; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantTestUtil; +import org.apache.iceberg.variants.Variants; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestRecordVariantShreddingAnalyzer { Review Comment: No test covers a record with a `null` variant value. Could you add one? ########## data/src/main/java/org/apache/iceberg/data/RecordVariantShreddingAnalyzer.java: ########## @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.Type; + +/** + * Variant shredding analyzer for generic {@link Record} types. + * + * <p>This analyzer extracts {@link Variant} values from {@link Record} objects and determines + * optimal shredding schemas by analyzing data distributions across buffered rows. The analyzer is + * used by Kafka Connect and other tools that work with generic Record types to enable automatic + * variant shredding for Parquet writes. + * + * <p>Shredding extracts frequently-occurring fields from variant data into typed Parquet columns + * for improved query performance while maintaining the full variant data in the raw value field. + */ +class RecordVariantShreddingAnalyzer extends VariantShreddingAnalyzer<Record, Void> { + + /** + * For generic {@link Record} rows, top-level field order matches {@link Schema#columns()}. {@link + * #resolveColumnIndex} is unused ({@code Void} engine schema); using it always produced {@code + * -1}, so variant columns were never analyzed and Parquet shredding never activated for Kafka + * Connect and other Record-based writers. + */ + @Override + public Map<Integer, Type> analyzeVariantColumns( + List<Record> bufferedRows, Schema icebergSchema, Void engineSchema) { + Map<Integer, Type> shreddedTypes = Maps.newHashMap(); + List<NestedField> cols = icebergSchema.columns(); + for (int rowIndex = 0; rowIndex < cols.size(); rowIndex++) { + NestedField col = cols.get(rowIndex); + if (col.type().isVariantType()) { + Type typed = analyzeAndCreateSchema(bufferedRows, rowIndex); + if (typed != null) { + shreddedTypes.put(col.fieldId(), typed); + } + } + } + + return shreddedTypes; + } + + @Override + protected List<VariantValue> extractVariantValues( Review Comment: `instanceof Variant` silently skips both nulls and non-`Variant` values. Should this throw on non-`Variant` so caller bugs surface instead of silently shrinking the analysis set? Worth referring to the Spark and Flink implementation. ########## data/src/main/java/org/apache/iceberg/data/RecordVariantShreddingAnalyzer.java: ########## @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.Type; + +/** + * Variant shredding analyzer for generic {@link Record} types. + * + * <p>This analyzer extracts {@link Variant} values from {@link Record} objects and determines + * optimal shredding schemas by analyzing data distributions across buffered rows. The analyzer is + * used by Kafka Connect and other tools that work with generic Record types to enable automatic + * variant shredding for Parquet writes. + * + * <p>Shredding extracts frequently-occurring fields from variant data into typed Parquet columns + * for improved query performance while maintaining the full variant data in the raw value field. + */ +class RecordVariantShreddingAnalyzer extends VariantShreddingAnalyzer<Record, Void> { + + /** + * For generic {@link Record} rows, top-level field order matches {@link Schema#columns()}. {@link + * #resolveColumnIndex} is unused ({@code Void} engine schema); using it always produced {@code + * -1}, so variant columns were never analyzed and Parquet shredding never activated for Kafka + * Connect and other Record-based writers. + */ + @Override + public Map<Integer, Type> analyzeVariantColumns( + List<Record> bufferedRows, Schema icebergSchema, Void engineSchema) { + Map<Integer, Type> shreddedTypes = Maps.newHashMap(); + List<NestedField> cols = icebergSchema.columns(); + for (int rowIndex = 0; rowIndex < cols.size(); rowIndex++) { + NestedField col = cols.get(rowIndex); + if (col.type().isVariantType()) { + Type typed = analyzeAndCreateSchema(bufferedRows, rowIndex); + if (typed != null) { + shreddedTypes.put(col.fieldId(), typed); + } + } + } + + return shreddedTypes; + } + + @Override + protected List<VariantValue> extractVariantValues( + List<Record> bufferedRows, int variantFieldIndex) { + List<VariantValue> values = Lists.newArrayList(); + for (Record record : bufferedRows) { + Object fieldValue = record.get(variantFieldIndex); + if (fieldValue instanceof Variant) { + Variant variant = (Variant) fieldValue; + values.add(variant.value()); + } + } + return values; + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { Review Comment: Is this getting called? ########## data/src/main/java/org/apache/iceberg/data/RecordVariantShreddingAnalyzer.java: ########## @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.Type; + +/** + * Variant shredding analyzer for generic {@link Record} types. + * + * <p>This analyzer extracts {@link Variant} values from {@link Record} objects and determines + * optimal shredding schemas by analyzing data distributions across buffered rows. The analyzer is + * used by Kafka Connect and other tools that work with generic Record types to enable automatic + * variant shredding for Parquet writes. + * + * <p>Shredding extracts frequently-occurring fields from variant data into typed Parquet columns + * for improved query performance while maintaining the full variant data in the raw value field. + */ +class RecordVariantShreddingAnalyzer extends VariantShreddingAnalyzer<Record, Void> { + + /** + * For generic {@link Record} rows, top-level field order matches {@link Schema#columns()}. {@link + * #resolveColumnIndex} is unused ({@code Void} engine schema); using it always produced {@code + * -1}, so variant columns were never analyzed and Parquet shredding never activated for Kafka + * Connect and other Record-based writers. + */ + @Override Review Comment: This override duplicates the loop from `VariantShreddingAnalyzer.analyzeVariantColumns`. Spark/Flink override only the protected hooks (`resolveColumnIndex`, `extractVariantValues`), not the public template. ########## data/src/main/java/org/apache/iceberg/data/RecordVariantShreddingAnalyzer.java: ########## @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.Type; + +/** + * Variant shredding analyzer for generic {@link Record} types. + * + * <p>This analyzer extracts {@link Variant} values from {@link Record} objects and determines + * optimal shredding schemas by analyzing data distributions across buffered rows. The analyzer is + * used by Kafka Connect and other tools that work with generic Record types to enable automatic + * variant shredding for Parquet writes. + * + * <p>Shredding extracts frequently-occurring fields from variant data into typed Parquet columns + * for improved query performance while maintaining the full variant data in the raw value field. + */ +class RecordVariantShreddingAnalyzer extends VariantShreddingAnalyzer<Record, Void> { + + /** + * For generic {@link Record} rows, top-level field order matches {@link Schema#columns()}. {@link + * #resolveColumnIndex} is unused ({@code Void} engine schema); using it always produced {@code + * -1}, so variant columns were never analyzed and Parquet shredding never activated for Kafka + * Connect and other Record-based writers. + */ + @Override Review Comment: This javadoc describes pre-fix state ("is unused", "always produced -1", "never activated") rather than what the method does. Suggest dropping it - `{@inheritDoc}` is the default for overrides and the override's behavior needs to be explained instead ########## data/src/main/java/org/apache/iceberg/data/RecordVariantShreddingAnalyzer.java: ########## @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.Type; + +/** + * Variant shredding analyzer for generic {@link Record} types. + * + * <p>This analyzer extracts {@link Variant} values from {@link Record} objects and determines + * optimal shredding schemas by analyzing data distributions across buffered rows. The analyzer is + * used by Kafka Connect and other tools that work with generic Record types to enable automatic + * variant shredding for Parquet writes. + * + * <p>Shredding extracts frequently-occurring fields from variant data into typed Parquet columns + * for improved query performance while maintaining the full variant data in the raw value field. + */ +class RecordVariantShreddingAnalyzer extends VariantShreddingAnalyzer<Record, Void> { Review Comment: A few issues to note in the doc string here: 1. "used by Kafka Connect and other tools" misframes this class as Connect-specific. PR title needs a change too. 2. The "Shredding extracts frequently-occurring fields..." paragraph describes what shredding is; that belongs on the base class `VariantShreddingAnalyzer`, not the Record-specific subclass. I don't think this is needed here. 3. The doc string doesn't actually describe what's specific about this implementation (positional indexing aligned with `Record.get(int)`). 4. Would be great to be consistent with `SparkVariantShreddingAnalyzer` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
