slessard commented on code in PR #10953:
URL: https://github.com/apache/iceberg/pull/10953#discussion_r1753042474
##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +263,89 @@ public void testReadColumnFilter2() throws Exception {
scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH,
ImmutableList.of("timestamp"));
}
+ @Test
+ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception
{
+ setMaxStackTraceElementsDisplayed(15);
+ rowsWritten = Lists.newArrayList();
+ tables = new HadoopTables();
+
+ Schema schema =
+ new Schema(
+ Types.NestedField.required(1, "a", Types.IntegerType.get()),
+ Types.NestedField.optional(2, "b", Types.IntegerType.get()));
+
+ PartitionSpec spec = PartitionSpec.builderFor(schema).build();
+ Table table1 = tables.create(schema, spec, tableLocation);
+
+ // Add one record to the table
+ GenericRecord rec = GenericRecord.create(schema);
+ rec.setField("a", 1);
+ List<GenericRecord> genericRecords = Lists.newArrayList();
+ genericRecords.add(rec);
+
+ AppendFiles appendFiles = table1.newAppend();
+ appendFiles.appendFile(writeParquetFile(table1, genericRecords));
+ appendFiles.commit();
+
+ // Alter the table schema by adding a new, optional column.
+ // Do not add any data for this new column in the one existing row in the
table
+ // and do not insert any new rows into the table.
+ Table table = tables.load(tableLocation);
+ table.updateSchema().addColumn("z", Types.IntegerType.get()).commit();
+
+ // Select all columns, all rows from the table
+ TableScan scan = table.newScan().select("*");
+
+ List<String> columns = ImmutableList.of("a", "b", "z");
+ // Read the data and verify that the returned ColumnarBatches match
expected rows.
+ int rowIndex = 0;
+ try (VectorizedTableScanIterable itr = new
VectorizedTableScanIterable(scan, 1, false)) {
+ for (ColumnarBatch batch : itr) {
+ List<GenericRecord> expectedRows = rowsWritten.subList(rowIndex,
rowIndex + 1);
+
+ Map<String, Integer> columnNameToIndex = Maps.newHashMap();
+ for (int i = 0; i < columns.size(); i++) {
+ columnNameToIndex.put(columns.get(i), i);
+ }
+ Set<String> columnSet = columnNameToIndex.keySet();
+
+ assertThat(batch.numRows()).isEqualTo(1);
+ assertThat(batch.numCols()).isEqualTo(columns.size());
+
+ checkColumnarArrayValues(
+ 1,
+ expectedRows,
+ batch,
+ 0,
+ columnSet,
+ "a",
+ (records, i) -> records.get(i).getField("a"),
+ ColumnVector::getInt);
+ checkColumnarArrayValues(
+ 1,
+ expectedRows,
+ batch,
+ 1,
+ columnSet,
+ "b",
+ (records, i) -> records.get(i).getField("b"),
+ (array, i) -> array.isNullAt(i) ? null : array.getInt(i));
Review Comment:
@nastra This column does exist within the parquet file and its value is
null. Can you tell me whether calling `array.isNullAt(i)` the correct way to
read a column that contains a null value?
##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +263,89 @@ public void testReadColumnFilter2() throws Exception {
scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH,
ImmutableList.of("timestamp"));
}
+ @Test
+ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception
{
+ setMaxStackTraceElementsDisplayed(15);
+ rowsWritten = Lists.newArrayList();
+ tables = new HadoopTables();
+
+ Schema schema =
+ new Schema(
+ Types.NestedField.required(1, "a", Types.IntegerType.get()),
+ Types.NestedField.optional(2, "b", Types.IntegerType.get()));
+
+ PartitionSpec spec = PartitionSpec.builderFor(schema).build();
+ Table table1 = tables.create(schema, spec, tableLocation);
+
+ // Add one record to the table
+ GenericRecord rec = GenericRecord.create(schema);
+ rec.setField("a", 1);
+ List<GenericRecord> genericRecords = Lists.newArrayList();
+ genericRecords.add(rec);
+
+ AppendFiles appendFiles = table1.newAppend();
+ appendFiles.appendFile(writeParquetFile(table1, genericRecords));
+ appendFiles.commit();
+
+ // Alter the table schema by adding a new, optional column.
+ // Do not add any data for this new column in the one existing row in the
table
+ // and do not insert any new rows into the table.
+ Table table = tables.load(tableLocation);
+ table.updateSchema().addColumn("z", Types.IntegerType.get()).commit();
+
+ // Select all columns, all rows from the table
+ TableScan scan = table.newScan().select("*");
+
+ List<String> columns = ImmutableList.of("a", "b", "z");
+ // Read the data and verify that the returned ColumnarBatches match
expected rows.
+ int rowIndex = 0;
+ try (VectorizedTableScanIterable itr = new
VectorizedTableScanIterable(scan, 1, false)) {
+ for (ColumnarBatch batch : itr) {
+ List<GenericRecord> expectedRows = rowsWritten.subList(rowIndex,
rowIndex + 1);
+
+ Map<String, Integer> columnNameToIndex = Maps.newHashMap();
+ for (int i = 0; i < columns.size(); i++) {
+ columnNameToIndex.put(columns.get(i), i);
+ }
+ Set<String> columnSet = columnNameToIndex.keySet();
+
+ assertThat(batch.numRows()).isEqualTo(1);
+ assertThat(batch.numCols()).isEqualTo(columns.size());
+
+ checkColumnarArrayValues(
+ 1,
+ expectedRows,
+ batch,
+ 0,
+ columnSet,
+ "a",
+ (records, i) -> records.get(i).getField("a"),
+ ColumnVector::getInt);
+ checkColumnarArrayValues(
+ 1,
+ expectedRows,
+ batch,
+ 1,
+ columnSet,
+ "b",
+ (records, i) -> records.get(i).getField("b"),
+ (array, i) -> array.isNullAt(i) ? null : array.getInt(i));
+ checkColumnarArrayValues(
+ 1,
+ expectedRows,
+ batch,
+ 2,
+ columnSet,
+ "z",
+ (records, i) -> records.get(i).getField("z"),
+ (array, i) -> array.isNullAt(i) ? null : array.getInt(i));
Review Comment:
@nastra I could use your help with this line in the test. This column does
not exist within the parquet file. (This is the very purpose for this entire
pull request.) Currently `array.isNullAt(i)` throws a NullPointerException
because internally `array.isNullAt(i)` calls
`nullabilityHolder.isNullAt(rowId)` and `nullabilityHolder` is `null`.
What is the correct way to verify that a null value is being returned for
this column?
##########
arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java:
##########
@@ -140,12 +141,16 @@ public static class ConstantVectorHolder<T> extends
VectorHolder {
private final int numRows;
public ConstantVectorHolder(int numRows) {
+ super(new NullVector("_dummy_", numRows), null, null);
this.numRows = numRows;
this.constantValue = null;
}
public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T
constantValue) {
- super(icebergField);
+ super(
+ new NullVector(icebergField.name(), numRows),
Review Comment:
I've made the change you suggested.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]