[GitHub] ppadma commented on a change in pull request #1429: DRILL-6676: Add Union, List and Repeated List types to Result Set Loader

GitBox Sat, 25 Aug 2018 07:22:36 -0700

ppadma commented on a change in pull request #1429: DRILL-6676: Add Union, List 
and Repeated List types to Result Set Loader
URL: https://github.com/apache/drill/pull/1429#discussion_r212799480


 ##########
 File path: 
exec/java-exec/src/test/java/org/apache/drill/exec/physical/rowSet/impl/TestResultSetLoaderUnions.java
 ##########
 @@ -0,0 +1,645 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.physical.rowSet.impl;
+
+import static org.apache.drill.test.rowSet.RowSetUtilities.listValue;
+import static org.apache.drill.test.rowSet.RowSetUtilities.mapValue;
+import static org.apache.drill.test.rowSet.RowSetUtilities.strArray;
+import static org.apache.drill.test.rowSet.RowSetUtilities.variantArray;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Arrays;
+
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MajorType;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.rowSet.ResultSetLoader;
+import org.apache.drill.exec.physical.rowSet.RowSetLoader;
+import org.apache.drill.exec.record.MaterializedField;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.record.metadata.MetadataUtils;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.vector.NullableIntVector;
+import org.apache.drill.exec.vector.NullableVarCharVector;
+import org.apache.drill.exec.vector.ValueVector;
+import org.apache.drill.exec.vector.accessor.ArrayWriter;
+import org.apache.drill.exec.vector.accessor.ObjectType;
+import org.apache.drill.exec.vector.accessor.ObjectWriter;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+import org.apache.drill.exec.vector.accessor.TupleWriter;
+import org.apache.drill.exec.vector.accessor.ValueType;
+import org.apache.drill.exec.vector.accessor.VariantWriter;
+import org.apache.drill.exec.vector.accessor.writer.EmptyListShim;
+import org.apache.drill.exec.vector.accessor.writer.ListWriterImpl;
+import org.apache.drill.exec.vector.accessor.writer.SimpleListShim;
+import org.apache.drill.exec.vector.accessor.writer.UnionVectorShim;
+import org.apache.drill.exec.vector.accessor.writer.UnionWriterImpl;
+import org.apache.drill.exec.vector.complex.ListVector;
+import org.apache.drill.exec.vector.complex.UnionVector;
+import org.apache.drill.test.SubOperatorTest;
+import org.apache.drill.test.rowSet.RowSet;
+import org.apache.drill.test.rowSet.RowSet.SingleRowSet;
+import org.apache.drill.test.rowSet.RowSetBuilder;
+import org.apache.drill.test.rowSet.RowSetReader;
+import org.apache.drill.test.rowSet.RowSetUtilities;
+import org.apache.drill.test.rowSet.schema.SchemaBuilder;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Tests the result set loader support for union vectors. Union vectors
+ * are only lightly supported in Apache Drill and not supported at all
+ * in commercial versions. They have may problems: both in code and in theory.
+ * Most operators do not support them. But, JSON uses them, so they must
+ * be made to work in the result set loader layer.
+ */
+
+public class TestResultSetLoaderUnions extends SubOperatorTest {
+
+  @Test
+  public void testUnionBasics() {
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addUnion("u")
+          .addType(MinorType.VARCHAR)
+          .addMap()
+            .addNullable("a", MinorType.INT)
+            .addNullable("b", MinorType.VARCHAR)
+            .resumeUnion()
+          .resumeSchema()
+        .buildSchema();
+
+    final ResultSetLoaderImpl.ResultSetOptions options = new OptionBuilder()
+        .setSchema(schema)
+        .build();
+    final ResultSetLoader rsLoader = new 
ResultSetLoaderImpl(fixture.allocator(), options);
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Sanity check of writer structure
+
+    final ObjectWriter wo = writer.column(1);
+    assertEquals(ObjectType.VARIANT, wo.type());
+    final VariantWriter vw = wo.variant();
+
+    assertTrue(vw.hasType(MinorType.VARCHAR));
+    assertNotNull(vw.memberWriter(MinorType.VARCHAR));
+
+    assertTrue(vw.hasType(MinorType.MAP));
+    assertNotNull(vw.memberWriter(MinorType.MAP));
+
+    // Write values
+
+    rsLoader.startBatch();
+    writer
+      .addRow(1, "first")
+      .addRow(2, mapValue(20, "fred"))
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last");
+
+    // Verify the values.
+    // (Relies on the row set level union tests having passed.)
+
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+      .addRow(1, "first")
+      .addRow(2, mapValue(20, "fred"))
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last")
+      .build();
+
+    RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
+  }
+
+  @Test
+  public void testUnionAddTypes() {
+    final ResultSetLoader rsLoader = new 
ResultSetLoaderImpl(fixture.allocator());
+    final RowSetLoader writer = rsLoader.writer();
+
+    rsLoader.startBatch();
+
+    // First row, (1, "first"), create types as we go.
+
+    writer.start();
+    writer.addColumn(SchemaBuilder.columnSchema("id", MinorType.INT, 
DataMode.REQUIRED));
+    writer.scalar("id").setInt(1);
+    writer.addColumn(SchemaBuilder.columnSchema("u", MinorType.UNION, 
DataMode.OPTIONAL));
+    final VariantWriter variant = writer.column("u").variant();
+    variant.member(MinorType.VARCHAR).scalar().setString("first");
+    writer.save();
+
+    // Second row, (2, {20, "fred"}), create types as we go.
+
+    writer.start();
+    writer.scalar("id").setInt(2);
+    final TupleWriter innerMap = variant.member(MinorType.MAP).tuple();
+    innerMap.addColumn(SchemaBuilder.columnSchema("a", MinorType.INT, 
DataMode.OPTIONAL));
+    innerMap.scalar("a").setInt(20);
+    innerMap.addColumn(SchemaBuilder.columnSchema("b", MinorType.VARCHAR, 
DataMode.OPTIONAL));
+    innerMap.scalar("b").setString("fred");
+    writer.save();
+
+    // Write remaining rows using convenient methods, using
+    // schema defined above.
+
+    writer
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last");
+
+    // Verify the values.
+    // (Relies on the row set level union tests having passed.)
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addUnion("u")
+          .addType(MinorType.VARCHAR)
+          .addMap()
+            .addNullable("a", MinorType.INT)
+            .addNullable("b", MinorType.VARCHAR)
+            .resumeUnion()
+          .resumeSchema()
+        .buildSchema();
+
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+      .addRow(1, "first")
+      .addRow(2, mapValue(20, "fred"))
+      .addRow(3, null)
+      .addRow(4, mapValue(40, null))
+      .addRow(5, "last")
+      .build();
+
+    final RowSet result = fixture.wrap(rsLoader.harvest());
+    RowSetUtilities.verify(expected, result);
+  }
+
+  @Test
+  public void testUnionOverflow() {
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addUnion("u")
+          .addType(MinorType.INT)
+          .addType(MinorType.VARCHAR)
+          .resumeSchema()
+        .buildSchema();
+
+    final ResultSetLoaderImpl.ResultSetOptions options = new OptionBuilder()
+        .setRowCountLimit(ValueVector.MAX_ROW_COUNT)
+        .setSchema(schema)
+        .build();
+    final ResultSetLoader rsLoader = new 
ResultSetLoaderImpl(fixture.allocator(), options);
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Fill the batch with enough data to cause overflow.
+    // Fill even rows with a Varchar, odd rows with an int.
+    // Data must be large enough to cause overflow before 32K rows
+    // (the half that get strings.
+    // 16 MB / 32 K = 512 bytes
+    // Make a bit bigger to overflow early.
+
+    final int strLength = 600;
+    final byte value[] = new byte[strLength - 6];
+    Arrays.fill(value, (byte) 'X');
+    final String strValue = new String(value, Charsets.UTF_8);
+    int count = 0;
+
+    rsLoader.startBatch();
+    while (! writer.isFull()) {
+      if (count % 2 == 0) {
+        writer.addRow(count, String.format("%s%06d", strValue, count));
+      } else {
+        writer.addRow(count, count * 10);
+      }
+      count++;
+    }
+
+    // Number of rows should be driven by vector size.
+    // Our row count should include the overflow row
+
+    final int expectedCount = ValueVector.MAX_BUFFER_SIZE / strLength * 2;
+    assertEquals(expectedCount + 1, count);
+
+    // Loader's row count should include only "visible" rows
+
+    assertEquals(expectedCount, writer.rowCount());
+
+    // Total count should include invisible and look-ahead rows.
+
+    assertEquals(expectedCount + 1, rsLoader.totalRowCount());
+
+    // Result should exclude the overflow row
+
+    RowSet result = fixture.wrap(rsLoader.harvest());
+    assertEquals(expectedCount, result.rowCount());
+
+    // Verify the data.
+
+    RowSetReader reader = result.reader();
+    int readCount = 0;
+    while (reader.next()) {
+      assertEquals(readCount, reader.scalar(0).getInt());
+      if (readCount % 2 == 0) {
+        assertEquals(String.format("%s%06d", strValue, readCount),
+            reader.variant(1).scalar().getString());
+      } else {
+        assertEquals(readCount * 10, reader.variant(1).scalar().getInt());
+      }
+      readCount++;
+    }
+    assertEquals(readCount, result.rowCount());
+    result.clear();
+
+    // Write a few more rows to verify the overflow row.
+
+    rsLoader.startBatch();
+    for (int i = 0; i < 1000; i++) {
+      if (count % 2 == 0) {
+        writer.addRow(count, String.format("%s%06d", strValue, count));
+      } else {
+        writer.addRow(count, count * 10);
+      }
+      count++;
+    }
+
+    result = fixture.wrap(rsLoader.harvest());
+    assertEquals(1001, result.rowCount());
+
+    final int startCount = readCount;
+    reader = result.reader();
+    while (reader.next()) {
+      assertEquals(readCount, reader.scalar(0).getInt());
+      if (readCount % 2 == 0) {
+        assertEquals(String.format("%s%06d", strValue, readCount),
+            reader.variant(1).scalar().getString());
+      } else {
+        assertEquals(readCount * 10, reader.variant(1).scalar().getInt());
+      }
+      readCount++;
+    }
+    assertEquals(readCount - startCount, result.rowCount());
+    result.clear();
+    rsLoader.close();
+  }
+
+  /**
+   * Test for the case of a list defined to contain exactly one type.
+   * Relies on the row set tests to verify that the single type model
+   * works for lists. Here we test that the ResultSetLoader put the
+   * pieces together correctly.
+   */
+
+  @Test
+  public void testSimpleList() {
+
+    // Schema with a list declared with one type, not expandable
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .addType(MinorType.VARCHAR)
+          .resumeSchema()
+        .buildSchema();
+
+    schema.metadata("list").variantSchema().becomeSimple();
+
+    final ResultSetLoaderImpl.ResultSetOptions options = new OptionBuilder()
+        .setSchema(schema)
+        .build();
+    final ResultSetLoader rsLoader = new 
ResultSetLoaderImpl(fixture.allocator(), options);
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Sanity check: should be an array of Varchar because we said the
+    // types within the list is not expandable.
+
+    final ArrayWriter arrWriter = writer.array("list");
+    assertEquals(ObjectType.SCALAR, arrWriter.entryType());
+    final ScalarWriter strWriter = arrWriter.scalar();
+    assertEquals(ValueType.STRING, strWriter.valueType());
+
+    // Can write a batch as if this was a repeated Varchar, except
+    // that any value can also be null.
+
+    rsLoader.startBatch();
+    writer
+      .addRow(1, strArray("fred", "barney"))
+      .addRow(2, null)
+      .addRow(3, strArray("wilma", "betty", "pebbles"));
+
+    // Verify
+
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+        .addRow(1, strArray("fred", "barney"))
+        .addRow(2, null)
+        .addRow(3, strArray("wilma", "betty", "pebbles"))
+        .build();
+
+    RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
+  }
+
+  /**
+   * Test a simple list created dynamically at load time.
+   * The list must include a single type member.
+   */
+
+  @Test
+  public void testSimpleListDynamic() {
+
+    final ResultSetLoader rsLoader = new 
ResultSetLoaderImpl(fixture.allocator());
+    final RowSetLoader writer = rsLoader.writer();
+
+    // Can write a batch as if this was a repeated Varchar, except
+    // that any value can also be null.
+
+    rsLoader.startBatch();
+
+    writer.addColumn(MaterializedField.create("id", 
Types.required(MinorType.INT)));
+
+    final ColumnMetadata colSchema = MetadataUtils.newVariant("list", 
DataMode.REPEATED);
+    colSchema.variantSchema().addType(MinorType.VARCHAR);
+    colSchema.variantSchema().becomeSimple();
+    writer.addColumn(colSchema);
+
+    // Sanity check: should be an array of Varchar because we said the
+    // types within the list is not expandable.
+
+    final ArrayWriter arrWriter = writer.array("list");
+    assertEquals(ObjectType.SCALAR, arrWriter.entryType());
+    final ScalarWriter strWriter = arrWriter.scalar();
+    assertEquals(ValueType.STRING, strWriter.valueType());
+
+    writer
+      .addRow(1, strArray("fred", "barney"))
+      .addRow(2, null)
+      .addRow(3, strArray("wilma", "betty", "pebbles"));
+
+    // Verify
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .addType(MinorType.VARCHAR)
+          .resumeSchema()
+        .buildSchema();
+    final SingleRowSet expected = fixture.rowSetBuilder(schema)
+        .addRow(1, strArray("fred", "barney"))
+        .addRow(2, null)
+        .addRow(3, strArray("wilma", "betty", "pebbles"))
+        .build();
+
+    RowSetUtilities.verify(expected, fixture.wrap(rsLoader.harvest()));
+  }
+
+  /**
+   * Try to create a simple (non-expandable) list without
+   * giving a member type. Expected to fail.
+   */
+
+  @Test
+  public void testSimpleListNoTypes() {
+
+    // Schema with a list declared with one type, not expandable
+
+    final TupleMetadata schema = new SchemaBuilder()
+        .add("id", MinorType.INT)
+        .addList("list")
+          .resumeSchema()
+        .buildSchema();
+
+    try {
+      schema.metadata("list").variantSchema().becomeSimple();
+    } catch (final IllegalStateException e) {
+      // expected
+    }
 
 Review comment:
   this test passes even if the exception is not thrown

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] ppadma commented on a change in pull request #1429: DRILL-6676: Add Union, List and Repeated List types to Result Set Loader

Reply via email to