[GitHub] [iceberg] shardulm94 commented on a change in pull request #1197: Refactor the GenericOrcWriter by using OrcSchemaWithTypeVisitor#visit

GitBox Tue, 14 Jul 2020 11:38:29 -0700


shardulm94 commented on a change in pull request #1197:
URL: https://github.com/apache/iceberg/pull/1197#discussion_r454563885




##########
File path: data/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriter.java
##########
@@ -434,175 +144,12 @@ public void addValue(int rowId, BigDecimal data, 
ColumnVector output) {
 
     @Override
     @SuppressWarnings("unchecked")
-    public void addValue(int rowId, Record data, ColumnVector output) {
-      if (data == null) {
-        output.noNulls = false;
-        output.isNull[rowId] = true;
-      } else {
-        output.isNull[rowId] = false;
-        StructColumnVector cv = (StructColumnVector) output;
-        for (int c = 0; c < children.length; ++c) {
-          children[c].addValue(rowId, data.get(c, children[c].getJavaClass()), 
cv.fields[c]);
-        }
-      }
-    }
-  }
-
-  static class ListConverter implements Converter<List> {
-    private final Converter children;
-
-    ListConverter(TypeDescription schema) {
-      this.children = buildConverter(schema.getChildren().get(0));
-    }
-
-    @Override
-    public Class<List> getJavaClass() {
-      return List.class;
-    }
-
-    @Override
-    @SuppressWarnings("unchecked")
-    public void addValue(int rowId, List data, ColumnVector output) {
-      if (data == null) {
-        output.noNulls = false;
-        output.isNull[rowId] = true;
-      } else {
-        output.isNull[rowId] = false;
-        List<Object> value = (List<Object>) data;
-        ListColumnVector cv = (ListColumnVector) output;
-        // record the length and start of the list elements
-        cv.lengths[rowId] = value.size();
-        cv.offsets[rowId] = cv.childCount;
-        cv.childCount += cv.lengths[rowId];
-        // make sure the child is big enough
-        cv.child.ensureSize(cv.childCount, true);
-        // Add each element
-        for (int e = 0; e < cv.lengths[rowId]; ++e) {
-          children.addValue((int) (e + cv.offsets[rowId]), value.get(e), 
cv.child);
-        }
+    public void nonNullWrite(int rowId, Record data, ColumnVector output) {
+      StructColumnVector cv = (StructColumnVector) output;
+      for (int c = 0; c < writers.size(); ++c) {
+        OrcValueWriter child = writers.get(c);
+        child.write(rowId, data.get(c, child.getJavaClass()), cv.fields[c]);
       }
     }
   }
-
-  static class MapConverter implements Converter<Map> {
-    private final Converter keyConverter;
-    private final Converter valueConverter;
-
-    MapConverter(TypeDescription schema) {
-      this.keyConverter = buildConverter(schema.getChildren().get(0));
-      this.valueConverter = buildConverter(schema.getChildren().get(1));
-    }
-
-    @Override
-    public Class<Map> getJavaClass() {
-      return Map.class;
-    }
-
-    @Override
-    @SuppressWarnings("unchecked")
-    public void addValue(int rowId, Map data, ColumnVector output) {
-      if (data == null) {
-        output.noNulls = false;
-        output.isNull[rowId] = true;
-      } else {
-        output.isNull[rowId] = false;
-        Map<Object, Object> map = (Map<Object, Object>) data;
-        List<Object> keys = Lists.newArrayListWithExpectedSize(map.size());
-        List<Object> values = Lists.newArrayListWithExpectedSize(map.size());
-        for (Map.Entry<?, ?> entry : map.entrySet()) {
-          keys.add(entry.getKey());
-          values.add(entry.getValue());
-        }
-        MapColumnVector cv = (MapColumnVector) output;
-        // record the length and start of the list elements
-        cv.lengths[rowId] = map.size();
-        cv.offsets[rowId] = cv.childCount;
-        cv.childCount += cv.lengths[rowId];
-        // make sure the child is big enough
-        cv.keys.ensureSize(cv.childCount, true);
-        cv.values.ensureSize(cv.childCount, true);
-        // Add each element
-        for (int e = 0; e < cv.lengths[rowId]; ++e) {
-          int pos = (int) (e + cv.offsets[rowId]);
-          keyConverter.addValue(pos, keys.get(e), cv.keys);
-          valueConverter.addValue(pos, values.get(e), cv.values);
-        }
-      }
-    }
-  }
-
-  private static Converter buildConverter(TypeDescription schema) {
-    switch (schema.getCategory()) {
-      case BOOLEAN:
-        return new BooleanConverter();
-      case BYTE:
-        return new ByteConverter();
-      case SHORT:
-        return new ShortConverter();
-      case DATE:
-        return new DateConverter();
-      case INT:
-        return new IntConverter();
-      case LONG:
-        String longAttributeValue = 
schema.getAttributeValue(ORCSchemaUtil.ICEBERG_LONG_TYPE_ATTRIBUTE);
-        ORCSchemaUtil.LongType longType = longAttributeValue == null ? 
ORCSchemaUtil.LongType.LONG :
-            ORCSchemaUtil.LongType.valueOf(longAttributeValue);
-        switch (longType) {
-          case TIME:
-            return new TimeConverter();
-          case LONG:
-            return new LongConverter();
-          default:
-            throw new IllegalStateException("Unhandled Long type found in ORC 
type attribute: " + longType);
-        }
-      case FLOAT:
-        return new FloatConverter();
-      case DOUBLE:
-        return new DoubleConverter();
-      case BINARY:
-        String binaryAttributeValue = 
schema.getAttributeValue(ORCSchemaUtil.ICEBERG_BINARY_TYPE_ATTRIBUTE);
-        ORCSchemaUtil.BinaryType binaryType = binaryAttributeValue == null ? 
ORCSchemaUtil.BinaryType.BINARY :
-            ORCSchemaUtil.BinaryType.valueOf(binaryAttributeValue);
-        switch (binaryType) {
-          case UUID:
-            return new UUIDConverter();
-          case FIXED:
-            return new FixedConverter();

Review comment:
       `TypeID#javaClass()` does not define the Java types exposed by the 
`iceberg-data` module. They are mainly used internally in `iceberg-core` when 
serializing and deserializing to/from manifest files and in Expressions as 
literals. Iceberg Generics use `byte[]` for fixed types. So this looks correct 
and is in parity with Parquet and Avro Generics. You can also look at Timestamp 
types. Iceberg Generics uses `LocalDateTime` and `OffsetDateTime` for Timestamp 
Without Zone and Timestamp With Zone respectively. However both types map to 
longs in `TypeID#javaClass()`.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] shardulm94 commented on a change in pull request #1197: Refactor the GenericOrcWriter by using OrcSchemaWithTypeVisitor#visit

Reply via email to