This is an automated email from the ASF dual-hosted git repository.

omalley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new b1a487f  ORC-522: Add user type annotations.
b1a487f is described below

commit b1a487f992a78a5219a6b662a2e5a535691b9e31
Author: Owen O'Malley <[email protected]>
AuthorDate: Mon Jun 24 22:02:13 2019 -0700

    ORC-522: Add user type annotations.
    
    Fixes #410
    
    Signed-off-by: Owen O'Malley <[email protected]>
---
 java/core/src/java/org/apache/orc/OrcUtils.java    | 393 ++++-----------------
 .../src/java/org/apache/orc/TypeDescription.java   |  34 ++
 .../test/org/apache/orc/TestTypeDescription.java   |  42 +++
 .../java/org/apache/orc/tools/JsonFileDump.java    |   8 +
 proto/orc_proto.proto                              |   6 +
 5 files changed, 161 insertions(+), 322 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java 
b/java/core/src/java/org/apache/orc/OrcUtils.java
index 0ba46dc..8fea086 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,7 +17,6 @@
  */
 package org.apache.orc;
 
-import org.apache.orc.impl.ReaderImpl;
 import org.apache.orc.impl.SchemaEvolution;
 
 import java.io.IOException;
@@ -42,6 +41,7 @@ public class OrcUtils {
    * corresponds to columns a and b. Index 3,4 correspond to column c which is 
list&lt;string&gt; and
    * index 5 correspond to column d. After flattening list&lt;string&gt; gets 
2 columns.
    *
+   * Column names that aren't found are ignored.
    * @param selectedColumns - comma separated list of selected column names
    * @param schema       - object schema
    * @return - boolean array with true value set for the specified column names
@@ -110,6 +110,13 @@ public class OrcUtils {
   private static void appendOrcTypes(List<OrcProto.Type> result, 
TypeDescription typeDescr) {
     OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
     List<TypeDescription> children = typeDescr.getChildren();
+    // save the attributes
+    for(String key: typeDescr.getAttributeNames()) {
+      type.addAttributes(
+          OrcProto.StringPair.newBuilder()
+              .setKey(key).setValue(typeDescr.getAttributeValue(key))
+              .build());
+    }
     switch (typeDescr.getCategory()) {
     case BOOLEAN:
       type.setKind(OrcProto.Type.Kind.BOOLEAN);
@@ -195,283 +202,6 @@ public class OrcUtils {
   }
 
   /**
-   * NOTE: This method ignores the subtype numbers in the TypeDescription 
rebuilds the subtype
-   * numbers based on the length of the result list being appended.
-   *
-   * @param result
-   * @param typeDescr
-   */
-  public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
-      TypeDescription typeDescr) {
-
-    int subtype = result.size();
-    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
-    boolean needsAdd = true;
-    List<TypeDescription> children = typeDescr.getChildren();
-    switch (typeDescr.getCategory()) {
-    case BOOLEAN:
-      type.setKind(OrcProto.Type.Kind.BOOLEAN);
-      break;
-    case BYTE:
-      type.setKind(OrcProto.Type.Kind.BYTE);
-      break;
-    case SHORT:
-      type.setKind(OrcProto.Type.Kind.SHORT);
-      break;
-    case INT:
-      type.setKind(OrcProto.Type.Kind.INT);
-      break;
-    case LONG:
-      type.setKind(OrcProto.Type.Kind.LONG);
-      break;
-    case FLOAT:
-      type.setKind(OrcProto.Type.Kind.FLOAT);
-      break;
-    case DOUBLE:
-      type.setKind(OrcProto.Type.Kind.DOUBLE);
-      break;
-    case STRING:
-      type.setKind(OrcProto.Type.Kind.STRING);
-      break;
-    case CHAR:
-      type.setKind(OrcProto.Type.Kind.CHAR);
-      type.setMaximumLength(typeDescr.getMaxLength());
-      break;
-    case VARCHAR:
-      type.setKind(OrcProto.Type.Kind.VARCHAR);
-      type.setMaximumLength(typeDescr.getMaxLength());
-      break;
-    case BINARY:
-      type.setKind(OrcProto.Type.Kind.BINARY);
-      break;
-    case TIMESTAMP:
-      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
-      break;
-    case DATE:
-      type.setKind(OrcProto.Type.Kind.DATE);
-      break;
-    case DECIMAL:
-      type.setKind(OrcProto.Type.Kind.DECIMAL);
-      type.setPrecision(typeDescr.getPrecision());
-      type.setScale(typeDescr.getScale());
-      break;
-    case LIST:
-      type.setKind(OrcProto.Type.Kind.LIST);
-      type.addSubtypes(++subtype);
-      result.add(type.build());
-      needsAdd = false;
-      appendOrcTypesRebuildSubtypes(result, children.get(0));
-      break;
-    case MAP:
-      {
-        // Make room for MAP type.
-        result.add(null);
-
-        // Add MAP type pair in order to determine their subtype values.
-        appendOrcTypesRebuildSubtypes(result, children.get(0));
-        int subtype2 = result.size();
-        appendOrcTypesRebuildSubtypes(result, children.get(1));
-        type.setKind(OrcProto.Type.Kind.MAP);
-        type.addSubtypes(subtype + 1);
-        type.addSubtypes(subtype2);
-        result.set(subtype, type.build());
-        needsAdd = false;
-      }
-      break;
-    case STRUCT:
-      {
-        List<String> fieldNames = typeDescr.getFieldNames();
-
-        // Make room for STRUCT type.
-        result.add(null);
-
-        List<Integer> fieldSubtypes = new 
ArrayList<Integer>(fieldNames.size());
-        for(TypeDescription child: children) {
-          int fieldSubtype = result.size();
-          fieldSubtypes.add(fieldSubtype);
-          appendOrcTypesRebuildSubtypes(result, child);
-        }
-
-        type.setKind(OrcProto.Type.Kind.STRUCT);
-
-        for (int i = 0 ; i < fieldNames.size(); i++) {
-          type.addSubtypes(fieldSubtypes.get(i));
-          type.addFieldNames(fieldNames.get(i));
-        }
-        result.set(subtype, type.build());
-        needsAdd = false;
-      }
-      break;
-    case UNION:
-      {
-        // Make room for UNION type.
-        result.add(null);
-
-        List<Integer> unionSubtypes = new ArrayList<Integer>(children.size());
-        for(TypeDescription child: children) {
-          int unionSubtype = result.size();
-          unionSubtypes.add(unionSubtype);
-          appendOrcTypesRebuildSubtypes(result, child);
-        }
-
-        type.setKind(OrcProto.Type.Kind.UNION);
-        for (int i = 0 ; i < children.size(); i++) {
-          type.addSubtypes(unionSubtypes.get(i));
-        }
-        result.set(subtype, type.build());
-        needsAdd = false;
-      }
-      break;
-    default:
-      throw new IllegalArgumentException("Unknown category: " + 
typeDescr.getCategory());
-    }
-    if (needsAdd) {
-      result.add(type.build());
-    }
-  }
-
-  /**
-   * NOTE: This method ignores the subtype numbers in the OrcProto.Type 
rebuilds the subtype
-   * numbers based on the length of the result list being appended.
-   *
-   * @param result
-   * @param types
-   * @param columnId
-   */
-  public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
-      List<OrcProto.Type> types, int columnId) {
-
-    OrcProto.Type oldType = types.get(columnId++);
-
-    int subtype = result.size();
-    OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
-    boolean needsAdd = true;
-    switch (oldType.getKind()) {
-    case BOOLEAN:
-      builder.setKind(OrcProto.Type.Kind.BOOLEAN);
-      break;
-    case BYTE:
-      builder.setKind(OrcProto.Type.Kind.BYTE);
-      break;
-    case SHORT:
-      builder.setKind(OrcProto.Type.Kind.SHORT);
-      break;
-    case INT:
-      builder.setKind(OrcProto.Type.Kind.INT);
-      break;
-    case LONG:
-      builder.setKind(OrcProto.Type.Kind.LONG);
-      break;
-    case FLOAT:
-      builder.setKind(OrcProto.Type.Kind.FLOAT);
-      break;
-    case DOUBLE:
-      builder.setKind(OrcProto.Type.Kind.DOUBLE);
-      break;
-    case STRING:
-      builder.setKind(OrcProto.Type.Kind.STRING);
-      break;
-    case CHAR:
-      builder.setKind(OrcProto.Type.Kind.CHAR);
-      builder.setMaximumLength(oldType.getMaximumLength());
-      break;
-    case VARCHAR:
-      builder.setKind(OrcProto.Type.Kind.VARCHAR);
-      builder.setMaximumLength(oldType.getMaximumLength());
-      break;
-    case BINARY:
-      builder.setKind(OrcProto.Type.Kind.BINARY);
-      break;
-    case TIMESTAMP:
-      builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
-      break;
-    case DATE:
-      builder.setKind(OrcProto.Type.Kind.DATE);
-      break;
-    case DECIMAL:
-      builder.setKind(OrcProto.Type.Kind.DECIMAL);
-      builder.setPrecision(oldType.getPrecision());
-      builder.setScale(oldType.getScale());
-      break;
-    case LIST:
-      builder.setKind(OrcProto.Type.Kind.LIST);
-      builder.addSubtypes(++subtype);
-      result.add(builder.build());
-      needsAdd = false;
-      columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
-      break;
-    case MAP:
-      {
-        // Make room for MAP type.
-        result.add(null);
-
-        // Add MAP type pair in order to determine their subtype values.
-        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
-        int subtype2 = result.size();
-        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
-        builder.setKind(OrcProto.Type.Kind.MAP);
-        builder.addSubtypes(subtype + 1);
-        builder.addSubtypes(subtype2);
-        result.set(subtype, builder.build());
-        needsAdd = false;
-      }
-      break;
-    case STRUCT:
-      {
-        List<String> fieldNames = oldType.getFieldNamesList();
-
-        // Make room for STRUCT type.
-        result.add(null);
-
-        List<Integer> fieldSubtypes = new 
ArrayList<Integer>(fieldNames.size());
-        for(int i = 0 ; i < fieldNames.size(); i++) {
-          int fieldSubtype = result.size();
-          fieldSubtypes.add(fieldSubtype);
-          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
-        }
-
-        builder.setKind(OrcProto.Type.Kind.STRUCT);
-
-        for (int i = 0 ; i < fieldNames.size(); i++) {
-          builder.addSubtypes(fieldSubtypes.get(i));
-          builder.addFieldNames(fieldNames.get(i));
-        }
-        result.set(subtype, builder.build());
-        needsAdd = false;
-      }
-      break;
-    case UNION:
-      {
-        int subtypeCount = oldType.getSubtypesCount();
-
-        // Make room for UNION type.
-        result.add(null);
-
-        List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount);
-        for(int i = 0 ; i < subtypeCount; i++) {
-          int unionSubtype = result.size();
-          unionSubtypes.add(unionSubtype);
-          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
-        }
-
-        builder.setKind(OrcProto.Type.Kind.UNION);
-        for (int i = 0 ; i < subtypeCount; i++) {
-          builder.addSubtypes(unionSubtypes.get(i));
-        }
-        result.set(subtype, builder.build());
-        needsAdd = false;
-      }
-      break;
-    default:
-      throw new IllegalArgumentException("Unknown category: " + 
oldType.getKind());
-    }
-    if (needsAdd) {
-      result.add(builder.build());
-    }
-    return columnId;
-  }
-
-  /**
    * Checks whether the list of protobuf types from the file are valid or not.
    * @param types the list of types from the protobuf
    * @param root the top of the tree to check
@@ -533,85 +263,105 @@ public class OrcUtils {
                                                 int rootColumn)
           throws FileFormatException {
     OrcProto.Type type = types.get(rootColumn);
+    TypeDescription result;
     switch (type.getKind()) {
       case BOOLEAN:
-        return TypeDescription.createBoolean();
+        result = TypeDescription.createBoolean();
+        break;
       case BYTE:
-        return TypeDescription.createByte();
+        result = TypeDescription.createByte();
+        break;
       case SHORT:
-        return TypeDescription.createShort();
+        result = TypeDescription.createShort();
+        break;
       case INT:
-        return TypeDescription.createInt();
+        result = TypeDescription.createInt();
+        break;
       case LONG:
-        return TypeDescription.createLong();
+        result = TypeDescription.createLong();
+        break;
       case FLOAT:
-        return TypeDescription.createFloat();
+        result = TypeDescription.createFloat();
+        break;
       case DOUBLE:
-        return TypeDescription.createDouble();
+        result = TypeDescription.createDouble();
+        break;
       case STRING:
-        return TypeDescription.createString();
+        result = TypeDescription.createString();
+        break;
       case CHAR:
       case VARCHAR: {
-        TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
-            TypeDescription.createChar() : TypeDescription.createVarchar();
-        if (type.hasMaximumLength()) {
-          result.withMaxLength(type.getMaximumLength());
+          result = type.getKind() == OrcProto.Type.Kind.CHAR ?
+              TypeDescription.createChar() : TypeDescription.createVarchar();
+          if (type.hasMaximumLength()) {
+            result.withMaxLength(type.getMaximumLength());
+          }
         }
-        return result;
-      }
+        break;
       case BINARY:
-        return TypeDescription.createBinary();
+        result = TypeDescription.createBinary();
+        break;
       case TIMESTAMP:
-        return TypeDescription.createTimestamp();
+        result = TypeDescription.createTimestamp();
+        break;
       case DATE:
-        return TypeDescription.createDate();
+        result = TypeDescription.createDate();
+        break;
       case DECIMAL: {
-        TypeDescription result = TypeDescription.createDecimal();
-        if (type.hasScale()) {
-          result.withScale(type.getScale());
-        }
-        if (type.hasPrecision()) {
-          result.withPrecision(type.getPrecision());
+          result = TypeDescription.createDecimal();
+          if (type.hasScale()) {
+            result.withScale(type.getScale());
+          }
+          if (type.hasPrecision()) {
+            result.withPrecision(type.getPrecision());
+          }
         }
-        return result;
-      }
+        break;
       case LIST:
         if (type.getSubtypesCount() != 1) {
           throw new FileFormatException("LIST type should contain exactly " +
                   "one subtype but has " + type.getSubtypesCount());
         }
-        return TypeDescription.createList(
+        result = TypeDescription.createList(
             convertTypeFromProtobuf(types, type.getSubtypes(0)));
+        break;
       case MAP:
         if (type.getSubtypesCount() != 2) {
           throw new FileFormatException("MAP type should contain exactly " +
                   "two subtypes but has " + type.getSubtypesCount());
         }
-        return TypeDescription.createMap(
+        result = TypeDescription.createMap(
             convertTypeFromProtobuf(types, type.getSubtypes(0)),
             convertTypeFromProtobuf(types, type.getSubtypes(1)));
+        break;
       case STRUCT: {
-        TypeDescription result = TypeDescription.createStruct();
-        for(int f=0; f < type.getSubtypesCount(); ++f) {
-          result.addField(type.getFieldNames(f),
-              convertTypeFromProtobuf(types, type.getSubtypes(f)));
+          result = TypeDescription.createStruct();
+          for(int f=0; f < type.getSubtypesCount(); ++f) {
+            result.addField(type.getFieldNames(f),
+                convertTypeFromProtobuf(types, type.getSubtypes(f)));
+          }
         }
-        return result;
-      }
+        break;
       case UNION: {
-        if (type.getSubtypesCount() == 0) {
-          throw new FileFormatException("UNION type should contain at least" +
+          if (type.getSubtypesCount() == 0) {
+            throw new FileFormatException("UNION type should contain at least" 
+
                   " one subtype but has none");
+          }
+          result = TypeDescription.createUnion();
+          for(int f=0; f < type.getSubtypesCount(); ++f) {
+            result.addUnionChild(
+                convertTypeFromProtobuf(types, type.getSubtypes(f)));
+          }
         }
-        TypeDescription result = TypeDescription.createUnion();
-        for(int f=0; f < type.getSubtypesCount(); ++f) {
-          result.addUnionChild(
-              convertTypeFromProtobuf(types, type.getSubtypes(f)));
-        }
-        return result;
-      }
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown ORC type " + 
type.getKind());
+    }
+    for(int i = 0; i < type.getAttributesCount(); ++i) {
+      OrcProto.StringPair pair = type.getAttributes(i);
+      result.setAttribute(pair.getKey(), pair.getValue());
     }
-    throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
+    return result;
   }
 
   public static List<StripeInformation> convertProtoStripesToStripes(
@@ -630,5 +380,4 @@ public class OrcUtils {
     }
     return result;
   }
-
 }
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java 
b/java/core/src/java/org/apache/orc/TypeDescription.java
index 8372207..9b49ff0 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -35,7 +35,9 @@ import org.apache.orc.impl.SchemaEvolution;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.regex.Pattern;
 
 /**
@@ -448,6 +450,17 @@ public class TypeDescription
     return this;
   }
 
+  /**
+   * Set an attribute on this type
+   * @param key the attribute name
+   * @param value the attribute value
+   * @return this for method chaining
+   */
+  public TypeDescription setAttribute(String key, String value) {
+    attributes.put(key, value);
+    return this;
+  }
+
   public static TypeDescription createVarchar() {
     return new TypeDescription(Category.VARCHAR);
   }
@@ -773,6 +786,26 @@ public class TypeDescription
   }
 
   /**
+   * Get the list of attribute names defined on this type.
+   * @return a list of sorted attribute names
+   */
+  public List<String> getAttributeNames() {
+    List<String> result = new ArrayList<>(attributes.size());
+    result.addAll(attributes.keySet());
+    Collections.sort(result);
+    return result;
+  }
+
+  /**
+   * Get the value of a given attribute.
+   * @param attributeName the name of the attribute
+   * @return the value of the attribute or null if it isn't set
+   */
+  public String getAttributeValue(String attributeName) {
+    return attributes.get(attributeName);
+  }
+
+  /**
    * Get the subtypes of this type.
    * @return the list of children types
    */
@@ -816,6 +849,7 @@ public class TypeDescription
   private final Category category;
   private final List<TypeDescription> children;
   private final List<String> fieldNames;
+  private final Map<String,String> attributes = new HashMap<>();
   private int maxLength = DEFAULT_LENGTH;
   private int precision = DEFAULT_PRECISION;
   private int scale = DEFAULT_SCALE;
diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java 
b/java/core/src/test/org/apache/orc/TestTypeDescription.java
index 570c7f4..6a48746 100644
--- a/java/core/src/test/org/apache/orc/TestTypeDescription.java
+++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java
@@ -17,13 +17,18 @@
  */
 package org.apache.orc;
 
+import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
 
+import java.io.File;
+import java.io.IOException;
 import java.util.List;
 
 public class TestTypeDescription {
@@ -325,4 +330,41 @@ public class TestTypeDescription {
     results = type.findSubtypes("");
     assertEquals(0, results.size());
   }
+
+  @Test
+  public void testAttributes() throws IOException {
+    TypeDescription schema = TypeDescription.fromString(
+        "struct<" +
+            "name:struct<first:string,last:string>," +
+            
"address:struct<street:string,city:string,country:string,post_code:string>," +
+            
"credit_cards:array<struct<card_number:string,expire:date,ccv:string>>>");
+    // set some attributes
+    schema.findSubtype("name").setAttribute("iceberg.id", "12");
+    schema.findSubtype("address.street").setAttribute("mask", "nullify")
+        .setAttribute("encrypt", "pii");
+
+    // write a file with those attributes
+    Path path = new Path(System.getProperty("test.tmp.dir",
+        "target" + File.separator + "test" + File.separator + "tmp"), 
"attribute.orc");
+    Configuration conf = new Configuration();
+    Writer writer = OrcFile.createWriter(path,
+        OrcFile.writerOptions(conf).setSchema(schema).overwrite(true));
+    writer.close();
+
+    // read the file back again
+    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+    TypeDescription readerSchema = reader.getSchema();
+
+    // make sure that the read types have the attributes
+    TypeDescription nameCol = readerSchema.findSubtype("name");
+    assertArrayEquals(new Object[]{"iceberg.id"},
+        nameCol.getAttributeNames().toArray());
+    assertEquals("12", nameCol.getAttributeValue("iceberg.id"));
+    TypeDescription street = readerSchema.findSubtype("address.street");
+    assertArrayEquals(new Object[]{"encrypt", "mask"},
+        street.getAttributeNames().toArray());
+    assertEquals("pii", street.getAttributeValue("encrypt"));
+    assertEquals("nullify", street.getAttributeValue("mask"));
+    assertEquals(null, street.getAttributeValue("foobar"));
+  }
 }
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java 
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index db51fed..9efeac8 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -277,6 +277,14 @@ public class JsonFileDump {
       if (type.hasMaximumLength()) {
         writer.key("maxLength").value(type.getMaximumLength());
       }
+
+      if (type.getAttributesCount() > 0) {
+        writer.key("attributes").object();
+        for(OrcProto.StringPair pair: type.getAttributesList()) {
+          writer.key(pair.getKey()).value(pair.getValue());
+        }
+        writer.endObject();
+      }
       writer.endObject();
     }
   }
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index f0c66f1..a2f4c24 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -187,6 +187,11 @@ message StripeFooter {
 //   postscript: PostScript
 //   psLen: byte
 
+message StringPair {
+  optional string key = 1;
+  optional string value = 2;
+}
+
 message Type {
   enum Kind {
     BOOLEAN = 0;
@@ -214,6 +219,7 @@ message Type {
   optional uint32 maximumLength = 4;
   optional uint32 precision = 5;
   optional uint32 scale = 6;
+  repeated StringPair attributes = 7;
 }
 
 message StripeInformation {

Reply via email to