This is an automated email from the ASF dual-hosted git repository.
omalley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new b1a487f ORC-522: Add user type annotations.
b1a487f is described below
commit b1a487f992a78a5219a6b662a2e5a535691b9e31
Author: Owen O'Malley <[email protected]>
AuthorDate: Mon Jun 24 22:02:13 2019 -0700
ORC-522: Add user type annotations.
Fixes #410
Signed-off-by: Owen O'Malley <[email protected]>
---
java/core/src/java/org/apache/orc/OrcUtils.java | 393 ++++-----------------
.../src/java/org/apache/orc/TypeDescription.java | 34 ++
.../test/org/apache/orc/TestTypeDescription.java | 42 +++
.../java/org/apache/orc/tools/JsonFileDump.java | 8 +
proto/orc_proto.proto | 6 +
5 files changed, 161 insertions(+), 322 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java
b/java/core/src/java/org/apache/orc/OrcUtils.java
index 0ba46dc..8fea086 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,7 +17,6 @@
*/
package org.apache.orc;
-import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.SchemaEvolution;
import java.io.IOException;
@@ -42,6 +41,7 @@ public class OrcUtils {
* corresponds to columns a and b. Index 3,4 correspond to column c which is
list<string> and
* index 5 correspond to column d. After flattening list<string> gets
2 columns.
*
+ * Column names that aren't found are ignored.
* @param selectedColumns - comma separated list of selected column names
* @param schema - object schema
* @return - boolean array with true value set for the specified column names
@@ -110,6 +110,13 @@ public class OrcUtils {
private static void appendOrcTypes(List<OrcProto.Type> result,
TypeDescription typeDescr) {
OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
List<TypeDescription> children = typeDescr.getChildren();
+ // save the attributes
+ for(String key: typeDescr.getAttributeNames()) {
+ type.addAttributes(
+ OrcProto.StringPair.newBuilder()
+ .setKey(key).setValue(typeDescr.getAttributeValue(key))
+ .build());
+ }
switch (typeDescr.getCategory()) {
case BOOLEAN:
type.setKind(OrcProto.Type.Kind.BOOLEAN);
@@ -195,283 +202,6 @@ public class OrcUtils {
}
/**
- * NOTE: This method ignores the subtype numbers in the TypeDescription
rebuilds the subtype
- * numbers based on the length of the result list being appended.
- *
- * @param result
- * @param typeDescr
- */
- public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
- TypeDescription typeDescr) {
-
- int subtype = result.size();
- OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
- boolean needsAdd = true;
- List<TypeDescription> children = typeDescr.getChildren();
- switch (typeDescr.getCategory()) {
- case BOOLEAN:
- type.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- type.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- type.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- type.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- type.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- type.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- type.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- type.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- type.setKind(OrcProto.Type.Kind.CHAR);
- type.setMaximumLength(typeDescr.getMaxLength());
- break;
- case VARCHAR:
- type.setKind(OrcProto.Type.Kind.VARCHAR);
- type.setMaximumLength(typeDescr.getMaxLength());
- break;
- case BINARY:
- type.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- type.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- type.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- type.setKind(OrcProto.Type.Kind.DECIMAL);
- type.setPrecision(typeDescr.getPrecision());
- type.setScale(typeDescr.getScale());
- break;
- case LIST:
- type.setKind(OrcProto.Type.Kind.LIST);
- type.addSubtypes(++subtype);
- result.add(type.build());
- needsAdd = false;
- appendOrcTypesRebuildSubtypes(result, children.get(0));
- break;
- case MAP:
- {
- // Make room for MAP type.
- result.add(null);
-
- // Add MAP type pair in order to determine their subtype values.
- appendOrcTypesRebuildSubtypes(result, children.get(0));
- int subtype2 = result.size();
- appendOrcTypesRebuildSubtypes(result, children.get(1));
- type.setKind(OrcProto.Type.Kind.MAP);
- type.addSubtypes(subtype + 1);
- type.addSubtypes(subtype2);
- result.set(subtype, type.build());
- needsAdd = false;
- }
- break;
- case STRUCT:
- {
- List<String> fieldNames = typeDescr.getFieldNames();
-
- // Make room for STRUCT type.
- result.add(null);
-
- List<Integer> fieldSubtypes = new
ArrayList<Integer>(fieldNames.size());
- for(TypeDescription child: children) {
- int fieldSubtype = result.size();
- fieldSubtypes.add(fieldSubtype);
- appendOrcTypesRebuildSubtypes(result, child);
- }
-
- type.setKind(OrcProto.Type.Kind.STRUCT);
-
- for (int i = 0 ; i < fieldNames.size(); i++) {
- type.addSubtypes(fieldSubtypes.get(i));
- type.addFieldNames(fieldNames.get(i));
- }
- result.set(subtype, type.build());
- needsAdd = false;
- }
- break;
- case UNION:
- {
- // Make room for UNION type.
- result.add(null);
-
- List<Integer> unionSubtypes = new ArrayList<Integer>(children.size());
- for(TypeDescription child: children) {
- int unionSubtype = result.size();
- unionSubtypes.add(unionSubtype);
- appendOrcTypesRebuildSubtypes(result, child);
- }
-
- type.setKind(OrcProto.Type.Kind.UNION);
- for (int i = 0 ; i < children.size(); i++) {
- type.addSubtypes(unionSubtypes.get(i));
- }
- result.set(subtype, type.build());
- needsAdd = false;
- }
- break;
- default:
- throw new IllegalArgumentException("Unknown category: " +
typeDescr.getCategory());
- }
- if (needsAdd) {
- result.add(type.build());
- }
- }
-
- /**
- * NOTE: This method ignores the subtype numbers in the OrcProto.Type
rebuilds the subtype
- * numbers based on the length of the result list being appended.
- *
- * @param result
- * @param types
- * @param columnId
- */
- public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
- List<OrcProto.Type> types, int columnId) {
-
- OrcProto.Type oldType = types.get(columnId++);
-
- int subtype = result.size();
- OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
- boolean needsAdd = true;
- switch (oldType.getKind()) {
- case BOOLEAN:
- builder.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- builder.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- builder.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- builder.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- builder.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- builder.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- builder.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- builder.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- builder.setKind(OrcProto.Type.Kind.CHAR);
- builder.setMaximumLength(oldType.getMaximumLength());
- break;
- case VARCHAR:
- builder.setKind(OrcProto.Type.Kind.VARCHAR);
- builder.setMaximumLength(oldType.getMaximumLength());
- break;
- case BINARY:
- builder.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- builder.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- builder.setKind(OrcProto.Type.Kind.DECIMAL);
- builder.setPrecision(oldType.getPrecision());
- builder.setScale(oldType.getScale());
- break;
- case LIST:
- builder.setKind(OrcProto.Type.Kind.LIST);
- builder.addSubtypes(++subtype);
- result.add(builder.build());
- needsAdd = false;
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- break;
- case MAP:
- {
- // Make room for MAP type.
- result.add(null);
-
- // Add MAP type pair in order to determine their subtype values.
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- int subtype2 = result.size();
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- builder.setKind(OrcProto.Type.Kind.MAP);
- builder.addSubtypes(subtype + 1);
- builder.addSubtypes(subtype2);
- result.set(subtype, builder.build());
- needsAdd = false;
- }
- break;
- case STRUCT:
- {
- List<String> fieldNames = oldType.getFieldNamesList();
-
- // Make room for STRUCT type.
- result.add(null);
-
- List<Integer> fieldSubtypes = new
ArrayList<Integer>(fieldNames.size());
- for(int i = 0 ; i < fieldNames.size(); i++) {
- int fieldSubtype = result.size();
- fieldSubtypes.add(fieldSubtype);
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- }
-
- builder.setKind(OrcProto.Type.Kind.STRUCT);
-
- for (int i = 0 ; i < fieldNames.size(); i++) {
- builder.addSubtypes(fieldSubtypes.get(i));
- builder.addFieldNames(fieldNames.get(i));
- }
- result.set(subtype, builder.build());
- needsAdd = false;
- }
- break;
- case UNION:
- {
- int subtypeCount = oldType.getSubtypesCount();
-
- // Make room for UNION type.
- result.add(null);
-
- List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount);
- for(int i = 0 ; i < subtypeCount; i++) {
- int unionSubtype = result.size();
- unionSubtypes.add(unionSubtype);
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- }
-
- builder.setKind(OrcProto.Type.Kind.UNION);
- for (int i = 0 ; i < subtypeCount; i++) {
- builder.addSubtypes(unionSubtypes.get(i));
- }
- result.set(subtype, builder.build());
- needsAdd = false;
- }
- break;
- default:
- throw new IllegalArgumentException("Unknown category: " +
oldType.getKind());
- }
- if (needsAdd) {
- result.add(builder.build());
- }
- return columnId;
- }
-
- /**
* Checks whether the list of protobuf types from the file are valid or not.
* @param types the list of types from the protobuf
* @param root the top of the tree to check
@@ -533,85 +263,105 @@ public class OrcUtils {
int rootColumn)
throws FileFormatException {
OrcProto.Type type = types.get(rootColumn);
+ TypeDescription result;
switch (type.getKind()) {
case BOOLEAN:
- return TypeDescription.createBoolean();
+ result = TypeDescription.createBoolean();
+ break;
case BYTE:
- return TypeDescription.createByte();
+ result = TypeDescription.createByte();
+ break;
case SHORT:
- return TypeDescription.createShort();
+ result = TypeDescription.createShort();
+ break;
case INT:
- return TypeDescription.createInt();
+ result = TypeDescription.createInt();
+ break;
case LONG:
- return TypeDescription.createLong();
+ result = TypeDescription.createLong();
+ break;
case FLOAT:
- return TypeDescription.createFloat();
+ result = TypeDescription.createFloat();
+ break;
case DOUBLE:
- return TypeDescription.createDouble();
+ result = TypeDescription.createDouble();
+ break;
case STRING:
- return TypeDescription.createString();
+ result = TypeDescription.createString();
+ break;
case CHAR:
case VARCHAR: {
- TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
- TypeDescription.createChar() : TypeDescription.createVarchar();
- if (type.hasMaximumLength()) {
- result.withMaxLength(type.getMaximumLength());
+ result = type.getKind() == OrcProto.Type.Kind.CHAR ?
+ TypeDescription.createChar() : TypeDescription.createVarchar();
+ if (type.hasMaximumLength()) {
+ result.withMaxLength(type.getMaximumLength());
+ }
}
- return result;
- }
+ break;
case BINARY:
- return TypeDescription.createBinary();
+ result = TypeDescription.createBinary();
+ break;
case TIMESTAMP:
- return TypeDescription.createTimestamp();
+ result = TypeDescription.createTimestamp();
+ break;
case DATE:
- return TypeDescription.createDate();
+ result = TypeDescription.createDate();
+ break;
case DECIMAL: {
- TypeDescription result = TypeDescription.createDecimal();
- if (type.hasScale()) {
- result.withScale(type.getScale());
- }
- if (type.hasPrecision()) {
- result.withPrecision(type.getPrecision());
+ result = TypeDescription.createDecimal();
+ if (type.hasScale()) {
+ result.withScale(type.getScale());
+ }
+ if (type.hasPrecision()) {
+ result.withPrecision(type.getPrecision());
+ }
}
- return result;
- }
+ break;
case LIST:
if (type.getSubtypesCount() != 1) {
throw new FileFormatException("LIST type should contain exactly " +
"one subtype but has " + type.getSubtypesCount());
}
- return TypeDescription.createList(
+ result = TypeDescription.createList(
convertTypeFromProtobuf(types, type.getSubtypes(0)));
+ break;
case MAP:
if (type.getSubtypesCount() != 2) {
throw new FileFormatException("MAP type should contain exactly " +
"two subtypes but has " + type.getSubtypesCount());
}
- return TypeDescription.createMap(
+ result = TypeDescription.createMap(
convertTypeFromProtobuf(types, type.getSubtypes(0)),
convertTypeFromProtobuf(types, type.getSubtypes(1)));
+ break;
case STRUCT: {
- TypeDescription result = TypeDescription.createStruct();
- for(int f=0; f < type.getSubtypesCount(); ++f) {
- result.addField(type.getFieldNames(f),
- convertTypeFromProtobuf(types, type.getSubtypes(f)));
+ result = TypeDescription.createStruct();
+ for(int f=0; f < type.getSubtypesCount(); ++f) {
+ result.addField(type.getFieldNames(f),
+ convertTypeFromProtobuf(types, type.getSubtypes(f)));
+ }
}
- return result;
- }
+ break;
case UNION: {
- if (type.getSubtypesCount() == 0) {
- throw new FileFormatException("UNION type should contain at least" +
+ if (type.getSubtypesCount() == 0) {
+ throw new FileFormatException("UNION type should contain at least"
+
" one subtype but has none");
+ }
+ result = TypeDescription.createUnion();
+ for(int f=0; f < type.getSubtypesCount(); ++f) {
+ result.addUnionChild(
+ convertTypeFromProtobuf(types, type.getSubtypes(f)));
+ }
}
- TypeDescription result = TypeDescription.createUnion();
- for(int f=0; f < type.getSubtypesCount(); ++f) {
- result.addUnionChild(
- convertTypeFromProtobuf(types, type.getSubtypes(f)));
- }
- return result;
- }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown ORC type " +
type.getKind());
+ }
+ for(int i = 0; i < type.getAttributesCount(); ++i) {
+ OrcProto.StringPair pair = type.getAttributes(i);
+ result.setAttribute(pair.getKey(), pair.getValue());
}
- throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
+ return result;
}
public static List<StripeInformation> convertProtoStripesToStripes(
@@ -630,5 +380,4 @@ public class OrcUtils {
}
return result;
}
-
}
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java
b/java/core/src/java/org/apache/orc/TypeDescription.java
index 8372207..9b49ff0 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -35,7 +35,9 @@ import org.apache.orc.impl.SchemaEvolution;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.regex.Pattern;
/**
@@ -448,6 +450,17 @@ public class TypeDescription
return this;
}
+ /**
+ * Set an attribute on this type
+ * @param key the attribute name
+ * @param value the attribute value
+ * @return this for method chaining
+ */
+ public TypeDescription setAttribute(String key, String value) {
+ attributes.put(key, value);
+ return this;
+ }
+
public static TypeDescription createVarchar() {
return new TypeDescription(Category.VARCHAR);
}
@@ -773,6 +786,26 @@ public class TypeDescription
}
/**
+ * Get the list of attribute names defined on this type.
+ * @return a list of sorted attribute names
+ */
+ public List<String> getAttributeNames() {
+ List<String> result = new ArrayList<>(attributes.size());
+ result.addAll(attributes.keySet());
+ Collections.sort(result);
+ return result;
+ }
+
+ /**
+ * Get the value of a given attribute.
+ * @param attributeName the name of the attribute
+ * @return the value of the attribute or null if it isn't set
+ */
+ public String getAttributeValue(String attributeName) {
+ return attributes.get(attributeName);
+ }
+
+ /**
* Get the subtypes of this type.
* @return the list of children types
*/
@@ -816,6 +849,7 @@ public class TypeDescription
private final Category category;
private final List<TypeDescription> children;
private final List<String> fieldNames;
+ private final Map<String,String> attributes = new HashMap<>();
private int maxLength = DEFAULT_LENGTH;
private int precision = DEFAULT_PRECISION;
private int scale = DEFAULT_SCALE;
diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java
b/java/core/src/test/org/apache/orc/TestTypeDescription.java
index 570c7f4..6a48746 100644
--- a/java/core/src/test/org/apache/orc/TestTypeDescription.java
+++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java
@@ -17,13 +17,18 @@
*/
package org.apache.orc;
+import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
+import java.io.File;
+import java.io.IOException;
import java.util.List;
public class TestTypeDescription {
@@ -325,4 +330,41 @@ public class TestTypeDescription {
results = type.findSubtypes("");
assertEquals(0, results.size());
}
+
+ @Test
+ public void testAttributes() throws IOException {
+ TypeDescription schema = TypeDescription.fromString(
+ "struct<" +
+ "name:struct<first:string,last:string>," +
+
"address:struct<street:string,city:string,country:string,post_code:string>," +
+
"credit_cards:array<struct<card_number:string,expire:date,ccv:string>>>");
+ // set some attributes
+ schema.findSubtype("name").setAttribute("iceberg.id", "12");
+ schema.findSubtype("address.street").setAttribute("mask", "nullify")
+ .setAttribute("encrypt", "pii");
+
+ // write a file with those attributes
+ Path path = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"),
"attribute.orc");
+ Configuration conf = new Configuration();
+ Writer writer = OrcFile.createWriter(path,
+ OrcFile.writerOptions(conf).setSchema(schema).overwrite(true));
+ writer.close();
+
+ // read the file back again
+ Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+ TypeDescription readerSchema = reader.getSchema();
+
+ // make sure that the read types have the attributes
+ TypeDescription nameCol = readerSchema.findSubtype("name");
+ assertArrayEquals(new Object[]{"iceberg.id"},
+ nameCol.getAttributeNames().toArray());
+ assertEquals("12", nameCol.getAttributeValue("iceberg.id"));
+ TypeDescription street = readerSchema.findSubtype("address.street");
+ assertArrayEquals(new Object[]{"encrypt", "mask"},
+ street.getAttributeNames().toArray());
+ assertEquals("pii", street.getAttributeValue("encrypt"));
+ assertEquals("nullify", street.getAttributeValue("mask"));
+ assertEquals(null, street.getAttributeValue("foobar"));
+ }
}
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index db51fed..9efeac8 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -277,6 +277,14 @@ public class JsonFileDump {
if (type.hasMaximumLength()) {
writer.key("maxLength").value(type.getMaximumLength());
}
+
+ if (type.getAttributesCount() > 0) {
+ writer.key("attributes").object();
+ for(OrcProto.StringPair pair: type.getAttributesList()) {
+ writer.key(pair.getKey()).value(pair.getValue());
+ }
+ writer.endObject();
+ }
writer.endObject();
}
}
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index f0c66f1..a2f4c24 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -187,6 +187,11 @@ message StripeFooter {
// postscript: PostScript
// psLen: byte
+message StringPair {
+ optional string key = 1;
+ optional string value = 2;
+}
+
message Type {
enum Kind {
BOOLEAN = 0;
@@ -214,6 +219,7 @@ message Type {
optional uint32 maximumLength = 4;
optional uint32 precision = 5;
optional uint32 scale = 6;
+ repeated StringPair attributes = 7;
}
message StripeInformation {