HIVE-18545: Add UDF to parse complex types from json (Zoltan Haindrich reviewed by Ashutosh Chauhan)
Signed-off-by: Zoltan Haindrich <k...@rxd.hu> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1105ef39 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1105ef39 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1105ef39 Branch: refs/heads/master Commit: 1105ef3974d8a324637d3d35881a739af3aeb382 Parents: 18fb1b3 Author: Zoltan Haindrich <k...@rxd.hu> Authored: Tue Jul 10 16:05:10 2018 +0200 Committer: Zoltan Haindrich <k...@rxd.hu> Committed: Tue Jul 10 16:05:10 2018 +0200 ---------------------------------------------------------------------- .../apache/hive/hcatalog/data/JsonSerDe.java | 612 ++---------------- .../hive/hcatalog/data/TestJsonSerDe.java | 12 +- .../benchmark/udf/json_read/JsonReadBench.java | 83 +++ .../hive/benchmark/udf/json_read/val1.json | 86 +++ .../hive/benchmark/udf/json_read/val1.type | 1 + .../hadoop/hive/ql/exec/FunctionRegistry.java | 1 + .../hive/ql/udf/generic/GenericUDFJsonRead.java | 92 +++ .../ql/udf/generic/TestGenericUDFJsonRead.java | 204 ++++++ .../test/queries/clientpositive/json_serde2.q | 37 ++ .../test/queries/clientpositive/udf_json_read.q | 44 ++ .../results/clientpositive/json_serde2.q.out | 113 ++++ .../results/clientpositive/show_functions.q.out | 1 + .../results/clientpositive/udf_json_read.q.out | 107 +++ .../apache/hadoop/hive/serde2/JsonSerDe.java | 645 ++++++------------- .../hive/serde2/json/HiveJsonStructReader.java | 402 ++++++++++++ .../apache/hive/streaming/StrictJsonWriter.java | 2 +- 16 files changed, 1428 insertions(+), 1014 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java ---------------------------------------------------------------------- diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java index af80c02..87611ad 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java @@ -18,125 +18,52 @@ */ package org.apache.hive.hcatalog.data; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Properties; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.Set; +import org.apache.commons.lang3.ArrayUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.common.type.Date; -import org.apache.hadoop.hive.common.type.HiveChar; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeSpec; import org.apache.hadoop.hive.serde2.SerDeStats; -import org.apache.hadoop.hive.serde2.SerDeUtils; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; -import org.apache.hive.common.util.HiveStringUtils; -import org.apache.hive.common.util.TimestampParser; import org.apache.hive.hcatalog.common.HCatException; -import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type; import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils; -import org.codehaus.jackson.JsonFactory; -import org.codehaus.jackson.JsonParseException; -import org.codehaus.jackson.JsonParser; -import org.codehaus.jackson.JsonToken; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.TIMESTAMP_FORMATS}) - public class JsonSerDe extends AbstractSerDe { private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class); - private List<String> columnNames; private HCatSchema schema; - private JsonFactory jsonFactory = null; - private HCatRecordObjectInspector cachedObjectInspector; - private TimestampParser tsParser; + private org.apache.hadoop.hive.serde2.JsonSerDe jsonSerde = new org.apache.hadoop.hive.serde2.JsonSerDe(); @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { - List<TypeInfo> columnTypes; - StructTypeInfo rowTypeInfo; - - LOG.debug("Initializing JsonSerDe: {}", tbl.entrySet()); - - // Get column names and types - String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); - String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); - final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl - .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); - // all table column names - if (columnNameProperty.isEmpty()) { - columnNames = Collections.emptyList(); - } else { - columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); - } - - // all column types - if (columnTypeProperty.isEmpty()) { - columnTypes = Collections.emptyList(); - } else { - columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); - } - LOG.debug("columns: {}, {}", columnNameProperty, columnNames); - LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); - - assert (columnNames.size() == columnTypes.size()); - - rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + jsonSerde.initialize(conf, tbl); + jsonSerde.setWriteablesUsage(false); + StructTypeInfo rowTypeInfo = jsonSerde.getTypeInfo(); cachedObjectInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo); try { schema = HCatSchemaUtils.getHCatSchema(rowTypeInfo).get(0).getStructSubSchema(); @@ -145,10 +72,6 @@ public class JsonSerDe extends AbstractSerDe { } catch (HCatException e) { throw new SerDeException(e); } - - jsonFactory = new JsonFactory(); - tsParser = new TimestampParser( - HiveStringUtils.splitAndUnEscape(tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS))); } /** @@ -160,274 +83,82 @@ public class JsonSerDe extends AbstractSerDe { */ @Override public Object deserialize(Writable blob) throws SerDeException { - - Text t = (Text) blob; - JsonParser p; - List<Object> r = new ArrayList<Object>(Collections.nCopies(columnNames.size(), null)); try { - p = jsonFactory.createJsonParser(new ByteArrayInputStream((t.getBytes()))); - if (p.nextToken() != JsonToken.START_OBJECT) { - throw new IOException("Start token not found where expected"); - } - JsonToken token; - while (((token = p.nextToken()) != JsonToken.END_OBJECT) && (token != null)) { - // iterate through each token, and create appropriate object here. - populateRecord(r, token, p, schema); - } - } catch (JsonParseException e) { - LOG.warn("Error [{}] parsing json text [{}].", e, t); - throw new SerDeException(e); - } catch (IOException e) { - LOG.warn("Error [{}] parsing json text [{}].", e, t); + Object row = jsonSerde.deserialize(blob); + List fatRow = fatLand((Object[]) row); + return new DefaultHCatRecord(fatRow); + } catch (Exception e) { throw new SerDeException(e); } - - return new DefaultHCatRecord(r); } - private void populateRecord(List<Object> r, JsonToken token, JsonParser p, HCatSchema s) throws IOException { - if (token != JsonToken.FIELD_NAME) { - throw new IOException("Field name expected"); - } - String fieldName = p.getText(); - Integer fpos = s.getPosition(fieldName); - if (fpos == null) { - fpos = getPositionFromHiveInternalColumnName(fieldName); - LOG.debug("NPE finding position for field [{}] in schema [{}]," - +" attempting to check if it is an internal column name like _col0", fieldName, s); - if (fpos == -1) { - skipValue(p); - return; // unknown field, we return. We'll continue from the next field onwards. - } - // If we get past this, then the column name did match the hive pattern for an internal - // column name, such as _col0, etc, so it *MUST* match the schema for the appropriate column. - // This means people can't use arbitrary column names such as _col0, and expect us to ignore it - // if we find it. - if (!fieldName.equalsIgnoreCase(getHiveInternalColumnName(fpos))) { - LOG.error("Hive internal column name {} and position " - + "encoding {} for the column name are at odds", fieldName, fpos); - throw new IOException("Hive internal column name ("+ fieldName - + ") and position encoding (" + fpos - + ") for the column name are at odds"); + @SuppressWarnings({"rawtypes", "unchecked" }) + private static List fatLand(Object[] arr) { + List ret = new ArrayList<>(); + for (Object o : arr) { + if (o != null && o instanceof Map<?, ?>) { + ret.add(fatMap(((Map) o))); + } else if (o != null && o instanceof List<?>) { + ret.add(fatLand(((List) o).toArray())); + } else if (o != null && o.getClass().isArray() && o.getClass().getComponentType() != byte.class) { + Class<?> ct = o.getClass().getComponentType(); + if (ct.isPrimitive()) { + ret.add(primitiveArrayToList(o)); + } else { + ret.add(fatLand((Object[]) o)); + } + } else { + ret.add(o); } - // If we reached here, then we were successful at finding an alternate internal - // column mapping, and we're about to proceed. } - HCatFieldSchema hcatFieldSchema = s.getFields().get(fpos); - Object currField = extractCurrentField(p, hcatFieldSchema, false); - r.set(fpos, currField); + return ret; } - public String getHiveInternalColumnName(int fpos) { - return HiveConf.getColumnInternalName(fpos); - } - - public int getPositionFromHiveInternalColumnName(String internalName) { -// return HiveConf.getPositionFromInternalName(fieldName); - // The above line should have been all the implementation that - // we need, but due to a bug in that impl which recognizes - // only single-digit columns, we need another impl here. - Pattern internalPattern = Pattern.compile("_col([0-9]+)"); - Matcher m = internalPattern.matcher(internalName); - if (!m.matches()) { - return -1; - } else { - return Integer.parseInt(m.group(1)); + @SuppressWarnings("rawtypes") + private static Object fatMap(Map<Object, Object> map) { + Map ret = new LinkedHashMap<>(); + Set<Entry<Object, Object>> es = map.entrySet(); + for (Entry<Object, Object> e : es) { + Object oldV = e.getValue(); + Object newV; + if (oldV != null && oldV.getClass().isArray()) { + newV = fatLand((Object[]) oldV); + } else { + newV = oldV; + } + ret.put(e.getKey(), newV); } + return ret; } - /** - * Utility method to extract (and forget) the next value token from the JsonParser, - * as a whole. The reason this function gets called is to yank out the next value altogether, - * because it corresponds to a field name that we do not recognize, and thus, do not have - * a schema/type for. Thus, this field is to be ignored. - * @throws IOException - * @throws JsonParseException - */ - private void skipValue(JsonParser p) throws JsonParseException, IOException { - JsonToken valueToken = p.nextToken(); - - if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)){ - // if the currently read token is a beginning of an array or object, move stream forward - // skipping any child tokens till we're at the corresponding END_ARRAY or END_OBJECT token - p.skipChildren(); + private static Object primitiveArrayToList(Object arr) { + Class<?> ct = arr.getClass().getComponentType(); + if (int.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((int[]) arr)); } - // At the end of this function, the stream should be pointing to the last token that - // corresponds to the value being skipped. This way, the next call to nextToken - // will advance it to the next field name. - } - - /** - * Utility method to extract current expected field from given JsonParser - * - * isTokenCurrent is a boolean variable also passed in, which determines - * if the JsonParser is already at the token we expect to read next, or - * needs advancing to the next before we read. - */ - private Object extractCurrentField(JsonParser p, HCatFieldSchema hcatFieldSchema, - boolean isTokenCurrent) throws IOException { - Object val = null; - JsonToken valueToken; - if (isTokenCurrent) { - valueToken = p.getCurrentToken(); - } else { - valueToken = p.nextToken(); + if (long.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((long[]) arr)); } - switch (hcatFieldSchema.getType()) { - case INT: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue(); - break; - case TINYINT: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue(); - break; - case SMALLINT: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue(); - break; - case BIGINT: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue(); - break; - case BOOLEAN: - String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); - if (bval != null) { - val = Boolean.valueOf(bval); - } else { - val = null; - } - break; - case FLOAT: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue(); - break; - case DOUBLE: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue(); - break; - case STRING: - val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); - break; - case BINARY: - String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); - if (b != null) { - try { - String t = Text.decode(b.getBytes(), 0, b.getBytes().length); - return t.getBytes(); - } catch (CharacterCodingException e) { - LOG.warn("Error generating json binary type from object.", e); - return null; - } - } else { - val = null; - } - break; - case DATE: - val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText()); - break; - case TIMESTAMP: - val = (valueToken == JsonToken.VALUE_NULL) ? null : tsParser.parseTimestamp(p.getText()); - break; - case DECIMAL: - val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText()); - break; - case VARCHAR: - int vLen = ((BaseCharTypeInfo)hcatFieldSchema.getTypeInfo()).getLength(); - val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen); - break; - case CHAR: - int cLen = ((BaseCharTypeInfo)hcatFieldSchema.getTypeInfo()).getLength(); - val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen); - break; - case ARRAY: - if (valueToken == JsonToken.VALUE_NULL) { - val = null; - break; - } - if (valueToken != JsonToken.START_ARRAY) { - throw new IOException("Start of Array expected"); - } - List<Object> arr = new ArrayList<Object>(); - while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) { - arr.add(extractCurrentField(p, hcatFieldSchema.getArrayElementSchema().get(0), true)); - } - val = arr; - break; - case MAP: - if (valueToken == JsonToken.VALUE_NULL) { - val = null; - break; - } - if (valueToken != JsonToken.START_OBJECT) { - throw new IOException("Start of Object expected"); - } - Map<Object, Object> map = new LinkedHashMap<Object, Object>(); - HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0); - while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { - Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), hcatFieldSchema.getMapKeyTypeInfo()); - Object v = extractCurrentField(p, valueSchema, false); - map.put(k, v); - } - val = map; - break; - case STRUCT: - if (valueToken == JsonToken.VALUE_NULL) { - val = null; - break; - } - if (valueToken != JsonToken.START_OBJECT) { - throw new IOException("Start of Object expected"); - } - HCatSchema subSchema = hcatFieldSchema.getStructSubSchema(); - int sz = subSchema.getFieldNames().size(); - - List<Object> struct = new ArrayList<Object>(Collections.nCopies(sz, null)); - while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { - populateRecord(struct, valueToken, p, subSchema); - } - val = struct; - break; - default: - LOG.error("Unknown type found: " + hcatFieldSchema.getType()); - return null; + if (char.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((char[]) arr)); + } + if (byte.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((byte[]) arr)); + } + if (short.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((short[]) arr)); } - return val; + if (float.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((float[]) arr)); + } + if (double.class.equals(ct)) { + return Arrays.asList(ArrayUtils.toObject((double[]) arr)); + } + throw new RuntimeException("Unhandled primitiveArrayToList for type: " + ct); } - private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType) - throws IOException { - switch (Type.getPrimitiveHType(mapKeyType)) { - case INT: - return Integer.valueOf(s); - case TINYINT: - return Byte.valueOf(s); - case SMALLINT: - return Short.valueOf(s); - case BIGINT: - return Long.valueOf(s); - case BOOLEAN: - return (s.equalsIgnoreCase("true")); - case FLOAT: - return Float.valueOf(s); - case DOUBLE: - return Double.valueOf(s); - case STRING: - return s; - case BINARY: - try { - String t = Text.decode(s.getBytes(), 0, s.getBytes().length); - return t.getBytes(); - } catch (CharacterCodingException e) { - LOG.warn("Error generating json binary type from object.", e); - return null; - } - case DATE: - return Date.valueOf(s); - case TIMESTAMP: - return Timestamp.valueOf(s); - case DECIMAL: - return HiveDecimal.create(s); - case VARCHAR: - return new HiveVarchar(s, ((BaseCharTypeInfo)mapKeyType).getLength()); - case CHAR: - return new HiveChar(s, ((BaseCharTypeInfo)mapKeyType).getLength()); - } - throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName()); + public String getHiveInternalColumnName(int fpos) { + return HiveConf.getColumnInternalName(fpos); } /** @@ -437,217 +168,8 @@ public class JsonSerDe extends AbstractSerDe { @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { - StringBuilder sb = new StringBuilder(); - try { - - StructObjectInspector soi = (StructObjectInspector) objInspector; - List<? extends StructField> structFields = soi.getAllStructFieldRefs(); - assert (columnNames.size() == structFields.size()); - if (obj == null) { - sb.append("null"); - } else { - sb.append(SerDeUtils.LBRACE); - for (int i = 0; i < structFields.size(); i++) { - if (i > 0) { - sb.append(SerDeUtils.COMMA); - } - appendWithQuotes(sb, columnNames.get(i)); - sb.append(SerDeUtils.COLON); - buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)), - structFields.get(i).getFieldObjectInspector()); - } - sb.append(SerDeUtils.RBRACE); - } - - } catch (IOException e) { - LOG.warn("Error generating json text from object.", e); - throw new SerDeException(e); - } - return new Text(sb.toString()); - } - private static StringBuilder appendWithQuotes(StringBuilder sb, String value) { - return sb == null ? null : sb.append(SerDeUtils.QUOTE).append(value).append(SerDeUtils.QUOTE); + return jsonSerde.serialize(obj, objInspector); } - // TODO : code section copied over from SerDeUtils because of non-standard json production there - // should use quotes for all field names. We should fix this there, and then remove this copy. - // See http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES - // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure - // when attempting to use that feature, so having to change the production itself. - // Also, throws IOException when Binary is detected. - private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException { - - switch (oi.getCategory()) { - case PRIMITIVE: { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - if (o == null) { - sb.append("null"); - } else { - switch (poi.getPrimitiveCategory()) { - case BOOLEAN: { - boolean b = ((BooleanObjectInspector) poi).get(o); - sb.append(b ? "true" : "false"); - break; - } - case BYTE: { - sb.append(((ByteObjectInspector) poi).get(o)); - break; - } - case SHORT: { - sb.append(((ShortObjectInspector) poi).get(o)); - break; - } - case INT: { - sb.append(((IntObjectInspector) poi).get(o)); - break; - } - case LONG: { - sb.append(((LongObjectInspector) poi).get(o)); - break; - } - case FLOAT: { - sb.append(((FloatObjectInspector) poi).get(o)); - break; - } - case DOUBLE: { - sb.append(((DoubleObjectInspector) poi).get(o)); - break; - } - case STRING: { - String s = - SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(o)); - appendWithQuotes(sb, s); - break; - } - case BINARY: - byte[] b = ((BinaryObjectInspector) oi).getPrimitiveJavaObject(o); - Text txt = new Text(); - txt.set(b, 0, b.length); - appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString())); - break; - case DATE: - Date d = ((DateObjectInspector)poi).getPrimitiveJavaObject(o); - appendWithQuotes(sb, d.toString()); - break; - case TIMESTAMP: { - Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(o); - appendWithQuotes(sb, t.toString()); - break; - } - case DECIMAL: - sb.append(((HiveDecimalObjectInspector)poi).getPrimitiveJavaObject(o)); - break; - case VARCHAR: { - String s = SerDeUtils.escapeString( - ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); - appendWithQuotes(sb, s); - break; - } - case CHAR: { - //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) - // HiveChar.toString() returns getPaddedValue() - String s = SerDeUtils.escapeString( - ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(o).toString()); - appendWithQuotes(sb, s); - break; - } - default: - throw new RuntimeException("Unknown primitive type: " + poi.getPrimitiveCategory()); - } - } - break; - } - case LIST: { - ListObjectInspector loi = (ListObjectInspector) oi; - ObjectInspector listElementObjectInspector = loi - .getListElementObjectInspector(); - List<?> olist = loi.getList(o); - if (olist == null) { - sb.append("null"); - } else { - sb.append(SerDeUtils.LBRACKET); - for (int i = 0; i < olist.size(); i++) { - if (i > 0) { - sb.append(SerDeUtils.COMMA); - } - buildJSONString(sb, olist.get(i), listElementObjectInspector); - } - sb.append(SerDeUtils.RBRACKET); - } - break; - } - case MAP: { - MapObjectInspector moi = (MapObjectInspector) oi; - ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector(); - ObjectInspector mapValueObjectInspector = moi - .getMapValueObjectInspector(); - Map<?, ?> omap = moi.getMap(o); - if (omap == null) { - sb.append("null"); - } else { - sb.append(SerDeUtils.LBRACE); - boolean first = true; - for (Object entry : omap.entrySet()) { - if (first) { - first = false; - } else { - sb.append(SerDeUtils.COMMA); - } - Map.Entry<?, ?> e = (Map.Entry<?, ?>) entry; - StringBuilder keyBuilder = new StringBuilder(); - buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector); - String keyString = keyBuilder.toString().trim(); - if((!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE)) { - appendWithQuotes(sb, keyString); - } - else { - sb.append(keyString); - } - sb.append(SerDeUtils.COLON); - buildJSONString(sb, e.getValue(), mapValueObjectInspector); - } - sb.append(SerDeUtils.RBRACE); - } - break; - } - case STRUCT: { - StructObjectInspector soi = (StructObjectInspector) oi; - List<? extends StructField> structFields = soi.getAllStructFieldRefs(); - if (o == null) { - sb.append("null"); - } else { - sb.append(SerDeUtils.LBRACE); - for (int i = 0; i < structFields.size(); i++) { - if (i > 0) { - sb.append(SerDeUtils.COMMA); - } - appendWithQuotes(sb, structFields.get(i).getFieldName()); - sb.append(SerDeUtils.COLON); - buildJSONString(sb, soi.getStructFieldData(o, structFields.get(i)), - structFields.get(i).getFieldObjectInspector()); - } - sb.append(SerDeUtils.RBRACE); - } - break; - } - case UNION: { - UnionObjectInspector uoi = (UnionObjectInspector) oi; - if (o == null) { - sb.append("null"); - } else { - sb.append(SerDeUtils.LBRACE); - sb.append(uoi.getTag(o)); - sb.append(SerDeUtils.COLON); - buildJSONString(sb, uoi.getField(o), - uoi.getObjectInspectors().get(uoi.getTag(o))); - sb.append(SerDeUtils.RBRACE); - } - break; - } - default: - throw new RuntimeException("Unknown type in ObjectInspector!"); - } - } - /** * Returns an object inspector for the specified schema that http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java ---------------------------------------------------------------------- diff --git a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java index 6770d44..d476b43 100644 --- a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java +++ b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java @@ -27,8 +27,6 @@ import java.util.List; import java.util.Map; import java.util.Properties; -import junit.framework.TestCase; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.HiveChar; @@ -43,6 +41,8 @@ import org.apache.hadoop.io.Writable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import junit.framework.TestCase; + public class TestJsonSerDe extends TestCase { private static final Logger LOG = LoggerFactory.getLogger(TestJsonSerDe.class); @@ -158,17 +158,17 @@ public class TestJsonSerDe extends TestCase { Writable s = hrsd.serialize(r, hrsd.getObjectInspector()); LOG.info("ONE:{}", s); - Object o1 = hrsd.deserialize(s); + HCatRecord o1 = (HCatRecord) hrsd.deserialize(s); StringBuilder msg = new StringBuilder(); - boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o1); + boolean isEqual = HCatDataCheckUtil.recordsEqual(r, o1); assertTrue(msg.toString(), isEqual); Writable s2 = jsde.serialize(o1, hrsd.getObjectInspector()); LOG.info("TWO:{}", s2); - Object o2 = jsde.deserialize(s2); + HCatRecord o2 = (HCatRecord) jsde.deserialize(s2); LOG.info("deserialized TWO : {} ", o2); msg.setLength(0); - isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg); + isEqual = HCatDataCheckUtil.recordsEqual(r, o2, msg); assertTrue(msg.toString(), isEqual); } http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java ---------------------------------------------------------------------- diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java new file mode 100644 index 0000000..aae247c --- /dev/null +++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.benchmark.udf.json_read; + +import java.io.IOException; +import java.nio.charset.Charset; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFJsonRead; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.Text; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; + +public class JsonReadBench { + + @State(Scope.Thread) + public static class MyState { + + public final String json; + public final String type; + + public MyState() { + try { + json = getResource("val1.json"); + type = getResource("val1.type").toLowerCase().trim(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private String getResource(String fname) throws IOException { + return IOUtils.toString(JsonReadBench.class.getResourceAsStream(fname), Charset.defaultCharset()); + } + } + + public void checkBenchMarkMethod() throws Exception { + benchmarkMethod(new MyState()); + } + + @Benchmark + public void benchmarkMethod(MyState state) throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments(state.type); + udf.initialize(arguments); + + udf.evaluate(evalArgs(state.json)); + } + } + + private DeferredObject[] evalArgs(String string) { + return new DeferredObject[] { new GenericUDF.DeferredJavaObject(new Text(string)), null }; + } + + private ObjectInspector[] buildArguments(String typeStr) { + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + ObjectInspector[] arguments = { valueOI, PrimitiveObjectInspectorFactory + .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, new Text(typeStr)) }; + return arguments; + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json ---------------------------------------------------------------------- diff --git a/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json new file mode 100644 index 0000000..4466539 --- /dev/null +++ b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json @@ -0,0 +1,86 @@ +[ +{ + "t0":"2017-08-1414:45:23.522000", + "business_id": "vcNAWiLM4dR7D2nwwJ7nCA", + "hours": { + "Tuesday": { + "close": "17:00", + "open": "08:00" + }, + "Friday": { + "close": "17:00", + "open": "08:00" + } + }, + "open": true, + "categories": [ + "Doctors", + "Health & Medical" + ], + "review_count": 9, + "name": "Eric Goldberg, MD", + "neighborhoods": [], + "attributes": { + "By Appointment Only": true, + "Accepts Credit Cards": true, + "Good For Groups": 1 + }, + "type": "business" +} +, +{ + "business_id": "vcNAWiLM4dR7D2nwwJ7nCA", + "hours": { + "Tuesday": { + "close": "17:00", + "open": "08:00" + }, + "Friday": { + "close": "17:00", + "open": "08:00" + } + }, + "open": true, + "categories": [ + "Doctors", + "Health & Medical" + ], + "review_count": 9, + "name": "Eric Goldberg, MD", + "neighborhoods": [], + "attributes": { + "By Appointment Only": true, + "Accepts Credit Cards": true, + "Good For Groups": 1 + }, + "type": "business" +} +, +{ + "business_id": "vcNAWiLM4dR7D2nwwJ7nCA", + "hours": { + "Tuesday": { + "close": "17:00", + "open": "08:00" + }, + "Friday": { + "close": "17:00", + "open": "08:00" + } + }, + "open": true, + "categories": [ + "Doctors", + "Health & Medical" + ], + "review_count": 9, + "name": "Eric Goldberg, MD", + "neighborhoods": [], + "attributes": { + "By Appointment Only": true, + "Accepts Credit Cards": true, + "Good For Groups": 1 + }, + "type": "business" +} +] http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type ---------------------------------------------------------------------- diff --git a/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type new file mode 100644 index 0000000..3543223 --- /dev/null +++ b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type @@ -0,0 +1 @@ +array<struct<t0:timestamp,attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>> http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 10517ad..0800a10 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -213,6 +213,7 @@ public final class FunctionRegistry { system.registerGenericUDF("ceiling", GenericUDFCeil.class); system.registerUDF("rand", UDFRand.class, false); system.registerGenericUDF("abs", GenericUDFAbs.class); + system.registerGenericUDF("json_read", GenericUDFJsonRead.class); system.registerGenericUDF("sq_count_check", GenericUDFSQCountCheck.class); system.registerGenericUDF("enforce_constraint", GenericUDFEnforceConstraint.class); system.registerGenericUDF("pmod", GenericUDFPosMod.class); http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java new file mode 100644 index 0000000..f5814ed --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.json.HiveJsonStructReader; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter.TextConverter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * Parses a json string representation into a Hive struct. + */ +@Description(name = "json_read", value = "_FUNC_(json,type) - " + + "Parses the given json according to the given complex type specification", extended = "" + + "Parsed as null: if the json is null, it is the empty string or if it contains only whitespaces\n" + + "Example:\n" + "select _FUNC_('[]','array<struct<a:string>>' ") +public class GenericUDFJsonRead extends GenericUDF { + + private TextConverter inputConverter; + private HiveJsonStructReader jsonReader; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + + checkArgsSize(arguments, 2, 2); + checkArgPrimitive(arguments, 0); + checkArgPrimitive(arguments, 1); + if (!ObjectInspectorUtils.isConstantObjectInspector(arguments[1])) { + throw new UDFArgumentTypeException(1, getFuncName() + " argument 2 may only be a constant"); + } + + inputConverter = new TextConverter((PrimitiveObjectInspector) arguments[0]); + String typeStr = getConstantStringValue(arguments, 1); + + try { + TypeInfo t = TypeInfoUtils.getTypeInfoFromTypeString(typeStr); + jsonReader = new HiveJsonStructReader(t); + jsonReader.setWritablesUsage(true); + } catch (Exception e) { + throw new UDFArgumentException(getFuncName() + ": Error parsing typestring: " + e.getMessage()); + } + + return jsonReader.getObjectInspector(); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object valObject = arguments[0].get(); + if (valObject == null) { + return null; + } + + try { + String text = inputConverter.convert(valObject).toString(); + if (text.trim().length() == 0) { + return null; + } + return jsonReader.parseStruct(text); + } catch (Exception e) { + throw new HiveException("Error parsing json: " + e.getMessage(), e); + } + } + + @Override + public String getDisplayString(String[] children) { + return getStandardDisplayString("json_read", children); + } + + +} http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java new file mode 100644 index 0000000..3016eaf --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +public class TestGenericUDFJsonRead { + + @Test(expected = UDFArgumentException.class) + public void testArgCnt1() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + ObjectInspector[] arguments = { valueOI }; + udf.initialize(arguments); + } + } + + @Test(expected = UDFArgumentException.class) + public void testArgCnt3() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + ObjectInspector[] arguments = { valueOI, valueOI }; + udf.initialize(arguments); + } + } + + @Test(expected = UDFArgumentException.class) + public void testArgInvalidType() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("__invalid__type__"); + udf.initialize(arguments); + } + } + + @Test + public void testList() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("array<string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("[\"a\",\"b\",null]")); + assertTrue(res instanceof List<?>); + List<?> l = (List<?>) res; + assertEquals(3, l.size()); + assertEquals(new Text("a"), l.get(0)); + assertEquals(new Text("b"), l.get(1)); + assertEquals(null, l.get(2)); + } + } + + @Test + public void testListNull() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("array<string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("null")); + assertNull(res); + } + } + + @Test + public void testSimpleStruct() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("struct<a:string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("{\"a\":\"b\"}")); + assertTrue(res instanceof Object[]); + Object o[] = (Object[]) res; + assertEquals(new Text("b"), o[0]); + } + } + + @Test + public void testStructNullField() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("struct<a:string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("{\"a\":null}")); + assertTrue(res instanceof Object[]); + Object o[] = (Object[]) res; + assertEquals(null, o[0]); + } + } + + @Test + public void testStructEmptyString() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("struct<a:string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("")); + assertNull(res); + } + } + + @Test + public void testStructNull() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("struct<a:string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(new DeferredObject[] { new DeferredJavaObject(null), null }); + assertNull(res); + } + } + + @Test + public void testStructNullComplexField() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("struct<a:struct<x:string>>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("{\"a\":null}")); + assertTrue(res instanceof Object[]); + Object o[] = (Object[]) res; + assertEquals(null, o[0]); + } + } + + @Test(expected = HiveException.class) + public void testUndeclaredStructField() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("struct<a:int>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("{\"b\":null}")); + assertTrue(res instanceof Object[]); + Object o[] = (Object[]) res; + assertEquals(null, o[0]); + } + } + + @Test(expected = HiveException.class) + public void testUnexpectedStruct() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("array<int>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("[1,22,2,{\"b\":null}]")); + assertTrue(res instanceof Object[]); + Object o[] = (Object[]) res; + assertEquals(null, o[0]); + } + } + + @Test + public void testMap() throws Exception { + try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) { + ObjectInspector[] arguments = buildArguments("map<string,string>"); + udf.initialize(arguments); + + Object res = udf.evaluate(evalArgs("{\"a\":\"v\"}")); + assertTrue(res instanceof Map); + Map o = (Map) res; + assertEquals(1, o.size()); + assertEquals(new Text("v"), o.get(new Text("a"))); + } + } + + private DeferredObject[] evalArgs(String string) { + return new DeferredObject[] { new DeferredJavaObject(new Text(string)), null }; + } + + private ObjectInspector[] buildArguments(String typeStr) { + ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + ObjectInspector[] arguments = { valueOI, PrimitiveObjectInspectorFactory + .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, new Text(typeStr)) }; + return arguments; + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/queries/clientpositive/json_serde2.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/json_serde2.q b/ql/src/test/queries/clientpositive/json_serde2.q new file mode 100644 index 0000000..c6088b7 --- /dev/null +++ b/ql/src/test/queries/clientpositive/json_serde2.q @@ -0,0 +1,37 @@ +--! qt:dataset:src + +add jar ${system:maven.local.repository}/org/apache/hive/hcatalog/hive-hcatalog-core/${system:hive.version}/hive-hcatalog-core-${system:hive.version}.jar; + +drop table if exists json_serde1_1; +drop table if exists json_serde1_2; + +create table json_serde1_1 (a array<string>,b map<string,int>) + row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'; + +insert into table json_serde1_1 + select array('aaa'),map('aaa',1) from src limit 2; + +select * from json_serde1_1; + +create table json_serde1_2 ( + a array<int>, + b map<int,date>, + c struct<c1:int, c2:string, c3:array<string>, c4:map<string, int>, c5:struct<c5_1:string, c5_2:int>> +) row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'; + +insert into table json_serde1_2 + select + array(3, 2, 1), + map(1, date '2001-01-01', 2, null), + named_struct( + 'c1', 123456, + 'c2', 'hello', + 'c3', array('aa', 'bb', 'cc'), + 'c4', map('abc', 123, 'xyz', 456), + 'c5', named_struct('c5_1', 'bye', 'c5_2', 88)) + from src limit 2; + +select * from json_serde1_2; + +drop table json_serde1_1; +drop table json_serde1_2; http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/queries/clientpositive/udf_json_read.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/udf_json_read.q b/ql/src/test/queries/clientpositive/udf_json_read.q new file mode 100644 index 0000000..30c297b --- /dev/null +++ b/ql/src/test/queries/clientpositive/udf_json_read.q @@ -0,0 +1,44 @@ +DESCRIBE FUNCTION java_read; +DESCRIBE FUNCTION EXTENDED java_read; + + +select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]', + 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>'); + +create table t (info array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>); + +insert into t + select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]', + 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>'); + + + +select json_read('[ +{ + "business_id": "vcNAWiLM4dR7D2nwwJ7nCA", + "hours": { + "Tuesday": { + "close": "17:00", + "open": "08:00" + }, + "Friday": { + "close": "17:00", + "open": "08:00" + } + }, + "open": true, + "categories": [ + "Doctors", + "Health & Medical" + ], + "review_count": 9, + "name": "Eric Goldberg, MD", + "neighborhoods": [], + "attributes": { + "By Appointment Only": true, + "Accepts Credit Cards": true, + "Good For Groups": 1 + }, + "type": "business" +} +]','array<struct<attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>'); http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/results/clientpositive/json_serde2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/json_serde2.q.out b/ql/src/test/results/clientpositive/json_serde2.q.out new file mode 100644 index 0000000..cfdb051 --- /dev/null +++ b/ql/src/test/results/clientpositive/json_serde2.q.out @@ -0,0 +1,113 @@ +PREHOOK: query: drop table if exists json_serde1_1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists json_serde1_1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists json_serde1_2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists json_serde1_2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table json_serde1_1 (a array<string>,b map<string,int>) + row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@json_serde1_1 +POSTHOOK: query: create table json_serde1_1 (a array<string>,b map<string,int>) + row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@json_serde1_1 +PREHOOK: query: insert into table json_serde1_1 + select array('aaa'),map('aaa',1) from src limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@json_serde1_1 +POSTHOOK: query: insert into table json_serde1_1 + select array('aaa'),map('aaa',1) from src limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@json_serde1_1 +POSTHOOK: Lineage: json_serde1_1.a EXPRESSION [] +POSTHOOK: Lineage: json_serde1_1.b EXPRESSION [] +PREHOOK: query: select * from json_serde1_1 +PREHOOK: type: QUERY +PREHOOK: Input: default@json_serde1_1 +#### A masked pattern was here #### +POSTHOOK: query: select * from json_serde1_1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@json_serde1_1 +#### A masked pattern was here #### +["aaa"] {"aaa":1} +["aaa"] {"aaa":1} +PREHOOK: query: create table json_serde1_2 ( + a array<int>, + b map<int,date>, + c struct<c1:int, c2:string, c3:array<string>, c4:map<string, int>, c5:struct<c5_1:string, c5_2:int>> +) row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@json_serde1_2 +POSTHOOK: query: create table json_serde1_2 ( + a array<int>, + b map<int,date>, + c struct<c1:int, c2:string, c3:array<string>, c4:map<string, int>, c5:struct<c5_1:string, c5_2:int>> +) row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@json_serde1_2 +PREHOOK: query: insert into table json_serde1_2 + select + array(3, 2, 1), + map(1, date '2001-01-01', 2, null), + named_struct( + 'c1', 123456, + 'c2', 'hello', + 'c3', array('aa', 'bb', 'cc'), + 'c4', map('abc', 123, 'xyz', 456), + 'c5', named_struct('c5_1', 'bye', 'c5_2', 88)) + from src limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@json_serde1_2 +POSTHOOK: query: insert into table json_serde1_2 + select + array(3, 2, 1), + map(1, date '2001-01-01', 2, null), + named_struct( + 'c1', 123456, + 'c2', 'hello', + 'c3', array('aa', 'bb', 'cc'), + 'c4', map('abc', 123, 'xyz', 456), + 'c5', named_struct('c5_1', 'bye', 'c5_2', 88)) + from src limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@json_serde1_2 +POSTHOOK: Lineage: json_serde1_2.a EXPRESSION [] +POSTHOOK: Lineage: json_serde1_2.b EXPRESSION [] +POSTHOOK: Lineage: json_serde1_2.c EXPRESSION [] +PREHOOK: query: select * from json_serde1_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@json_serde1_2 +#### A masked pattern was here #### +POSTHOOK: query: select * from json_serde1_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@json_serde1_2 +#### A masked pattern was here #### +[3,2,1] {1:"2001-01-01",2:null} {"c1":123456,"c2":"hello","c3":["aa","bb","cc"],"c4":{"abc":123,"xyz":456},"c5":{"c5_1":"bye","c5_2":88}} +[3,2,1] {1:"2001-01-01",2:null} {"c1":123456,"c2":"hello","c3":["aa","bb","cc"],"c4":{"abc":123,"xyz":456},"c5":{"c5_1":"bye","c5_2":88}} +PREHOOK: query: drop table json_serde1_1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@json_serde1_1 +PREHOOK: Output: default@json_serde1_1 +POSTHOOK: query: drop table json_serde1_1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@json_serde1_1 +POSTHOOK: Output: default@json_serde1_1 +PREHOOK: query: drop table json_serde1_2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@json_serde1_2 +PREHOOK: Output: default@json_serde1_2 +POSTHOOK: query: drop table json_serde1_2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@json_serde1_2 +POSTHOOK: Output: default@json_serde1_2 http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/results/clientpositive/show_functions.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out index e91cffd..629781a 100644 --- a/ql/src/test/results/clientpositive/show_functions.q.out +++ b/ql/src/test/results/clientpositive/show_functions.q.out @@ -130,6 +130,7 @@ isnottrue isnull istrue java_method +json_read json_tuple lag last_day http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/results/clientpositive/udf_json_read.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udf_json_read.q.out b/ql/src/test/results/clientpositive/udf_json_read.q.out new file mode 100644 index 0000000..05b1eb8 --- /dev/null +++ b/ql/src/test/results/clientpositive/udf_json_read.q.out @@ -0,0 +1,107 @@ +PREHOOK: query: DESCRIBE FUNCTION java_read +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION java_read +POSTHOOK: type: DESCFUNCTION +Function 'java_read' does not exist. +PREHOOK: query: DESCRIBE FUNCTION EXTENDED java_read +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION EXTENDED java_read +POSTHOOK: type: DESCFUNCTION +Function 'java_read' does not exist. +PREHOOK: query: select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]', + 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]', + 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +[{"name":"john","age":null,"alias":"j","address":{"city":"LA","street":null}},{"name":"kinga","age":2,"alias":"binga","address":null}] +PREHOOK: query: create table t (info array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t +POSTHOOK: query: create table t (info array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t +PREHOOK: query: insert into t + select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]', + 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t + select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]', + 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.info EXPRESSION [] +PREHOOK: query: select json_read('[ +{ + "business_id": "vcNAWiLM4dR7D2nwwJ7nCA", + "hours": { + "Tuesday": { + "close": "17:00", + "open": "08:00" + }, + "Friday": { + "close": "17:00", + "open": "08:00" + } + }, + "open": true, + "categories": [ + "Doctors", + "Health & Medical" + ], + "review_count": 9, + "name": "Eric Goldberg, MD", + "neighborhoods": [], + "attributes": { + "By Appointment Only": true, + "Accepts Credit Cards": true, + "Good For Groups": 1 + }, + "type": "business" +} +]','array<struct<attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select json_read('[ +{ + "business_id": "vcNAWiLM4dR7D2nwwJ7nCA", + "hours": { + "Tuesday": { + "close": "17:00", + "open": "08:00" + }, + "Friday": { + "close": "17:00", + "open": "08:00" + } + }, + "open": true, + "categories": [ + "Doctors", + "Health & Medical" + ], + "review_count": 9, + "name": "Eric Goldberg, MD", + "neighborhoods": [], + "attributes": { + "By Appointment Only": true, + "Accepts Credit Cards": true, + "Good For Groups": 1 + }, + "type": "business" +} +]','array<struct<attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +[{"attributes":{"accepts credit cards":true,"by appointment only":true,"good for groups":1},"business_id":"vcNAWiLM4dR7D2nwwJ7nCA","categories":["Doctors","Health & Medical"],"hours":{"Tuesday":{"close":"17:00","open":"08:00"},"Friday":{"close":"17:00","open":"08:00"}},"name":"Eric Goldberg, MD","neighborhoods":[],"open":true,"review_count":9,"type":"business"}]