Author: jdere Date: Thu Mar 26 18:12:50 2015 New Revision: 1669394 URL: http://svn.apache.org/r1669394 Log: HIVE-1575: get_json_object does not support JSON array at the root level (Alex Pivovarov via Jason Dere)
Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFJson.java Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java hive/trunk/ql/src/test/queries/clientpositive/udf_get_json_object.q hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java?rev=1669394&r1=1669393&r2=1669394&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java Thu Mar 26 18:12:50 2015 @@ -69,6 +69,7 @@ public class UDFJson extends UDF { } private static final ObjectMapper MAPPER = new ObjectMapper(JSON_FACTORY); private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class); + private static final JavaType LIST_TYPE = TypeFactory.fromClass(List.class); // An LRU cache using a linked hash map static class HashCache<K, V> extends LinkedHashMap<K, V> { @@ -124,10 +125,24 @@ public class UDFJson extends UDF { public Text evaluate(String jsonString, String pathString) { if (jsonString == null || jsonString.isEmpty() || pathString == null - || pathString.isEmpty()) { + || pathString.isEmpty() || pathString.charAt(0) != '$') { return null; } + int pathExprStart = 1; + boolean isRootArray = false; + + if (pathString.length() > 1) { + if (pathString.charAt(1) == '[') { + pathExprStart = 0; + isRootArray = true; + } else if (pathString.charAt(1) == '.') { + isRootArray = pathString.length() > 2 && pathString.charAt(2) == '['; + } else { + return null; + } + } + // Cache pathExpr String[] pathExpr = pathExprCache.get(pathString); if (pathExpr == null) { @@ -135,24 +150,22 @@ public class UDFJson extends UDF { pathExprCache.put(pathString, pathExpr); } - if (!pathExpr[0].equalsIgnoreCase("$")) { - return null; - } // Cache extractObject Object extractObject = extractObjectCache.get(jsonString); if (extractObject == null) { + JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE; try { - extractObject = MAPPER.readValue(jsonString, MAP_TYPE); + extractObject = MAPPER.readValue(jsonString, javaType); } catch (Exception e) { return null; } extractObjectCache.put(jsonString, extractObject); } - for (int i = 1; i < pathExpr.length; i++) { + for (int i = pathExprStart; i < pathExpr.length; i++) { if (extractObject == null) { return null; } - extractObject = extract(extractObject, pathExpr[i]); + extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray); } if (extractObject instanceof Map || extractObject instanceof List) { try { @@ -168,36 +181,37 @@ public class UDFJson extends UDF { return result; } - private Object extract(Object json, String path) { - - // Cache patternkey.matcher(path).matches() - Matcher mKey = null; - Boolean mKeyMatches = mKeyMatchesCache.get(path); - if (mKeyMatches == null) { - mKey = patternKey.matcher(path); - mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; - mKeyMatchesCache.put(path, mKeyMatches); - } - if (!mKeyMatches.booleanValue()) { - return null; - } - - // Cache mkey.group(1) - String mKeyGroup1 = mKeyGroup1Cache.get(path); - if (mKeyGroup1 == null) { - if (mKey == null) { + private Object extract(Object json, String path, boolean skipMapProc) { + // skip MAP processing for the first path element if root is array + if (!skipMapProc) { + // Cache patternkey.matcher(path).matches() + Matcher mKey = null; + Boolean mKeyMatches = mKeyMatchesCache.get(path); + if (mKeyMatches == null) { mKey = patternKey.matcher(path); mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; mKeyMatchesCache.put(path, mKeyMatches); - if (!mKeyMatches.booleanValue()) { - return null; + } + if (!mKeyMatches.booleanValue()) { + return null; + } + + // Cache mkey.group(1) + String mKeyGroup1 = mKeyGroup1Cache.get(path); + if (mKeyGroup1 == null) { + if (mKey == null) { + mKey = patternKey.matcher(path); + mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; + mKeyMatchesCache.put(path, mKeyMatches); + if (!mKeyMatches.booleanValue()) { + return null; + } } + mKeyGroup1 = mKey.group(1); + mKeyGroup1Cache.put(path, mKeyGroup1); } - mKeyGroup1 = mKey.group(1); - mKeyGroup1Cache.put(path, mKeyGroup1); + json = extract_json_withkey(json, mKeyGroup1); } - json = extract_json_withkey(json, mKeyGroup1); - // Cache indexList ArrayList<String> indexList = indexListCache.get(path); if (indexList == null) { Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFJson.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFJson.java?rev=1669394&view=auto ============================================================================== --- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFJson.java (added) +++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFJson.java Thu Mar 26 18:12:50 2015 @@ -0,0 +1,104 @@ +package org.apache.hadoop.hive.ql.udf; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class TestUDFJson { + + @Test + public void testJson() throws HiveException { + + String book0 = "{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\"" + + ",\"category\":\"reference\",\"price\":8.95}"; + String backet0 = "[1,2,{\"b\":\"y\",\"a\":\"x\"}]"; + String backet = "[" + backet0 + ",[3,4],[5,6]]"; + String backetFlat = backet0.substring(0, backet0.length() - 1) + ",3,4,5,6]"; + + String book = "[" + book0 + ",{\"author\":\"Herman Melville\",\"title\":\"Moby Dick\"," + + "\"category\":\"fiction\",\"price\":8.99" + + ",\"isbn\":\"0-553-21311-3\"},{\"author\":\"J. R. R. Tolkien\"" + + ",\"title\":\"The Lord of the Rings\",\"category\":\"fiction\"" + + ",\"reader\":[{\"age\":25,\"name\":\"bob\"},{\"age\":26,\"name\":\"jack\"}]" + + ",\"price\":22.99,\"isbn\":\"0-395-19395-8\"}]"; + + String json = "{\"store\":{\"fruit\":[{\"weight\":8,\"type\":\"apple\"}," + + "{\"weight\":9,\"type\":\"pear\"}],\"basket\":" + backet + ",\"book\":" + book + + ",\"bicycle\":{\"price\":19.95,\"color\":\"red\"}}" + + ",\"email\":\"amy@only_for_json_udf_test.net\"" + + ",\"owner\":\"amy\",\"zip code\":\"94025\",\"fb:testid\":\"1234\"}"; + + UDFJson udf = new UDFJson(); + + runTest(json, "$.owner", "amy", udf); + runTest(json, "$.store.bicycle", "{\"price\":19.95,\"color\":\"red\"}", udf); + runTest(json, "$.store.book", book, udf); + runTest(json, "$.store.book[0]", book0, udf); + runTest(json, "$.store.book[*]", book, udf); + runTest(json, "$.store.book[0].category", "reference", udf); + runTest(json, "$.store.book[*].category", "[\"reference\",\"fiction\",\"fiction\"]", udf); + runTest(json, "$.store.book[*].reader[0].age", "25", udf); + runTest(json, "$.store.book[*].reader[*].age", "[25,26]", udf); + runTest(json, "$.store.basket[0][1]", "2", udf); + runTest(json, "$.store.basket[*]", backet, udf); + runTest(json, "$.store.basket[*][0]", "[1,3,5]", udf); + runTest(json, "$.store.basket[0][*]", backet0, udf); + runTest(json, "$.store.basket[*][*]", backetFlat, udf); + runTest(json, "$.store.basket[0][2].b", "y", udf); + runTest(json, "$.store.basket[0][*].b", "[\"y\"]", udf); + runTest(json, "$.non_exist_key", null, udf); + runTest(json, "$.store.book[10]", null, udf); + runTest(json, "$.store.book[0].non_exist_key", null, udf); + runTest(json, "$.store.basket[*].non_exist_key", null, udf); + runTest(json, "$.store.basket[0][*].non_exist_key", null, udf); + runTest(json, "$.store.basket[*][*].non_exist_key", null, udf); + runTest(json, "$.zip code", "94025", udf); + runTest(json, "$.fb:testid", "1234", udf); + runTest("{\"a\":\"b\nc\"}", "$.a", "b\nc", udf); + } + + @Test + public void testRootArray() throws HiveException { + UDFJson udf = new UDFJson(); + + runTest("[1,2,3]", "$[0]", "1", udf); + runTest("[1,2,3]", "$.[0]", "1", udf); + runTest("[1,2,3]", "$.[1]", "2", udf); + runTest("[1,2,3]", "$[1]", "2", udf); + + runTest("[1,2,3]", "$[3]", null, udf); + runTest("[1,2,3]", "$.[*]", "[1,2,3]", udf); + runTest("[1,2,3]", "$[*]", "[1,2,3]", udf); + runTest("[1,2,3]", "$", "[1,2,3]", udf); + runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2]", "{\"k3\":\"v3\"}", udf); + runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2].k3", "v3", udf); + runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11[1]", "2", udf); + runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11", "[1,2,3]", udf); + runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0]", "{\"k11\":[1,2,3]}", udf); + runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1", "[{\"k11\":[1,2,3]}]", udf); + runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0]", "{\"k1\":[{\"k11\":[1,2,3]}]}", udf); + runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1]", "[4,5,6]", udf); + runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1][0]", "4", udf); + runTest("[\"a\",\"b\"]", "$[1]", "b", udf); + runTest("[[\"a\",\"b\"]]", "$[0][1]", "b", udf); + + runTest("[1,2,3]", "[0]", null, udf); + runTest("[1,2,3]", "$0", null, udf); + runTest("[1,2,3]", "0", null, udf); + runTest("[1,2,3]", "$.", null, udf); + + runTest("[1,2,3]", "$", "[1,2,3]", udf); + runTest("{\"a\":4}", "$", "{\"a\":4}", udf); + } + + protected void runTest(String json, String path, String exp, UDFJson udf) { + Text res = udf.evaluate(json, path); + if (exp == null) { + Assert.assertNull(res); + } else { + Assert.assertNotNull(res); + Assert.assertEquals("get_json_object test", exp, res.toString()); + } + } +} Modified: hive/trunk/ql/src/test/queries/clientpositive/udf_get_json_object.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/udf_get_json_object.q?rev=1669394&r1=1669393&r2=1669394&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/udf_get_json_object.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/udf_get_json_object.q Thu Mar 26 18:12:50 2015 @@ -42,3 +42,24 @@ INSERT OVERWRITE TABLE dest2 SELECT '{"a SELECT * FROM dest2; SELECT get_json_object(c1, '$.a') FROM dest2; + +--root is array +SELECT +get_json_object('[1,2,3]', '$[0]'), +get_json_object('[1,2,3]', '$.[0]'), +get_json_object('[1,2,3]', '$.[1]'), +get_json_object('[1,2,3]', '$[1]'), +get_json_object('[1,2,3]', '$[2]'), +get_json_object('[1,2,3]', '$[*]'), +get_json_object('[1,2,3]', '$'), +get_json_object('[{"k1":"v1"},{"k2":"v2"},{"k3":"v3"}]', '$[2]'), +get_json_object('[{"k1":"v1"},{"k2":"v2"},{"k3":"v3"}]', '$[2].k3'), +get_json_object('[[1,2,3],[4,5,6],[7,8,9]]', '$[1]'), +get_json_object('[[1,2,3],[4,5,6],[7,8,9]]', '$[1][0]'), +get_json_object('[{"k1":[{"k11":[1,2,3]}]}]', '$[0].k1[0].k11[1]'); + +--null +SELECT +get_json_object('[1,2,3]', '[2]'), +get_json_object('[1,2,3]', '$0'), +get_json_object('[1,2,3]', '$[3]'); \ No newline at end of file Modified: hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out?rev=1669394&r1=1669393&r2=1669394&view=diff ============================================================================== --- hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out (original) +++ hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out Thu Mar 26 18:12:50 2015 @@ -191,3 +191,55 @@ POSTHOOK: Input: default@dest2 #### A masked pattern was here #### b c +PREHOOK: query: --root is array +SELECT +get_json_object('[1,2,3]', '$[0]'), +get_json_object('[1,2,3]', '$.[0]'), +get_json_object('[1,2,3]', '$.[1]'), +get_json_object('[1,2,3]', '$[1]'), +get_json_object('[1,2,3]', '$[2]'), +get_json_object('[1,2,3]', '$[*]'), +get_json_object('[1,2,3]', '$'), +get_json_object('[{"k1":"v1"},{"k2":"v2"},{"k3":"v3"}]', '$[2]'), +get_json_object('[{"k1":"v1"},{"k2":"v2"},{"k3":"v3"}]', '$[2].k3'), +get_json_object('[[1,2,3],[4,5,6],[7,8,9]]', '$[1]'), +get_json_object('[[1,2,3],[4,5,6],[7,8,9]]', '$[1][0]'), +get_json_object('[{"k1":[{"k11":[1,2,3]}]}]', '$[0].k1[0].k11[1]') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: --root is array +SELECT +get_json_object('[1,2,3]', '$[0]'), +get_json_object('[1,2,3]', '$.[0]'), +get_json_object('[1,2,3]', '$.[1]'), +get_json_object('[1,2,3]', '$[1]'), +get_json_object('[1,2,3]', '$[2]'), +get_json_object('[1,2,3]', '$[*]'), +get_json_object('[1,2,3]', '$'), +get_json_object('[{"k1":"v1"},{"k2":"v2"},{"k3":"v3"}]', '$[2]'), +get_json_object('[{"k1":"v1"},{"k2":"v2"},{"k3":"v3"}]', '$[2].k3'), +get_json_object('[[1,2,3],[4,5,6],[7,8,9]]', '$[1]'), +get_json_object('[[1,2,3],[4,5,6],[7,8,9]]', '$[1][0]'), +get_json_object('[{"k1":[{"k11":[1,2,3]}]}]', '$[0].k1[0].k11[1]') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +1 1 2 2 3 [1,2,3] [1,2,3] {"k3":"v3"} v3 [4,5,6] 4 2 +PREHOOK: query: --null +SELECT +get_json_object('[1,2,3]', '[2]'), +get_json_object('[1,2,3]', '$0'), +get_json_object('[1,2,3]', '$[3]') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: --null +SELECT +get_json_object('[1,2,3]', '[2]'), +get_json_object('[1,2,3]', '$0'), +get_json_object('[1,2,3]', '$[3]') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +NULL NULL NULL