DRILL-1461: Repeated count fails on varchar. This patch also includes implementations for all of the repeated_contains methods for the basic JSON types.
Small change was made to the unit tests for JSON as the leftover ALL_TEXT_MODE option was causing failures if the test were run in a different order. Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/a13fc39b Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/a13fc39b Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/a13fc39b Branch: refs/heads/master Commit: a13fc39b05addabb4328b9211aa897b8e0387dd0 Parents: 10e4847 Author: Jason Altekruse <[email protected]> Authored: Mon Sep 29 17:11:05 2014 -0700 Committer: Steven Phillips <[email protected]> Committed: Tue Sep 30 13:44:51 2014 -0700 ---------------------------------------------------------------------- .../expr/fn/impl/SimpleRepeatedFunctions.java | 267 +++++++++++++++++++ .../vector/complex/writer/TestJsonReader.java | 22 ++ .../resources/parquet/alltypes_repeated.json | 110 +++++++- .../store/json/json_basic_repeated_varchar.json | 4 + 4 files changed, 390 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/a13fc39b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SimpleRepeatedFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SimpleRepeatedFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SimpleRepeatedFunctions.java index cc4343e..64de402 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SimpleRepeatedFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SimpleRepeatedFunctions.java @@ -21,11 +21,23 @@ import org.apache.drill.exec.expr.DrillSimpleFunc; import org.apache.drill.exec.expr.annotations.FunctionTemplate; import org.apache.drill.exec.expr.annotations.Output; import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.expr.annotations.Workspace; import org.apache.drill.exec.expr.holders.BigIntHolder; import org.apache.drill.exec.expr.holders.BitHolder; +import org.apache.drill.exec.expr.holders.Float4Holder; +import org.apache.drill.exec.expr.holders.Float8Holder; import org.apache.drill.exec.expr.holders.IntHolder; import org.apache.drill.exec.expr.holders.RepeatedBigIntHolder; +import org.apache.drill.exec.expr.holders.RepeatedBitHolder; +import org.apache.drill.exec.expr.holders.RepeatedFloat4Holder; +import org.apache.drill.exec.expr.holders.RepeatedFloat8Holder; import org.apache.drill.exec.expr.holders.RepeatedIntHolder; +import org.apache.drill.exec.expr.holders.RepeatedListHolder; +import org.apache.drill.exec.expr.holders.RepeatedMapHolder; +import org.apache.drill.exec.expr.holders.RepeatedTinyIntHolder; +import org.apache.drill.exec.expr.holders.RepeatedVarCharHolder; +import org.apache.drill.exec.expr.holders.TinyIntHolder; +import org.apache.drill.exec.expr.holders.VarCharHolder; import org.apache.drill.exec.record.RecordBatch; public class SimpleRepeatedFunctions { @@ -33,6 +45,10 @@ public class SimpleRepeatedFunctions { private SimpleRepeatedFunctions() { } + // TODO - replace with a freemarker template and fill out the rest of the types + // focused on getting functions defined for JSON types as this is the primary format + // users are extracting repeated data out of currently + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) public static class RepeatedLengthBigInt implements DrillSimpleFunc { @@ -50,6 +66,104 @@ public class SimpleRepeatedFunctions { } @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthTinyInt implements DrillSimpleFunc { + + @Param + RepeatedTinyIntHolder input; + @Output + IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + + // TODO - need to confirm that these work + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthMap implements DrillSimpleFunc { + + @Param + RepeatedMapHolder input; + @Output + IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + + // TODO - need to confirm that these work + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthList implements DrillSimpleFunc { + + @Param + RepeatedListHolder input; + @Output + IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthBit implements DrillSimpleFunc { + + @Param + RepeatedBitHolder input; + @Output + IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthFloat4 implements DrillSimpleFunc { + + @Param + RepeatedFloat4Holder input; + @Output + IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthFloat8 implements DrillSimpleFunc { + + @Param + RepeatedFloat8Holder input; + @Output + IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) public static class RepeatedLengthInt implements DrillSimpleFunc { @Param RepeatedIntHolder input; @@ -63,6 +177,20 @@ public class SimpleRepeatedFunctions { } } + @FunctionTemplate(name = "repeated_count", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class RepeatedLengthVarChar implements DrillSimpleFunc { + + @Param RepeatedVarCharHolder input; + @Output IntHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = input.end - input.start; + } + } + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) public static class ContainsBigInt implements DrillSimpleFunc { @@ -74,6 +202,7 @@ public class SimpleRepeatedFunctions { } public void eval() { + out.value = 0; for (int i = listToSearch.start; i < listToSearch.end; i++) { if (listToSearch.vector.getAccessor().get(i) == targetValue.value) { out.value = 1; @@ -83,4 +212,142 @@ public class SimpleRepeatedFunctions { } } + + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class ContainsInt implements DrillSimpleFunc { + + @Param RepeatedIntHolder listToSearch; + @Param IntHolder targetValue; + @Output BitHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = 0; + for (int i = listToSearch.start; i < listToSearch.end; i++) { + if (listToSearch.vector.getAccessor().get(i) == targetValue.value) { + out.value = 1; + break; + } + } + } + + } + + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class ContainsTinyInt implements DrillSimpleFunc { + + @Param RepeatedTinyIntHolder listToSearch; + @Param TinyIntHolder targetValue; + @Output BitHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + for (int i = listToSearch.start; i < listToSearch.end; i++) { + if (listToSearch.vector.getAccessor().get(i) == targetValue.value) { + out.value = 1; + break; + } + } + } + + } + + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class ContainsBit implements DrillSimpleFunc { + + @Param RepeatedBitHolder listToSearch; + @Param BitHolder targetValue; + @Output BitHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = 0; + for (int i = listToSearch.start; i < listToSearch.end; i++) { + if (listToSearch.vector.getAccessor().get(i) == targetValue.value) { + out.value = 1; + break; + } + } + } + + } + + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class ContainsFloat4 implements DrillSimpleFunc { + + @Param RepeatedFloat4Holder listToSearch; + @Param Float4Holder targetValue; + @Output BitHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = 0; + for (int i = listToSearch.start; i < listToSearch.end; i++) { + if (listToSearch.vector.getAccessor().get(i) == targetValue.value) { + out.value = 1; + break; + } + } + } + + } + + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class ContainsFloat8 implements DrillSimpleFunc { + + @Param RepeatedFloat8Holder listToSearch; + @Param Float8Holder targetValue; + @Output BitHolder out; + + public void setup(RecordBatch b) { + } + + public void eval() { + out.value = 0; + for (int i = listToSearch.start; i < listToSearch.end; i++) { + if (listToSearch.vector.getAccessor().get(i) == targetValue.value) { + out.value = 1; + break; + } + } + } + + } + + @FunctionTemplate(name = "repeated_contains", scope = FunctionTemplate.FunctionScope.SIMPLE) + public static class ContainsVarChar implements DrillSimpleFunc { + + @Param RepeatedVarCharHolder listToSearch; + @Param VarCharHolder targetValue; + @Workspace VarCharHolder currVal; + + @Output BitHolder out; + + public void setup(RecordBatch b) { + currVal = new VarCharHolder(); + } + + public void eval() { + for (int i = listToSearch.start; i < listToSearch.end; i++) { + out.value = 0; + listToSearch.vector.getAccessor().get(i, currVal); + if (org.apache.drill.exec.expr.fn.impl.ByteFunctionHelpers.compare( + currVal.buffer.memoryAddress(), currVal.start, currVal.end, targetValue.buffer.memoryAddress(), targetValue.start, targetValue.end) == 0 ) { + out.value = 1; + break; + } + } + } + + } + + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/a13fc39b/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java b/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java index fd38bd3..f450e5d 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java @@ -105,6 +105,24 @@ public class TestJsonReader extends BaseTestQuery { runTestsOnFile(filename, UserBitShared.QueryType.SQL, queries, rowCounts); } + @Test + public void testRepeatedCount() throws Exception { + test("select repeated_count(str_list) from cp.`/store/json/json_basic_repeated_varchar.json`"); + test("select repeated_count(INT_col) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_count(FLOAT4_col) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_count(VARCHAR_col) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_count(BIT_col) from cp.`/parquet/alltypes_repeated.json`"); + } + + @Test + public void testRepeatedContains() throws Exception { + test("select repeated_contains(str_list, 'asdf') from cp.`/store/json/json_basic_repeated_varchar.json`"); + test("select repeated_contains(INT_col, -2147483648) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_contains(FLOAT4_col, -1000000000000.0) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_contains(VARCHAR_col, 'qwerty' ) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_contains(BIT_col, true) from cp.`/parquet/alltypes_repeated.json`"); + test("select repeated_contains(BIT_col, false) from cp.`/parquet/alltypes_repeated.json`"); + } @Test public void testSingleColumnRead_vector_fill_bug() throws Exception { @@ -122,12 +140,14 @@ public class TestJsonReader extends BaseTestQuery { runTestsOnFile(filename, UserBitShared.QueryType.SQL, queries, rowCounts); } + @Test public void testAllTextMode() throws Exception { test("alter system set `store.json.all_text_mode` = true"); String[] queries = {"select * from cp.`/store/json/schema_change_int_to_string.json`"}; long[] rowCounts = {3}; String filename = "/store/json/schema_change_int_to_string.json"; runTestsOnFile(filename, UserBitShared.QueryType.SQL, queries, rowCounts); + test("alter system set `store.json.all_text_mode` = false"); } @Test @@ -153,6 +173,7 @@ public class TestJsonReader extends BaseTestQuery { long[] rowCounts = {3}; String filename = "/store/json/null_where_list_expected.json"; runTestsOnFile(filename, UserBitShared.QueryType.SQL, queries, rowCounts); + test("alter system set `store.json.all_text_mode` = false"); } @Test @@ -162,6 +183,7 @@ public class TestJsonReader extends BaseTestQuery { long[] rowCounts = {3}; String filename = "/store/json/null_where_map_expected.json"; runTestsOnFile(filename, UserBitShared.QueryType.SQL, queries, rowCounts); + test("alter system set `store.json.all_text_mode` = false"); } // The project pushdown rule is correctly adding the projected columns to the scan, however it is not removing http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/a13fc39b/exec/java-exec/src/test/resources/parquet/alltypes_repeated.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/parquet/alltypes_repeated.json b/exec/java-exec/src/test/resources/parquet/alltypes_repeated.json index 927cb52..141c185 100644 --- a/exec/java-exec/src/test/resources/parquet/alltypes_repeated.json +++ b/exec/java-exec/src/test/resources/parquet/alltypes_repeated.json @@ -2,11 +2,11 @@ "TINYINT_col" : [ 1, 2, 3, 4, 5, -1, -2, -3, -4, -5, 10000, -10000 ], "UINT1_col" : [ 1, 2, 3, 4, 5, 10000 ], "UINT2_col" : [ 1, 2, 3, 4, 5, 10000 ], - SMALLINT_col" : [ 1, 2, 3, 4, 5, -1, -2, -3, -4, -5, 100000, -100000 ], + "SMALLINT_col" : [ 1, 2, 3, 4, 5, -1, -2, -3, -4, -5, 100000, -100000 ], "INT_col" : [ 1, 2, 3, 4, 5, -1, -2, -3, -4, -5, 2147483647, -2147483648 ], "UINT4_col" : [ 1, 2, 3, 4, 5, 2147483700 ], "FLOAT4_col" : [ 1.0, 2.0, 3.0, 4.0, 5.0, 1000000000000.0, -1000000000000.0 ], - "TIME_col" : [ "2:30, "11:45", "12:00", 11:59", 23:59" ], + "TIME_col" : [ "2:30", "11:45", "12:00", "11:59", "23:59" ], "DECIMAL9_col" : [ "1.0", "2.0", "3.0", "4.0", "5.0", "100.100", "0.0000001" ], "BIGINT_col" : [ 1, 2, 3, 4, 5, 9223372036854775000, -9223372036854775000], "UINT8_col" : [ "1", "2", "3", "4", "5", "9223372036854778000" ], @@ -14,15 +14,99 @@ "DATE_col": [ "1995-01-01", "1995-01-02", "1995-01-03", "1995-01-04", "1995-01-05" ], "TIMESTAMP_col" : [ "1995-01-01 01:00:00.000","1995-01-01 01:00:00.000", "1995-01-01 01:00:00.000", "1995-01-01 01:00:00.000" ], "DECIMAL18_col" : ["123456789.000000000", "11.123456789", "0.100000000", "-0.100400000", "-987654321.123456789", "-2.030100000"], - "INTERVALYEAR" : - "INTERVALDAY" : - "INTERVAL" : - "DECIMAL28DENSE_col", - "DECIMAL38DENSE_col", - "DECIMAL38SPARSE_col", - "DECIMAL28SPARSE_col", - "VARBINARY_col", - "VARCHAR_col", - "VAR16CHAR_col", - "BIT_col", + "INTERVALYEAR" : false, + "INTERVALDAY" : false, + "INTERVAL" : false, + "DECIMAL28DENSE_col" : false, + "DECIMAL38DENSE_col" : false, + "DECIMAL38SPARSE_col" : false, + "DECIMAL28SPARSE_col" : false, + "VARBINARY_col" : false, + "VARCHAR_col" : [ "a string", "asdf", "", "qwerty"], + "VAR16CHAR_col" : false, + "BIT_col" : [ false, true, false, false, true, false, false] } +{ + "TINYINT_col" : [ 1, 2, 3, 4, 5, -1, -2 ], + "UINT1_col" : [ 1, 2, 3 ], + "UINT2_col" : [ 1, 2 ], + "SMALLINT_col" : [ 1, 2], + "INT_col" : [ 1, 2, 3, 4 ], + "UINT4_col" : [ 1], + "FLOAT4_col" : [ 1.0, 2.0, 3.0, 4.0], + "TIME_col" : [ "2:30", "11:45", "12:00"], + "DECIMAL9_col" : [ "1.0", "2.0", "3.0", "4.0", "5.0" ], + "BIGINT_col" : [ 1, 2, 3, 4, 5, 9223372036854775000], + "UINT8_col" : [ "1", "2", "3"], + "FLOAT8_col" : [ 1.0, 2.0, 3.0], + "DATE_col": [ "1995-01-01", "1995-01-02", "1995-01-03"], + "TIMESTAMP_col" : [ "1995-01-01 01:00:00.000","1995-01-01 01:00:00.000"], + "DECIMAL18_col" : ["123456789.000000000", "11.123456789", "0.100000000", "-0.100400000"], + "INTERVALYEAR" : false, + "INTERVALDAY" : false, + "INTERVAL" : false, + "DECIMAL28DENSE_col" : false, + "DECIMAL38DENSE_col" : false, + "DECIMAL38SPARSE_col" : false, + "DECIMAL28SPARSE_col" : false, + "VARBINARY_col" : false, + "VARCHAR_col" : [ "a string", "asdf", ""], + "VAR16CHAR_col" : false, + "BIT_col" : [ false, true, false, false, true, false, false] +} +{ + "TINYINT_col" : [ 1, 2, 3, 4, 5, -1, -2 ], + "UINT1_col" : [ 1, 2, 3 ], + "UINT2_col" : [ 1, 2 ], + "SMALLINT_col" : [ 1, 2], + "INT_col" : [ 1, 2, 3, 4 ], + "UINT4_col" : [ 1], + "FLOAT4_col" : [ 1.0, 2.0, 3.0, 4.0], + "TIME_col" : [ "2:30", "11:45", "12:00"], + "DECIMAL9_col" : [ "1.0", "2.0", "3.0", "4.0", "5.0" ], + "BIGINT_col" : [ 1, 2, 3, 4, 5, 9223372036854775000], + "UINT8_col" : [ "1", "2", "3"], + "FLOAT8_col" : [ 1.0, 2.0, 3.0], + "DATE_col": [ "1995-01-01", "1995-01-02", "1995-01-03"], + "TIMESTAMP_col" : [ "1995-01-01 01:00:00.000","1995-01-01 01:00:00.000"], + "DECIMAL18_col" : ["123456789.000000000", "11.123456789", "0.100000000", "-0.100400000"], + "INTERVALYEAR" : false, + "INTERVALDAY" : false, + "INTERVAL" : false, + "DECIMAL28DENSE_col" : false, + "DECIMAL38DENSE_col" : false, + "DECIMAL38SPARSE_col" : false, + "DECIMAL28SPARSE_col" : false, + "VARBINARY_col" : false, + "VARCHAR_col" : [ "a string", "asdf", ""], + "VAR16CHAR_col" : false, + "BIT_col" : [ false, false, false, false, false] +} +{ + "TINYINT_col" : [ 1, 2, 3, 4, 5, -1, -2 ], + "UINT1_col" : [ 1, 2, 3 ], + "UINT2_col" : [ 1, 2 ], + "SMALLINT_col" : [ 1, 2], + "INT_col" : [ 1, 2, 3, 4 ], + "UINT4_col" : [ 1], + "FLOAT4_col" : [ 1.0, 2.0, 3.0, 4.0], + "TIME_col" : [ "2:30", "11:45", "12:00"], + "DECIMAL9_col" : [ "1.0", "2.0", "3.0", "4.0", "5.0" ], + "BIGINT_col" : [ 1, 2, 3, 4, 5, 9223372036854775000], + "UINT8_col" : [ "1", "2", "3"], + "FLOAT8_col" : [ 1.0, 2.0, 3.0], + "DATE_col": [ "1995-01-01", "1995-01-02", "1995-01-03"], + "TIMESTAMP_col" : [ "1995-01-01 01:00:00.000","1995-01-01 01:00:00.000"], + "DECIMAL18_col" : ["123456789.000000000", "11.123456789", "0.100000000", "-0.100400000"], + "INTERVALYEAR" : false, + "INTERVALDAY" : false, + "INTERVAL" : false, + "DECIMAL28DENSE_col" : false, + "DECIMAL38DENSE_col" : false, + "DECIMAL38SPARSE_col" : false, + "DECIMAL28SPARSE_col" : false, + "VARBINARY_col" : false, + "VARCHAR_col" : [ "a string", "asdf", ""], + "VAR16CHAR_col" : false, + "BIT_col" : [ true, true, true] +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/a13fc39b/exec/java-exec/src/test/resources/store/json/json_basic_repeated_varchar.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/json/json_basic_repeated_varchar.json b/exec/java-exec/src/test/resources/store/json/json_basic_repeated_varchar.json new file mode 100644 index 0000000..2b18a5e --- /dev/null +++ b/exec/java-exec/src/test/resources/store/json/json_basic_repeated_varchar.json @@ -0,0 +1,4 @@ +{ "str_list" : ["asdf", "Doctors","Health & Medical","asdf", "1234asdf"] } +{ "str_list" : ["Restaurants"] } +{ "str_list" : ["asdf", "American (Traditional)","Restaurants"] } +{ "str_list" : ["Restaurants"] }
