[CARBONDATA-2799][BloomDataMap] Fix bugs in querying with bloom datamap on preagg with dictionary column
For preaggregate table, if the groupby column is dictionary column in parent table, the preaggregate table will inherit the dictionary encoding as well as the dictionary file from the parent table. So for dictionary columns, during query with bloom, we need to convert the plain filter value to dictionarty encoded value based on parent table's dictionary file. This closes #2580 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/bd6abbbf Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/bd6abbbf Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/bd6abbbf Branch: refs/heads/external-format Commit: bd6abbbffd36b5ca0aaad9d937d401982d1d60eb Parents: b65bf9b Author: xuchuanyin <xuchuan...@hust.edu.cn> Authored: Mon Jul 30 17:50:51 2018 +0800 Committer: kunal642 <kunalkapoor...@gmail.com> Committed: Thu Aug 2 16:55:59 2018 +0530 ---------------------------------------------------------------------- .../datamap/bloom/BloomCoarseGrainDataMap.java | 21 ++++- .../BloomCoarseGrainDataMapFunctionSuite.scala | 97 ++++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/bd6abbbf/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java ---------------------------------------------------------------------- diff --git a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java index be531d6..71b1c55 100644 --- a/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java +++ b/datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java @@ -47,10 +47,12 @@ import org.apache.carbondata.core.devapi.DictionaryGenerationException; import org.apache.carbondata.core.indexstore.Blocklet; import org.apache.carbondata.core.indexstore.PartitionSpec; import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; +import org.apache.carbondata.core.metadata.CarbonMetadata; import org.apache.carbondata.core.metadata.datatype.DataType; import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.metadata.encoder.Encoding; import org.apache.carbondata.core.metadata.schema.table.CarbonTable; +import org.apache.carbondata.core.metadata.schema.table.RelationIdentifier; import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn; import org.apache.carbondata.core.scan.expression.ColumnExpression; import org.apache.carbondata.core.scan.expression.Expression; @@ -108,6 +110,7 @@ public class BloomCoarseGrainDataMap extends CoarseGrainDataMap { for (CarbonColumn col : indexedColumn) { this.name2Col.put(col.getColName(), col); } + String parentTablePath = getAncestorTablePath(carbonTable); try { this.name2Converters = new HashMap<>(indexedColumn.size()); @@ -129,7 +132,7 @@ public class BloomCoarseGrainDataMap extends CoarseGrainDataMap { dataField.setTimestampFormat(tsFormat); FieldConverter fieldConverter = FieldEncoderFactory.getInstance() .createFieldEncoder(dataField, absoluteTableIdentifier, i, nullFormat, null, false, - localCaches[i], false, carbonTable.getTablePath()); + localCaches[i], false, parentTablePath); this.name2Converters.put(indexedColumn.get(i).getColName(), fieldConverter); } } catch (IOException e) { @@ -140,6 +143,22 @@ public class BloomCoarseGrainDataMap extends CoarseGrainDataMap { this.badRecordLogHolder.setLogged(false); } + /** + * recursively find the ancestor's table path. This is used for dictionary scenario + * where preagg will use the dictionary of the parent table. + */ + private String getAncestorTablePath(CarbonTable currentTable) { + if (!currentTable.isChildDataMap()) { + return currentTable.getTablePath(); + } + + RelationIdentifier parentIdentifier = + currentTable.getTableInfo().getParentRelationIdentifiers().get(0); + CarbonTable parentTable = CarbonMetadata.getInstance().getCarbonTable( + parentIdentifier.getDatabaseName(), parentIdentifier.getTableName()); + return getAncestorTablePath(parentTable); + } + @Override public List<Blocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, List<PartitionSpec> partitions) throws IOException { http://git-wip-us.apache.org/repos/asf/carbondata/blob/bd6abbbf/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala ---------------------------------------------------------------------- diff --git a/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala b/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala index 496a506..fd1345c 100644 --- a/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala +++ b/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapFunctionSuite.scala @@ -832,6 +832,103 @@ class BloomCoarseGrainDataMapFunctionSuite extends QueryTest with BeforeAndAfte CarbonCommonConstants.BLOCKLET_SIZE_DEFAULT_VAL) } + /** + * create bloom and preagg on base table, then create bloom on preagg table, + * index column and group by column is dictionary column. + * note that the test steps are copied from issue. + * In the CI env, sometime it will become timeout, so we ignore the newly added tests + */ + ignore("test bloom datamap: CARBONDATA-2799 bloom datamap on preaggregate") { + sql( + s""" + | CREATE TABLE $normalTable (id int, name string, salary float,dob date) + | STORED BY 'carbondata' + | TBLPROPERTIES('dictionary_include'='id') + """.stripMargin) + sql( + s""" + | CREATE TABLE $bloomDMSampleTable (id int, name string, salary float,dob date) + | STORED BY 'carbondata' + | TBLPROPERTIES('dictionary_include'='id') + """.stripMargin) + (1 to 2).foreach { _ => + sql( + s""" + | INSERT INTO $bloomDMSampleTable VALUES + | ('1', 'name1', '11.1', '2018-07-01'), + | ('2', 'name2', '21.1', '2018-07-02'), + | ('3', 'name3', '31.1', '2018-07-03'), + | ('4', 'name4', '41.1', '2018-07-04') + """.stripMargin) + sql( + s""" + | INSERT INTO $normalTable VALUES + | ('1', 'name1', '11.1', '2018-07-01'), + | ('2', 'name2', '21.1', '2018-07-02'), + | ('3', 'name3', '31.1', '2018-07-03'), + | ('4', 'name4', '41.1', '2018-07-04') + """.stripMargin) + } + sql( + s""" + | CREATE DATAMAP $dataMapName ON TABLE $bloomDMSampleTable + | USING 'bloomfilter' + | DMPROPERTIES('INDEX_COLUMNS'='id', 'BLOOM_SIZE'='320000', 'BLOOM_FPP'='0.01', 'BLOOM_COMPRESS'='TRUE') + """.stripMargin) + sql( + s""" + | INSERT INTO $bloomDMSampleTable VALUES + | ('1', 'name1', '11.1', '2018-07-01'), + | ('2', 'name2', '21.1', '2018-07-02'), + | ('3', 'name3', '31.1', '2018-07-03'), + | ('4', 'name4', '41.1', '2018-07-04') + """.stripMargin) + sql( + s""" + | INSERT INTO $normalTable VALUES + | ('1', 'name1', '11.1', '2018-07-01'), + | ('2', 'name2', '21.1', '2018-07-02'), + | ('3', 'name3', '31.1', '2018-07-03'), + | ('4', 'name4', '41.1', '2018-07-04') + """.stripMargin) + val preAggOnBase = "preagg_on_base" + sql( + s""" + | CREATE DATAMAP $preAggOnBase ON TABLE $bloomDMSampleTable + | USING 'preaggregate' AS + | select id, count(id) from $bloomDMSampleTable group by id + """.stripMargin) + checkAnswer(sql(s"SELECT id, count(id) from $bloomDMSampleTable where id = 3 group by id"), + sql(s"SELECT id, count(id) from $normalTable where id = 3 group by id")) + + val bloomOnPreAgg = "bloom_on_pre_agg" + sql( + s""" + | CREATE DATAMAP $bloomOnPreAgg ON TABLE ${bloomDMSampleTable}_${preAggOnBase} + | USING 'bloomfilter' + | DMPROPERTIES('INDEX_COLUMNS'='${bloomDMSampleTable}_id') + """.stripMargin) + checkAnswer(sql(s"SELECT id, count(id) from $bloomDMSampleTable where id = 3 group by id"), + sql(s"SELECT id, count(id) from $normalTable where id = 3 group by id")) + + sql(s"DROP DATAMAP $bloomOnPreAgg on table ${bloomDMSampleTable}_${preAggOnBase}") + checkAnswer(sql(s"SELECT id, count(id) from $bloomDMSampleTable where id = 3 group by id"), + sql(s"SELECT id, count(id) from $normalTable where id = 3 group by id")) + + sql( + s""" + | CREATE DATAMAP $bloomOnPreAgg ON TABLE ${bloomDMSampleTable}_${preAggOnBase} + | USING 'bloomfilter' + | DMPROPERTIES('INDEX_COLUMNS'='${bloomDMSampleTable}_id') + """.stripMargin) + checkAnswer(sql(s"SELECT id, count(id) from $bloomDMSampleTable where id = 3 group by id"), + sql(s"SELECT id, count(id) from $normalTable where id = 3 group by id")) + + sql(s"DROP DATAMAP $bloomOnPreAgg on table ${bloomDMSampleTable}_${preAggOnBase}") + checkAnswer(sql(s"SELECT id, count(id) from $bloomDMSampleTable where id = 3 group by id"), + sql(s"SELECT id, count(id) from $normalTable where id = 3 group by id")) + } + override def afterAll(): Unit = { deleteFile(bigFile) sql(s"DROP TABLE IF EXISTS $normalTable")