This is an automated email from the ASF dual-hosted git repository. xxyu pushed a commit to branch kylin-on-parquet-v2 in repository https://gitbox.apache.org/repos/asf/kylin.git
The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this push: new 49dcdd2 KYLIN-4845 Fix NFilePruningTest report dup key error 49dcdd2 is described below commit 49dcdd270a0a014123e4ba1b586c49e6dd508540 Author: yaqian.zhang <598593...@qq.com> AuthorDate: Fri Dec 18 15:31:49 2020 +0800 KYLIN-4845 Fix NFilePruningTest report dup key error --- .../file_prunning/cube_desc/file_pruning_cube.json | 61 +++++++++------------- .../cube_desc/file_pruning_cube2.json | 2 +- .../model_desc/file_pruning_model.json | 22 ++++---- .../model_desc/file_pruning_model2.json | 40 -------------- .../file_prunning/project/default.json | 2 +- .../engine/spark/builder/CubeSnapshotBuilder.scala | 2 +- .../spark2/file_pruning/NFilePruningTest.java | 49 +++++++++-------- 7 files changed, 62 insertions(+), 116 deletions(-) diff --git a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json index 01581d9..0ec4904 100644 --- a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json +++ b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json @@ -1,6 +1,6 @@ { "uuid" : "330b1839-1baf-5e4f-7f4c-ad173a5217c8", - "last_modified" : 1589194835241, + "last_modified" : 1589268257682, "version" : "3.9.9.1", "name" : "file_pruning_cube", "is_draft" : false, @@ -9,29 +9,14 @@ "null_string" : null, "dimensions" : [ { "name" : "ORDER_ID", - "table" : "TEST_ORDER", + "table" : "TEST_KYLIN_FACT", "column" : "ORDER_ID", "derived" : null }, { - "name" : "BUYER_ID", - "table" : "TEST_ORDER", - "column" : "BUYER_ID", - "derived" : null - }, { - "name" : "TEST_DATE_ENC", - "table" : "TEST_ORDER", - "column" : "TEST_DATE_ENC", - "derived" : null - }, { - "name" : "TEST_TIME_ENC", - "table" : "TEST_ORDER", - "column" : "TEST_TIME_ENC", - "derived" : null - }, { - "name" : "ORDER_ID", + "name" : "CAL_DT", "table" : "TEST_KYLIN_FACT", - "column" : null, - "derived" : [ "ORDER_ID" ] + "column" : "CAL_DT", + "derived" : null }, { "name" : "LSTG_FORMAT_NAME", "table" : "TEST_KYLIN_FACT", @@ -52,6 +37,16 @@ "table" : "TEST_KYLIN_FACT", "column" : "PRICE", "derived" : null + }, { + "name" : "ORDER_ID", + "table" : "TEST_ORDER", + "column" : null, + "derived" : [ "ORDER_ID" ] + }, { + "name" : "TEST_DATE_ENC", + "table" : "TEST_ORDER", + "column" : null, + "derived" : [ "TEST_DATE_ENC" ] } ], "measures" : [ { "name" : "_COUNT_", @@ -67,20 +62,20 @@ "dictionaries" : [ ], "rowkey" : { "rowkey_columns" : [ { - "column" : "TEST_ORDER.ORDER_ID", + "column" : "TEST_KYLIN_FACT.ORDER_ID", "encoding" : "dict", "isShardBy" : false }, { - "column" : "TEST_ORDER.BUYER_ID", - "encoding" : "dict", + "column" : "TEST_KYLIN_FACT.CAL_DT", + "encoding" : "date", "isShardBy" : false }, { - "column" : "TEST_ORDER.TEST_DATE_ENC", - "encoding" : "date", + "column" : "TEST_KYLIN_FACT.LSTG_FORMAT_NAME", + "encoding" : "dict", "isShardBy" : false }, { - "column" : "TEST_ORDER.TEST_TIME_ENC", - "encoding" : "time", + "column" : "TEST_KYLIN_FACT.SLR_SEGMENT_CD", + "encoding" : "dict", "isShardBy" : false }, { "column" : "TEST_KYLIN_FACT.SELLER_ID", @@ -90,14 +85,6 @@ "column" : "TEST_KYLIN_FACT.PRICE", "encoding" : "dict", "isShardBy" : false - }, { - "column" : "TEST_KYLIN_FACT.SLR_SEGMENT_CD", - "encoding" : "dict", - "isShardBy" : false - } , { - "column" : "TEST_KYLIN_FACT.LSTG_FORMAT_NAME", - "encoding" : "dict", - "isShardBy" : false } ] }, "hbase_mapping" : { @@ -110,14 +97,14 @@ } ] }, "aggregation_groups" : [ { - "includes" : [ "TEST_ORDER.ORDER_ID", "TEST_ORDER.BUYER_ID", "TEST_ORDER.TEST_DATE_ENC", "TEST_ORDER.TEST_TIME_ENC" ], + "includes" : [ "TEST_KYLIN_FACT.ORDER_ID", "TEST_KYLIN_FACT.CAL_DT", "TEST_KYLIN_FACT.LSTG_FORMAT_NAME", "TEST_KYLIN_FACT.SLR_SEGMENT_CD", "TEST_KYLIN_FACT.SELLER_ID", "TEST_KYLIN_FACT.PRICE" ], "select_rule" : { "hierarchy_dims" : [ ], "mandatory_dims" : [ ], "joint_dims" : [ ] } } ], - "signature" : "2BsFyXxmB2F+FNlH4RxDwQ==", + "signature" : "ioyeFiil5XMOa8G02uVhgQ==", "notify_list" : [ ], "status_need_notify" : [ "ERROR", "DISCARDED", "SUCCEED" ], "partition_date_start" : 0, diff --git a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json index f9fa85b..8e2bd3a 100644 --- a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json +++ b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json @@ -4,7 +4,7 @@ "version" : "3.9.9.1", "name" : "file_pruning_cube2", "is_draft" : false, - "model_name" : "file_pruning_model2", + "model_name" : "file_pruning_model", "description" : "", "null_string" : null, "dimensions" : [ { diff --git a/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json b/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json index fb35bd8..018a384 100644 --- a/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json +++ b/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json @@ -1,33 +1,33 @@ { "uuid" : "1433c51f-fa34-3c12-6d10-d4a59338a19d", - "last_modified" : 1589194803840, + "last_modified" : 0, "version" : "3.9.9.1", "name" : "file_pruning_model", "owner" : "ADMIN", "is_draft" : false, "description" : "", - "fact_table" : "DEFAULT.TEST_ORDER", + "fact_table" : "DEFAULT.TEST_KYLIN_FACT", "lookups" : [ { - "table" : "DEFAULT.TEST_KYLIN_FACT", + "table" : "DEFAULT.TEST_ORDER", "kind" : "LOOKUP", - "alias" : "TEST_KYLIN_FACT", + "alias" : "TEST_ORDER", "join" : { "type" : "left", - "primary_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ], - "foreign_key" : [ "TEST_ORDER.ORDER_ID" ] + "primary_key" : [ "TEST_ORDER.ORDER_ID" ], + "foreign_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ] } } ], "dimensions" : [ { - "table" : "TEST_ORDER", - "columns" : [ "ORDER_ID", "BUYER_ID", "TEST_DATE_ENC", "TEST_TIME_ENC" ] - }, { "table" : "TEST_KYLIN_FACT", - "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "ORDER_ID" ] + "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "CAL_DT", "ORDER_ID" ] + }, { + "table" : "TEST_ORDER", + "columns" : [ "ORDER_ID", "TEST_DATE_ENC" ] } ], "metrics" : [ ], "filter_condition" : "", "partition_desc" : { - "partition_date_column" : "TEST_ORDER.TEST_TIME_ENC", + "partition_date_column" : "TEST_KYLIN_FACT.CAL_DT", "partition_time_column" : null, "partition_date_start" : 0, "partition_date_format" : "yyyy-MM-dd", diff --git a/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json b/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json deleted file mode 100644 index 2f29fc6..0000000 --- a/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "uuid" : "7622b8c5-a5fb-e10b-4038-4cbccfc2df24", - "last_modified" : 1589268019470, - "version" : "3.9.9.1", - "name" : "file_pruning_model2", - "owner" : "ADMIN", - "is_draft" : false, - "description" : "", - "fact_table" : "DEFAULT.TEST_KYLIN_FACT", - "lookups" : [ { - "table" : "DEFAULT.TEST_ORDER", - "kind" : "LOOKUP", - "alias" : "TEST_ORDER", - "join" : { - "type" : "left", - "primary_key" : [ "TEST_ORDER.ORDER_ID" ], - "foreign_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ] - } - } ], - "dimensions" : [ { - "table" : "TEST_KYLIN_FACT", - "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "CAL_DT", "ORDER_ID" ] - }, { - "table" : "TEST_ORDER", - "columns" : [ "ORDER_ID", "TEST_DATE_ENC" ] - } ], - "metrics" : [ ], - "filter_condition" : "", - "partition_desc" : { - "partition_date_column" : "TEST_KYLIN_FACT.CAL_DT", - "partition_time_column" : null, - "partition_date_start" : 0, - "partition_date_format" : "yyyy-MM-dd", - "partition_time_format" : "HH:mm:ss", - "partition_type" : "APPEND", - "partition_condition_builder" : "org.apache.kylin.metadata.model.PartitionDesc$DefaultPartitionConditionBuilder" - }, - "capacity" : "MEDIUM", - "projectName" : "default" -} \ No newline at end of file diff --git a/examples/test_case_data/file_prunning/project/default.json b/examples/test_case_data/file_prunning/project/default.json index 9ebc29f..e983b5d 100644 --- a/examples/test_case_data/file_prunning/project/default.json +++ b/examples/test_case_data/file_prunning/project/default.json @@ -19,7 +19,7 @@ "type" : "CUBE", "realization" : "file_pruning_cube_measure" } ], - "models" : [ "file_pruning_model","file_pruning_model2","file_pruning_model_measure" ], + "models" : [ "file_pruning_model","file_pruning_model_measure" ], "ext_filters" : [ ], "override_kylin_properties" : { } } \ No newline at end of file diff --git a/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala b/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala index d1b62f0..146dcab 100644 --- a/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala +++ b/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala @@ -195,7 +195,7 @@ class CubeSnapshotBuilder extends Logging { val lookupTablePKS = joinDesc.PKS.map(lookupTablePK => lookupTablePK.columnName) val countDistinctColumn = df.agg(countDistinct(lookupTablePKS.head, lookupTablePKS.tail: _*)).collect().map(_.getLong(0)).head if (countColumn != countDistinctColumn) { - throw new IllegalStateException(s"Failed to build lookup table ${lookupTableName} snapshot for Dup key found, key= ${lookupTablePKS}") + throw new IllegalStateException(s"Failed to build lookup table ${lookupTableName} snapshot for Dup key found, key= ${lookupTablePKS.mkString(",")}") } } } diff --git a/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java b/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java index a64638b..f004168 100644 --- a/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java +++ b/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java @@ -63,10 +63,9 @@ import java.util.UUID; public class NFilePruningTest extends LocalWithSparkSessionTest { - private String SQL_BASE2 = "SELECT COUNT(*) FROM TEST_KYLIN_FACT LEFT JOIN TEST_ORDER ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID "; - private final static String CUBE_NAME = "file_pruning_cube2"; - private String SQL_BASE = "SELECT COUNT(*) FROM TEST_ORDER LEFT JOIN TEST_KYLIN_FACT ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID "; - private final static String CUBE_NAME2 = "file_pruning_cube"; + private String SQL_BASE = "SELECT COUNT(*) FROM TEST_KYLIN_FACT LEFT JOIN TEST_ORDER ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID "; + private final static String CUBE_SHARD_BY_SELLER_ID = "file_pruning_cube"; + private final static String CUBE_PRUNER_BY_PARTITION = "file_pruning_cube2"; protected KylinConfig config; protected CubeManager cubeMgr; protected ExecutableManager execMgr; @@ -128,11 +127,11 @@ public class NFilePruningTest extends LocalWithSparkSessionTest { public void testNonExistTimeRange() throws Exception { Long start = DateFormat.stringToMillis("2023-01-01 00:00:00"); Long end = DateFormat.stringToMillis("2025-01-01 00:00:00"); - cleanupSegments(CUBE_NAME); - buildCuboid(CUBE_NAME, new SegmentRange.TSRange(start, end)); + cleanupSegments(CUBE_PRUNER_BY_PARTITION); + buildCuboid(CUBE_PRUNER_BY_PARTITION, new SegmentRange.TSRange(start, end)); populateSSWithCSVData(config, getProject(), KylinSparkEnv.getSparkSession()); - assertResultsAndScanFiles(SQL_BASE2, 1); + assertResultsAndScanFiles(SQL_BASE, 1); } @Test @@ -141,7 +140,7 @@ public class NFilePruningTest extends LocalWithSparkSessionTest { // [2009-01-01 00:00:00, 2011-01-01 00:00:00) // [2011-01-01 00:00:00, 2013-01-01 00:00:00) // [2013-01-01 00:00:00, 2015-01-01 00:00:00) - buildMultiSegs(CUBE_NAME); + buildMultiSegs(CUBE_PRUNER_BY_PARTITION); populateSSWithCSVData(getTestConfig(), getProject(), SparderContext.getSparkSession()); testSegPruningWithStringDate(); testSegPruningWithStringTimeStamp(); @@ -161,28 +160,28 @@ public class NFilePruningTest extends LocalWithSparkSessionTest { } public void testSegPruningWithStringTimeStamp() throws Exception { - String and_pruning0 = SQL_BASE2 + String and_pruning0 = SQL_BASE + "where CAL_DT > '2011-01-01 00:00:00' and CAL_DT < '2013-01-01 00:00:00'"; - String and_pruning1 = SQL_BASE2 + String and_pruning1 = SQL_BASE + "where CAL_DT > '2011-01-01 00:00:00' and CAL_DT = '2016-01-01 00:00:00'"; - String or_pruning0 = SQL_BASE2 + String or_pruning0 = SQL_BASE + "where CAL_DT > '2011-01-01 00:00:00' or CAL_DT = '2016-01-01 00:00:00'"; - String or_pruning1 = SQL_BASE2 + String or_pruning1 = SQL_BASE + "where CAL_DT < '2009-01-01 00:00:00' or CAL_DT > '2015-01-01 00:00:00'"; - String pruning0 = SQL_BASE2 + "where CAL_DT < '2009-01-01 00:00:00'"; - String pruning1 = SQL_BASE2 + "where CAL_DT <= '2009-01-01 00:00:00'"; - String pruning2 = SQL_BASE2 + "where CAL_DT >= '2015-01-01 00:00:00'"; + String pruning0 = SQL_BASE + "where CAL_DT < '2009-01-01 00:00:00'"; + String pruning1 = SQL_BASE + "where CAL_DT <= '2009-01-01 00:00:00'"; + String pruning2 = SQL_BASE + "where CAL_DT >= '2015-01-01 00:00:00'"; - String not0 = SQL_BASE2 + "where CAL_DT <> '2012-01-01 00:00:00'"; + String not0 = SQL_BASE + "where CAL_DT <> '2012-01-01 00:00:00'"; - String in_pruning0 = SQL_BASE2 + String in_pruning0 = SQL_BASE + "where CAL_DT in ('2009-01-01 00:00:00', '2008-01-01 00:00:00', '2016-01-01 00:00:00')"; - String in_pruning1 = SQL_BASE2 + String in_pruning1 = SQL_BASE + "where CAL_DT in ('2008-01-01 00:00:00', '2016-01-01 00:00:00')"; - assertResultsAndScanFiles(SQL_BASE2, 3); + assertResultsAndScanFiles(SQL_BASE, 3); assertResultsAndScanFiles(and_pruning0, 1); assertResultsAndScanFiles(and_pruning1, 0); @@ -254,7 +253,7 @@ public class NFilePruningTest extends LocalWithSparkSessionTest { public void testSegShardPruning() throws Exception { System.setProperty("kylin.storage.columnar.shard-rowcount", "100"); try { - buildMultiSegs(CUBE_NAME2); + buildMultiSegs(CUBE_SHARD_BY_SELLER_ID); populateSSWithCSVData(getTestConfig(), getProject(), KylinSparkEnv.getSparkSession()); @@ -362,17 +361,17 @@ public class NFilePruningTest extends LocalWithSparkSessionTest { String in = SQL_BASE + "where SELLER_ID in (10000233,10000234,10000235)"; String isNull = SQL_BASE + "where SELLER_ID is NULL"; String and = SQL_BASE + "where SELLER_ID in (10000233,10000234,10000235) and SELLER_ID = 10000233 "; - String or = SQL_BASE + "where SELLER_ID = 10000233 or SELLER_ID = 1 "; + String or = SQL_BASE + "where SELLER_ID = 10000233 or SELLER_ID = 2 "; String notSupported0 = SQL_BASE + "where SELLER_ID <> 10000233"; String notSupported1 = SQL_BASE + "where SELLER_ID > 10000233"; assertResultsAndScanFiles(equality, 3); - assertResultsAndScanFiles(in, 9); + assertResultsAndScanFiles(in, 7); assertResultsAndScanFiles(isNull, 3); assertResultsAndScanFiles(and, 3); - assertResultsAndScanFiles(or, 5); //5 - assertResultsAndScanFiles(notSupported0, 57); //36 - assertResultsAndScanFiles(notSupported1, 57); //36 + assertResultsAndScanFiles(or, 5); + assertResultsAndScanFiles(notSupported0, 13); + assertResultsAndScanFiles(notSupported1, 13); List<Pair<String, String>> query = new ArrayList<>(); query.add(Pair.newPair("", equality));