Repository: kylin
Updated Branches:
  refs/heads/master 029880f23 -> 79374047d


APACHE-KYLIN-2867: split large fuzzy key set

Signed-off-by: lidongsjtu <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/cb7f567f
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/cb7f567f
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/cb7f567f

Branch: refs/heads/master
Commit: cb7f567f43ea0b1466772c61fa5855f406cce661
Parents: 029880f
Author: Zhong <[email protected]>
Authored: Mon Sep 18 14:53:49 2017 +0800
Committer: lidongsjtu <[email protected]>
Committed: Wed Dec 13 14:23:17 2017 +0800

----------------------------------------------------------------------
 .../apache/kylin/common/KylinConfigBase.java    |  4 ++
 .../storage/gtrecord/CubeScanRangePlanner.java  | 49 ++++++++++++++++----
 2 files changed, 43 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kylin/blob/cb7f567f/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
----------------------------------------------------------------------
diff --git 
a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java 
b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
index 66805df..914ba7e 100644
--- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
+++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java
@@ -895,6 +895,10 @@ abstract public class KylinConfigBase implements 
Serializable {
         return 
Integer.parseInt(this.getOptional("kylin.storage.hbase.max-fuzzykey-scan", 
"200"));
     }
 
+    public int getQueryScanFuzzyKeySplitMax() {
+        return 
Integer.parseInt(this.getOptional("kylin.storage.hbase.max-fuzzykey-scan-split",
 "1"));
+    }
+
     public int getQueryStorageVisitScanRangeMax() {
         return 
Integer.valueOf(this.getOptional("kylin.storage.hbase.max-visit-scanrange", 
"1000000"));
     }

http://git-wip-us.apache.org/repos/asf/kylin/blob/cb7f567f/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java
----------------------------------------------------------------------
diff --git 
a/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java
 
b/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java
index 7e6f7c4..ef1114a 100644
--- 
a/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java
+++ 
b/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java
@@ -63,6 +63,7 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
     private static final Logger logger = 
LoggerFactory.getLogger(CubeScanRangePlanner.class);
 
     protected int maxScanRanges;
+    protected int maxFuzzyKeysPerSplit;
     protected int maxFuzzyKeys;
 
     //non-GT
@@ -77,7 +78,8 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
         this.context = context;
 
         this.maxScanRanges = 
cubeSegment.getConfig().getQueryStorageVisitScanRangeMax();
-        this.maxFuzzyKeys = cubeSegment.getConfig().getQueryScanFuzzyKeyMax();
+        this.maxFuzzyKeysPerSplit = 
cubeSegment.getConfig().getQueryScanFuzzyKeyMax();
+        this.maxFuzzyKeys = maxFuzzyKeysPerSplit * 
cubeSegment.getConfig().getQueryScanFuzzyKeySplitMax();
 
         this.cubeSegment = cubeSegment;
         this.cubeDesc = cubeSegment.getCubeDesc();
@@ -124,7 +126,8 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
     public CubeScanRangePlanner(GTInfo info, Pair<ByteArray, ByteArray> 
gtStartAndEnd, TblColRef gtPartitionCol, TupleFilter gtFilter) {
 
         this.maxScanRanges = 
KylinConfig.getInstanceFromEnv().getQueryStorageVisitScanRangeMax();
-        this.maxFuzzyKeys = 
KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeyMax();
+        this.maxFuzzyKeysPerSplit = 
KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeyMax();
+        this.maxFuzzyKeys = maxFuzzyKeysPerSplit * 
KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeySplitMax();
 
         this.gtInfo = info;
 
@@ -172,6 +175,7 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
         }
 
         List<GTScanRange> mergedRanges = mergeOverlapRanges(scanRanges);
+        mergedRanges = splitFuzzyKeys(mergedRanges);
         mergedRanges = mergeTooManyRanges(mergedRanges, maxScanRanges);
 
         return mergedRanges;
@@ -196,8 +200,6 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
         GTRecord pkEnd = new GTRecord(gtInfo);
         Map<Integer, Set<ByteArray>> fuzzyValues = Maps.newHashMap();
 
-        List<GTRecord> fuzzyKeys;
-
         for (ColumnRange range : andDimRanges) {
             if (gtPartitionCol != null && range.column.equals(gtPartitionCol)) 
{
                 int beginCompare = 
rangeStartEndComparator.comparator.compare(range.begin, 
gtStartAndEnd.getSecond());
@@ -224,9 +226,8 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
             }
         }
 
-        fuzzyKeys =
+        List<GTRecord> fuzzyKeys = buildFuzzyKeys(fuzzyValues);
 
-                buildFuzzyKeys(fuzzyValues);
         return new GTScanRange(pkStart, pkEnd, fuzzyKeys);
     }
 
@@ -243,7 +244,6 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
         }
 
         List<Map<Integer, ByteArray>> fuzzyValueCombinations = 
FuzzyValueCombination.calculate(fuzzyValueSet, maxFuzzyKeys);
-
         for (Map<Integer, ByteArray> fuzzyValue : fuzzyValueCombinations) {
 
             //            BitSet bitSet = new BitSet(gtInfo.getColumnCount());
@@ -309,7 +309,7 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
 
         GTRecord start = first.pkStart;
         GTRecord end = first.pkEnd;
-        List<GTRecord> newFuzzyKeys = new ArrayList<GTRecord>();
+        Set<GTRecord> newFuzzyKeys = Sets.newLinkedHashSet();
 
         boolean hasNonFuzzyRange = false;
         for (GTScanRange range : ranges) {
@@ -319,12 +319,15 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
         }
 
         // if any range is non-fuzzy, then all fuzzy keys must be cleared
-        // also too many fuzzy keys will slow down HBase scan
+        // too many fuzzy keys will slow down HBase scan
         if (hasNonFuzzyRange || newFuzzyKeys.size() > maxFuzzyKeys) {
+            if (newFuzzyKeys.size() > maxFuzzyKeys) {
+                logger.debug("too many FuzzyKeys,  clean it!");
+            }
             newFuzzyKeys.clear();
         }
 
-        return new GTScanRange(start, end, newFuzzyKeys);
+        return new GTScanRange(start, end, Lists.newArrayList(newFuzzyKeys));
     }
 
     protected List<GTScanRange> mergeTooManyRanges(List<GTScanRange> ranges, 
int maxRanges) {
@@ -336,6 +339,32 @@ public class CubeScanRangePlanner extends 
ScanRangePlannerBase {
         List<GTScanRange> result = new ArrayList<GTScanRange>(1);
         GTScanRange mergedRange = mergeKeyRange(ranges);
         result.add(mergedRange);
+
+        result = splitFuzzyKeys(result);
+        return result;
+    }
+
+    private List<GTScanRange> splitFuzzyKeys(List<GTScanRange> mergedRanges) {
+        List<GTScanRange> result = Lists.newArrayList();
+        for (GTScanRange range : mergedRanges) {
+            // if the fuzzy key is huge but still within in split range, then 
we split fuzzy keys to multiple ones.
+            if (range.fuzzyKeys.size() > maxFuzzyKeysPerSplit && 
range.fuzzyKeys.size() <= maxFuzzyKeys) {
+                List<GTRecord> fuzzyKeys = range.fuzzyKeys;
+                Collections.sort(fuzzyKeys);
+                int nSplit = (fuzzyKeys.size() - 1) / maxFuzzyKeysPerSplit + 1;
+                int nFuzzyKeysPerSplit = fuzzyKeys.size() / nSplit;
+                int startIndex = 0;
+                for (int i = 1; i <= nSplit; i++) {
+                    int endIndex = i == nSplit ? fuzzyKeys.size() : i * 
nFuzzyKeysPerSplit;
+                    List<GTRecord> subFuzzyKeys = 
fuzzyKeys.subList(startIndex, endIndex);
+                    result.add(new GTScanRange(range.pkStart, range.pkEnd, 
subFuzzyKeys));
+                    startIndex = endIndex;
+                }
+                logger.debug("large FuzzyKeys split size : " + result.size());
+            } else {
+                result.add(range);
+            }
+        }
         return result;
     }
 

Reply via email to