Repository: kylin Updated Branches: refs/heads/master 029880f23 -> 79374047d
APACHE-KYLIN-2867: split large fuzzy key set Signed-off-by: lidongsjtu <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/cb7f567f Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/cb7f567f Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/cb7f567f Branch: refs/heads/master Commit: cb7f567f43ea0b1466772c61fa5855f406cce661 Parents: 029880f Author: Zhong <[email protected]> Authored: Mon Sep 18 14:53:49 2017 +0800 Committer: lidongsjtu <[email protected]> Committed: Wed Dec 13 14:23:17 2017 +0800 ---------------------------------------------------------------------- .../apache/kylin/common/KylinConfigBase.java | 4 ++ .../storage/gtrecord/CubeScanRangePlanner.java | 49 ++++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/cb7f567f/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java ---------------------------------------------------------------------- diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java index 66805df..914ba7e 100644 --- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java +++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java @@ -895,6 +895,10 @@ abstract public class KylinConfigBase implements Serializable { return Integer.parseInt(this.getOptional("kylin.storage.hbase.max-fuzzykey-scan", "200")); } + public int getQueryScanFuzzyKeySplitMax() { + return Integer.parseInt(this.getOptional("kylin.storage.hbase.max-fuzzykey-scan-split", "1")); + } + public int getQueryStorageVisitScanRangeMax() { return Integer.valueOf(this.getOptional("kylin.storage.hbase.max-visit-scanrange", "1000000")); } http://git-wip-us.apache.org/repos/asf/kylin/blob/cb7f567f/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java ---------------------------------------------------------------------- diff --git a/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java b/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java index 7e6f7c4..ef1114a 100644 --- a/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java +++ b/core-storage/src/main/java/org/apache/kylin/storage/gtrecord/CubeScanRangePlanner.java @@ -63,6 +63,7 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { private static final Logger logger = LoggerFactory.getLogger(CubeScanRangePlanner.class); protected int maxScanRanges; + protected int maxFuzzyKeysPerSplit; protected int maxFuzzyKeys; //non-GT @@ -77,7 +78,8 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { this.context = context; this.maxScanRanges = cubeSegment.getConfig().getQueryStorageVisitScanRangeMax(); - this.maxFuzzyKeys = cubeSegment.getConfig().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeysPerSplit = cubeSegment.getConfig().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeys = maxFuzzyKeysPerSplit * cubeSegment.getConfig().getQueryScanFuzzyKeySplitMax(); this.cubeSegment = cubeSegment; this.cubeDesc = cubeSegment.getCubeDesc(); @@ -124,7 +126,8 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { public CubeScanRangePlanner(GTInfo info, Pair<ByteArray, ByteArray> gtStartAndEnd, TblColRef gtPartitionCol, TupleFilter gtFilter) { this.maxScanRanges = KylinConfig.getInstanceFromEnv().getQueryStorageVisitScanRangeMax(); - this.maxFuzzyKeys = KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeysPerSplit = KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeyMax(); + this.maxFuzzyKeys = maxFuzzyKeysPerSplit * KylinConfig.getInstanceFromEnv().getQueryScanFuzzyKeySplitMax(); this.gtInfo = info; @@ -172,6 +175,7 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } List<GTScanRange> mergedRanges = mergeOverlapRanges(scanRanges); + mergedRanges = splitFuzzyKeys(mergedRanges); mergedRanges = mergeTooManyRanges(mergedRanges, maxScanRanges); return mergedRanges; @@ -196,8 +200,6 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { GTRecord pkEnd = new GTRecord(gtInfo); Map<Integer, Set<ByteArray>> fuzzyValues = Maps.newHashMap(); - List<GTRecord> fuzzyKeys; - for (ColumnRange range : andDimRanges) { if (gtPartitionCol != null && range.column.equals(gtPartitionCol)) { int beginCompare = rangeStartEndComparator.comparator.compare(range.begin, gtStartAndEnd.getSecond()); @@ -224,9 +226,8 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } } - fuzzyKeys = + List<GTRecord> fuzzyKeys = buildFuzzyKeys(fuzzyValues); - buildFuzzyKeys(fuzzyValues); return new GTScanRange(pkStart, pkEnd, fuzzyKeys); } @@ -243,7 +244,6 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } List<Map<Integer, ByteArray>> fuzzyValueCombinations = FuzzyValueCombination.calculate(fuzzyValueSet, maxFuzzyKeys); - for (Map<Integer, ByteArray> fuzzyValue : fuzzyValueCombinations) { // BitSet bitSet = new BitSet(gtInfo.getColumnCount()); @@ -309,7 +309,7 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { GTRecord start = first.pkStart; GTRecord end = first.pkEnd; - List<GTRecord> newFuzzyKeys = new ArrayList<GTRecord>(); + Set<GTRecord> newFuzzyKeys = Sets.newLinkedHashSet(); boolean hasNonFuzzyRange = false; for (GTScanRange range : ranges) { @@ -319,12 +319,15 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { } // if any range is non-fuzzy, then all fuzzy keys must be cleared - // also too many fuzzy keys will slow down HBase scan + // too many fuzzy keys will slow down HBase scan if (hasNonFuzzyRange || newFuzzyKeys.size() > maxFuzzyKeys) { + if (newFuzzyKeys.size() > maxFuzzyKeys) { + logger.debug("too many FuzzyKeys, clean it!"); + } newFuzzyKeys.clear(); } - return new GTScanRange(start, end, newFuzzyKeys); + return new GTScanRange(start, end, Lists.newArrayList(newFuzzyKeys)); } protected List<GTScanRange> mergeTooManyRanges(List<GTScanRange> ranges, int maxRanges) { @@ -336,6 +339,32 @@ public class CubeScanRangePlanner extends ScanRangePlannerBase { List<GTScanRange> result = new ArrayList<GTScanRange>(1); GTScanRange mergedRange = mergeKeyRange(ranges); result.add(mergedRange); + + result = splitFuzzyKeys(result); + return result; + } + + private List<GTScanRange> splitFuzzyKeys(List<GTScanRange> mergedRanges) { + List<GTScanRange> result = Lists.newArrayList(); + for (GTScanRange range : mergedRanges) { + // if the fuzzy key is huge but still within in split range, then we split fuzzy keys to multiple ones. + if (range.fuzzyKeys.size() > maxFuzzyKeysPerSplit && range.fuzzyKeys.size() <= maxFuzzyKeys) { + List<GTRecord> fuzzyKeys = range.fuzzyKeys; + Collections.sort(fuzzyKeys); + int nSplit = (fuzzyKeys.size() - 1) / maxFuzzyKeysPerSplit + 1; + int nFuzzyKeysPerSplit = fuzzyKeys.size() / nSplit; + int startIndex = 0; + for (int i = 1; i <= nSplit; i++) { + int endIndex = i == nSplit ? fuzzyKeys.size() : i * nFuzzyKeysPerSplit; + List<GTRecord> subFuzzyKeys = fuzzyKeys.subList(startIndex, endIndex); + result.add(new GTScanRange(range.pkStart, range.pkEnd, subFuzzyKeys)); + startIndex = endIndex; + } + logger.debug("large FuzzyKeys split size : " + result.size()); + } else { + result.add(range); + } + } return result; }
