KYLIN-2244 "kylin.job.cuboid.size.memhungry.ratio" shouldn't be applied on measures like TopN
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/8ffb0e71 Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/8ffb0e71 Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/8ffb0e71 Branch: refs/heads/master-cdh5.7 Commit: 8ffb0e7103d63d2c0f5d093f3afde1a0490eb8a0 Parents: 4408579 Author: shaofengshi <[email protected]> Authored: Mon Dec 12 14:19:55 2016 +0800 Committer: shaofengshi <[email protected]> Committed: Mon Dec 12 14:19:55 2016 +0800 ---------------------------------------------------------------------- .../apache/kylin/common/KylinConfigBase.java | 5 +++ .../kylin/engine/mr/common/CubeStatsReader.java | 36 ++++++++------------ 2 files changed, 20 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/8ffb0e71/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java ---------------------------------------------------------------------- diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java index 2b35c70..610c2af 100644 --- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java +++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java @@ -261,10 +261,15 @@ abstract public class KylinConfigBase implements Serializable { return Double.parseDouble(getOptional("kylin.cube.size-estimate-ratio", "0.25")); } + @Deprecated public double getJobCuboidSizeMemHungryRatio() { return Double.parseDouble(getOptional("kylin.cube.size-estimate-memhungry-ratio", "0.05")); } + public double getJobCuboidSizeCountDistinctRatio() { + return Double.parseDouble(getOptional("kylin.cube.size-estimate-countdistinct-ratio", "0.05")); + } + public String getCubeAlgorithm() { return getOptional("kylin.cube.algorithm", "auto"); } http://git-wip-us.apache.org/repos/asf/kylin/blob/8ffb0e71/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java index 1cf5da6..21af1e6 100644 --- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java +++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java @@ -55,6 +55,7 @@ import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.engine.mr.HadoopUtil; import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter; import org.apache.kylin.metadata.datatype.DataType; +import org.apache.kylin.metadata.model.FunctionDesc; import org.apache.kylin.metadata.model.MeasureDesc; import org.apache.kylin.metadata.model.TblColRef; import org.slf4j.Logger; @@ -196,41 +197,34 @@ public class CubeStatsReader { */ private static double estimateCuboidStorageSize(CubeSegment cubeSegment, long cuboidId, long rowCount, long baseCuboidId, List<Integer> rowKeyColumnLength) { - int bytesLength = cubeSegment.getRowKeyPreambleSize(); + int rowkeyLength = cubeSegment.getRowKeyPreambleSize(); KylinConfig kylinConf = cubeSegment.getConfig(); long mask = Long.highestOneBit(baseCuboidId); long parentCuboidIdActualLength = Long.SIZE - Long.numberOfLeadingZeros(baseCuboidId); for (int i = 0; i < parentCuboidIdActualLength; i++) { if ((mask & cuboidId) > 0) { - bytesLength += rowKeyColumnLength.get(i); //colIO.getColumnLength(columnList.get(i)); + rowkeyLength += rowKeyColumnLength.get(i); //colIO.getColumnLength(columnList.get(i)); } mask = mask >> 1; } // add the measure length - int space = 0; - boolean isMemoryHungry = false; + int normalSpace = rowkeyLength; + int countDistinctSpace = 0; for (MeasureDesc measureDesc : cubeSegment.getCubeDesc().getMeasures()) { - if (measureDesc.getFunction().getMeasureType().isMemoryHungry()) { - isMemoryHungry = true; - } DataType returnType = measureDesc.getFunction().getReturnDataType(); - space += returnType.getStorageBytesEstimate(); - } - bytesLength += space; - - double ret = 1.0 * bytesLength * rowCount / (1024L * 1024L); - if (isMemoryHungry) { - double cuboidSizeMemHungryRatio = kylinConf.getJobCuboidSizeMemHungryRatio(); - logger.info("Cube is memory hungry, storage size estimation multiply " + cuboidSizeMemHungryRatio); - ret *= cuboidSizeMemHungryRatio; - } else { - double cuboidSizeRatio = kylinConf.getJobCuboidSizeRatio(); - logger.info("Cube is not memory hungry, storage size estimation multiply " + cuboidSizeRatio); - ret *= cuboidSizeRatio; + if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_COUNT_DISTINCT)) { + countDistinctSpace += returnType.getStorageBytesEstimate(); + } else { + normalSpace += returnType.getStorageBytesEstimate(); + } } - logger.info("Cuboid " + cuboidId + " has " + rowCount + " rows, each row size is " + bytesLength + " bytes." + " Total size is " + ret + "M."); + + double cuboidSizeRatio = kylinConf.getJobCuboidSizeRatio(); + double cuboidSizeMemHungryRatio = kylinConf.getJobCuboidSizeCountDistinctRatio(); + double ret = (1.0 * normalSpace * rowCount * cuboidSizeRatio + 1.0 * countDistinctSpace * rowCount * cuboidSizeMemHungryRatio) / (1024L * 1024L); + logger.info("Cuboid " + cuboidId + " has " + rowCount + " rows, each row size is " + (normalSpace + countDistinctSpace) + " bytes." + " Total size is " + ret + "M."); return ret; }
