KYLIN-976 Support Custom Aggregation Types
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/c721d679 Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/c721d679 Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/c721d679 Branch: refs/heads/1.x-staging Commit: c721d679727b0345a7fcdd8335a088e7c336aacc Parents: cdea731 Author: honma <ho...@ebay.com> Authored: Fri Dec 11 16:08:59 2015 +0800 Committer: honma <ho...@ebay.com> Committed: Tue Dec 15 16:50:58 2015 +0800 ---------------------------------------------------------------------- .../org/apache/kylin/common/KylinConfig.java | 40 +- .../common/hll/HyperLogLogPlusCounter.java | 87 +++- .../org/apache/kylin/common/util/BytesUtil.java | 16 + .../apache/kylin/common/util/Dictionary.java | 199 +++++++++ .../kylin/cube/CubeCapabilityChecker.java | 147 ++++--- .../org/apache/kylin/cube/CubeInstance.java | 20 +- .../java/org/apache/kylin/cube/CubeManager.java | 6 +- .../java/org/apache/kylin/cube/CubeSegment.java | 14 +- .../cube/estimation/CubeSizeEstimationCLI.java | 4 +- .../kylin/cube/kv/AbstractRowKeyEncoder.java | 2 +- .../apache/kylin/cube/kv/RowKeyColumnIO.java | 2 +- .../apache/kylin/cube/kv/RowKeyColumnOrder.java | 2 +- .../apache/kylin/cube/kv/RowValueDecoder.java | 27 +- .../org/apache/kylin/cube/model/CubeDesc.java | 61 ++- .../apache/kylin/cube/model/v1/CubeDesc.java | 9 - .../model/validation/rule/FunctionRule.java | 33 +- .../kylin/cube/DictionaryManagerTest.java | 2 +- .../kylin/cube/kv/RowValueDecoderTest.java | 6 +- .../metadata/measure/MeasureCodecTest.java | 3 +- cube/src/test/resources/data/TEST1_desc.json | 7 +- cube/src/test/resources/data/TEST2_desc.json | 7 +- cube/src/test/resources/data/TEST3_desc.json | 418 +++++++++++-------- .../apache/kylin/dict/DateStrDictionary.java | 1 + .../java/org/apache/kylin/dict/Dictionary.java | 199 --------- .../apache/kylin/dict/DictionaryGenerator.java | 91 ++-- .../org/apache/kylin/dict/DictionaryInfo.java | 1 + .../kylin/dict/DictionaryInfoSerializer.java | 1 + .../apache/kylin/dict/DictionaryManager.java | 1 + .../java/org/apache/kylin/dict/ISegment.java | 3 +- .../dict/MultipleDictionaryValueEnumerator.java | 1 + .../org/apache/kylin/dict/TrieDictionary.java | 1 + .../apache/kylin/dict/lookup/SnapshotTable.java | 2 +- .../test_kylin_cube_without_slr_desc.json | 7 +- ...t_kylin_cube_without_slr_left_join_desc.json | 7 +- .../invertedindex/IICapabilityChecker.java | 182 ++++++++ .../apache/kylin/invertedindex/IIInstance.java | 16 +- .../apache/kylin/invertedindex/IIManager.java | 2 +- .../apache/kylin/invertedindex/IISegment.java | 4 +- .../invertedindex/index/BitMapContainer.java | 2 +- .../index/CompressedValueContainer.java | 2 +- .../invertedindex/index/RawTableRecord.java | 2 +- .../invertedindex/index/TableRecordInfo.java | 2 +- .../index/TableRecordInfoDigest.java | 2 +- .../kylin/invertedindex/model/IIDesc.java | 4 + .../invertedindex/IIInstanceTest.java | 2 +- .../invertedindex/InvertedIndexLocalTest.java | 2 +- .../kylin/job/hadoop/cube/BaseCuboidMapper.java | 76 ++-- .../kylin/job/hadoop/cube/CubeHFileMapper.java | 4 +- .../apache/kylin/job/hadoop/cube/CuboidJob.java | 5 +- .../kylin/job/hadoop/cube/CuboidReducer.java | 6 +- .../job/hadoop/cube/MergeCuboidMapper.java | 57 ++- .../job/hadoop/cube/NewBaseCuboidMapper.java | 346 --------------- .../kylin/job/dataGen/FactTableGenerator.java | 2 +- .../job/hadoop/cube/BaseCuboidMapperTest.java | 7 +- .../job/hadoop/cube/CubeHFileMapper2Test.java | 2 +- .../kylin/job/hadoop/cube/CubeReducerTest.java | 2 +- .../job/hadoop/cube/MergeCuboidMapperTest.java | 2 +- .../job/hadoop/cube/NDCuboidMapperTest.java | 17 +- .../apache/kylin/measure/MeasureAggregator.java | 78 ++++ .../kylin/measure/MeasureAggregators.java | 81 ++++ .../org/apache/kylin/measure/MeasureCodec.java | 79 ++++ .../apache/kylin/measure/MeasureIngester.java | 48 +++ .../org/apache/kylin/measure/MeasureType.java | 146 +++++++ .../kylin/measure/MeasureTypeFactory.java | 93 +++++ .../kylin/measure/basic/BasicMeasureType.java | 152 +++++++ .../kylin/measure/basic/BigDecimalIngester.java | 41 ++ .../measure/basic/BigDecimalMaxAggregator.java | 56 +++ .../measure/basic/BigDecimalMinAggregator.java | 57 +++ .../measure/basic/BigDecimalSumAggregator.java | 53 +++ .../kylin/measure/basic/DoubleIngester.java | 46 ++ .../measure/basic/DoubleMaxAggregator.java | 55 +++ .../measure/basic/DoubleMinAggregator.java | 55 +++ .../measure/basic/DoubleSumAggregator.java | 52 +++ .../kylin/measure/basic/LongIngester.java | 46 ++ .../kylin/measure/basic/LongMaxAggregator.java | 55 +++ .../kylin/measure/basic/LongMinAggregator.java | 55 +++ .../kylin/measure/basic/LongSumAggregator.java | 52 +++ .../kylin/measure/hllc/HLLCAggregator.java | 64 +++ .../kylin/measure/hllc/HLLCMeasureType.java | 127 ++++++ .../kylin/measure/hllc/HLLCSerializer.java | 88 ++++ .../measure/hllc/HLLDistinctCountAggFunc.java | 152 +++++++ .../metadata/datatype/BigDecimalSerializer.java | 105 +++++ .../kylin/metadata/datatype/DataType.java | 285 +++++++++++++ .../metadata/datatype/DataTypeSerializer.java | 90 ++++ .../metadata/datatype/DateTimeSerializer.java | 57 +++ .../metadata/datatype/DoubleSerializer.java | 75 ++++ .../kylin/metadata/datatype/LongSerializer.java | 82 ++++ .../metadata/datatype/StringSerializer.java | 52 +++ .../measure/BigDecimalMaxAggregator.java | 53 --- .../measure/BigDecimalMinAggregator.java | 54 --- .../metadata/measure/BigDecimalSerializer.java | 62 --- .../measure/BigDecimalSumAggregator.java | 50 --- .../metadata/measure/DoubleMaxAggregator.java | 54 --- .../metadata/measure/DoubleMinAggregator.java | 54 --- .../metadata/measure/DoubleSerializer.java | 55 --- .../metadata/measure/DoubleSumAggregator.java | 51 --- .../kylin/metadata/measure/HLLCAggregator.java | 57 --- .../kylin/metadata/measure/HLLCSerializer.java | 67 --- .../kylin/metadata/measure/LDCAggregator.java | 64 --- .../metadata/measure/LongMaxAggregator.java | 54 --- .../metadata/measure/LongMinAggregator.java | 54 --- .../kylin/metadata/measure/LongSerializer.java | 56 --- .../metadata/measure/LongSumAggregator.java | 51 --- .../metadata/measure/MeasureAggregator.java | 102 ----- .../metadata/measure/MeasureAggregators.java | 82 ---- .../kylin/metadata/measure/MeasureCodec.java | 83 ---- .../metadata/measure/MeasureSerializer.java | 68 --- .../measure/fixedlen/FixedHLLCodec.java | 2 +- .../measure/fixedlen/FixedLenMeasureCodec.java | 5 +- .../measure/fixedlen/FixedPointLongCodec.java | 2 +- .../apache/kylin/metadata/model/ColumnDesc.java | 1 + .../apache/kylin/metadata/model/DataType.java | 290 ------------- .../kylin/metadata/model/FunctionDesc.java | 53 ++- .../kylin/metadata/model/MeasureDesc.java | 4 - .../kylin/metadata/model/ParameterDesc.java | 59 ++- .../apache/kylin/metadata/model/TblColRef.java | 1 + .../metadata/realization/CapabilityResult.java | 63 +++ .../metadata/realization/IRealization.java | 15 +- .../metadata/realization/SQLDigestUtil.java | 2 +- .../org/apache/kylin/metadata/tuple/Tuple.java | 164 ++++++++ .../apache/kylin/metadata/tuple/TupleInfo.java | 113 +++++ .../datatype/BigDecimalSerializerTest.java | 69 +++ .../query/enumerator/LookupTableEnumerator.java | 2 +- .../kylin/query/relnode/OLAPAggregateRel.java | 14 +- .../apache/kylin/query/routing/Candidate.java | 100 +++++ .../apache/kylin/query/routing/QueryRouter.java | 63 ++- .../apache/kylin/query/routing/RoutingRule.java | 34 +- .../AdjustForWeeklyMatchedRealization.java | 101 ----- .../routing/RoutingRules/CubesSortRule.java | 67 --- .../RoutingRules/RealizationPriorityRule.java | 60 --- .../RoutingRules/RealizationSortRule.java | 65 --- .../RemoveUncapableRealizationsRule.java | 42 -- .../SimpleQueryMoreColumnsCubeFirstRule.java | 50 --- .../routing/rules/RealizationSortRule.java | 34 ++ .../rules/RemoveUncapableRealizationsRule.java | 44 ++ .../apache/kylin/query/schema/OLAPTable.java | 12 +- .../query/sqlfunc/HLLDistinctCountAggFunc.java | 152 ------- .../apache/kylin/query/test/IIQueryTest.java | 18 +- .../apache/kylin/query/test/KylinQueryTest.java | 2 +- .../apache/kylin/query/test/KylinTestBase.java | 4 +- .../apache/kylin/rest/service/JobService.java | 10 +- .../storage/filter/BitMapFilterEvaluator.java | 2 +- .../kylin/storage/hbase/ColumnValueRange.java | 2 +- .../storage/hbase/CubeSegmentTupleIterator.java | 211 +++++++--- .../kylin/storage/hbase/CubeStorageEngine.java | 24 +- .../kylin/storage/hbase/CubeTupleConverter.java | 139 ++++++ .../hbase/SerializedHBaseTupleIterator.java | 2 +- .../hbase/coprocessor/AggregationCache.java | 7 +- .../hbase/coprocessor/CoprocessorFilter.java | 2 +- .../CoprocessorTupleFilterTranslator.java | 2 +- .../endpoint/EndpointAggregationCache.java | 2 +- .../endpoint/EndpointAggregators.java | 11 +- .../endpoint/EndpointTupleIterator.java | 9 +- .../hbase/coprocessor/endpoint/IIEndpoint.java | 2 +- .../observer/AggregationScanner.java | 2 +- .../observer/ObserverAggregationCache.java | 2 +- .../observer/ObserverAggregators.java | 24 +- .../coprocessor/observer/ObserverEnabler.java | 2 +- .../coprocessor/observer/ObserverTuple.java | 2 +- .../kylin/storage/hybrid/HybridInstance.java | 31 +- .../storage/hybrid/HybridStorageEngine.java | 2 +- .../org/apache/kylin/storage/tuple/Tuple.java | 252 ----------- .../apache/kylin/storage/tuple/TupleInfo.java | 120 ------ .../filter/BitMapFilterEvaluatorTest.java | 2 +- .../kylin/storage/filter/FilterBaseTest.java | 4 +- .../storage/filter/FilterEvaluateTest.java | 2 +- .../kylin/storage/filter/FilterPerfTest.java | 4 +- .../storage/hbase/ColumnValueRangeTest.java | 2 +- .../endpoint/EndpointAggregationTest.java | 2 +- 169 files changed, 4764 insertions(+), 3644 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/common/src/main/java/org/apache/kylin/common/KylinConfig.java ---------------------------------------------------------------------- diff --git a/common/src/main/java/org/apache/kylin/common/KylinConfig.java b/common/src/main/java/org/apache/kylin/common/KylinConfig.java index 7816487..8a123dc 100644 --- a/common/src/main/java/org/apache/kylin/common/KylinConfig.java +++ b/common/src/main/java/org/apache/kylin/common/KylinConfig.java @@ -45,7 +45,7 @@ import com.google.common.collect.Sets; * @author yangli9 */ public class KylinConfig { - + /* * NO NEED TO DEFINE PUBLIC CONSTANTS FOR KEY NAMES! * @@ -272,7 +272,7 @@ public class KylinConfig { // ============================================================================ public boolean isHiveReroutingEnabled() { - return Boolean.parseBoolean(getOptional("kylin.route.hive.enabled", "false")); + return Boolean.parseBoolean(getOptional("kylin.route.hive.enabled", "false")); } public String getHiveRerouteUrl() { @@ -336,7 +336,7 @@ public class KylinConfig { } return getFileName(kylinHome + File.separator + "lib", JOB_JAR_NAME_PATTERN); } - + public String getKylinJobMRLibDir() { return getOptional("kylin.job.mr.lib.dir", ""); } @@ -499,7 +499,7 @@ public class KylinConfig { public int getHBaseKeyValueSize() { return Integer.parseInt(this.getOptional("kylin.hbase.client.keyvalue.maxsize", "10485760")); } - + public int getHBaseScanCacheRows() { return Integer.parseInt(this.getOptional("kylin.hbase.scan.cache_rows", "1024")); } @@ -507,7 +507,11 @@ public class KylinConfig { public int getHBaseScanMaxResultSize() { return Integer.parseInt(this.getOptional("kylin.hbase.scan.max_result_size", "" + (5 * 1024 * 1024))); // 5 MB } - + + public boolean isQueryIgnoreUnknownFunction() { + return Boolean.parseBoolean(this.getOptional("kylin.query.ignore_unknown_function", "false")); + } + public String getHbaseDefaultCompressionCodec() { return getOptional(HTABLE_DEFAULT_COMPRESSION_CODEC, ""); @@ -704,26 +708,26 @@ public class KylinConfig { public int getHBaseRegionCountMax() { return Integer.parseInt(getOptional(HBASE_REGION_COUNT_MAX, "500")); } - + public int getHBaseRegionCut(String capacity) { String cut; switch (capacity) { - case "SMALL": - cut = getProperty(HBASE_REGION_CUT_SMALL, "10"); - break; - case "MEDIUM": - cut = getProperty(HBASE_REGION_CUT_MEDIUM, "20"); - break; - case "LARGE": - cut = getProperty(HBASE_REGION_CUT_LARGE, "100"); - break; - default: - throw new IllegalArgumentException("Capacity not recognized: " + capacity); + case "SMALL": + cut = getProperty(HBASE_REGION_CUT_SMALL, "10"); + break; + case "MEDIUM": + cut = getProperty(HBASE_REGION_CUT_MEDIUM, "20"); + break; + case "LARGE": + cut = getProperty(HBASE_REGION_CUT_LARGE, "100"); + break; + default: + throw new IllegalArgumentException("Capacity not recognized: " + capacity); } return Integer.valueOf(cut); } - + public String getProperty(String key, String defaultValue) { return kylinConfig.getString(key, defaultValue); } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/common/src/main/java/org/apache/kylin/common/hll/HyperLogLogPlusCounter.java ---------------------------------------------------------------------- diff --git a/common/src/main/java/org/apache/kylin/common/hll/HyperLogLogPlusCounter.java b/common/src/main/java/org/apache/kylin/common/hll/HyperLogLogPlusCounter.java index 5ee323e..d37d6db 100644 --- a/common/src/main/java/org/apache/kylin/common/hll/HyperLogLogPlusCounter.java +++ b/common/src/main/java/org/apache/kylin/common/hll/HyperLogLogPlusCounter.java @@ -6,19 +6,20 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ +*/ package org.apache.kylin.common.hll; import java.io.IOException; +import java.io.Serializable; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.Arrays; @@ -30,13 +31,14 @@ import com.google.common.hash.Hashing; /** * About compression, test on HLLC data shows - * + * * - LZF compression ratio is around 65%-80%, fast * - GZIP compression ratio is around 41%-46%, very slow - * + * * @author yangli9 */ -public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter> { +@SuppressWarnings("serial") +public class HyperLogLogPlusCounter implements Serializable, Comparable<HyperLogLogPlusCounter> { private final int p; private final int m; @@ -108,10 +110,6 @@ public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter return new HLLCSnapshot(this).getCountEstimate(); } - public int getMemBytes() { - return 12 + m; - } - public double getErrorRate() { return 1.04 / Math.sqrt(m); } @@ -125,6 +123,11 @@ public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter return size; } + @Override + public String toString() { + return "" + getCountEstimate(); + } + // ============================================================================ // a memory efficient snapshot of HLL registers which can yield count @@ -176,7 +179,7 @@ public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter // decide output scheme -- map (3*size bytes) or array (2^p bytes) byte scheme; - if ((indexLen + 1) * size < m) + if (5 + (indexLen + 1) * size < m) // 5 is max len of vint scheme = 0; // map else scheme = 1; // array @@ -186,15 +189,14 @@ public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter BytesUtil.writeVInt(size, out); for (int i = 0; i < m; i++) { if (registers[i] > 0) { - BytesUtil.writeUnsigned(i, indexLen, out); + writeUnsigned(i, indexLen, out); out.put(registers[i]); } } - } else { // array scheme - for (int i = 0; i < m; i++) { - out.put(registers[i]); - } - } + } else if (scheme == 1) { // array scheme + out.put(registers); + } else + throw new IllegalStateException(); } public void readRegisters(ByteBuffer in) throws IOException { @@ -207,12 +209,34 @@ public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")"); int indexLen = getRegisterIndexSize(); for (int i = 0; i < size; i++) { - int key = BytesUtil.readUnsigned(in, indexLen); + int key = readUnsigned(in, indexLen); registers[key] = in.get(); } - } else { // array scheme + } else if (scheme == 1) { // array scheme in.get(registers); + } else + throw new IllegalStateException(); + } + + public int peekLength(ByteBuffer in) { + int mark = in.position(); + int len; + + byte scheme = in.get(); + if (scheme == 0) { // map scheme + int size = BytesUtil.readVInt(in); + int indexLen = getRegisterIndexSize(); + len = in.position() - mark + (indexLen + 1) * size; + } else { + len = in.position() - mark + m; } + + in.position(mark); + return len; + } + + public int maxLength() { + return 1 + m; } public void writeRegistersArray(final ByteBuffer out) { @@ -288,4 +312,29 @@ public class HyperLogLogPlusCounter implements Comparable<HyperLogLogPlusCounter System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%"); } } + + /** + * + * @param num + * @param size + * @param out + */ + public static void writeUnsigned(int num, int size, ByteBuffer out) { + for (int i = 0; i < size; i++) { + out.put((byte) num); + num >>>= 8; + } + } + + public static int readUnsigned(ByteBuffer in, int size) { + int integer = 0; + int mask = 0xff; + int shift = 0; + for (int i = 0; i < size; i++) { + integer |= (in.get() << shift) & mask; + mask = mask << 8; + shift += 8; + } + return integer; + } } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/common/src/main/java/org/apache/kylin/common/util/BytesUtil.java ---------------------------------------------------------------------- diff --git a/common/src/main/java/org/apache/kylin/common/util/BytesUtil.java b/common/src/main/java/org/apache/kylin/common/util/BytesUtil.java index 54e5871..03da261 100644 --- a/common/src/main/java/org/apache/kylin/common/util/BytesUtil.java +++ b/common/src/main/java/org/apache/kylin/common/util/BytesUtil.java @@ -230,6 +230,8 @@ public class BytesUtil { return bytes == null ? null : Bytes.toString(bytes); } + + public static void writeAsciiString(String str, ByteBuffer out) { if (str == null) { BytesUtil.writeVInt(-1, out); @@ -320,6 +322,20 @@ public class BytesUtil { return array; } + + public static int peekByteArrayLength(ByteBuffer in) { + int start = in.position(); + int arrayLen = readVInt(in); + int sizeLen = in.position() - start; + in.position(start); + + if (arrayLen < 0) + return sizeLen; + else + return sizeLen + arrayLen; + } + + public static void writeBooleanArray(boolean[] array, ByteBuffer out) { if (array == null) { writeVInt(-1, out); http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/common/src/main/java/org/apache/kylin/common/util/Dictionary.java ---------------------------------------------------------------------- diff --git a/common/src/main/java/org/apache/kylin/common/util/Dictionary.java b/common/src/main/java/org/apache/kylin/common/util/Dictionary.java new file mode 100644 index 0000000..0168609 --- /dev/null +++ b/common/src/main/java/org/apache/kylin/common/util/Dictionary.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kylin.common.util; + +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; + +import org.apache.hadoop.io.Writable; +import org.apache.kylin.common.util.BytesUtil; + +/** + * A bi-way dictionary that maps from dimension/column values to IDs and vice + * versa. By storing IDs instead of real values, the size of cube is + * significantly reduced. + * + * - IDs are smallest integers possible for the cardinality of a column, for the + * purpose of minimal storage space - IDs preserve ordering of values, such that + * range query can be applied to IDs directly + * + * A dictionary once built, is immutable. This allows optimal memory footprint + * by e.g. flatten the Trie structure into a byte array, replacing node pointers + * with array offsets. + * + * @author yangli9 + */ +abstract public class Dictionary<T> implements Writable { + + public static final byte NULL = (byte) 0xff; + + // ID with all bit-1 (0xff e.g.) reserved for NULL value + public static final int NULL_ID[] = new int[] { 0, 0xff, 0xffff, 0xffffff, 0xffffff }; + + abstract public int getMinId(); + + abstract public int getMaxId(); + + public int getSize() { + return getMaxId() - getMinId() + 1; + } + + /** + * @return the size of an ID in bytes, determined by the cardinality of + * column + */ + abstract public int getSizeOfId(); + + /** + * @return the (maximum) size of value in bytes, determined by the longest + * value of column + */ + abstract public int getSizeOfValue(); + + /** + * Convenient form of <code>getIdFromValue(value, 0)</code> + */ + final public int getIdFromValue(T value) throws IllegalArgumentException { + return getIdFromValue(value, 0); + } + + /** + * Returns the ID integer of given value. In case of not found + * <p> + * - if roundingFlag=0, throw IllegalArgumentException; <br> + * - if roundingFlag<0, the closest smaller ID integer if exist; <br> + * - if roundingFlag>0, the closest bigger ID integer if exist. <br> + * <p> + * The implementation often has cache, thus faster than the byte[] version getIdFromValueBytes() + * + * @throws IllegalArgumentException + * if value is not found in dictionary and rounding is off or + * failed + */ + final public int getIdFromValue(T value, int roundingFlag) throws IllegalArgumentException { + if (isNullObjectForm(value)) + return nullId(); + else + return getIdFromValueImpl(value, roundingFlag); + } + + protected boolean isNullObjectForm(T value) { + return value == null; + } + + abstract protected int getIdFromValueImpl(T value, int roundingFlag); + + /** + * @return the value corresponds to the given ID + * @throws IllegalArgumentException + * if ID is not found in dictionary + */ + final public T getValueFromId(int id) { + if (isNullId(id)) + return null; + else + return getValueFromIdImpl(id); + } + + abstract protected T getValueFromIdImpl(int id); + + /** + * Convenient form of + * <code>getIdFromValueBytes(value, offset, len, 0)</code> + */ + final public int getIdFromValueBytes(byte[] value, int offset, int len) { + return getIdFromValueBytes(value, offset, len, 0); + } + + /** + * A lower level API, return ID integer from raw value bytes. In case of not found + * <p> + * - if roundingFlag=0, throw IllegalArgumentException; <br> + * - if roundingFlag<0, the closest smaller ID integer if exist; <br> + * - if roundingFlag>0, the closest bigger ID integer if exist. <br> + * <p> + * Bypassing the cache layer, this could be significantly slower than getIdFromValue(T value). + * + * @throws IllegalArgumentException + * if value is not found in dictionary and rounding is off or failed + */ + final public int getIdFromValueBytes(byte[] value, int offset, int len, int roundingFlag) { + if (isNullByteForm(value, offset, len)) + return nullId(); + else + return getIdFromValueBytesImpl(value, offset, len, roundingFlag); + } + + protected boolean isNullByteForm(byte[] value, int offset, int len) { + return value == null; + } + + abstract protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag); + + /** + * A lower level API, get byte values from ID, return the number of bytes + * written. Bypassing the cache layer, this could be significantly slower + * than getIdFromValue(T value). + * + * @return size of value bytes, 0 if empty string, -1 if null + * + * @throws IllegalArgumentException + * if ID is not found in dictionary + */ + final public int getValueBytesFromId(int id, byte[] returnValue, int offset) { + if (isNullId(id)) + return -1; + else + return getValueBytesFromIdImpl(id, returnValue, offset); + } + + abstract protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset); + + abstract public void dump(PrintStream out); + + public int nullId() { + return NULL_ID[getSizeOfId()]; + } + + public boolean isNullId(int id) { + int nullId = NULL_ID[getSizeOfId()]; + return (nullId & id) == nullId; + } + + /** utility that converts a dictionary ID to string, preserving order */ + public static String dictIdToString(byte[] idBytes, int offset, int length) { + try { + return new String(idBytes, offset, length, "ISO-8859-1"); + } catch (UnsupportedEncodingException e) { + // never happen + return null; + } + } + + /** the reverse of dictIdToString(), returns integer ID */ + public static int stringToDictId(String str) { + try { + byte[] bytes = str.getBytes("ISO-8859-1"); + return BytesUtil.readUnsigned(bytes, 0, bytes.length); + } catch (UnsupportedEncodingException e) { + // never happen + return 0; + } + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/CubeCapabilityChecker.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/CubeCapabilityChecker.java b/cube/src/main/java/org/apache/kylin/cube/CubeCapabilityChecker.java index e713774..5668030 100644 --- a/cube/src/main/java/org/apache/kylin/cube/CubeCapabilityChecker.java +++ b/cube/src/main/java/org/apache/kylin/cube/CubeCapabilityChecker.java @@ -6,84 +6,113 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ +*/ package org.apache.kylin.cube; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; +import java.util.Set; import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.cube.model.DimensionDesc; +import org.apache.kylin.measure.MeasureType; +import org.apache.kylin.measure.basic.BasicMeasureType; import org.apache.kylin.metadata.model.FunctionDesc; import org.apache.kylin.metadata.model.JoinDesc; +import org.apache.kylin.metadata.model.MeasureDesc; import org.apache.kylin.metadata.model.TblColRef; +import org.apache.kylin.metadata.realization.CapabilityResult; +import org.apache.kylin.metadata.realization.CapabilityResult.CapabilityInfluence; import org.apache.kylin.metadata.realization.SQLDigest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.Sets; + /** - * Created by Hongbin Ma(Binmahone) on 1/8/15. */ public class CubeCapabilityChecker { private static final Logger logger = LoggerFactory.getLogger(CubeCapabilityChecker.class); - public static boolean check(CubeInstance cube, SQLDigest digest, boolean allowWeekMatch) { + public static CapabilityResult check(CubeInstance cube, SQLDigest digest) { + CapabilityResult result = new CapabilityResult(); + result.capable = false; - // retrieve members from olapContext - Collection<TblColRef> dimensionColumns = CubeDimensionDeriver.getDimensionColumns(digest.groupbyColumns, digest.filterColumns); - Collection<FunctionDesc> functions = digest.aggregations; - Collection<TblColRef> metricsColumns = digest.metricColumns; - Collection<JoinDesc> joins = digest.joinDescs; + // match joins + boolean isJoinMatch = isJoinMatch(digest.joinDescs, cube); + if (!isJoinMatch) { + logger.info("Exclude cube " + cube.getName() + " because unmatched joins"); + return result; + } - // match dimensions & aggregations & joins + // dimensions & measures + Collection<TblColRef> dimensionColumns = getDimensionColumns(digest); + Collection<FunctionDesc> aggrFunctions = digest.aggregations; + Collection<TblColRef> unmatchedDimensions = unmatchedDimensions(dimensionColumns, cube); + Collection<FunctionDesc> unmatchedAggregations = unmatchedAggregations(aggrFunctions, cube); - boolean isOnline = cube.isReady(); + // try custom measure types + if (!unmatchedDimensions.isEmpty() || !unmatchedAggregations.isEmpty()) { + tryCustomMeasureTypes(unmatchedDimensions, unmatchedAggregations, digest, cube, result); + } - boolean matchDimensions = isMatchedWithDimensions(dimensionColumns, cube); - boolean matchAggregation = isMatchedWithAggregations(functions, cube); - boolean matchJoin = isMatchedWithJoins(joins, cube); + // try dimension-as-measure + if (!unmatchedAggregations.isEmpty()) { + tryDimensionAsMeasures(unmatchedAggregations, digest, cube, result); + } - // Some cubes are not "perfectly" match, but still save them in case of usage - if (allowWeekMatch && isOnline && matchDimensions && !matchAggregation && matchJoin) { - // sometimes metrics are indeed dimensions - // e.g. select min(cal_dt) from ..., where cal_dt is actually a dimension - if (isWeaklyMatchedWithAggregations(functions, metricsColumns, cube)) { - logger.info("Weakly matched cube found " + cube.getName()); - return true; - } + if (!unmatchedDimensions.isEmpty()) { + logger.info("Exclude cube " + cube.getName() + " because unmatched dimensions"); + return result; } - if (!isOnline || !matchDimensions || !matchAggregation || !matchJoin) { - logger.info("Exclude cube " + cube.getName() + " because " + " isOnlne=" + isOnline + ",matchDimensions=" + matchDimensions + ",matchAggregation=" + matchAggregation + ",matchJoin=" + matchJoin); - return false; + if (!unmatchedAggregations.isEmpty()) { + logger.info("Exclude cube " + cube.getName() + " because unmatched aggregations"); + return result; } - return true; + // cost will be minded by caller + result.capable = true; + return result; } - private static boolean isMatchedWithDimensions(Collection<TblColRef> dimensionColumns, CubeInstance cube) { + private static Collection<TblColRef> getDimensionColumns(SQLDigest sqlDigest) { + Collection<TblColRef> groupByColumns = sqlDigest.groupbyColumns; + Collection<TblColRef> filterColumns = sqlDigest.filterColumns; + + Collection<TblColRef> dimensionColumns = new HashSet<TblColRef>(); + dimensionColumns.addAll(groupByColumns); + dimensionColumns.addAll(filterColumns); + return dimensionColumns; + } + + private static Set<TblColRef> unmatchedDimensions(Collection<TblColRef> dimensionColumns, CubeInstance cube) { + HashSet<TblColRef> result = Sets.newHashSet(dimensionColumns); CubeDesc cubeDesc = cube.getDescriptor(); - boolean matchAgg = cubeDesc.listDimensionColumnsIncludingDerived().containsAll(dimensionColumns); - return matchAgg; + result.removeAll(cubeDesc.listDimensionColumnsIncludingDerived()); + return result; } - private static boolean isMatchedWithAggregations(Collection<FunctionDesc> aggregations, CubeInstance cube) { + private static Set<FunctionDesc> unmatchedAggregations(Collection<FunctionDesc> aggregations, CubeInstance cube) { + HashSet<FunctionDesc> result = Sets.newHashSet(aggregations); CubeDesc cubeDesc = cube.getDescriptor(); - boolean matchAgg = cubeDesc.listAllFunctions().containsAll(aggregations); - return matchAgg; + result.removeAll(cubeDesc.listAllFunctions()); + return result; } - private static boolean isMatchedWithJoins(Collection<JoinDesc> joins, CubeInstance cube) { + private static boolean isJoinMatch(Collection<JoinDesc> joins, CubeInstance cube) { CubeDesc cubeDesc = cube.getDescriptor(); List<JoinDesc> cubeJoins = new ArrayList<JoinDesc>(cubeDesc.getDimensions().size()); @@ -118,30 +147,50 @@ public class CubeCapabilityChecker { return true; } - private static boolean isWeaklyMatchedWithAggregations(Collection<FunctionDesc> aggregations, Collection<TblColRef> metricColumns, CubeInstance cube) { + private static void tryDimensionAsMeasures(Collection<FunctionDesc> unmatchedAggregations, SQLDigest digest, CubeInstance cube, CapabilityResult result) { CubeDesc cubeDesc = cube.getDescriptor(); Collection<FunctionDesc> cubeFuncs = cubeDesc.listAllFunctions(); - boolean matched = true; - for (FunctionDesc functionDesc : aggregations) { - if (cubeFuncs.contains(functionDesc)) + Iterator<FunctionDesc> it = unmatchedAggregations.iterator(); + while (it.hasNext()) { + FunctionDesc functionDesc = it.next(); + + if (cubeFuncs.contains(functionDesc)) { + it.remove(); continue; + } - // only inverted-index cube does not have count, and let calcite handle in this case - if (functionDesc.isCount()) + // let calcite handle count + if (functionDesc.isCount()) { + it.remove(); continue; + } - if (functionDesc.isCountDistinct()) // calcite can not handle distinct count - matched = false; + // calcite can do aggregation from columns on-the-fly + List<TblColRef> neededCols = functionDesc.getParameter().getColRefs(); + if (neededCols.size() > 0 && cubeDesc.listDimensionColumnsIncludingDerived().containsAll(neededCols)) { + result.influences.add(new CapabilityResult.DimensionAsMeasure(functionDesc)); + it.remove(); + continue; + } + } + } - TblColRef col = null; - if (functionDesc.getParameter().getColRefs().size() > 0) - col = functionDesc.getParameter().getColRefs().get(0); + // custom measure types can cover unmatched dimensions or measures + private static void tryCustomMeasureTypes(Collection<TblColRef> unmatchedDimensions, Collection<FunctionDesc> unmatchedAggregations, SQLDigest digest, CubeInstance cube, CapabilityResult result) { + CubeDesc cubeDesc = cube.getDescriptor(); + for (MeasureDesc measure : cubeDesc.getMeasures()) { + if (unmatchedDimensions.isEmpty() && unmatchedAggregations.isEmpty()) + break; - if (col == null || !cubeDesc.listDimensionColumnsIncludingDerived().contains(col)) { - matched = false; - } + MeasureType<?> measureType = measure.getFunction().getMeasureType(); + if (measureType instanceof BasicMeasureType) + continue; + + CapabilityInfluence inf = measureType.influenceCapabilityCheck(unmatchedDimensions, unmatchedAggregations, digest, measure); + if (inf != null) + result.influences.add(inf); } - return matched; } + } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/CubeInstance.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/CubeInstance.java b/cube/src/main/java/org/apache/kylin/cube/CubeInstance.java index 81a64e8..7773351 100644 --- a/cube/src/main/java/org/apache/kylin/cube/CubeInstance.java +++ b/cube/src/main/java/org/apache/kylin/cube/CubeInstance.java @@ -28,6 +28,7 @@ import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.metadata.model.MeasureDesc; import org.apache.kylin.metadata.model.SegmentStatusEnum; import org.apache.kylin.metadata.model.TblColRef; +import org.apache.kylin.metadata.realization.CapabilityResult; import org.apache.kylin.metadata.realization.IRealization; import org.apache.kylin.metadata.realization.RealizationStatusEnum; import org.apache.kylin.metadata.realization.RealizationType; @@ -321,12 +322,20 @@ public class CubeInstance extends RootPersistentEntity implements IRealization { } @Override - public boolean isCapable(SQLDigest digest) { - return CubeCapabilityChecker.check(this, digest, true); + public CapabilityResult isCapable(SQLDigest digest) { + CapabilityResult result = CubeCapabilityChecker.check(this, digest); + if (result.capable) { + result.cost = getCost(digest); + for (CapabilityResult.CapabilityInfluence i : result.influences) { + result.cost *= (i.suggestCostMultiplier() == 0) ? 1.0 : i.suggestCostMultiplier(); + } + } else { + result.cost = -1; + } + return result; } - @Override - public int getCost(SQLDigest digest) { + private int getCost(SQLDigest digest) { return cost; } @@ -353,9 +362,6 @@ public class CubeInstance extends RootPersistentEntity implements IRealization { if (!this.getDescriptor().getModel().getPartitionDesc().isPartitioned()) return false; - if (this.getDescriptor().hasHolisticCountDistinctMeasures()) - return false; - return this.getDescriptor().getAutoMergeTimeRanges() != null && this.getDescriptor().getAutoMergeTimeRanges().length > 0; } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/CubeManager.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/CubeManager.java b/cube/src/main/java/org/apache/kylin/cube/CubeManager.java index b3eeeef..d3bbf59 100644 --- a/cube/src/main/java/org/apache/kylin/cube/CubeManager.java +++ b/cube/src/main/java/org/apache/kylin/cube/CubeManager.java @@ -36,11 +36,11 @@ import org.apache.kylin.common.persistence.ResourceStore; import org.apache.kylin.common.persistence.Serializer; import org.apache.kylin.common.restclient.Broadcaster; import org.apache.kylin.common.restclient.CaseInsensitiveStringCache; +import org.apache.kylin.common.util.Dictionary; import org.apache.kylin.common.util.Pair; import org.apache.kylin.cube.cuboid.Cuboid; import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.cube.model.DimensionDesc; -import org.apache.kylin.dict.Dictionary; import org.apache.kylin.dict.DictionaryInfo; import org.apache.kylin.dict.DictionaryManager; import org.apache.kylin.dict.lookup.HiveTable; @@ -176,7 +176,7 @@ public class CubeManager implements IRealizationProvider { /** * return null if no dictionary for given column */ - public Dictionary<?> getDictionary(CubeSegment cubeSeg, TblColRef col) { + public Dictionary<String> getDictionary(CubeSegment cubeSeg, TblColRef col) { DictionaryInfo info = null; try { DictionaryManager dictMgr = getDictionaryManager(); @@ -193,7 +193,7 @@ public class CubeManager implements IRealizationProvider { throw new IllegalStateException("Failed to get dictionary for cube segment" + cubeSeg + ", col" + col, e); } - return info.getDictionaryObject(); + return (Dictionary<String>) info.getDictionaryObject(); } public SnapshotTable buildSnapshotTable(CubeSegment cubeSeg, String lookupTable) throws IOException { http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/CubeSegment.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/CubeSegment.java b/cube/src/main/java/org/apache/kylin/cube/CubeSegment.java index 5db3251..12f0bf7 100644 --- a/cube/src/main/java/org/apache/kylin/cube/CubeSegment.java +++ b/cube/src/main/java/org/apache/kylin/cube/CubeSegment.java @@ -24,8 +24,9 @@ import java.util.Map; import java.util.TimeZone; import java.util.concurrent.ConcurrentHashMap; +import com.google.common.collect.Maps; import org.apache.kylin.cube.model.CubeDesc; -import org.apache.kylin.dict.Dictionary; +import org.apache.kylin.common.util.Dictionary; import org.apache.kylin.dict.ISegment; import org.apache.kylin.metadata.model.SegmentStatusEnum; import org.apache.kylin.metadata.model.TblColRef; @@ -261,10 +262,19 @@ public class CubeSegment implements Comparable<CubeSegment>, ISegment { } @Override - public Dictionary<?> getDictionary(TblColRef col) { + public Dictionary<String> getDictionary(TblColRef col) { return CubeManager.getInstance(this.getCubeInstance().getConfig()).getDictionary(this, col); } + public Map<TblColRef, Dictionary<String>> buildDictionaryMap() { + Map<TblColRef, Dictionary<String>> result = Maps.newHashMap(); + for (TblColRef col : getCubeDesc().getAllColumnsNeedDictionary()) { + result.put(col, (Dictionary<String>) getDictionary(col)); + } + return result; + } + + public void validate() { if (cubeInstance.getDescriptor().getModel().getPartitionDesc().isPartitioned() && dateRangeStart >= dateRangeEnd) throw new IllegalStateException("dateRangeStart(" + dateRangeStart + ") must be greater than dateRangeEnd(" + dateRangeEnd + ") in segment " + this); http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/estimation/CubeSizeEstimationCLI.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/estimation/CubeSizeEstimationCLI.java b/cube/src/main/java/org/apache/kylin/cube/estimation/CubeSizeEstimationCLI.java index 1172d66..188b157 100644 --- a/cube/src/main/java/org/apache/kylin/cube/estimation/CubeSizeEstimationCLI.java +++ b/cube/src/main/java/org/apache/kylin/cube/estimation/CubeSizeEstimationCLI.java @@ -34,7 +34,7 @@ import org.apache.kylin.cube.model.DimensionDesc; import org.apache.kylin.cube.model.HierarchyDesc; import org.apache.kylin.cube.model.RowKeyColDesc; import org.apache.kylin.cube.model.RowKeyDesc; -import org.apache.kylin.metadata.model.DataType; +import org.apache.kylin.metadata.datatype.DataType; import org.apache.kylin.metadata.model.MeasureDesc; /** @@ -140,7 +140,7 @@ public class CubeSizeEstimationCLI { int space = 0; for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { DataType returnType = measureDesc.getFunction().getReturnDataType(); - space += returnType.getSpaceEstimate(); + space += returnType.getStorageBytesEstimate(); } return space; } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/kv/AbstractRowKeyEncoder.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/kv/AbstractRowKeyEncoder.java b/cube/src/main/java/org/apache/kylin/cube/kv/AbstractRowKeyEncoder.java index c923b33..532950b 100644 --- a/cube/src/main/java/org/apache/kylin/cube/kv/AbstractRowKeyEncoder.java +++ b/cube/src/main/java/org/apache/kylin/cube/kv/AbstractRowKeyEncoder.java @@ -22,7 +22,7 @@ import java.util.Map; import org.apache.kylin.cube.CubeSegment; import org.apache.kylin.cube.cuboid.Cuboid; -import org.apache.kylin.dict.Dictionary; +import org.apache.kylin.common.util.Dictionary; import org.apache.kylin.metadata.model.TblColRef; import org.slf4j.Logger; import org.slf4j.LoggerFactory; http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnIO.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnIO.java b/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnIO.java index 1a84abb..00ecd46 100644 --- a/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnIO.java +++ b/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnIO.java @@ -22,7 +22,7 @@ import java.util.Arrays; import org.apache.kylin.common.util.Bytes; import org.apache.kylin.common.util.BytesUtil; -import org.apache.kylin.dict.Dictionary; +import org.apache.kylin.common.util.Dictionary; import org.apache.kylin.dict.ISegment; import org.apache.kylin.metadata.model.TblColRef; import org.slf4j.Logger; http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnOrder.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnOrder.java b/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnOrder.java index 7a64132..64bc813 100644 --- a/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnOrder.java +++ b/cube/src/main/java/org/apache/kylin/cube/kv/RowKeyColumnOrder.java @@ -21,7 +21,7 @@ package org.apache.kylin.cube.kv; import java.util.Collection; import java.util.Comparator; -import org.apache.kylin.metadata.model.DataType; +import org.apache.kylin.metadata.datatype.DataType; /** * @author yangli9 http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/kv/RowValueDecoder.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/kv/RowValueDecoder.java b/cube/src/main/java/org/apache/kylin/cube/kv/RowValueDecoder.java index f90a88d..5fe4e2e 100644 --- a/cube/src/main/java/org/apache/kylin/cube/kv/RowValueDecoder.java +++ b/cube/src/main/java/org/apache/kylin/cube/kv/RowValueDecoder.java @@ -25,11 +25,9 @@ import java.util.Collection; import java.util.List; import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.kylin.cube.model.HBaseColumnDesc; -import org.apache.kylin.metadata.measure.MeasureCodec; +import org.apache.kylin.measure.MeasureCodec; import org.apache.kylin.metadata.model.FunctionDesc; import org.apache.kylin.metadata.model.MeasureDesc; @@ -80,14 +78,19 @@ public class RowValueDecoder implements Cloneable { for (int i = 0; i < mapredObjs.length; i++) { Object o = mapredObjs[i]; + // if (o instanceof LongWritable) + // o = ((LongWritable) o).get(); + // else if (o instanceof IntWritable) + // o = ((IntWritable) o).get(); + // else if (o instanceof DoubleWritable) + // o = ((DoubleWritable) o).get(); + // else if (o instanceof FloatWritable) + // o = ((FloatWritable) o).get(); + if (o instanceof LongWritable) o = ((LongWritable) o).get(); - else if (o instanceof IntWritable) - o = ((IntWritable) o).get(); else if (o instanceof DoubleWritable) o = ((DoubleWritable) o).get(); - else if (o instanceof FloatWritable) - o = ((FloatWritable) o).get(); results[i] = o; } @@ -117,22 +120,20 @@ public class RowValueDecoder implements Cloneable { return measures; } - public boolean hasMemHungryCountDistinct() { + public boolean hasMemHungryMeasures() { for (int i = projectionIndex.nextSetBit(0); i >= 0; i = projectionIndex.nextSetBit(i + 1)) { FunctionDesc func = measures[i].getFunction(); - if (func.isCountDistinct() && !func.isHolisticCountDistinct()) { + if (func.getMeasureType().isMemoryHungry()) return true; - } } return false; } - public static boolean hasMemHungryCountDistinct(Collection<RowValueDecoder> rowValueDecoders) { + public static boolean hasMemHungryMeasures(Collection<RowValueDecoder> rowValueDecoders) { for (RowValueDecoder decoder : rowValueDecoders) { - if (decoder.hasMemHungryCountDistinct()) + if (decoder.hasMemHungryMeasures()) return true; } return false; } - } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java b/cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java index 8fd2c4e..3a9df54 100644 --- a/cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java +++ b/cube/src/main/java/org/apache/kylin/cube/model/CubeDesc.java @@ -34,7 +34,6 @@ import java.util.Map.Entry; import java.util.Set; import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.lang.StringUtils; import org.apache.commons.net.util.Base64; import org.apache.kylin.common.KylinConfig; import org.apache.kylin.common.persistence.ResourceStore; @@ -42,6 +41,7 @@ import org.apache.kylin.common.persistence.RootPersistentEntity; import org.apache.kylin.common.util.Array; import org.apache.kylin.common.util.CaseInsensitiveStringMap; import org.apache.kylin.common.util.JsonUtil; +import org.apache.kylin.measure.MeasureType; import org.apache.kylin.metadata.MetadataConstants; import org.apache.kylin.metadata.MetadataManager; import org.apache.kylin.metadata.model.ColumnDesc; @@ -562,7 +562,7 @@ public class CubeDesc extends RootPersistentEntity { private void initDerivedMap(TblColRef[] hostCols, DeriveType type, DimensionDesc dimension, TblColRef[] derivedCols, String[] extra) { if (hostCols.length == 0 || derivedCols.length == 0) throw new IllegalStateException("host/derived columns must not be empty"); - + // Although FK derives PK automatically, user unaware of this can declare PK as derived dimension explicitly. // In that case, derivedCols[] will contain a FK which is transformed from the PK by initDimensionColRef(). // Must drop FK from derivedCols[] before continue. @@ -641,25 +641,22 @@ public class CubeDesc extends RootPersistentEntity { f.setExpression(f.getExpression().toUpperCase()); f.initReturnDataType(); - ParameterDesc p = f.getParameter(); - p.normalizeColumnValue(); + for (ParameterDesc p = f.getParameter(); p != null; p = p.getNextParameter()) { + p.setValue(p.getValue().toUpperCase()); + } - if (p.isColumnType()) { - ArrayList<TblColRef> colRefs = Lists.newArrayList(); - for (String cName : p.getValue().split("\\s*,\\s*")) { - ColumnDesc sourceColumn = factTable.findColumnByName(cName); + ArrayList<TblColRef> colRefs = Lists.newArrayList(); + for (ParameterDesc p = f.getParameter(); p != null; p = p.getNextParameter()) { + if (p.isColumnType()) { + ColumnDesc sourceColumn = factTable.findColumnByName(p.getValue()); TblColRef colRef = new TblColRef(sourceColumn); colRefs.add(colRef); allColumns.add(colRef); } - if (colRefs.isEmpty() == false) - p.setColRefs(colRefs); } - // verify holistic count distinct as a dependent measure - if (m.isHolisticCountDistinct() && StringUtils.isBlank(m.getDependentMeasureRef())) { - throw new IllegalStateException(m + " is a holistic count distinct but it has no DependentMeasureRef defined!"); - } + f.getParameter().setColRefs(colRefs); + } } @@ -730,15 +727,6 @@ public class CubeDesc extends RootPersistentEntity { } } - public boolean hasHolisticCountDistinctMeasures() { - for (MeasureDesc measure : measures) { - if (measure.getFunction().isHolisticCountDistinct()) { - return true; - } - } - return false; - } - /** * Add error info and thrown exception out * @@ -794,4 +782,31 @@ public class CubeDesc extends RootPersistentEntity { this.autoMergeTimeRanges = autoMergeTimeRanges; } + public List<TblColRef> getAllColumnsNeedDictionary() { + List<TblColRef> result = Lists.newArrayList(); + + for (RowKeyColDesc rowKeyColDesc : rowkey.getRowKeyColumns()) { + TblColRef colRef = rowKeyColDesc.getColRef(); + if (rowkey.isUseDictionary(colRef)) { + result.add(colRef); + } + } + + for (MeasureDesc measure : measures) { + MeasureType<?> aggrType = measure.getFunction().getMeasureType(); + result.addAll(aggrType.getColumnsNeedDictionary(measure.getFunction())); + } + return result; + } + + public boolean hasMemoryHungryMeasures() { + for (MeasureDesc measure : measures) { + if (measure.getFunction().getMeasureType().isMemoryHungry()) { + return true; + } + } + return false; + } + + } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/model/v1/CubeDesc.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/model/v1/CubeDesc.java b/cube/src/main/java/org/apache/kylin/cube/model/v1/CubeDesc.java index b11056a..41d75c2 100644 --- a/cube/src/main/java/org/apache/kylin/cube/model/v1/CubeDesc.java +++ b/cube/src/main/java/org/apache/kylin/cube/model/v1/CubeDesc.java @@ -718,7 +718,6 @@ public class CubeDesc extends RootPersistentEntity { f.initReturnDataType(); ParameterDesc p = f.getParameter(); - p.normalizeColumnValue(); if (p.isColumnType()) { ArrayList<TblColRef> colRefs = Lists.newArrayList(); @@ -801,14 +800,6 @@ public class CubeDesc extends RootPersistentEntity { } } - public boolean hasHolisticCountDistinctMeasures() { - for (MeasureDesc measure : measures) { - if (measure.getFunction().isHolisticCountDistinct()) { - return true; - } - } - return false; - } /** * Add error info and thrown exception out http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/main/java/org/apache/kylin/cube/model/validation/rule/FunctionRule.java ---------------------------------------------------------------------- diff --git a/cube/src/main/java/org/apache/kylin/cube/model/validation/rule/FunctionRule.java b/cube/src/main/java/org/apache/kylin/cube/model/validation/rule/FunctionRule.java index a271ab5..d7d9f13 100644 --- a/cube/src/main/java/org/apache/kylin/cube/model/validation/rule/FunctionRule.java +++ b/cube/src/main/java/org/apache/kylin/cube/model/validation/rule/FunctionRule.java @@ -25,14 +25,12 @@ import java.util.List; import java.util.Set; import org.apache.commons.lang.StringUtils; -import org.apache.kylin.common.KylinConfig; import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.cube.model.validation.IValidatorRule; import org.apache.kylin.cube.model.validation.ResultLevel; import org.apache.kylin.cube.model.validation.ValidateContext; import org.apache.kylin.metadata.MetadataManager; import org.apache.kylin.metadata.model.ColumnDesc; -import org.apache.kylin.metadata.model.DataType; import org.apache.kylin.metadata.model.FunctionDesc; import org.apache.kylin.metadata.model.MeasureDesc; import org.apache.kylin.metadata.model.ParameterDesc; @@ -94,7 +92,11 @@ public class FunctionRule implements IValidatorRule<CubeDesc> { } else if (StringUtils.equals(FunctionDesc.PARAMTER_TYPE_CONSTANT, type)) { validateCostantParameter(context, cube, value); } - validateReturnType(context, cube, func); + try { + func.getMeasureType().validate(func); + } catch (IllegalArgumentException ex) { + context.addResult(ResultLevel.ERROR, ex.getMessage()); + } if (func.isCount()) countFuncs.add(func); @@ -105,31 +107,6 @@ public class FunctionRule implements IValidatorRule<CubeDesc> { } } - private void validateReturnType(ValidateContext context, CubeDesc cube, FunctionDesc funcDesc) { - - String func = funcDesc.getExpression(); - DataType rtype = funcDesc.getReturnDataType(); - - if (funcDesc.isCount()) { - if (rtype.isIntegerFamily() == false) { - context.addResult(ResultLevel.ERROR, "Return type for function " + func + " must be one of " + DataType.INTEGER_FAMILY); - } - } else if (funcDesc.isCountDistinct()) { - if (rtype.isHLLC() == false && funcDesc.isHolisticCountDistinct() == false) { - context.addResult(ResultLevel.ERROR, "Return type for function " + func + " must be hllc(10), hllc(12) etc."); - } - } else if (funcDesc.isMax() || funcDesc.isMin() || funcDesc.isSum()) { - if (rtype.isNumberFamily() == false) { - context.addResult(ResultLevel.ERROR, "Return type for function " + func + " must be one of " + DataType.NUMBER_FAMILY); - } - } else { - if (StringUtils.equalsIgnoreCase(KylinConfig.getInstanceFromEnv().getProperty(KEY_IGNORE_UNKNOWN_FUNC, "false"), "false")) { - context.addResult(ResultLevel.ERROR, "Unrecognized function: [" + func + "]"); - } - } - - } - /** * @param context * @param cube http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/test/java/org/apache/kylin/cube/DictionaryManagerTest.java ---------------------------------------------------------------------- diff --git a/cube/src/test/java/org/apache/kylin/cube/DictionaryManagerTest.java b/cube/src/test/java/org/apache/kylin/cube/DictionaryManagerTest.java index a076938..f093032 100644 --- a/cube/src/test/java/org/apache/kylin/cube/DictionaryManagerTest.java +++ b/cube/src/test/java/org/apache/kylin/cube/DictionaryManagerTest.java @@ -26,7 +26,7 @@ import java.util.HashSet; import org.apache.kylin.common.util.JsonUtil; import org.apache.kylin.common.util.LocalFileMetadataTestCase; import org.apache.kylin.cube.model.CubeDesc; -import org.apache.kylin.dict.Dictionary; +import org.apache.kylin.common.util.Dictionary; import org.apache.kylin.dict.DictionaryInfo; import org.apache.kylin.dict.DictionaryManager; import org.apache.kylin.metadata.model.TblColRef; http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/test/java/org/apache/kylin/cube/kv/RowValueDecoderTest.java ---------------------------------------------------------------------- diff --git a/cube/src/test/java/org/apache/kylin/cube/kv/RowValueDecoderTest.java b/cube/src/test/java/org/apache/kylin/cube/kv/RowValueDecoderTest.java index b4aa238..a00afef 100644 --- a/cube/src/test/java/org/apache/kylin/cube/kv/RowValueDecoderTest.java +++ b/cube/src/test/java/org/apache/kylin/cube/kv/RowValueDecoderTest.java @@ -30,8 +30,8 @@ import org.apache.kylin.common.util.LocalFileMetadataTestCase; import org.apache.kylin.cube.CubeManager; import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.cube.model.HBaseColumnDesc; +import org.apache.kylin.measure.MeasureCodec; import org.apache.kylin.metadata.MetadataManager; -import org.apache.kylin.metadata.measure.MeasureCodec; import org.apache.kylin.metadata.model.FunctionDesc; import org.apache.kylin.metadata.model.MeasureDesc; import org.junit.After; @@ -63,7 +63,7 @@ public class RowValueDecoderTest extends LocalFileMetadataTestCase { LongWritable count = new LongWritable(2); LongWritable item_count = new LongWritable(99999); ByteBuffer buf = ByteBuffer.allocate(RowConstants.ROWVALUE_BUFFER_SIZE); - codec.encode(new Object[] { sum, min, max, count,item_count }, buf); + codec.encode(new Object[] { sum, min, max, count, item_count }, buf); buf.flip(); byte[] valueBytes = new byte[buf.limit()]; @@ -81,7 +81,7 @@ public class RowValueDecoderTest extends LocalFileMetadataTestCase { Object[] measureValues = rowValueDecoder.getValues(); assertEquals("[PRICE, MIN_PRICE_, MAX_PRICE_, COUNT__, ITEM_COUNT]", measureNames.toString()); - assertEquals("[333.1234567, 333.1111111, 333.1999999, 2, 99999]", Arrays.toString(measureValues)); + assertEquals("[333.1235, 333.1111, 333.2000, 2, 99999]", Arrays.toString(measureValues)); } } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java ---------------------------------------------------------------------- diff --git a/cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java b/cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java index 3ed6094..0870914 100644 --- a/cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java +++ b/cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java @@ -28,6 +28,7 @@ import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.kylin.common.hll.HyperLogLogPlusCounter; import org.apache.kylin.cube.kv.RowConstants; +import org.apache.kylin.measure.MeasureCodec; import org.apache.kylin.metadata.model.FunctionDesc; import org.apache.kylin.metadata.model.MeasureDesc; import org.junit.Test; @@ -45,7 +46,7 @@ public class MeasureCodecTest { DoubleWritable d = new DoubleWritable(1.0); LongWritable l = new LongWritable(2); - BigDecimal b = new BigDecimal("333.1234567"); + BigDecimal b = new BigDecimal("333.1234"); HyperLogLogPlusCounter hllc = new HyperLogLogPlusCounter(16); hllc.add("1234567"); hllc.add("abcdefg"); http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/test/resources/data/TEST1_desc.json ---------------------------------------------------------------------- diff --git a/cube/src/test/resources/data/TEST1_desc.json b/cube/src/test/resources/data/TEST1_desc.json index 082a3cf..0373496 100644 --- a/cube/src/test/resources/data/TEST1_desc.json +++ b/cube/src/test/resources/data/TEST1_desc.json @@ -127,7 +127,12 @@ "expression" : "COUNT_DISTINCT", "parameter" : { "type" : "column", - "value" : "LSTG_FORMAT_NAME,SELLER_ID" + "value" : "LSTG_FORMAT_NAME", + "next_parameter" : { + "type" : "column", + "value" : "SELLER_ID", + "next_parameter" : null + } }, "returntype" : "hllc10" } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/test/resources/data/TEST2_desc.json ---------------------------------------------------------------------- diff --git a/cube/src/test/resources/data/TEST2_desc.json b/cube/src/test/resources/data/TEST2_desc.json index 90bd806..de6bfe0 100644 --- a/cube/src/test/resources/data/TEST2_desc.json +++ b/cube/src/test/resources/data/TEST2_desc.json @@ -112,7 +112,12 @@ "expression" : "COUNT_DISTINCT", "parameter" : { "type" : "column", - "value" : "LSTG_FORMAT_NAME,SELLER_ID" + "value" : "LSTG_FORMAT_NAME", + "next_parameter" : { + "type" : "column", + "value" : "SELLER_ID", + "next_parameter" : null + } }, "returntype" : "hllc10" } http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/cube/src/test/resources/data/TEST3_desc.json ---------------------------------------------------------------------- diff --git a/cube/src/test/resources/data/TEST3_desc.json b/cube/src/test/resources/data/TEST3_desc.json index 4b0836b..54f0cf9 100644 --- a/cube/src/test/resources/data/TEST3_desc.json +++ b/cube/src/test/resources/data/TEST3_desc.json @@ -1,182 +1,268 @@ { - "uuid" : "9c53506d-d7b9-4ad4-b3b1-53ea42673c4b", - "last_modified" : 1401429176099, - "name" : "TEST1_desc", + "uuid": "9c53506d-d7b9-4ad4-b3b1-53ea42673c4b", + "last_modified": 1401429176099, + "name": "TEST1_desc", "model_name": "TEST1_model_desc", - "dimensions" : [ { - "id" : 1, - "name" : "CAL_DT", - "join" : { - "type" : "inner", - "primary_key" : [ "CAL_DT" ], - "foreign_key" : [ "CAL_DT" ] + "dimensions": [ + { + "id": 1, + "name": "CAL_DT", + "join": { + "type": "inner", + "primary_key": [ + "CAL_DT" + ], + "foreign_key": [ + "CAL_DT" + ] + }, + "hierarchy": null, + "table": "TEST_CAL_DT", + "column": [ + "TEST_CAL_DT.CAL_DT" + ], + "datatype": "date", + "derived": [ + "WEEK_BEG_DT" + ] }, - "hierarchy" : null, - "table" : "TEST_CAL_DT", - "column" : ["TEST_CAL_DT.CAL_DT"], - "datatype" : "date", - "derived" : [ "WEEK_BEG_DT" ] - }, { - "id" : 2, - "name" : "CATEGORY", - "join" : { - "type" : "inner", - "primary_key" : [ "LEAF_CATEG_ID", "SITE_ID" ], - "foreign_key" : [ "LEAF_CATEG_ID", "LSTG_SITE_ID" ] + { + "id": 2, + "name": "CATEGORY", + "join": { + "type": "inner", + "primary_key": [ + "LEAF_CATEG_ID", + "SITE_ID" + ], + "foreign_key": [ + "LEAF_CATEG_ID", + "LSTG_SITE_ID" + ] + }, + "hierarchy": true, + "table": "TEST_CATEGORY_GROUPINGS", + "column": [ + "TEST_CATEGORY_GROUPINGS.META_CATEG_NAME", + "TEST_CATEGORY_GROUPINGS.CATEG_LVL2_NAME", + "TEST_CATEGORY_GROUPINGS.CATEG_LVL3_NAME" + ], + "derived": null }, - "hierarchy" : true, - "table" : "TEST_CATEGORY_GROUPINGS", - "column" : ["TEST_CATEGORY_GROUPINGS.META_CATEG_NAME", "TEST_CATEGORY_GROUPINGS.CATEG_LVL2_NAME", "TEST_CATEGORY_GROUPINGS.CATEG_LVL3_NAME"], - "derived" : null - }, { - "id" : 3, - "name" : "LSTG_FORMAT_NAME", - "join" : null, - "hierarchy" : null, - "table" : "TEST_KYLIN_FACT", - "column" : ["TEST_KYLIN_FACT.LSTG_FORMAT_NAME"], - "datatype" : "string", - "derived" : null - }, { - "id" : 4, - "name" : "SITE_ID", - "join" : { - "type" : "inner", - "primary_key" : [ "SITE_ID" ], - "foreign_key" : [ "LSTG_SITE_ID" ] + { + "id": 3, + "name": "LSTG_FORMAT_NAME", + "join": null, + "hierarchy": null, + "table": "TEST_KYLIN_FACT", + "column": [ + "TEST_KYLIN_FACT.LSTG_FORMAT_NAME" + ], + "datatype": "string", + "derived": null }, - "hierarchy" : null, - "table" : "TEST_SITES", - "column" : ["TEST_SITES.SITE_ID"], - "datatype" : "string", - "derived" : [ "SITE_NAME", "CRE_USER" ] - }, { - "id" : 5, - "name" : "SELLER_TYPE_CD", - "join" : { - "type" : "inner", - "primary_key" : [ "SELLER_TYPE_CD" ], - "foreign_key" : [ "SLR_SEGMENT_CD" ] + { + "id": 4, + "name": "SITE_ID", + "join": { + "type": "inner", + "primary_key": [ + "SITE_ID" + ], + "foreign_key": [ + "LSTG_SITE_ID" + ] + }, + "hierarchy": null, + "table": "TEST_SITES", + "column": [ + "TEST_SITES.SITE_ID" + ], + "datatype": "string", + "derived": [ + "SITE_NAME", + "CRE_USER" + ] }, - "hierarchy" : null, - "table" : "TEST_SELLER_TYPE_DIM", - "column" : ["TEST_SELLER_TYPE_DIM.SELLER_TYPE_CD"], - "datatype" : "string", - "derived" : [ "SELLER_TYPE_DESC" ] - } ], - "measures" : [ { - "id" : 1, - "name" : "GMV_SUM", - "function" : { - "expression" : "SUM", - "parameter" : { - "type" : "column", - "value" : "PRICE" + { + "id": 5, + "name": "SELLER_TYPE_CD", + "join": { + "type": "inner", + "primary_key": [ + "SELLER_TYPE_CD" + ], + "foreign_key": [ + "SLR_SEGMENT_CD" + ] }, - "returntype" : "decimal" + "hierarchy": null, + "table": "TEST_SELLER_TYPE_DIM", + "column": [ + "TEST_SELLER_TYPE_DIM.SELLER_TYPE_CD" + ], + "datatype": "string", + "derived": [ + "SELLER_TYPE_DESC" + ] } - }, { - "id" : 2, - "name" : "GMV_MIN", - "function" : { - "expression" : "MIN", - "parameter" : { - "type" : "column", - "value" : "PRICE" - }, - "returntype" : "decimal" + ], + "measures": [ + { + "id": 1, + "name": "GMV_SUM", + "function": { + "expression": "SUM", + "parameter": { + "type": "column", + "value": "PRICE" + }, + "returntype": "decimal" + } + }, + { + "id": 2, + "name": "GMV_MIN", + "function": { + "expression": "MIN", + "parameter": { + "type": "column", + "value": "PRICE" + }, + "returntype": "decimal" + } + }, + { + "id": 3, + "name": "GMV_MAX", + "function": { + "expression": "MAX", + "parameter": { + "type": "column", + "value": "PRICE" + }, + "returntype": "decimal" + } + }, + { + "id": 4, + "name": "TRANS_CNT", + "function": { + "expression": "COUNT", + "parameter": { + "type": "constant", + "value": "1" + }, + "returntype": "long" + } + }, + { + "id": 5, + "name": "SELLER_CNT", + "function": { + "expression": "COUNT_DISTINCT", + "parameter": { + "type": "column", + "value": "SELLER_ID" + }, + "returntype": "hllc10" + } + }, + { + "id": 6, + "name": "SELLER_FORMAT_CNT", + "function": { + "expression": "COUNT_DISTINCT", + "parameter": { + "type": "column", + "value": "LSTG_FORMAT_NAME", + "next_parameter": { + "type": "column", + "value": "SELLER_ID", + "next_parameter": null + } + }, + "returntype": "hllc10" + } } - }, { - "id" : 3, - "name" : "GMV_MAX", - "function" : { - "expression" : "MAX", - "parameter" : { - "type" : "column", - "value" : "PRICE" + ], + "rowkey": { + "rowkey_columns": [ + { + "column": "CAL_DT", + "length": 10, + "dictionary": "date(yyyy-mm-dd)", + "mandatory": false }, - "returntype" : "decimal" - } - }, { - "id" : 4, - "name" : "TRANS_CNT", - "function" : { - "expression" : "COUNT", - "parameter" : { - "type" : "constant", - "value" : "1" + { + "column": "META_CATEG_NAME", + "length": 0, + "dictionary": "string", + "mandatory": false }, - "returntype" : "long" - } - }, { - "id" : 5, - "name" : "SELLER_CNT", - "function" : { - "expression" : "COUNT_DISTINCT", - "parameter" : { - "type" : "column", - "value" : "SELLER_ID" + { + "column": "CATEG_LVL2_NAME", + "length": 0, + "dictionary": "string", + "mandatory": false }, - "returntype" : "hllc10" - } - }, { - "id" : 6, - "name" : "SELLER_FORMAT_CNT", - "function" : { - "expression" : "COUNT_DISTINCT", - "parameter" : { - "type" : "column", - "value" : "LSTG_FORMAT_NAME,SELLER_ID" + { + "column": "CATEG_LVL3_NAME", + "length": 0, + "dictionary": "string", + "mandatory": false }, - "returntype" : "hllc10" - } - } ], - "rowkey" : { - "rowkey_columns" : [ { - "column" : "CAL_DT", - "length" : 10, - "dictionary" : "date(yyyy-mm-dd)", - "mandatory" : false - }, { - "column" : "META_CATEG_NAME", - "length" : 0, - "dictionary" : "string", - "mandatory" : false - }, { - "column" : "CATEG_LVL2_NAME", - "length" : 0, - "dictionary" : "string", - "mandatory" : false - }, { - "column" : "CATEG_LVL3_NAME", - "length" : 0, - "dictionary" : "string", - "mandatory" : false - }, { - "column" : "LSTG_FORMAT_NAME", - "length" : 12, - "dictionary" : null, - "mandatory" : false - }, { - "column" : "SITE_ID", - "length" : 0, - "dictionary" : "string", - "mandatory" : false - }, { - "column" : "SELLER_TYPE_CD", - "length" : 0, - "dictionary" : "string", - "mandatory" : false - } ], - "aggregation_groups" : [ [ "META_CATEG_NAME", "CATEG_LVL3_NAME", "CATEG_LVL2_NAME", "CAL_DT" ], [ "LSTG_FORMAT_NAME", "SITE_ID", "SELLER_TYPE_CD" ] ] + { + "column": "LSTG_FORMAT_NAME", + "length": 12, + "dictionary": null, + "mandatory": false + }, + { + "column": "SITE_ID", + "length": 0, + "dictionary": "string", + "mandatory": false + }, + { + "column": "SELLER_TYPE_CD", + "length": 0, + "dictionary": "string", + "mandatory": false + } + ], + "aggregation_groups": [ + [ + "META_CATEG_NAME", + "CATEG_LVL3_NAME", + "CATEG_LVL2_NAME", + "CAL_DT" + ], + [ + "LSTG_FORMAT_NAME", + "SITE_ID", + "SELLER_TYPE_CD" + ] + ] }, - "hbase_mapping" : { - "column_family" : [ { - "name" : "F1", - "columns" : [ { - "qualifier" : "M", - "measure_refs" : [ "GMV_SUM", "GMV_MIN", "GMV_MAX", "TRANS_CNT", "SELLER_CNT", "SELLER_FORMAT_CNT" ] - } ] - } ] + "hbase_mapping": { + "column_family": [ + { + "name": "F1", + "columns": [ + { + "qualifier": "M", + "measure_refs": [ + "GMV_SUM", + "GMV_MIN", + "GMV_MAX", + "TRANS_CNT", + "SELLER_CNT", + "SELLER_FORMAT_CNT" + ] + } + ] + } + ] } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java ---------------------------------------------------------------------- diff --git a/dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java b/dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java index 5702be9..14086c6 100644 --- a/dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java +++ b/dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java @@ -27,6 +27,7 @@ import java.util.Date; import org.apache.commons.lang.StringUtils; import org.apache.kylin.common.util.DateFormat; +import org.apache.kylin.common.util.Dictionary; /** * A dictionary for date string (date only, no time). http://git-wip-us.apache.org/repos/asf/kylin/blob/c721d679/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java ---------------------------------------------------------------------- diff --git a/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java b/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java deleted file mode 100644 index e99a553..0000000 --- a/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.kylin.dict; - -import java.io.PrintStream; -import java.io.UnsupportedEncodingException; - -import org.apache.hadoop.io.Writable; -import org.apache.kylin.common.util.BytesUtil; - -/** - * A bi-way dictionary that maps from dimension/column values to IDs and vice - * versa. By storing IDs instead of real values, the size of cube is - * significantly reduced. - * - * - IDs are smallest integers possible for the cardinality of a column, for the - * purpose of minimal storage space - IDs preserve ordering of values, such that - * range query can be applied to IDs directly - * - * A dictionary once built, is immutable. This allows optimal memory footprint - * by e.g. flatten the Trie structure into a byte array, replacing node pointers - * with array offsets. - * - * @author yangli9 - */ -abstract public class Dictionary<T> implements Writable { - - public static final byte NULL = (byte) 0xff; - - // ID with all bit-1 (0xff e.g.) reserved for NULL value - public static final int NULL_ID[] = new int[] { 0, 0xff, 0xffff, 0xffffff, 0xffffff }; - - abstract public int getMinId(); - - abstract public int getMaxId(); - - public int getSize() { - return getMaxId() - getMinId() + 1; - } - - /** - * @return the size of an ID in bytes, determined by the cardinality of - * column - */ - abstract public int getSizeOfId(); - - /** - * @return the (maximum) size of value in bytes, determined by the longest - * value of column - */ - abstract public int getSizeOfValue(); - - /** - * Convenient form of <code>getIdFromValue(value, 0)</code> - */ - final public int getIdFromValue(T value) throws IllegalArgumentException { - return getIdFromValue(value, 0); - } - - /** - * Returns the ID integer of given value. In case of not found - * <p> - * - if roundingFlag=0, throw IllegalArgumentException; <br> - * - if roundingFlag<0, the closest smaller ID integer if exist; <br> - * - if roundingFlag>0, the closest bigger ID integer if exist. <br> - * <p> - * The implementation often has cache, thus faster than the byte[] version getIdFromValueBytes() - * - * @throws IllegalArgumentException - * if value is not found in dictionary and rounding is off or - * failed - */ - final public int getIdFromValue(T value, int roundingFlag) throws IllegalArgumentException { - if (isNullObjectForm(value)) - return nullId(); - else - return getIdFromValueImpl(value, roundingFlag); - } - - protected boolean isNullObjectForm(T value) { - return value == null; - } - - abstract protected int getIdFromValueImpl(T value, int roundingFlag); - - /** - * @return the value corresponds to the given ID - * @throws IllegalArgumentException - * if ID is not found in dictionary - */ - final public T getValueFromId(int id) { - if (isNullId(id)) - return null; - else - return getValueFromIdImpl(id); - } - - abstract protected T getValueFromIdImpl(int id); - - /** - * Convenient form of - * <code>getIdFromValueBytes(value, offset, len, 0)</code> - */ - final public int getIdFromValueBytes(byte[] value, int offset, int len) { - return getIdFromValueBytes(value, offset, len, 0); - } - - /** - * A lower level API, return ID integer from raw value bytes. In case of not found - * <p> - * - if roundingFlag=0, throw IllegalArgumentException; <br> - * - if roundingFlag<0, the closest smaller ID integer if exist; <br> - * - if roundingFlag>0, the closest bigger ID integer if exist. <br> - * <p> - * Bypassing the cache layer, this could be significantly slower than getIdFromValue(T value). - * - * @throws IllegalArgumentException - * if value is not found in dictionary and rounding is off or failed - */ - final public int getIdFromValueBytes(byte[] value, int offset, int len, int roundingFlag) { - if (isNullByteForm(value, offset, len)) - return nullId(); - else - return getIdFromValueBytesImpl(value, offset, len, roundingFlag); - } - - protected boolean isNullByteForm(byte[] value, int offset, int len) { - return value == null; - } - - abstract protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag); - - /** - * A lower level API, get byte values from ID, return the number of bytes - * written. Bypassing the cache layer, this could be significantly slower - * than getIdFromValue(T value). - * - * @return size of value bytes, 0 if empty string, -1 if null - * - * @throws IllegalArgumentException - * if ID is not found in dictionary - */ - final public int getValueBytesFromId(int id, byte[] returnValue, int offset) { - if (isNullId(id)) - return -1; - else - return getValueBytesFromIdImpl(id, returnValue, offset); - } - - abstract protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset); - - abstract public void dump(PrintStream out); - - public int nullId() { - return NULL_ID[getSizeOfId()]; - } - - public boolean isNullId(int id) { - int nullId = NULL_ID[getSizeOfId()]; - return (nullId & id) == nullId; - } - - /** utility that converts a dictionary ID to string, preserving order */ - public static String dictIdToString(byte[] idBytes, int offset, int length) { - try { - return new String(idBytes, offset, length, "ISO-8859-1"); - } catch (UnsupportedEncodingException e) { - // never happen - return null; - } - } - - /** the reverse of dictIdToString(), returns integer ID */ - public static int stringToDictId(String str) { - try { - byte[] bytes = str.getBytes("ISO-8859-1"); - return BytesUtil.readUnsigned(bytes, 0, bytes.length); - } catch (UnsupportedEncodingException e) { - // never happen - return 0; - } - } -}