[CARBONDATA-2420][32K] Support string longer than 32000 characters Add a property in creating table 'long_string_columns' to support string columns that will contains more than 32000 characters. Inside carbondata, it use an integer instead of short to store the length of bytes content.
Internally in Carbondata, add a new datatype called varchar to represent the long string column add a new encoding called DIRECT_COMPRESS_VARCHAR to the varcher column page meta use an integer (previously short) to store the length of bytes content. add 2GB constraint for one column page This closes #2379 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/dc53dee2 Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/dc53dee2 Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/dc53dee2 Branch: refs/heads/carbonstore Commit: dc53dee2448f366319764021d77c4be75d43b9e3 Parents: c5a4ec0 Author: xuchuanyin <xuchuan...@hust.edu.cn> Authored: Sat Jun 2 15:17:04 2018 +0800 Committer: kumarvishal09 <kumarvishal1...@gmail.com> Committed: Wed Jun 20 15:24:22 2018 +0530 ---------------------------------------------------------------------- .../core/constants/CarbonCommonConstants.java | 3 + .../impl/FixedLengthDimensionColumnPage.java | 2 +- .../impl/VariableLengthDimensionColumnPage.java | 11 +- ...mpressedDimensionChunkFileBasedReaderV1.java | 3 +- ...mpressedDimensionChunkFileBasedReaderV2.java | 3 +- ...mpressedDimensionChunkFileBasedReaderV3.java | 7 +- .../chunk/store/DimensionChunkStoreFactory.java | 22 +- ...ariableIntLengthDimensionDataChunkStore.java | 43 +++ ...feVariableLengthDimensionDataChunkStore.java | 45 +-- ...iableShortLengthDimensionDataChunkStore.java | 41 +++ ...ariableIntLengthDimensionDataChunkStore.java | 44 +++ ...feVariableLengthDimensionDataChunkStore.java | 54 ++-- ...iableShortLengthDimensionDataChunkStore.java | 44 +++ .../core/datastore/page/ColumnPage.java | 16 +- .../datastore/page/VarLengthColumnPageBase.java | 6 + .../page/encoding/DefaultEncodingFactory.java | 1 + .../page/encoding/EncodingFactory.java | 3 +- .../encoding/compress/DirectCompressCodec.java | 6 +- .../legacy/HighCardDictDimensionIndexCodec.java | 13 +- .../statistics/LVLongStringStatsCollector.java | 51 ++++ .../statistics/LVShortStringStatsCollector.java | 50 ++++ .../page/statistics/LVStringStatsCollector.java | 27 +- .../core/indexstore/UnsafeMemoryDMStore.java | 11 +- .../blockletindex/BlockletDataMap.java | 8 +- .../core/indexstore/row/DataMapRow.java | 4 +- .../core/indexstore/row/UnsafeDataMapRow.java | 60 ++-- .../core/indexstore/schema/CarbonRowSchema.java | 10 +- .../core/metadata/blocklet/BlockletInfo.java | 2 +- .../ThriftWrapperSchemaConverterImpl.java | 8 + .../core/metadata/datatype/DataType.java | 3 + .../core/metadata/datatype/DataTypes.java | 5 + .../core/metadata/datatype/VarcharType.java | 34 +++ .../core/metadata/encoder/Encoding.java | 5 +- .../schema/table/TableSchemaBuilder.java | 1 + .../util/AbstractDataFileFooterConverter.java | 2 + .../apache/carbondata/core/util/CarbonUtil.java | 8 +- .../carbondata/core/util/DataTypeUtil.java | 4 + .../ThriftWrapperSchemaConverterImplTest.java | 2 +- format/src/main/thrift/schema.thrift | 3 + .../VarcharDataTypesBasicTestCase.scala | 279 +++++++++++++++++++ .../carbondata/spark/util/CarbonScalaUtil.scala | 1 + .../spark/util/DataTypeConverterUtil.scala | 1 + .../spark/sql/catalyst/CarbonDDLSqlParser.scala | 36 ++- .../command/carbonTableSchemaCommon.scala | 43 ++- .../apache/spark/sql/hive/CarbonRelation.scala | 1 + .../impl/NonDictionaryFieldConverterImpl.java | 12 +- .../loading/csvinput/CSVInputFormat.java | 4 +- .../loading/row/IntermediateSortTempRow.java | 19 +- .../loading/sort/SortStepRowHandler.java | 26 +- .../merger/CompactionResultSortProcessor.java | 11 +- .../sort/sortdata/SortParameters.java | 21 +- .../sort/sortdata/TableFieldStat.java | 37 ++- .../carbondata/processing/store/TablePage.java | 37 ++- .../util/CarbonDataProcessorUtil.java | 20 ++ 54 files changed, 1049 insertions(+), 164 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java b/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java index 5f06d08..118ff28 100644 --- a/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java +++ b/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java @@ -910,6 +910,7 @@ public final class CarbonCommonConstants { public static final String COLUMN_GROUPS = "column_groups"; public static final String DICTIONARY_EXCLUDE = "dictionary_exclude"; public static final String DICTIONARY_INCLUDE = "dictionary_include"; + public static final String LONG_STRING_COLUMNS = "long_string_columns"; /** * Table property to enable or disable local dictionary generation @@ -1632,6 +1633,8 @@ public final class CarbonCommonConstants { // As Short data type is used for storing the length of a column during data processing hence // the maximum characters that can be supported should be less than Short max value public static final int MAX_CHARS_PER_COLUMN_DEFAULT = 32000; + // todo: use infinity first, will switch later + public static final int MAX_CHARS_PER_COLUMN_INFINITY = -1; /** * Enabling page level reader for compaction reduces the memory usage while compacting more http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/FixedLengthDimensionColumnPage.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/FixedLengthDimensionColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/FixedLengthDimensionColumnPage.java index 76bcf30..570404a 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/FixedLengthDimensionColumnPage.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/FixedLengthDimensionColumnPage.java @@ -47,7 +47,7 @@ public class FixedLengthDimensionColumnPage extends AbstractDimensionColumnPage dataChunk.length; dataChunkStore = DimensionChunkStoreFactory.INSTANCE .getDimensionChunkStore(columnValueSize, isExplicitSorted, numberOfRows, totalSize, - DimensionStoreType.FIXEDLENGTH); + DimensionStoreType.FIXED_LENGTH); dataChunkStore.putArray(invertedIndex, invertedIndexReverse, dataChunk); } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/VariableLengthDimensionColumnPage.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/VariableLengthDimensionColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/VariableLengthDimensionColumnPage.java index 1c6b7f4..7394217 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/VariableLengthDimensionColumnPage.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/VariableLengthDimensionColumnPage.java @@ -30,21 +30,16 @@ public class VariableLengthDimensionColumnPage extends AbstractDimensionColumnPa /** * Constructor for this class - * @param dataChunks - * @param invertedIndex - * @param invertedIndexReverse - * @param numberOfRows */ public VariableLengthDimensionColumnPage(byte[] dataChunks, int[] invertedIndex, - int[] invertedIndexReverse, int numberOfRows) { + int[] invertedIndexReverse, int numberOfRows, DimensionStoreType dimStoreType) { boolean isExplicitSorted = isExplicitSorted(invertedIndex); - long totalSize = isExplicitSorted ? + long totalSize = null != invertedIndex ? (dataChunks.length + (2 * numberOfRows * CarbonCommonConstants.INT_SIZE_IN_BYTE) + ( numberOfRows * CarbonCommonConstants.INT_SIZE_IN_BYTE)) : (dataChunks.length + (numberOfRows * CarbonCommonConstants.INT_SIZE_IN_BYTE)); dataChunkStore = DimensionChunkStoreFactory.INSTANCE - .getDimensionChunkStore(0, isExplicitSorted, numberOfRows, totalSize, - DimensionStoreType.VARIABLELENGTH); + .getDimensionChunkStore(0, isExplicitSorted, numberOfRows, totalSize, dimStoreType); dataChunkStore.putArray(invertedIndex, invertedIndexReverse, dataChunks); } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v1/CompressedDimensionChunkFileBasedReaderV1.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v1/CompressedDimensionChunkFileBasedReaderV1.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v1/CompressedDimensionChunkFileBasedReaderV1.java index 6679402..92a9684 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v1/CompressedDimensionChunkFileBasedReaderV1.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v1/CompressedDimensionChunkFileBasedReaderV1.java @@ -26,6 +26,7 @@ import org.apache.carbondata.core.datastore.chunk.impl.DimensionRawColumnChunk; import org.apache.carbondata.core.datastore.chunk.impl.FixedLengthDimensionColumnPage; import org.apache.carbondata.core.datastore.chunk.impl.VariableLengthDimensionColumnPage; import org.apache.carbondata.core.datastore.chunk.reader.dimension.AbstractChunkReader; +import org.apache.carbondata.core.datastore.chunk.store.DimensionChunkStoreFactory; import org.apache.carbondata.core.datastore.columnar.UnBlockIndexer; import org.apache.carbondata.core.metadata.blocklet.BlockletInfo; import org.apache.carbondata.core.metadata.blocklet.datachunk.DataChunk; @@ -151,7 +152,7 @@ public class CompressedDimensionChunkFileBasedReaderV1 extends AbstractChunkRead .hasEncoding(dataChunk.getEncodingList(), Encoding.DICTIONARY)) { columnDataChunk = new VariableLengthDimensionColumnPage(dataPage, invertedIndexes, invertedIndexesReverse, - numberOfRows); + numberOfRows, DimensionChunkStoreFactory.DimensionStoreType.VARIABLE_SHORT_LENGTH); } else { // to store fixed length column chunk values columnDataChunk = http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v2/CompressedDimensionChunkFileBasedReaderV2.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v2/CompressedDimensionChunkFileBasedReaderV2.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v2/CompressedDimensionChunkFileBasedReaderV2.java index 8938260..3cdbe1d 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v2/CompressedDimensionChunkFileBasedReaderV2.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v2/CompressedDimensionChunkFileBasedReaderV2.java @@ -25,6 +25,7 @@ import org.apache.carbondata.core.datastore.chunk.impl.DimensionRawColumnChunk; import org.apache.carbondata.core.datastore.chunk.impl.FixedLengthDimensionColumnPage; import org.apache.carbondata.core.datastore.chunk.impl.VariableLengthDimensionColumnPage; import org.apache.carbondata.core.datastore.chunk.reader.dimension.AbstractChunkReaderV2V3Format; +import org.apache.carbondata.core.datastore.chunk.store.DimensionChunkStoreFactory; import org.apache.carbondata.core.datastore.columnar.UnBlockIndexer; import org.apache.carbondata.core.metadata.blocklet.BlockletInfo; import org.apache.carbondata.core.util.CarbonUtil; @@ -175,7 +176,7 @@ public class CompressedDimensionChunkFileBasedReaderV2 extends AbstractChunkRead if (!hasEncoding(dimensionColumnChunk.encoders, Encoding.DICTIONARY)) { columnDataChunk = new VariableLengthDimensionColumnPage(dataPage, invertedIndexes, invertedIndexesReverse, - numberOfRows); + numberOfRows, DimensionChunkStoreFactory.DimensionStoreType.VARIABLE_SHORT_LENGTH); } else { // to store fixed length column chunk values columnDataChunk = http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/CompressedDimensionChunkFileBasedReaderV3.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/CompressedDimensionChunkFileBasedReaderV3.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/CompressedDimensionChunkFileBasedReaderV3.java index 58a9b18..782a8df 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/CompressedDimensionChunkFileBasedReaderV3.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/CompressedDimensionChunkFileBasedReaderV3.java @@ -27,6 +27,7 @@ import org.apache.carbondata.core.datastore.chunk.impl.FixedLengthDimensionColum import org.apache.carbondata.core.datastore.chunk.impl.VariableLengthDimensionColumnPage; import org.apache.carbondata.core.datastore.chunk.reader.dimension.AbstractChunkReaderV2V3Format; import org.apache.carbondata.core.datastore.chunk.store.ColumnPageWrapper; +import org.apache.carbondata.core.datastore.chunk.store.DimensionChunkStoreFactory; import org.apache.carbondata.core.datastore.columnar.UnBlockIndexer; import org.apache.carbondata.core.datastore.page.ColumnPage; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageDecoder; @@ -271,9 +272,13 @@ public class CompressedDimensionChunkFileBasedReaderV3 extends AbstractChunkRead // if no dictionary column then first create a no dictionary column chunk // and set to data chunk instance if (!hasEncoding(pageMetadata.encoders, Encoding.DICTIONARY)) { + DimensionChunkStoreFactory.DimensionStoreType dimStoreType = + hasEncoding(pageMetadata.encoders, Encoding.DIRECT_COMPRESS_VARCHAR) ? + DimensionChunkStoreFactory.DimensionStoreType.VARIABLE_INT_LENGTH : + DimensionChunkStoreFactory.DimensionStoreType.VARIABLE_SHORT_LENGTH; columnDataChunk = new VariableLengthDimensionColumnPage(dataPage, invertedIndexes, invertedIndexesReverse, - pageMetadata.getNumberOfRowsInpage()); + pageMetadata.getNumberOfRowsInpage(), dimStoreType); } else { // to store fixed length column chunk values columnDataChunk = http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/DimensionChunkStoreFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/DimensionChunkStoreFactory.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/DimensionChunkStoreFactory.java index f210641..eccfd9c 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/DimensionChunkStoreFactory.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/DimensionChunkStoreFactory.java @@ -19,9 +19,11 @@ package org.apache.carbondata.core.datastore.chunk.store; import org.apache.carbondata.core.constants.CarbonCommonConstants; import org.apache.carbondata.core.datastore.chunk.store.impl.safe.SafeFixedLengthDimensionDataChunkStore; -import org.apache.carbondata.core.datastore.chunk.store.impl.safe.SafeVariableLengthDimensionDataChunkStore; +import org.apache.carbondata.core.datastore.chunk.store.impl.safe.SafeVariableIntLengthDimensionDataChunkStore; +import org.apache.carbondata.core.datastore.chunk.store.impl.safe.SafeVariableShortLengthDimensionDataChunkStore; import org.apache.carbondata.core.datastore.chunk.store.impl.unsafe.UnsafeFixedLengthDimensionDataChunkStore; -import org.apache.carbondata.core.datastore.chunk.store.impl.unsafe.UnsafeVariableLengthDimensionDataChunkStore; +import org.apache.carbondata.core.datastore.chunk.store.impl.unsafe.UnsafeVariableIntLengthDimensionDataChunkStore; +import org.apache.carbondata.core.datastore.chunk.store.impl.unsafe.UnsafeVariableShortLengthDimensionDataChunkStore; import org.apache.carbondata.core.util.CarbonProperties; /** @@ -63,19 +65,23 @@ public class DimensionChunkStoreFactory { boolean isInvertedIndex, int numberOfRows, long totalSize, DimensionStoreType storeType) { if (isUnsafe) { - if (storeType == DimensionStoreType.FIXEDLENGTH) { + if (storeType == DimensionStoreType.FIXED_LENGTH) { return new UnsafeFixedLengthDimensionDataChunkStore(totalSize, columnValueSize, isInvertedIndex, numberOfRows); + } else if (storeType == DimensionStoreType.VARIABLE_SHORT_LENGTH) { + return new UnsafeVariableShortLengthDimensionDataChunkStore(totalSize, isInvertedIndex, + numberOfRows); } else { - return new UnsafeVariableLengthDimensionDataChunkStore(totalSize, isInvertedIndex, + return new UnsafeVariableIntLengthDimensionDataChunkStore(totalSize, isInvertedIndex, numberOfRows); } - } else { - if (storeType == DimensionStoreType.FIXEDLENGTH) { + if (storeType == DimensionStoreType.FIXED_LENGTH) { return new SafeFixedLengthDimensionDataChunkStore(isInvertedIndex, columnValueSize); + } else if (storeType == DimensionStoreType.VARIABLE_SHORT_LENGTH) { + return new SafeVariableShortLengthDimensionDataChunkStore(isInvertedIndex, numberOfRows); } else { - return new SafeVariableLengthDimensionDataChunkStore(isInvertedIndex, numberOfRows); + return new SafeVariableIntLengthDimensionDataChunkStore(isInvertedIndex, numberOfRows); } } } @@ -84,6 +90,6 @@ public class DimensionChunkStoreFactory { * dimension store type enum */ public enum DimensionStoreType { - FIXEDLENGTH, VARIABLELENGTH; + FIXED_LENGTH, VARIABLE_SHORT_LENGTH, VARIABLE_INT_LENGTH; } } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableIntLengthDimensionDataChunkStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableIntLengthDimensionDataChunkStore.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableIntLengthDimensionDataChunkStore.java new file mode 100644 index 0000000..773f078 --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableIntLengthDimensionDataChunkStore.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.chunk.store.impl.safe; + +import java.nio.ByteBuffer; + +import org.apache.carbondata.core.constants.CarbonCommonConstants; + +/** + * Below class is responsible to store variable long length(>32000) dimension data chunk in + * memory. Memory occupied can be on heap or offheap using unsafe interface + */ +public class SafeVariableIntLengthDimensionDataChunkStore + extends SafeVariableLengthDimensionDataChunkStore { + public SafeVariableIntLengthDimensionDataChunkStore(boolean isInvertedIndex, int numberOfRows) { + super(isInvertedIndex, numberOfRows); + } + + @Override + protected int getLengthSize() { + return CarbonCommonConstants.INT_SIZE_IN_BYTE; + } + + @Override + protected int getLengthFromBuffer(ByteBuffer buffer) { + return buffer.getInt(); + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java index bb9c888..52e7317 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java @@ -28,9 +28,10 @@ import org.apache.carbondata.core.util.DataTypeUtil; /** * Below class is responsible to store variable length dimension data chunk in - * memory Memory occupied can be on heap or offheap using unsafe interface + * memory. Memory occupied can be on heap or offheap using unsafe interface */ -public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimensionDataChunkStore { +public abstract class SafeVariableLengthDimensionDataChunkStore + extends SafeAbsractDimensionDataChunkStore { /** * total number of rows @@ -56,7 +57,8 @@ public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimens * @param invertedIndexReverse inverted index reverse to be stored * @param data data to be stored */ - @Override public void putArray(final int[] invertedIndex, final int[] invertedIndexReverse, + @Override + public void putArray(final int[] invertedIndex, final int[] invertedIndexReverse, byte[] data) { // first put the data, inverted index and reverse inverted index to memory super.putArray(invertedIndex, invertedIndexReverse, data); @@ -75,21 +77,25 @@ public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimens // as first position will be start from 2 byte as data is stored first in the memory block // we need to skip first two bytes this is because first two bytes will be length of the data // which we have to skip - dataOffsets[0] = CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + dataOffsets[0] = getLengthSize(); // creating a byte buffer which will wrap the length of the row ByteBuffer buffer = ByteBuffer.wrap(data); for (int i = 1; i < numberOfRows; i++) { buffer.position(startOffset); // so current row position will be // previous row length + 2 bytes used for storing previous row data - startOffset += buffer.getShort() + CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + startOffset += getLengthFromBuffer(buffer) + getLengthSize(); // as same byte buffer is used to avoid creating many byte buffer for each row // we need to clear the byte buffer - dataOffsets[i] = startOffset + CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + dataOffsets[i] = startOffset + getLengthSize(); } } - @Override public byte[] getRow(int rowId) { + protected abstract int getLengthSize(); + protected abstract int getLengthFromBuffer(ByteBuffer buffer); + + @Override + public byte[] getRow(int rowId) { // if column was explicitly sorted we need to get the rowid based inverted index reverse if (isExplictSorted) { rowId = invertedIndexReverse[rowId]; @@ -101,21 +107,21 @@ public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimens // else subtract the current row offset with complete data // length get the offset of set of data int currentDataOffset = dataOffsets[rowId]; - short length = 0; + int length = 0; // calculating the length of data if (rowId < numberOfRows - 1) { - length = (short) (dataOffsets[rowId + 1] - (currentDataOffset - + CarbonCommonConstants.SHORT_SIZE_IN_BYTE)); + length = dataOffsets[rowId + 1] - (currentDataOffset + getLengthSize()); } else { // for last record - length = (short) (this.data.length - currentDataOffset); + length = this.data.length - currentDataOffset; } byte[] currentRowData = new byte[length]; System.arraycopy(data, currentDataOffset, currentRowData, 0, length); return currentRowData; } - @Override public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) { + @Override + public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) { // if column was explicitly sorted we need to get the rowid based inverted index reverse if (isExplictSorted) { rowId = invertedIndexReverse[rowId]; @@ -127,11 +133,10 @@ public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimens // else subtract the current row offset with complete data // length get the offset of set of data int currentDataOffset = dataOffsets[rowId]; - short length = 0; + int length = 0; // calculating the length of data if (rowId < numberOfRows - 1) { - length = (short) (dataOffsets[rowId + 1] - (currentDataOffset - + CarbonCommonConstants.SHORT_SIZE_IN_BYTE)); + length = dataOffsets[rowId + 1] - (currentDataOffset + getLengthSize()); } else { // for last record length = (short) (this.data.length - currentDataOffset); @@ -162,7 +167,8 @@ public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimens } } - @Override public int compareTo(int rowId, byte[] compareValue) { + @Override + public int compareTo(int rowId, byte[] compareValue) { // now to get the row from memory block we need to do following thing // 1. first get the current offset // 2. if it's not a last row- get the next row offset @@ -172,14 +178,13 @@ public class SafeVariableLengthDimensionDataChunkStore extends SafeAbsractDimens // get the offset of set of data int currentDataOffset = dataOffsets[rowId]; - short length = 0; + int length = 0; // calculating the length of data if (rowId < numberOfRows - 1) { - length = (short) (dataOffsets[rowId + 1] - (currentDataOffset - + CarbonCommonConstants.SHORT_SIZE_IN_BYTE)); + length = dataOffsets[rowId + 1] - (currentDataOffset + getLengthSize()); } else { // for last record - length = (short) (this.data.length - currentDataOffset); + length = this.data.length - currentDataOffset; } return ByteUtil.UnsafeComparer.INSTANCE .compareTo(data, currentDataOffset, length, compareValue, 0, compareValue.length); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableShortLengthDimensionDataChunkStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableShortLengthDimensionDataChunkStore.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableShortLengthDimensionDataChunkStore.java new file mode 100644 index 0000000..beccf86 --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableShortLengthDimensionDataChunkStore.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.chunk.store.impl.safe; + +import java.nio.ByteBuffer; + +import org.apache.carbondata.core.constants.CarbonCommonConstants; + +/** + * Below class is responsible to store variable long length(>32000) dimension data chunk in + * memory. Memory occupied can be on heap or offheap using unsafe interface + */ +public class SafeVariableShortLengthDimensionDataChunkStore + extends SafeVariableLengthDimensionDataChunkStore { + public SafeVariableShortLengthDimensionDataChunkStore(boolean isInvertedIndex, int numberOfRows) { + super(isInvertedIndex, numberOfRows); + } + + @Override protected int getLengthSize() { + return CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + } + + @Override protected int getLengthFromBuffer(ByteBuffer buffer) { + return buffer.getShort(); + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableIntLengthDimensionDataChunkStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableIntLengthDimensionDataChunkStore.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableIntLengthDimensionDataChunkStore.java new file mode 100644 index 0000000..851fff6 --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableIntLengthDimensionDataChunkStore.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.chunk.store.impl.unsafe; + +import java.nio.ByteBuffer; + +import org.apache.carbondata.core.constants.CarbonCommonConstants; + +/** + * Below class is responsible to store variable length dimension data chunk in + * memory Memory occupied can be on heap or offheap using unsafe interface + */ +public class UnsafeVariableIntLengthDimensionDataChunkStore + extends UnsafeVariableLengthDimensionDataChunkStore { + public UnsafeVariableIntLengthDimensionDataChunkStore(long totalSize, boolean isInvertedIdex, + int numberOfRows) { + super(totalSize, isInvertedIdex, numberOfRows); + } + + @Override + protected int getLengthSize() { + return CarbonCommonConstants.INT_SIZE_IN_BYTE; + } + + @Override + protected int getLengthFromBuffer(ByteBuffer byteBuffer) { + return byteBuffer.getInt(); + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableLengthDimensionDataChunkStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableLengthDimensionDataChunkStore.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableLengthDimensionDataChunkStore.java index 07dc806..801a282 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableLengthDimensionDataChunkStore.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableLengthDimensionDataChunkStore.java @@ -31,7 +31,7 @@ import org.apache.carbondata.core.util.DataTypeUtil; * Below class is responsible to store variable length dimension data chunk in * memory Memory occupied can be on heap or offheap using unsafe interface */ -public class UnsafeVariableLengthDimensionDataChunkStore +public abstract class UnsafeVariableLengthDimensionDataChunkStore extends UnsafeAbstractDimensionDataChunkStore { /** @@ -67,42 +67,43 @@ public class UnsafeVariableLengthDimensionDataChunkStore * @param invertedIndexReverse inverted index reverse to be stored * @param data data to be stored */ - @Override public void putArray(final int[] invertedIndex, final int[] invertedIndexReverse, + @Override + public void putArray(final int[] invertedIndex, final int[] invertedIndexReverse, byte[] data) { // first put the data, inverted index and reverse inverted index to memory super.putArray(invertedIndex, invertedIndexReverse, data); // position from where offsets will start this.dataPointersOffsets = this.invertedIndexReverseOffset; if (isExplicitSorted) { - this.dataPointersOffsets += (long)numberOfRows * CarbonCommonConstants.INT_SIZE_IN_BYTE; + this.dataPointersOffsets += (long) numberOfRows * CarbonCommonConstants.INT_SIZE_IN_BYTE; } // As data is of variable length and data format is - // <length in short><data><length in short><data> + // <length in short><data><length in short/int><data> // we need to store offset of each data so data can be accessed directly // for example: //data = {0,5,1,2,3,4,5,0,6,0,1,2,3,4,5,0,2,8,9} //so value stored in offset will be position of actual data // [2,9,17] - // to store this value we need to get the actual data length + 2 bytes used for storing the + // to store this value we need to get the actual data length + 2/4 bytes used for storing the // length // start position will be used to store the current data position int startOffset = 0; - // as first position will be start from 2 byte as data is stored first in the memory block + // as first position will be start from 2/4 byte as data is stored first in the memory block // we need to skip first two bytes this is because first two bytes will be length of the data // which we have to skip int [] dataOffsets = new int[numberOfRows]; - dataOffsets[0] = CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + dataOffsets[0] = getLengthSize(); // creating a byte buffer which will wrap the length of the row ByteBuffer buffer = ByteBuffer.wrap(data); for (int i = 1; i < numberOfRows; i++) { buffer.position(startOffset); // so current row position will be - // previous row length + 2 bytes used for storing previous row data - startOffset += buffer.getShort() + CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + // previous row length + 2/4 bytes used for storing previous row data + startOffset += getLengthFromBuffer(buffer) + getLengthSize(); // as same byte buffer is used to avoid creating many byte buffer for each row // we need to clear the byte buffer - dataOffsets[i] = startOffset + CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + dataOffsets[i] = startOffset + getLengthSize(); } CarbonUnsafe.getUnsafe().copyMemory(dataOffsets, CarbonUnsafe.INT_ARRAY_OFFSET, dataPageMemoryBlock.getBaseObject(), @@ -110,6 +111,9 @@ public class UnsafeVariableLengthDimensionDataChunkStore dataOffsets.length * CarbonCommonConstants.INT_SIZE_IN_BYTE); } + protected abstract int getLengthSize(); + protected abstract int getLengthFromBuffer(ByteBuffer byteBuffer); + /** * Below method will be used to get the row based on row id passed * Getting the row from unsafe works in below logic @@ -122,13 +126,14 @@ public class UnsafeVariableLengthDimensionDataChunkStore * @param rowId * @return row */ - @Override public byte[] getRow(int rowId) { + @Override + public byte[] getRow(int rowId) { // get the actual row id rowId = getRowId(rowId); // get offset of data in unsafe int currentDataOffset = getOffSet(rowId); // get the data length - short length = getLength(rowId, currentDataOffset); + int length = getLength(rowId, currentDataOffset); // create data array byte[] data = new byte[length]; // fill the row data @@ -167,25 +172,24 @@ public class UnsafeVariableLengthDimensionDataChunkStore /** * To get the length of data for row id * if it's not a last row- get the next row offset - * Subtract the current row offset + 2 bytes(to skip the data length) with next row offset + * Subtract the current row offset + 2/4 bytes(to skip the data length) with next row offset * if it's last row - * subtract the current row offset + 2 bytes(to skip the data length) with complete data length + * subtract the current row offset + 2/4 bytes(to skip the data length) with complete data length * @param rowId rowId * @param currentDataOffset current data offset * @return length of row */ - private short getLength(int rowId, int currentDataOffset) { - short length = 0; + private int getLength(int rowId, int currentDataOffset) { + int length = 0; // calculating the length of data if (rowId < numberOfRows - 1) { int OffsetOfNextdata = CarbonUnsafe.getUnsafe().getInt(dataPageMemoryBlock.getBaseObject(), dataPageMemoryBlock.getBaseOffset() + this.dataPointersOffsets + ((rowId + 1) * CarbonCommonConstants.INT_SIZE_IN_BYTE)); - length = (short) (OffsetOfNextdata - (currentDataOffset - + CarbonCommonConstants.SHORT_SIZE_IN_BYTE)); + length = OffsetOfNextdata - (currentDataOffset + getLengthSize()); } else { // for last record we need to subtract with data length - length = (short) (this.dataLength - currentDataOffset); + length = this.dataLength - currentDataOffset; } return length; } @@ -196,7 +200,7 @@ public class UnsafeVariableLengthDimensionDataChunkStore * @param data data array * @param currentDataOffset current data offset */ - private void fillRowInternal(short length, byte[] data, int currentDataOffset) { + private void fillRowInternal(int length, byte[] data, int currentDataOffset) { CarbonUnsafe.getUnsafe().copyMemory(dataPageMemoryBlock.getBaseObject(), dataPageMemoryBlock.getBaseOffset() + currentDataOffset, data, CarbonUnsafe.BYTE_ARRAY_OFFSET, length); @@ -217,13 +221,14 @@ public class UnsafeVariableLengthDimensionDataChunkStore * @param vectorRow vector row id * */ - @Override public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) { + @Override + public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) { // get the row id from reverse inverted index based on row id rowId = getRowId(rowId); // get the current row offset int currentDataOffset = getOffSet(rowId); // get the row data length - short length = getLength(rowId, currentDataOffset); + int length = getLength(rowId, currentDataOffset); // check if value length is less the current data length // then create a new array else use the same if (length > value.length) { @@ -262,9 +267,10 @@ public class UnsafeVariableLengthDimensionDataChunkStore * @param compareValue value of to be compared * @return compare result */ - @Override public int compareTo(int rowId, byte[] compareValue) { + @Override + public int compareTo(int rowId, byte[] compareValue) { int currentDataOffset = getOffSet(rowId);; - short length = getLength(rowId, currentDataOffset); + int length = getLength(rowId, currentDataOffset); // as this class handles this variable length data, so filter value can be // smaller or bigger than than actual data, so we need to take the smaller length int compareResult; http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableShortLengthDimensionDataChunkStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableShortLengthDimensionDataChunkStore.java b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableShortLengthDimensionDataChunkStore.java new file mode 100644 index 0000000..995f5ba --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/unsafe/UnsafeVariableShortLengthDimensionDataChunkStore.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.chunk.store.impl.unsafe; + +import java.nio.ByteBuffer; + +import org.apache.carbondata.core.constants.CarbonCommonConstants; + +/** + * Below class is responsible to store variable length dimension data chunk in + * memory Memory occupied can be on heap or offheap using unsafe interface + */ +public class UnsafeVariableShortLengthDimensionDataChunkStore + extends UnsafeVariableLengthDimensionDataChunkStore { + public UnsafeVariableShortLengthDimensionDataChunkStore(long totalSize, boolean isInvertedIdex, + int numberOfRows) { + super(totalSize, isInvertedIdex, numberOfRows); + } + + @Override + protected int getLengthSize() { + return CarbonCommonConstants.SHORT_SIZE_IN_BYTE; + } + + @Override + protected int getLengthFromBuffer(ByteBuffer byteBuffer) { + return byteBuffer.getShort(); + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java index 69ed437..4dcf514 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java @@ -203,7 +203,9 @@ public abstract class ColumnPage { instance = new UnsafeFixLengthColumnPage(columnSpec, dataType, pageSize); } else if (DataTypes.isDecimal(dataType)) { instance = new UnsafeDecimalColumnPage(columnSpec, dataType, pageSize); - } else if (dataType == DataTypes.STRING || dataType == DataTypes.BYTE_ARRAY) { + } else if (dataType == DataTypes.STRING + || dataType == DataTypes.BYTE_ARRAY + || dataType == DataTypes.VARCHAR) { instance = new UnsafeVarLengthColumnPage(columnSpec, dataType, pageSize); } else { throw new RuntimeException("Unsupported data dataType: " + dataType); @@ -225,7 +227,9 @@ public abstract class ColumnPage { instance = newDoublePage(columnSpec, new double[pageSize]); } else if (DataTypes.isDecimal(dataType)) { instance = newDecimalPage(columnSpec, new byte[pageSize][]); - } else if (dataType == DataTypes.STRING || dataType == DataTypes.BYTE_ARRAY) { + } else if (dataType == DataTypes.STRING + || dataType == DataTypes.BYTE_ARRAY + || dataType == DataTypes.VARCHAR) { instance = new SafeVarLengthColumnPage(columnSpec, dataType, pageSize); } else { throw new RuntimeException("Unsupported data dataType: " + dataType); @@ -398,7 +402,9 @@ public abstract class ColumnPage { } else if (DataTypes.isDecimal(dataType)) { putDecimal(rowId, (BigDecimal) value); statsCollector.update((BigDecimal) value); - } else if (dataType == DataTypes.STRING || dataType == DataTypes.BYTE_ARRAY) { + } else if (dataType == DataTypes.STRING + || dataType == DataTypes.BYTE_ARRAY + || dataType == DataTypes.VARCHAR) { putBytes(rowId, (byte[]) value); statsCollector.update((byte[]) value); } else { @@ -431,7 +437,9 @@ public abstract class ColumnPage { return getDouble(rowId); } else if (DataTypes.isDecimal(dataType)) { return getDecimal(rowId); - } else if (dataType == DataTypes.STRING || dataType == DataTypes.BYTE_ARRAY) { + } else if (dataType == DataTypes.STRING + || dataType == DataTypes.BYTE_ARRAY + || dataType == DataTypes.VARCHAR) { return getBytes(rowId); } else { throw new RuntimeException("unsupported data type: " + dataType); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java index 901758a..cb907a5 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java @@ -289,6 +289,12 @@ public abstract class VarLengthColumnPageBase extends ColumnPage { @Override public void putBytes(int rowId, byte[] bytes) { + // rowId * 4 represents the length of L in LV + if (bytes.length > (Integer.MAX_VALUE - totalLength - rowId * 4)) { + // since we later store a column page in a byte array, so its maximum size is 2GB + throw new RuntimeException("Carbondata only support maximum 2GB size for one column page," + + " exceed this limit at rowId " + rowId); + } if (rowId == 0) { rowOffset[0] = 0; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java index 00f7a0f..816b01f 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java @@ -103,6 +103,7 @@ public class DefaultEncodingFactory extends EncodingFactory { return new HighCardDictDimensionIndexCodec( dimensionSpec.isInSortColumns(), dimensionSpec.isInSortColumns() && dimensionSpec.isDoInvertedIndex(), + dimensionSpec.getSchemaDataType() == DataTypes.VARCHAR, compressor).createEncoder(null); default: throw new RuntimeException("unsupported dimension type: " + http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java index 318d55d..a661a49 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java @@ -47,6 +47,7 @@ import static org.apache.carbondata.format.Encoding.ADAPTIVE_FLOATING; import static org.apache.carbondata.format.Encoding.ADAPTIVE_INTEGRAL; import static org.apache.carbondata.format.Encoding.BOOL_BYTE; import static org.apache.carbondata.format.Encoding.DIRECT_COMPRESS; +import static org.apache.carbondata.format.Encoding.DIRECT_COMPRESS_VARCHAR; import static org.apache.carbondata.format.Encoding.RLE_INTEGRAL; /** @@ -71,7 +72,7 @@ public abstract class EncodingFactory { byte[] encoderMeta = encoderMetas.get(0).array(); ByteArrayInputStream stream = new ByteArrayInputStream(encoderMeta); DataInputStream in = new DataInputStream(stream); - if (encoding == DIRECT_COMPRESS) { + if (encoding == DIRECT_COMPRESS || encoding == DIRECT_COMPRESS_VARCHAR) { ColumnPageEncoderMeta metadata = new ColumnPageEncoderMeta(); metadata.readFields(in); return new DirectCompressCodec(metadata.getStoreDataType()).createDecoder(metadata); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java index cfdf114..4c1bc49 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java @@ -64,7 +64,7 @@ public class DirectCompressCodec implements ColumnPageCodec { return new DirectDecompressor(meta); } - private static class DirectCompressor extends ColumnPageEncoder { + private class DirectCompressor extends ColumnPageEncoder { private Compressor compressor; @@ -80,7 +80,9 @@ public class DirectCompressCodec implements ColumnPageCodec { @Override protected List<Encoding> getEncodingList() { List<Encoding> encodings = new ArrayList<>(); - encodings.add(Encoding.DIRECT_COMPRESS); + encodings.add(dataType == DataTypes.VARCHAR ? + Encoding.DIRECT_COMPRESS_VARCHAR : + Encoding.DIRECT_COMPRESS); return encodings; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java index d722c38..741dbfe 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java @@ -30,11 +30,16 @@ import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; import org.apache.carbondata.core.util.ByteUtil; import org.apache.carbondata.format.Encoding; -public class HighCardDictDimensionIndexCodec extends IndexStorageCodec { +public class HighCardDictDimensionIndexCodec extends IndexStorageCodec { + /** + * whether this column is varchar data type(long string) + */ + private boolean isVarcharType; public HighCardDictDimensionIndexCodec(boolean isSort, boolean isInvertedIndex, - Compressor compressor) { + boolean isVarcharType, Compressor compressor) { super(isSort, isInvertedIndex, compressor); + this.isVarcharType = isVarcharType; } @Override @@ -63,7 +68,9 @@ public class HighCardDictDimensionIndexCodec extends IndexStorageCodec { @Override protected List<Encoding> getEncodingList() { List<Encoding> encodings = new ArrayList<>(); - if (indexStorage.getRowIdPageLengthInBytes() > 0) { + if (isVarcharType) { + encodings.add(Encoding.DIRECT_COMPRESS_VARCHAR); + } else if (indexStorage.getRowIdPageLengthInBytes() > 0) { encodings.add(Encoding.INVERTED_INDEX); } return encodings; http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVLongStringStatsCollector.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVLongStringStatsCollector.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVLongStringStatsCollector.java new file mode 100644 index 0000000..a7bb47e --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVLongStringStatsCollector.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.page.statistics; + +import org.apache.carbondata.core.util.ByteUtil; + +/** + * This class is for the columns with varchar data type, + * a string type which can hold more than 32000 characters + */ +public class LVLongStringStatsCollector extends LVStringStatsCollector { + + public static LVLongStringStatsCollector newInstance() { + return new LVLongStringStatsCollector(); + } + + private LVLongStringStatsCollector() { + + } + + @Override + protected byte[] getActualValue(byte[] value) { + byte[] actualValue; + assert (value.length >= 4); + if (value.length == 4) { + assert (value[0] == 0 && value[1] == 0); + actualValue = new byte[0]; + } else { + int length = ByteUtil.toInt(value, 0); + assert (length > 0); + actualValue = new byte[value.length - 4]; + System.arraycopy(value, 4, actualValue, 0, actualValue.length); + } + return actualValue; + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVShortStringStatsCollector.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVShortStringStatsCollector.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVShortStringStatsCollector.java new file mode 100644 index 0000000..21b06d5 --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVShortStringStatsCollector.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.page.statistics; + +import org.apache.carbondata.core.util.ByteUtil; + +/** + * This class is for the columns with string data type which hold less than 32000 characters + */ +public class LVShortStringStatsCollector extends LVStringStatsCollector { + + public static LVShortStringStatsCollector newInstance() { + return new LVShortStringStatsCollector(); + } + + private LVShortStringStatsCollector() { + + } + + @Override + protected byte[] getActualValue(byte[] value) { + byte[] actualValue; + assert (value.length >= 2); + if (value.length == 2) { + assert (value[0] == 0 && value[1] == 0); + actualValue = new byte[0]; + } else { + int length = ByteUtil.toShort(value, 0); + assert (length > 0); + actualValue = new byte[value.length - 2]; + System.arraycopy(value, 2, actualValue, 0, actualValue.length); + } + return actualValue; + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVStringStatsCollector.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVStringStatsCollector.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVStringStatsCollector.java index 7958a8d..e1ac676 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVStringStatsCollector.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVStringStatsCollector.java @@ -23,18 +23,10 @@ import org.apache.carbondata.core.metadata.datatype.DataType; import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.util.ByteUtil; -public class LVStringStatsCollector implements ColumnPageStatsCollector { +public abstract class LVStringStatsCollector implements ColumnPageStatsCollector { private byte[] min, max; - public static LVStringStatsCollector newInstance() { - return new LVStringStatsCollector(); - } - - private LVStringStatsCollector() { - - } - @Override public void updateNull(int rowId) { @@ -70,22 +62,13 @@ public class LVStringStatsCollector implements ColumnPageStatsCollector { } + protected abstract byte[] getActualValue(byte[] value); + @Override public void update(byte[] value) { // input value is LV encoded - byte[] newValue = null; - assert (value.length >= 2); - if (value.length == 2) { - assert (value[0] == 0 && value[1] == 0); - newValue = new byte[0]; - } else { - int length = (value[0] << 8) + (value[1] & 0xff); - assert (length > 0); - newValue = new byte[value.length - 2]; - System.arraycopy(value, 2, newValue, 0, newValue.length); - } - - if (null == min) { + byte[] newValue = getActualValue(value); + if (min == null) { min = newValue; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/indexstore/UnsafeMemoryDMStore.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/UnsafeMemoryDMStore.java b/core/src/main/java/org/apache/carbondata/core/indexstore/UnsafeMemoryDMStore.java index ca5e2dd..599877c 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/UnsafeMemoryDMStore.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/UnsafeMemoryDMStore.java @@ -144,7 +144,7 @@ public class UnsafeMemoryDMStore extends AbstractMemoryDMStore { "unsupported data type for unsafe storage: " + schema.getDataType()); } break; - case VARIABLE: + case VARIABLE_SHORT: byte[] data = row.getByteArray(index); getUnsafe().putShort(memoryBlock.getBaseObject(), memoryBlock.getBaseOffset() + runningLength, (short) data.length); @@ -153,6 +153,15 @@ public class UnsafeMemoryDMStore extends AbstractMemoryDMStore { memoryBlock.getBaseOffset() + runningLength, data.length); runningLength += data.length; break; + case VARIABLE_INT: + byte[] data2 = row.getByteArray(index); + getUnsafe().putInt(memoryBlock.getBaseObject(), + memoryBlock.getBaseOffset() + runningLength, data2.length); + runningLength += 4; + getUnsafe().copyMemory(data2, BYTE_ARRAY_OFFSET, memoryBlock.getBaseObject(), + memoryBlock.getBaseOffset() + runningLength, data2.length); + runningLength += data2.length; + break; case STRUCT: CarbonRowSchema[] childSchemas = ((CarbonRowSchema.StructCarbonRowSchema) schema).getChildSchemas(); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java index 6e43fbc..4b5b36b 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/blockletindex/BlockletDataMap.java @@ -607,7 +607,13 @@ public class BlockletDataMap extends CoarseGrainDataMap implements Serializable CarbonRowSchema[] mapSchemas = new CarbonRowSchema[minMaxLen.length]; for (int i = 0; i < minMaxLen.length; i++) { if (minMaxLen[i] <= 0) { - mapSchemas[i] = new CarbonRowSchema.VariableCarbonRowSchema(DataTypes.BYTE_ARRAY); + boolean isVarchar = false; + if (i < segmentProperties.getDimensions().size() + && segmentProperties.getDimensions().get(i).getDataType() == DataTypes.VARCHAR) { + isVarchar = true; + } + mapSchemas[i] = new CarbonRowSchema.VariableCarbonRowSchema(DataTypes.BYTE_ARRAY, + isVarchar); } else { mapSchemas[i] = new CarbonRowSchema.FixedCarbonRowSchema(DataTypes.BYTE_ARRAY, minMaxLen[i]); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/indexstore/row/DataMapRow.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/row/DataMapRow.java b/core/src/main/java/org/apache/carbondata/core/indexstore/row/DataMapRow.java index 496a1d0..b8b46ef 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/row/DataMapRow.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/row/DataMapRow.java @@ -78,8 +78,10 @@ public abstract class DataMapRow implements Serializable { switch (schemas[ordinal].getSchemaType()) { case FIXED: return schemas[ordinal].getLength(); - case VARIABLE: + case VARIABLE_SHORT: return getLengthInBytes(ordinal) + 2; + case VARIABLE_INT: + return getLengthInBytes(ordinal) + 4; case STRUCT: return getRow(ordinal).getTotalSizeInBytes(); default: http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/indexstore/row/UnsafeDataMapRow.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/row/UnsafeDataMapRow.java b/core/src/main/java/org/apache/carbondata/core/indexstore/row/UnsafeDataMapRow.java index 1c1ecad..127e2a9 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/row/UnsafeDataMapRow.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/row/UnsafeDataMapRow.java @@ -49,11 +49,16 @@ public class UnsafeDataMapRow extends DataMapRow { int length; int position = getPosition(ordinal); switch (schemas[ordinal].getSchemaType()) { - case VARIABLE: - length = - getUnsafe().getShort(block.getBaseObject(), block.getBaseOffset() + pointer + position); + case VARIABLE_SHORT: + length = getUnsafe().getShort(block.getBaseObject(), + block.getBaseOffset() + pointer + position); position += 2; break; + case VARIABLE_INT: + length = getUnsafe().getInt(block.getBaseObject(), + block.getBaseOffset() + pointer + position); + position += 4; + break; default: length = schemas[ordinal].getLength(); } @@ -67,9 +72,13 @@ public class UnsafeDataMapRow extends DataMapRow { int length; int position = getPosition(ordinal); switch (schemas[ordinal].getSchemaType()) { - case VARIABLE: - length = - getUnsafe().getShort(block.getBaseObject(), block.getBaseOffset() + pointer + position); + case VARIABLE_SHORT: + length = getUnsafe().getShort(block.getBaseObject(), + block.getBaseOffset() + pointer + position); + break; + case VARIABLE_INT: + length = getUnsafe().getInt(block.getBaseObject(), + block.getBaseOffset() + pointer + position); break; default: length = schemas[ordinal].getLength(); @@ -80,9 +89,13 @@ public class UnsafeDataMapRow extends DataMapRow { private int getLengthInBytes(int ordinal, int position) { int length; switch (schemas[ordinal].getSchemaType()) { - case VARIABLE: - length = - getUnsafe().getShort(block.getBaseObject(), block.getBaseOffset() + pointer + position); + case VARIABLE_SHORT: + length = getUnsafe().getShort(block.getBaseObject(), + block.getBaseOffset() + pointer + position); + break; + case VARIABLE_INT: + length = getUnsafe().getInt(block.getBaseObject(), + block.getBaseOffset() + pointer + position); break; default: length = schemas[ordinal].getLength(); @@ -226,21 +239,28 @@ public class UnsafeDataMapRow extends DataMapRow { "unsupported data type for unsafe storage: " + schema.getDataType()); } break; - case VARIABLE: - short length = getUnsafe().getShort( - block.getBaseObject(), - block.getBaseOffset() + pointer + runningLength); + case VARIABLE_SHORT: + int length = getUnsafe() + .getShort(block.getBaseObject(), block.getBaseOffset() + pointer + runningLength); runningLength += 2; byte[] data = new byte[length]; - getUnsafe().copyMemory( - block.getBaseObject(), + getUnsafe().copyMemory(block.getBaseObject(), block.getBaseOffset() + pointer + runningLength, - data, - BYTE_ARRAY_OFFSET, - data.length); + data, BYTE_ARRAY_OFFSET, data.length); runningLength += data.length; row.setByteArray(data, i); break; + case VARIABLE_INT: + int length2 = getUnsafe() + .getInt(block.getBaseObject(), block.getBaseOffset() + pointer + runningLength); + runningLength += 4; + byte[] data2 = new byte[length2]; + getUnsafe().copyMemory(block.getBaseObject(), + block.getBaseOffset() + pointer + runningLength, + data2, BYTE_ARRAY_OFFSET, data2.length); + runningLength += data2.length; + row.setByteArray(data2, i); + break; case STRUCT: DataMapRow structRow = ((UnsafeDataMapRow) getRow(i)).convertToSafeRow(); row.setRow(structRow, i); @@ -260,8 +280,10 @@ public class UnsafeDataMapRow extends DataMapRow { switch (schemas[ordinal].getSchemaType()) { case FIXED: return schemas[ordinal].getLength(); - case VARIABLE: + case VARIABLE_SHORT: return getLengthInBytes(ordinal, position) + 2; + case VARIABLE_INT: + return getLengthInBytes(ordinal, position) + 4; case STRUCT: return getRow(ordinal).getTotalSizeInBytes(); default: http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/indexstore/schema/CarbonRowSchema.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/indexstore/schema/CarbonRowSchema.java b/core/src/main/java/org/apache/carbondata/core/indexstore/schema/CarbonRowSchema.java index 1a77467..971f42a 100644 --- a/core/src/main/java/org/apache/carbondata/core/indexstore/schema/CarbonRowSchema.java +++ b/core/src/main/java/org/apache/carbondata/core/indexstore/schema/CarbonRowSchema.java @@ -90,17 +90,23 @@ public abstract class CarbonRowSchema implements Serializable { } public static class VariableCarbonRowSchema extends CarbonRowSchema { + private boolean isVarcharType = false; public VariableCarbonRowSchema(DataType dataType) { super(dataType); } + public VariableCarbonRowSchema(DataType dataType, boolean isVarcharType) { + super(dataType); + this.isVarcharType = isVarcharType; + } + @Override public int getLength() { return dataType.getSizeInBytes(); } @Override public DataMapSchemaType getSchemaType() { - return DataMapSchemaType.VARIABLE; + return isVarcharType ? DataMapSchemaType.VARIABLE_INT : DataMapSchemaType.VARIABLE_SHORT; } } @@ -127,6 +133,6 @@ public abstract class CarbonRowSchema implements Serializable { } public enum DataMapSchemaType { - FIXED, VARIABLE, STRUCT + FIXED, VARIABLE_INT, VARIABLE_SHORT, STRUCT } } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java b/core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java index f77358f..420cd4e 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java @@ -268,7 +268,7 @@ public class BlockletInfo implements Serializable, Writable { @Override public void readFields(DataInput input) throws IOException { dimensionOffset = input.readLong(); measureOffsets = input.readLong(); - short dimensionChunkOffsetsSize = input.readShort(); + int dimensionChunkOffsetsSize = input.readShort(); dimensionChunkOffsets = new ArrayList<>(dimensionChunkOffsetsSize); for (int i = 0; i < dimensionChunkOffsetsSize; i++) { dimensionChunkOffsets.add(input.readLong()); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImpl.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImpl.java b/core/src/main/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImpl.java index 12f5fc3..87dda33 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImpl.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImpl.java @@ -112,6 +112,8 @@ public class ThriftWrapperSchemaConverterImpl implements SchemaConverter { return org.apache.carbondata.format.Encoding.RLE; case INVERTED_INDEX: return org.apache.carbondata.format.Encoding.INVERTED_INDEX; + case DIRECT_COMPRESS_VARCHAR: + return org.apache.carbondata.format.Encoding.DIRECT_COMPRESS_VARCHAR; case BIT_PACKED: return org.apache.carbondata.format.Encoding.BIT_PACKED; case DIRECT_DICTIONARY: @@ -154,6 +156,8 @@ public class ThriftWrapperSchemaConverterImpl implements SchemaConverter { return org.apache.carbondata.format.DataType.ARRAY; } else if (DataTypes.isStructType(dataType)) { return org.apache.carbondata.format.DataType.STRUCT; + } else if (dataType.getId() == DataTypes.VARCHAR.getId()) { + return org.apache.carbondata.format.DataType.VARCHAR; } else { return org.apache.carbondata.format.DataType.STRING; } @@ -447,6 +451,8 @@ public class ThriftWrapperSchemaConverterImpl implements SchemaConverter { return Encoding.RLE; case INVERTED_INDEX: return Encoding.INVERTED_INDEX; + case DIRECT_COMPRESS_VARCHAR: + return Encoding.DIRECT_COMPRESS_VARCHAR; case BIT_PACKED: return Encoding.BIT_PACKED; case DIRECT_DICTIONARY: @@ -490,6 +496,8 @@ public class ThriftWrapperSchemaConverterImpl implements SchemaConverter { return DataTypes.createDefaultArrayType(); case STRUCT: return DataTypes.createDefaultStructType(); + case VARCHAR: + return DataTypes.VARCHAR; default: return DataTypes.STRING; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataType.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataType.java b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataType.java index d71f984..4dc1fbc 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataType.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataType.java @@ -69,6 +69,7 @@ public class DataType implements Serializable { public static final char DOUBLE_MEASURE_CHAR = 'n'; public static final char STRING_CHAR = 's'; + public static final char VARCHAR_CHAR = 'v'; public static final char TIMESTAMP_CHAR = 't'; public static final char DATE_CHAR = 'x'; public static final char BYTE_ARRAY_CHAR = 'y'; @@ -89,6 +90,8 @@ public class DataType implements Serializable { return BIG_DECIMAL_MEASURE_CHAR; } else if (dataType == DataTypes.STRING) { return STRING_CHAR; + } else if (dataType == DataTypes.VARCHAR) { + return VARCHAR_CHAR; } else if (dataType == DataTypes.TIMESTAMP) { return TIMESTAMP_CHAR; } else if (dataType == DataTypes.DATE) { http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataTypes.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataTypes.java b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataTypes.java index dc89a41..d71eea4 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataTypes.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/DataTypes.java @@ -47,6 +47,8 @@ public class DataTypes { // Only for internal use for backward compatability. It is only used for V1 version public static final DataType LEGACY_LONG = LegacyLongType.LEGACY_LONG; + public static final DataType VARCHAR = VarcharType.VARCHAR; + // these IDs are used within this package only static final int STRING_TYPE_ID = 0; static final int DATE_TYPE_ID = 1; @@ -66,6 +68,7 @@ public class DataTypes { public static final int ARRAY_TYPE_ID = 11; public static final int STRUCT_TYPE_ID = 12; public static final int MAP_TYPE_ID = 13; + public static final int VARCHAR_TYPE_ID = 18; /** * create a DataType instance from uniqueId of the DataType @@ -107,6 +110,8 @@ public class DataTypes { return createDefaultMapType(); } else if (id == BYTE_ARRAY.getId()) { return BYTE_ARRAY; + } else if (id == VARCHAR.getId()) { + return VARCHAR; } else { throw new RuntimeException("create DataType with invalid id: " + id); } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/datatype/VarcharType.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/datatype/VarcharType.java b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/VarcharType.java new file mode 100644 index 0000000..bfde1a9 --- /dev/null +++ b/core/src/main/java/org/apache/carbondata/core/metadata/datatype/VarcharType.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.metadata.datatype; + +/** + * This class is for internal use. It is used to support string that longer than 32000 characters + */ +public class VarcharType extends DataType { + static final DataType VARCHAR = new VarcharType(DataTypes.VARCHAR_TYPE_ID, 0, "VARCHAR", -1); + + private VarcharType(int id, int precedenceOrder, String name, int sizeInBytes) { + super(id, precedenceOrder, name, sizeInBytes); + } + + // this function is needed to ensure singleton pattern while supporting java serialization + private Object readResolve() { + return DataTypes.VARCHAR; + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/encoder/Encoding.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/encoder/Encoding.java b/core/src/main/java/org/apache/carbondata/core/metadata/encoder/Encoding.java index 06d09f8..f3c21b1 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/encoder/Encoding.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/encoder/Encoding.java @@ -31,7 +31,8 @@ public enum Encoding { DIRECT_COMPRESS, ADAPTIVE_INTEGRAL, ADAPTIVE_DELTA_INTEGRAL, - RLE_INTEGRAL; + RLE_INTEGRAL, + DIRECT_COMPRESS_VARCHAR; public static Encoding valueOf(int ordinal) { if (ordinal == DICTIONARY.ordinal()) { @@ -56,6 +57,8 @@ public enum Encoding { return ADAPTIVE_DELTA_INTEGRAL; } else if (ordinal == RLE_INTEGRAL.ordinal()) { return RLE_INTEGRAL; + } else if (ordinal == DIRECT_COMPRESS_VARCHAR.ordinal()) { + return DIRECT_COMPRESS_VARCHAR; } else { throw new RuntimeException("create Encoding with invalid ordinal: " + ordinal); } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java b/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java index bb7e901..40f8725 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java @@ -203,6 +203,7 @@ public class TableSchemaBuilder { } } } + // todo: need more information such as long_string_columns return newColumn; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java b/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java index f005d88..7cd0c18 100644 --- a/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java +++ b/core/src/main/java/org/apache/carbondata/core/util/AbstractDataFileFooterConverter.java @@ -436,6 +436,8 @@ public abstract class AbstractDataFileFooterConverter { return DataTypes.createDefaultArrayType(); case STRUCT: return DataTypes.createDefaultStructType(); + case VARCHAR: + return DataTypes.VARCHAR; default: return DataTypes.STRING; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java index 2f34163..1f6c697 100644 --- a/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java +++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java @@ -2309,6 +2309,8 @@ public final class CarbonUtil { return DataTypes.createDefaultArrayType(); case STRUCT: return DataTypes.createDefaultStructType(); + case VARCHAR: + return DataTypes.VARCHAR; default: return DataTypes.STRING; } @@ -2499,8 +2501,10 @@ public final class CarbonUtil { return DataTypeUtil.bigDecimalToByte((BigDecimal) value); } else if (dataType == DataTypes.BYTE_ARRAY) { return (byte[]) value; - } else if (dataType == DataTypes.STRING || dataType == DataTypes.TIMESTAMP || - dataType == DataTypes.DATE) { + } else if (dataType == DataTypes.STRING + || dataType == DataTypes.TIMESTAMP + || dataType == DataTypes.DATE + || dataType == DataTypes.VARCHAR) { return (byte[]) value; } else { throw new IllegalArgumentException("Invalid data type: " + dataType); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java b/core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java index e06c82e..c84b0da 100644 --- a/core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java +++ b/core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java @@ -856,6 +856,8 @@ public final class DataTypeUtil { return DataTypes.FLOAT; } else if (DataTypes.DOUBLE.getName().equalsIgnoreCase(name)) { return DataTypes.DOUBLE; + } else if (DataTypes.VARCHAR.getName().equalsIgnoreCase(name)) { + return DataTypes.VARCHAR; } else if (DataTypes.NULL.getName().equalsIgnoreCase(name)) { return DataTypes.NULL; } else if (DataTypes.BYTE_ARRAY.getName().equalsIgnoreCase(name)) { @@ -904,6 +906,8 @@ public final class DataTypeUtil { return DataTypes.FLOAT; } else if (DataTypes.DOUBLE.getName().equalsIgnoreCase(dataType.getName())) { return DataTypes.DOUBLE; + } else if (DataTypes.VARCHAR.getName().equalsIgnoreCase(dataType.getName())) { + return DataTypes.VARCHAR; } else if (DataTypes.NULL.getName().equalsIgnoreCase(dataType.getName())) { return DataTypes.NULL; } else if (DataTypes.BYTE_ARRAY.getName().equalsIgnoreCase(dataType.getName())) { http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/core/src/test/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImplTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImplTest.java b/core/src/test/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImplTest.java index 67c7594..522bf41 100644 --- a/core/src/test/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImplTest.java +++ b/core/src/test/java/org/apache/carbondata/core/metadata/converter/ThriftWrapperSchemaConverterImplTest.java @@ -1562,7 +1562,7 @@ public class ThriftWrapperSchemaConverterImplTest { } @Test public void testFromExternalToWrapperSchemaEvolutionEntry() { -long time =1112745600000L; + long time =1112745600000L; ColumnSchema wrapperColumnSchema = new ColumnSchema(); wrapperColumnSchema.setColumnUniqueId("1"); wrapperColumnSchema.setColumnName("columnName"); http://git-wip-us.apache.org/repos/asf/carbondata/blob/dc53dee2/format/src/main/thrift/schema.thrift ---------------------------------------------------------------------- diff --git a/format/src/main/thrift/schema.thrift b/format/src/main/thrift/schema.thrift index b44fe19..3af2b9a 100644 --- a/format/src/main/thrift/schema.thrift +++ b/format/src/main/thrift/schema.thrift @@ -35,6 +35,7 @@ enum DataType { BOOLEAN = 8, ARRAY = 20, STRUCT = 21, + VARCHAR = 22, } /** @@ -56,6 +57,7 @@ enum Encoding{ ADAPTIVE_FLOATING = 11; // Identifies that a column is encoded using AdaptiveFloatingCodec BOOL_BYTE = 12; // Identifies that a column is encoded using BooleanPageCodec ADAPTIVE_DELTA_FLOATING = 13; // Identifies that a column is encoded using AdaptiveDeltaFloatingCodec + DIRECT_COMPRESS_VARCHAR = 14; // Identifies that a columm is encoded using DirectCompressCodec, it is used for long string columns } enum PartitionType{ @@ -173,6 +175,7 @@ struct TableSchema{ 4: optional map<string,string> tableProperties; // Table properties configured by the user 5: optional BucketingInfo bucketingInfo; // Bucketing information 6: optional PartitionInfo partitionInfo; // Partition information + 7: optional list<string> long_string_columns // long string columns in the table } struct RelationIdentifier {