Repository: carbondata Updated Branches: refs/heads/master fa9c8323c -> d8003a31c
[CARBONDATA-2976] Support dumping column chunk metadata in CarbonCli By using -k option, CarbonCli will print all column chunk/page metadata for the specified column For example, java CarbonCli -cmd summary -p /home/root1/bin /home/root1/.local/bin /usr/local/sbin /usr/local/bin /usr/sbin /usr/bin /sbin /bin /usr/games /usr/local/games /snap/bin /usr/lib/jvm/java-8-oracle/bin /usr/lib/jvm/java-8-oracle/db/bin /usr/lib/jvm/java-8-oracle/jre/bin -c name -k will output: ## Page Meta for column 'name' in file /Users/jacky/code/carbondata/tools/cli/CarbonCliTest/part-0-138391629343461_batchno0-0-null-138390048546321.carbondata Blocklet 0: Page 0 (offset 0, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000) Page 1 (offset 12049, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000) Page 2 (offset 24098, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000) This closes #2771 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/d8003a31 Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/d8003a31 Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/d8003a31 Branch: refs/heads/master Commit: d8003a31c602807f00d438d8be392992cb0955ac Parents: fa9c832 Author: Jacky Li <jacky.li...@qq.com> Authored: Wed Sep 26 23:51:34 2018 +0800 Committer: ravipesala <ravi.pes...@gmail.com> Committed: Wed Oct 3 20:17:04 2018 +0530 ---------------------------------------------------------------------- .../org/apache/carbondata/tool/CarbonCli.java | 2 + .../org/apache/carbondata/tool/DataFile.java | 8 +++- .../org/apache/carbondata/tool/DataSummary.java | 45 ++++++++++++++++++-- .../apache/carbondata/tool/CarbonCliTest.java | 13 ++++++ 4 files changed, 63 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java ---------------------------------------------------------------------- diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java b/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java index 5725f8e..f1baa92 100644 --- a/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java +++ b/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java @@ -65,6 +65,7 @@ public class CarbonCli { Option segment = new Option("m", "showSegment", false, "print segment information"); Option tblProperties = new Option("t", "tblProperties", false, "print table properties"); Option detail = new Option("b", "blocklet", false, "print blocklet size detail"); + Option columnMeta = new Option("k", "columnChunkMeta", false, "print column chunk meta"); Option columnName = OptionBuilder .withArgName("column name") .hasArg() @@ -82,6 +83,7 @@ public class CarbonCli { options.addOption(segment); options.addOption(tblProperties); options.addOption(detail); + options.addOption(columnMeta); options.addOption(columnName); return options; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java ---------------------------------------------------------------------- diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java b/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java index da81d84..039401e 100644 --- a/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java +++ b/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java @@ -321,6 +321,8 @@ class DataFile { // they are set after calculation in DataSummary double minPercentage, maxPercentage; + DataChunk3 dataChunk; + /** * Constructor * @param blockletInfo blocklet info which this column chunk belongs to @@ -338,7 +340,7 @@ class DataFile { ByteBuffer buffer = fileReader.readByteBuffer( filePath, blockletInfo.column_data_chunks_offsets.get(columnIndex), blockletInfo.column_data_chunks_length.get(columnIndex)); - DataChunk3 dataChunk = CarbonUtil.readDataChunk3(new ByteArrayInputStream(buffer.array())); + dataChunk = CarbonUtil.readDataChunk3(new ByteArrayInputStream(buffer.array())); this.localDict = dataChunk.isSetLocal_dictionary(); if (this.localDict) { String compressorName = CarbonMetadataUtil.getCompressorNameFromChunkMeta( @@ -376,6 +378,10 @@ class DataFile { return column.getDataType(); } + public DataChunk3 getDataChunk3() { + return dataChunk; + } + byte[] min(byte[] minValue) { if (minValue == null) { return min; http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java ---------------------------------------------------------------------- diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java b/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java index 6463977..5f1fb68 100644 --- a/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java +++ b/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java @@ -38,6 +38,8 @@ import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; import org.apache.carbondata.core.statusmanager.SegmentStatusManager; import org.apache.carbondata.core.util.CarbonUtil; import org.apache.carbondata.format.BlockletInfo3; +import org.apache.carbondata.format.DataChunk2; +import org.apache.carbondata.format.DataChunk3; import org.apache.carbondata.format.FileFooter3; import org.apache.carbondata.format.FileHeader; import org.apache.carbondata.format.TableInfo; @@ -91,6 +93,9 @@ class DataSummary implements Command { if (line.hasOption("c")) { String columName = line.getOptionValue("c"); printColumnStats(columName); + if (line.hasOption("k")) { + printColumnChunkMeta(columName); + } } } @@ -217,13 +222,13 @@ class DataSummary implements Command { throw new RuntimeException("schema for column " + columnName + " not found"); } + // true if blockled stats are collected + private boolean collected = false; + private void printColumnStats(String columnName) throws IOException, MemoryException { out.println(); out.println("## Column Statistics for '" + columnName + "'"); - for (DataFile dataFile : dataFiles.values()) { - dataFile.initAllBlockletStats(columnName); - } - collectAllBlockletStats(dataFiles.values()); + collectStats(columnName); int columnIndex = getColumnIndex(columnName); String[] header = new String[]{"BLK", "BLKLT", "Meta Size", "Data Size", @@ -260,6 +265,38 @@ class DataSummary implements Command { printer.printFormatted(out); } + private void collectStats(String columnName) throws IOException, MemoryException { + if (!collected) { + for (DataFile dataFile : dataFiles.values()) { + dataFile.initAllBlockletStats(columnName); + } + collectAllBlockletStats(dataFiles.values()); + collected = true; + } + } + + private void printColumnChunkMeta(String columnName) throws IOException, MemoryException { + out.println(); + DataFile file = dataFiles.entrySet().iterator().next().getValue(); + out.println("## Page Meta for column '" + columnName + "' in file " + file.getFilePath()); + collectStats(columnName); + for (int i = 0; i < file.getAllBlocklets().size(); i++) { + DataFile.Blocklet blocklet = file.getAllBlocklets().get(i); + DataChunk3 dataChunk3 = blocklet.getColumnChunk().getDataChunk3(); + List<DataChunk2> dataChunk2List = dataChunk3.getData_chunk_list(); + out.println(String.format("Blocklet %d:", i)); + + // There will be many pages, for debugging purpose, + // just print 3 page for each blocklet is enough + for (int j = 0; j < dataChunk2List.size() && j < 3; j++) { + out.println(String.format("Page %d (offset %d, length %d): %s", + j, dataChunk3.page_offset.get(j), dataChunk3.page_length.get(j), + dataChunk2List.get(j).toString())); + } + out.println("\n"); + } + } + private void collectAllBlockletStats(Collection<DataFile> dataFiles) { // shard name mapping to blocklets belong to same shard Map<String, List<DataFile.Blocklet>> shards = new HashMap<>(); http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java ---------------------------------------------------------------------- diff --git a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java index fcd46c8..e526131 100644 --- a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java +++ b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java @@ -184,6 +184,19 @@ public class CarbonCliTest { } @Test + public void testSummaryPageMeta() { + String[] args = { "-cmd", "summary", "-p", path, "-c", "name", "-k"}; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PrintStream stream = new PrintStream(out); + CarbonCli.run(args, stream); + String output = new String(out.toByteArray()); + Assert.assertTrue( + output.contains( + "Blocklet 0:\n" + + "Page 0 (offset 0, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)")); + } + + @Test public void testBenchmark() { String[] args = {"-cmd", "benchmark", "-p", path, "-a", "-c", "name"}; ByteArrayOutputStream out = new ByteArrayOutputStream();