Github user xuchuanyin commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216968250 --- Diff: tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java --- @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.tool; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.carbondata.common.Strings; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema; +import org.apache.carbondata.core.reader.CarbonHeaderReader; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.util.CarbonUtil; +import org.apache.carbondata.core.util.path.CarbonTablePath; +import org.apache.carbondata.format.BlockletInfo3; +import org.apache.carbondata.format.FileFooter3; +import org.apache.carbondata.format.FileHeader; +import org.apache.carbondata.format.TableInfo; + +import static org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET; + +/** + * Data Summary command implementation for {@link CarbonCli} + */ +class DataSummary { + private String dataFolder; + private PrintStream out; + + private long numBlock; + private long numShard; + private long numBlocklet; + private long numPage; + private long numRow; + private long totalDataSize; + + // file path mapping to file object + private LinkedHashMap<String, DataFile> dataFiles = new LinkedHashMap<>(); + private CarbonFile tableStatusFile; + private CarbonFile schemaFile; + + DataSummary(String dataFolder, PrintStream out) throws IOException { + this.dataFolder = dataFolder; + this.out = out; + collectDataFiles(); + } + + private boolean isColumnarFile(String fileName) { + // if the timestamp in file name is "0", it is a streaming file + return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) && + !CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0"); + } + + private boolean isStreamFile(String fileName) { + // if the timestamp in file name is "0", it is a streaming file + return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) && + CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0"); + } + + private void collectDataFiles() throws IOException { + Set<String> shards = new HashSet<>(); + CarbonFile folder = FileFactory.getCarbonFile(dataFolder); + List<CarbonFile> files = folder.listFiles(true); + List<DataFile> unsortedFiles = new ArrayList<>(); + for (CarbonFile file : files) { + if (isColumnarFile(file.getName())) { + DataFile dataFile = new DataFile(file); + unsortedFiles.add(dataFile); + collectNum(dataFile.getFooter()); + shards.add(dataFile.getShardName()); + totalDataSize += file.getSize(); + } else if (file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) { + tableStatusFile = file; + } else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) { + schemaFile = file; + } else if (isStreamFile(file.getName())) { + out.println("WARN: input path contains streaming file, this tool does not support it yet, " + + "skipping it..."); + } + } + unsortedFiles.sort((o1, o2) -> { + if (o1.getShardName().equalsIgnoreCase(o2.getShardName())) { + return Integer.parseInt(o1.getPartNo()) - Integer.parseInt(o2.getPartNo()); + } else { + return o1.getShardName().hashCode() - o2.getShardName().hashCode(); --- End diff -- Why not sort by the alphabet sequence of the shardName directly?
---