[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...

xuchuanyin Wed, 12 Sep 2018 03:01:21 -0700

Github user xuchuanyin commented on a diff in the pull request:

    https://github.com/apache/carbondata/pull/2683#discussion_r216968250
  
    --- Diff: 
tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java ---
    @@ -0,0 +1,360 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.carbondata.tool;
    +
    +import java.io.IOException;
    +import java.io.PrintStream;
    +import java.nio.charset.Charset;
    +import java.util.ArrayList;
    +import java.util.Collection;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.LinkedHashMap;
    +import java.util.LinkedList;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Set;
    +
    +import org.apache.carbondata.common.Strings;
    +import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
    +import org.apache.carbondata.core.datastore.impl.FileFactory;
    +import org.apache.carbondata.core.memory.MemoryException;
    +import org.apache.carbondata.core.metadata.datatype.DataTypes;
    +import 
org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
    +import org.apache.carbondata.core.reader.CarbonHeaderReader;
    +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
    +import org.apache.carbondata.core.statusmanager.SegmentStatusManager;
    +import org.apache.carbondata.core.util.CarbonUtil;
    +import org.apache.carbondata.core.util.path.CarbonTablePath;
    +import org.apache.carbondata.format.BlockletInfo3;
    +import org.apache.carbondata.format.FileFooter3;
    +import org.apache.carbondata.format.FileHeader;
    +import org.apache.carbondata.format.TableInfo;
    +
    +import static 
org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET;
    +
    +/**
    + * Data Summary command implementation for {@link CarbonCli}
    + */
    +class DataSummary {
    +  private String dataFolder;
    +  private PrintStream out;
    +
    +  private long numBlock;
    +  private long numShard;
    +  private long numBlocklet;
    +  private long numPage;
    +  private long numRow;
    +  private long totalDataSize;
    +
    +  // file path mapping to file object
    +  private LinkedHashMap<String, DataFile> dataFiles = new 
LinkedHashMap<>();
    +  private CarbonFile tableStatusFile;
    +  private CarbonFile schemaFile;
    +
    +  DataSummary(String dataFolder, PrintStream out) throws IOException {
    +    this.dataFolder = dataFolder;
    +    this.out = out;
    +    collectDataFiles();
    +  }
    +
    +  private boolean isColumnarFile(String fileName) {
    +    // if the timestamp in file name is "0", it is a streaming file
    +    return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
    +        
!CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
    +  }
    +
    +  private boolean isStreamFile(String fileName) {
    +    // if the timestamp in file name is "0", it is a streaming file
    +    return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
    +        
CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
    +  }
    +
    +  private void collectDataFiles() throws IOException {
    +    Set<String> shards = new HashSet<>();
    +    CarbonFile folder = FileFactory.getCarbonFile(dataFolder);
    +    List<CarbonFile> files = folder.listFiles(true);
    +    List<DataFile> unsortedFiles = new ArrayList<>();
    +    for (CarbonFile file : files) {
    +      if (isColumnarFile(file.getName())) {
    +        DataFile dataFile = new DataFile(file);
    +        unsortedFiles.add(dataFile);
    +        collectNum(dataFile.getFooter());
    +        shards.add(dataFile.getShardName());
    +        totalDataSize += file.getSize();
    +      } else if 
(file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) {
    +        tableStatusFile = file;
    +      } else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) {
    +        schemaFile = file;
    +      } else if (isStreamFile(file.getName())) {
    +        out.println("WARN: input path contains streaming file, this tool 
does not support it yet, "
    +            + "skipping it...");
    +      }
    +    }
    +    unsortedFiles.sort((o1, o2) -> {
    +      if (o1.getShardName().equalsIgnoreCase(o2.getShardName())) {
    +        return Integer.parseInt(o1.getPartNo()) - 
Integer.parseInt(o2.getPartNo());
    +      } else {
    +        return o1.getShardName().hashCode() - o2.getShardName().hashCode();
    --- End diff --
    
    Why not sort by the alphabet sequence of the shardName directly?

---

[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...

Reply via email to