[ 
https://issues.apache.org/jira/browse/DRILL-5601?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16086492#comment-16086492
 ] 

ASF GitHub Bot commented on DRILL-5601:
---------------------------------------

Github user Ben-Zvi commented on a diff in the pull request:

    https://github.com/apache/drill/pull/860#discussion_r123838906
  
    --- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
    @@ -19,117 +19,162 @@
     
     import java.util.ArrayList;
     import java.util.List;
    +import java.util.Set;
     
    +import org.apache.drill.common.types.TypeProtos.DataMode;
     import org.apache.drill.common.types.TypeProtos.MinorType;
     import org.apache.drill.exec.expr.TypeHelper;
    +import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
     import org.apache.drill.exec.memory.BaseAllocator;
     import org.apache.drill.exec.record.BatchSchema;
     import org.apache.drill.exec.record.MaterializedField;
     import org.apache.drill.exec.record.RecordBatch;
    +import org.apache.drill.exec.record.SmartAllocationHelper;
     import org.apache.drill.exec.record.VectorAccessible;
     import org.apache.drill.exec.record.VectorWrapper;
     import org.apache.drill.exec.record.selection.SelectionVector2;
    +import org.apache.drill.exec.vector.UInt4Vector;
     import org.apache.drill.exec.vector.ValueVector;
     import org.apache.drill.exec.vector.complex.AbstractMapVector;
    +import org.apache.drill.exec.vector.complex.RepeatedValueVector;
     import org.apache.drill.exec.vector.VariableWidthVector;
     
    +import com.google.common.collect.Sets;
    +
     /**
      * Given a record batch or vector container, determines the actual memory
      * consumed by each column, the average row, and the entire record batch.
      */
     
     public class RecordBatchSizer {
    -//  private static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
     
       /**
        * Column size information.
        */
       public static class ColumnSize {
    +    public final String prefix;
         public final MaterializedField metadata;
     
         /**
    -     * Assumed size from Drill metadata.
    +     * Assumed size from Drill metadata. Note that this information is
    +     * 100% bogus. Do not use it.
          */
     
    +    @Deprecated
         public int stdSize;
     
         /**
    -     * Actual memory consumed by all the vectors associated with this 
column.
    -     */
    -
    -    public int totalSize;
    -
    -    /**
          * Actual average column width as determined from actual memory use. 
This
          * size is larger than the actual data size since this size includes 
per-
          * column overhead such as any unused vector space, etc.
          */
     
    -    public int estSize;
    -    public int capacity;
    -    public int density;
    -    public int dataSize;
    -    public boolean variableWidth;
    -
    -    public ColumnSize(ValueVector vv) {
    -      metadata = vv.getField();
    -      stdSize = TypeHelper.getSize(metadata.getType());
    -
    -      // Can't get size estimates if this is an empty batch.
    -      int rowCount = vv.getAccessor().getValueCount();
    -      if (rowCount == 0) {
    -        estSize = stdSize;
    -        return;
    -      }
    +    public final int estSize;
    +    public final int valueCount;
    +    public final int entryCount;
    +    public final int dataSize;
    +    public final int estElementCount;
    +    public final boolean isVariableWidth;
     
    -      // Total size taken by all vectors (and underlying buffers)
    -      // associated with this vector.
    +    public ColumnSize(ValueVector v, String prefix, int valueCount) {
    +      this.prefix = prefix;
    +      this.valueCount = valueCount;
    +      metadata = v.getField();
    +      boolean isMap = v.getField().getType().getMinorType() == 
MinorType.MAP;
     
    -      totalSize = vv.getAllocatedByteCount();
    +      // The amount of memory consumed by the payload: the actual
    +      // data stored in the vectors.
     
    -      // Capacity is the number of values that the vector could
    -      // contain. This is useful only for fixed-length vectors.
    +      int mapSize = 0;
    +      if (v.getField().getDataMode() == DataMode.REPEATED) {
     
    -      capacity = vv.getValueCapacity();
    +        // Repeated vectors are special: they have an associated offset 
vector
    +        // that changes the value count of the contained vectors.
     
    -      // The amount of memory consumed by the payload: the actual
    -      // data stored in the vectors.
    +        @SuppressWarnings("resource")
    +        UInt4Vector offsetVector = ((RepeatedValueVector) 
v).getOffsetVector();
    +        entryCount = offsetVector.getAccessor().get(valueCount);
    +        estElementCount = roundUp(entryCount, valueCount);
    +        if (isMap) {
     
    -      dataSize = vv.getPayloadByteCount();
    +          // For map, the only data associated with the map vector
    +          // itself is the offset vector, if any.
     
    -      // Determine "density" the number of rows compared to potential
    -      // capacity. Low-density batches occur at block boundaries, ends
    -      // of files and so on. Low-density batches throw off our estimates
    -      // for Varchar columns because we don't know the actual number of
    -      // bytes consumed (that information is hidden behind the Varchar
    -      // implementation where we can't get at it.)
    +          mapSize = offsetVector.getPayloadByteCount(valueCount);
    +        }
    +      } else {
    +        entryCount = valueCount;
    +        estElementCount = 1;
    +      }
     
    -      density = roundUp(dataSize * 100, totalSize);
    -      estSize = roundUp(dataSize, rowCount);
    -      variableWidth = vv instanceof VariableWidthVector ;
    +      if (isMap) {
    +        dataSize = mapSize;
    +        stdSize = 0;
    +     } else {
    +        dataSize = v.getPayloadByteCount(valueCount);
    +        stdSize = TypeHelper.getSize(metadata.getType());
    +     }
    +      estSize = roundUp(dataSize, valueCount);
    +      isVariableWidth = v instanceof VariableWidthVector ;
         }
     
         @Override
         public String toString() {
           StringBuilder buf = new StringBuilder()
    +          .append(prefix)
               .append(metadata.getName())
               .append("(type: ")
    +          .append(metadata.getType().getMode().name())
    +          .append(" ")
               .append(metadata.getType().getMinorType().name())
    -          .append(", std col. size: ")
    +          .append(", count: ")
    +          .append(valueCount);
    +      if (metadata.getDataMode() == DataMode.REPEATED) {
    +        buf.append(", total entries: ")
    +           .append(entryCount)
    +           .append(", per-array: ")
    +           .append(estElementCount);
    +      }
    +      buf .append(", std size: ")
               .append(stdSize)
    -          .append(", actual col. size: ")
    +          .append(", actual size: ")
               .append(estSize)
    -          .append(", total size: ")
    -          .append(totalSize)
               .append(", data size: ")
               .append(dataSize)
    -          .append(", row capacity: ")
    -          .append(capacity)
    -          .append(", density: ")
    -          .append(density)
               .append(")");
           return buf.toString();
         }
    +
    +    public void buildAllocHelper(SmartAllocationHelper allocHelper) {
    +      int width = 0;
    +      switch(metadata.getType().getMinorType()) {
    +      case VAR16CHAR:
    +      case VARBINARY:
    +      case VARCHAR:
    +
    +        // Subtract out the offset vector width
    +        width = estSize - 4;
    +
    +        // Subtract out the bits (is-set) vector width
    +        if (metadata.getDataMode() == DataMode.OPTIONAL) {
    +          width -= 1;
    +        }
    +        break;
    +      default:
    +        break;
    +      }
    +      String name = prefix + metadata.getName();
    +      if (metadata.getDataMode() == DataMode.REPEATED) {
    +        if (width > 0) {
    +          allocHelper.variableWidthArray(name, width, estElementCount);
    --- End diff --
    
    (1) The methods variableWidthArray() and variableWidth() can be combined 
into one (as the latter is called when estElementCount is 1, which makes the 
two do the same).
    (2) Can move this call (combined with the one below - on line 175) into the 
new if statement above that handles variable subtypes.


> Rollup of External Sort memory management fixes
> -----------------------------------------------
>
>                 Key: DRILL-5601
>                 URL: https://issues.apache.org/jira/browse/DRILL-5601
>             Project: Apache Drill
>          Issue Type: Task
>    Affects Versions: 1.11.0
>            Reporter: Paul Rogers
>            Assignee: Paul Rogers
>             Fix For: 1.12.0
>
>
> Rollup of a set of specific JIRA entries that all relate to the very 
> difficult problem of managing memory within Drill in order for the external 
> sort to stay within a memory budget. In general, the fixes relate to better 
> estimating memory used by the three ways that Drill allocates vector memory 
> (see DRILL-5522) and to predicting the size of vectors that the sort will 
> create, to avoid repeated realloc-copy cycles (see DRILL-5594).



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to