Github user paul-rogers commented on a diff in the pull request:
https://github.com/apache/drill/pull/1125#discussion_r172105104
--- Diff:
exec/java-exec/src/main/java/org/apache/drill/exec/record/RecordBatchSizer.java
---
@@ -76,110 +82,327 @@
* greater than (but unlikely) same as the row count.
*/
- public final int valueCount;
+ private final int valueCount;
/**
- * Total number of elements for a repeated type, or 1 if this is
- * a non-repeated type. That is, a batch of 100 rows may have an
- * array with 10 elements per row. In this case, the element count
- * is 1000.
+ * Total number of elements for a repeated type, or same as
+ * valueCount if this is a non-repeated type. That is, a batch
+ * of 100 rows may have an array with 10 elements per row.
+ * In this case, the element count is 1000.
*/
- public final int elementCount;
+ private int elementCount;
/**
- * Size of the top level value vector. For map and repeated list,
- * this is just size of offset vector.
+ * The estimated, average number of elements per parent value.
+ * Always 1 for a non-repeated type. For a repeated type,
+ * this is the average entries per array (per repeated element).
*/
- public int dataSize;
+
+ private float estElementCountPerArray;
/**
- * Total size of the column includes the sum total of memory for all
- * value vectors representing the column.
+ * Indicates if it is variable width column.
+ * For map columns, this is true if any of the children is variable
+ * width column.
*/
- public int netSize;
+
+ private boolean isVariableWidth;
/**
- * The estimated, average number of elements per parent value.
- * Always 1 for a non-repeated type. For a repeated type,
- * this is the average entries per array (per repeated element).
+ * Indicates if cardinality is repeated(top level only).
+ */
+
+ private boolean isRepeated;
+
+ /**
+ * Indicates if cardinality is optional i.e. nullable(top level only).
+ */
+ private boolean isOptional;
+
+ /**
+ * Child columns if this is a map column.
+ */
+ private Map<String, ColumnSize> children =
CaseInsensitiveMap.newHashMap();
+
+ /**
+ * std pure data size per entry from Drill metadata, based on type.
+ * Does not include metadata vector overhead we add for cardinality,
+ * variable length etc.
+ * For variable-width columns, we use 50 as std size for entry width.
+ * For repeated column, we assume repetition of 10.
+ */
+ public int getStdDataSizePerEntry() {
+ int stdDataSize;
+
+ try {
+ stdDataSize = TypeHelper.getSize(metadata.getType());
+
+ // For variable width, typeHelper includes offset vector width.
Adjust for that.
+ if (isVariableWidth) {
+ stdDataSize -= OFFSET_VECTOR_WIDTH;
+ }
+
+ if (isRepeated) {
+ stdDataSize = stdDataSize * STD_REPETITION_FACTOR;
+ }
+ } catch (Exception e) {
+ // For unsupported types, just set stdSize to 0.
+ // Map, Union, List etc.
+ stdDataSize = 0;
+ }
+
+ // Add sizes of children.
+ for (ColumnSize columnSize : children.values()) {
+ stdDataSize += columnSize.getStdDataSizePerEntry();
+ }
+
+ if (isRepeatedList()) {
+ stdDataSize = stdDataSize * STD_REPETITION_FACTOR;
+ }
+
+ return stdDataSize;
+ }
+
+ /**
+ * std net size per entry taking into account additional metadata
vectors
+ * we add on top for variable length, cardinality etc.
+ * For variable-width columns, we use 50 as std data size for entry
width.
+ * For repeated column, we assume repetition of 10.
+ */
+ public int getStdNetSizePerEntry() {
+ int stdNetSize;
+ try {
+ stdNetSize = TypeHelper.getSize(metadata.getType());
+ } catch (Exception e) {
+ stdNetSize = 0;
+ }
+
+ if (isOptional) {
+ stdNetSize += BIT_VECTOR_WIDTH;
+ }
+
+ if (isRepeated) {
+ stdNetSize = (stdNetSize * STD_REPETITION_FACTOR) +
OFFSET_VECTOR_WIDTH;
+ }
+
+ for (ColumnSize columnSize : children.values()) {
+ stdNetSize += columnSize.getStdNetSizePerEntry();
+ }
+
+ if (isRepeatedList()) {
+ stdNetSize = (stdNetSize * STD_REPETITION_FACTOR) +
OFFSET_VECTOR_WIDTH;
+ }
+
+ return stdNetSize;
+ }
+
+ /**
+ * This is the average actual per entry data size in bytes. Does not
+ * include any overhead of metadata vectors.
+ * For repeated columns, it is average for the repeated array, not
+ * individual entry in the array.
+ */
+ public int getDataSizePerEntry() {
+ return safeDivide(getTotalDataSize(), getValueCount());
+ }
+
+ /**
+ * This is the average per entry size of just pure data plus
+ * overhead of additional vectors we add on top like bits vector,
+ * offset vector etc. This
+ * size is larger than the actual data size since this size includes
per-
+ * column overhead for additional vectors we add for
+ * cardinality, variable length etc.
+ */
+ public int getNetSizePerEntry() {
+ return safeDivide(getTotalNetSize(), getValueCount());
+ }
+
+ /**
+ * This is the total data size for the column, including children for
map
+ * columns. Does not include any overhead of metadata vectors.
+ */
+ public int getTotalDataSize() {
+ int dataSize = this.totalDataSize;
+ for (ColumnSize columnSize : children.values()) {
+ dataSize += columnSize.getTotalDataSize();
+ }
+ return dataSize;
+ }
+
+ /**
+ * This is the total net size for the column, including children for
map
+ * columns. Includes overhead of metadata vectors.
*/
+ public int getTotalNetSize() {
+ return this.totalNetSize;
+ }
+
+ public int getValueCount() {
+ return valueCount;
+ }
- public final float estElementCountPerArray;
- public final boolean isVariableWidth;
+ public int getElementCount() {
+ return elementCount;
+ }
+
+ public float getEstElementCountPerArray() {
+ return estElementCountPerArray;
+ }
- public Map<String, ColumnSize> children =
CaseInsensitiveMap.newHashMap();
+ public boolean isVariableWidth() {
+ return isVariableWidth;
+ }
public Map<String, ColumnSize> getChildren() {
return children;
}
+ public boolean isComplex() {
+ if (metadata.getType().getMinorType() == MinorType.MAP ||
+ metadata.getType().getMinorType() == MinorType.UNION ||
+ metadata.getType().getMinorType() == MinorType.LIST) {
+ return true;
+ }
+ return false;
+ }
+
+ public boolean isRepeatedList() {
+ if (metadata.getType().getMinorType() == MinorType.LIST &&
+ metadata.getDataMode() == DataMode.REPEATED) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * This is the average per entry width, used for vector allocation.
+ */
+ public int getEntryWidth() {
+ int width = 0;
+ if (isVariableWidth) {
+ width = getNetSizePerEntry() - OFFSET_VECTOR_WIDTH;
+
+ // Subtract out the bits (is-set) vector width
+ if (metadata.getDataMode() == DataMode.OPTIONAL) {
+ width -= BIT_VECTOR_WIDTH;
+ }
+ }
+
+ return (safeDivide(width, estElementCountPerArray));
+ }
+
public ColumnSize(ValueVector v, String prefix) {
this.prefix = prefix;
valueCount = v.getAccessor().getValueCount();
metadata = v.getField();
- isVariableWidth = v instanceof VariableWidthVector;
-
- // The amount of memory consumed by the payload: the actual
- // data stored in the vectors.
-
- if (v.getField().getDataMode() == DataMode.REPEATED) {
- elementCount = buildRepeated(v);
- estElementCountPerArray = valueCount == 0 ? 0 : elementCount *
1.0f / valueCount;
- } else {
- elementCount = 1;
- estElementCountPerArray = 1;
+ isVariableWidth = (v instanceof VariableWidthVector || v instanceof
RepeatedVariableWidthVectorLike);
+ elementCount = valueCount;
+ estElementCountPerArray = 1;
+ totalNetSize = v.getPayloadByteCount(valueCount);
+
+ // Special case. For union and list vectors, it is very complex
+ // to figure out raw data size. Make it same as net size.
+ if (metadata.getType().getMinorType() == MinorType.UNION ||
+ (metadata.getType().getMinorType() == MinorType.LIST &&
v.getField().getDataMode() != DataMode.REPEATED)) {
+ totalDataSize = totalNetSize;
}
- switch (metadata.getType().getMinorType()) {
- case LIST:
- buildList(v);
- break;
- case MAP:
- case UNION:
- // No standard size for Union type
- dataSize = v.getPayloadByteCount(valueCount);
- break;
- default:
- dataSize = v.getPayloadByteCount(valueCount);
- try {
- stdSize = TypeHelper.getSize(metadata.getType()) * elementCount;
- } catch (Exception e) {
- // For unsupported types, just set stdSize to 0.
- stdSize = 0;
- }
+
+ switch(v.getField().getDataMode()) {
--- End diff --
Once code gets this complex, it may be time to create subclasses for the
three modes. Plus subclasses for the categories of types (variable-width, map,
etc.)
The column size would be a composition of the type size class plus a
cardinality size class.
Maybe overkill for the current round of changes, but something to keep in
mind if this code continues to grow in complexity.
---