[ https://issues.apache.org/jira/browse/DRILL-6032?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16349601#comment-16349601 ]
ASF GitHub Bot commented on DRILL-6032: --------------------------------------- Github user ilooner commented on a diff in the pull request: https://github.com/apache/drill/pull/1101#discussion_r165532035 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/aggregate/HashAggTemplate.java --- @@ -516,43 +501,48 @@ private void initializeSetup(RecordBatch newIncoming) throws SchemaChangeExcepti * @param incoming */ private void updateEstMaxBatchSize(RecordBatch incoming) { - if ( estMaxBatchSize > 0 ) { return; } // no handling of a schema (or varchar) change // Use the sizer to get the input row width and the length of the longest varchar column - RecordBatchSizer sizer = new RecordBatchSizer(incoming); - logger.trace("Incoming sizer: {}",sizer); - // An empty batch only has the schema, can not tell actual length of varchars - // else use the actual varchars length, each capped at 50 (to match the space allocation) - long estInputRowWidth = sizer.rowCount() == 0 ? sizer.stdRowWidth() : sizer.netRowWidthCap50(); + final RecordBatchSizer incomingColumnSizes = new RecordBatchSizer(incoming); + final Map<String, RecordBatchSizer.ColumnSize> columnSizeMap = incomingColumnSizes.getColumnSizeMap(); + keySizes = CaseInsensitiveMap.newHashMap(); - // Get approx max (varchar) column width to get better memory allocation - maxColumnWidth = Math.max(sizer.maxAvgColumnSize(), VARIABLE_MIN_WIDTH_VALUE_SIZE); - maxColumnWidth = Math.min(maxColumnWidth, VARIABLE_MAX_WIDTH_VALUE_SIZE); + logger.trace("Incoming sizer: {}",incomingColumnSizes); - // // Calculate the estimated max (internal) batch (i.e. Keys batch + Values batch) size // (which is used to decide when to spill) // Also calculate the values batch size (used as a reserve to overcome an OOM) - // - Iterator<VectorWrapper<?>> outgoingIter = outContainer.iterator(); - int fieldId = 0; - while (outgoingIter.hasNext()) { - ValueVector vv = outgoingIter.next().getValueVector(); - MaterializedField mr = vv.getField(); - int fieldSize = vv instanceof VariableWidthVector ? maxColumnWidth : - TypeHelper.getSize(mr.getType()); - estRowWidth += fieldSize; - estOutputRowWidth += fieldSize; - if ( fieldId < numGroupByOutFields ) { fieldId++; } - else { estValuesRowWidth += fieldSize; } + + for (int columnIndex = 0; columnIndex < numGroupByOutFields; columnIndex++) { + final VectorWrapper vectorWrapper = outContainer.getValueVector(columnIndex); + final String columnName = vectorWrapper.getField().getName(); + final int columnSize = columnSizeMap.get(columnName).estSize; + keySizes.put(columnName, columnSize); + estOutputRowWidth += columnSize; } + + long estValuesRowWidth = 0; + + for (int columnIndex = numGroupByOutFields; columnIndex < outContainer.getNumberOfColumns(); columnIndex++) { + VectorWrapper vectorWrapper = outContainer.getValueVector(columnIndex); + RecordBatchSizer.ColumnSize columnSize = new RecordBatchSizer.ColumnSize(vectorWrapper.getValueVector()); + estOutputRowWidth += columnSize.estSize; --- End diff -- We have to account for the unknown columns in the value batch estimate, but the columns in the output batch are well defined and do not contain these "extra" columns. So why do we have to augment the estimate for the output batch? Why is looking at the columns in the outputContainer insufficient? > Use RecordBatchSizer to estimate size of columns in HashAgg > ----------------------------------------------------------- > > Key: DRILL-6032 > URL: https://issues.apache.org/jira/browse/DRILL-6032 > Project: Apache Drill > Issue Type: Improvement > Reporter: Timothy Farkas > Assignee: Timothy Farkas > Priority: Major > Fix For: 1.13.0 > > > We need to use the RecordBatchSize to estimate the size of columns in the > Partition batches created by HashAgg. -- This message was sent by Atlassian JIRA (v7.6.3#76005)