ramitg254 commented on code in PR #6089:
URL: https://github.com/apache/hive/pull/6089#discussion_r2602628155
##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java:
##########
@@ -2174,86 +2270,132 @@ private List<ColumnStatisticsObj>
aggrStatsUseDB(String catName, String dbName,
|| IExtrapolatePartStatus.aggrTypes[colStatIndex] ==
IExtrapolatePartStatus.AggrType.Max) {
// if the aggregation type is min/max, we extrapolate from the
// left/right borders
- if (!decimal) {
- queryText = "select \"" + colStatName + "\",\"PART_NAME\" from
" + PART_COL_STATS
- + " inner join " + PARTITIONS + " on " + PART_COL_STATS +
".\"PART_ID\" = " + PARTITIONS + ".\"PART_ID\""
- + " inner join " + TBLS + " on " + PARTITIONS +
".\"TBL_ID\" = " + TBLS + ".\"TBL_ID\""
- + " inner join " + DBS + " on " + TBLS + ".\"DB_ID\" = " +
DBS + ".\"DB_ID\""
- + " where " + DBS + ".\"CTLG_NAME\" = ? and " + DBS +
".\"NAME\" = ? and " + TBLS + ".\"TBL_NAME\" = ? "
- + " and " + PART_COL_STATS + ".\"COLUMN_NAME\" = ? "
- + " and " + PARTITIONS + ".\"PART_NAME\" in (" +
makeParams(partNames.size()) + ")"
- + " and " + PART_COL_STATS + ".\"ENGINE\" = ? "
- + " order by \"" + colStatName + "\"";
- } else {
- queryText = "select \"" + colStatName + "\",\"PART_NAME\" from
" + PART_COL_STATS
- + " inner join " + PARTITIONS + " on " + PART_COL_STATS +
".\"PART_ID\" = " + PARTITIONS + ".\"PART_ID\""
- + " inner join " + TBLS + " on " + PARTITIONS +
".\"TBL_ID\" = " + TBLS + ".\"TBL_ID\""
- + " inner join " + DBS + " on " + TBLS + ".\"DB_ID\" = " +
DBS + ".\"DB_ID\""
- + " where " + DBS + ".\"CTLG_NAME\" = ? and " + DBS +
".\"NAME\" = ? and " + TBLS + ".\"TBL_NAME\" = ? "
- + " and " + PART_COL_STATS + ".\"COLUMN_NAME\" = ? "
- + " and " + PARTITIONS + ".\"PART_NAME\" in (" +
makeParams(partNames.size()) + ")"
- + " and " + PART_COL_STATS + ".\"ENGINE\" = ? "
- + " order by cast(\"" + colStatName + "\" as decimal)";
- }
- start = doTrace ? System.nanoTime() : 0;
- try (QueryWrapper query = new
QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
- Object qResult = executeWithArray(query.getInnerQuery(),
- prepareParams(catName, dbName, tableName, partNames,
Arrays.asList(colName), engine), queryText);
- if (qResult == null) {
- return Collections.emptyList();
- }
- fqr = (ForwardQueryResult<?>) qResult;
- Object[] min = (Object[]) (fqr.get(0));
- Object[] max = (Object[]) (fqr.get(fqr.size() - 1));
- end = doTrace ? System.nanoTime() : 0;
- MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start,
end);
- if (min[0] == null || max[0] == null) {
- row[2 + colStatIndex] = null;
- } else {
- row[2 + colStatIndex] = extrapolateMethod
- .extrapolate(min, max, colStatIndex, indexMap);
- }
- }
- } else {
- // if the aggregation type is avg, we use the average on the
existing ones.
- queryText = "select "
- +
"avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as
decimal)),"
- +
"avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\"),"
- + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as
decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")"
- + " from " + PART_COL_STATS + ""
+ String orderByExpr = decimal ? "cast(\"" + colStatName + "\" as
decimal)" : "\"" + colStatName + "\"";
+
+ queryText = "select \"" + colStatName + "\",\"PART_NAME\" from "
+ PART_COL_STATS
+ " inner join " + PARTITIONS + " on " + PART_COL_STATS +
".\"PART_ID\" = " + PARTITIONS + ".\"PART_ID\""
+ " inner join " + TBLS + " on " + PARTITIONS + ".\"TBL_ID\"
= " + TBLS + ".\"TBL_ID\""
+ " inner join " + DBS + " on " + TBLS + ".\"DB_ID\" = " +
DBS + ".\"DB_ID\""
+ " where " + DBS + ".\"CTLG_NAME\" = ? and " + DBS +
".\"NAME\" = ? and " + TBLS + ".\"TBL_NAME\" = ? "
- + " and " + PART_COL_STATS + ".\"COLUMN_NAME\" = ? "
- + " and " + PARTITIONS + ".\"PART_NAME\" in (" +
makeParams(partNames.size()) + ")"
+ + " and " + PART_COL_STATS + ".\"COLUMN_NAME\" in (%1$s)"
+ + " and " + PARTITIONS + ".\"PART_NAME\" in (%2$s)"
+ " and " + PART_COL_STATS + ".\"ENGINE\" = ? "
- + " group by \"COLUMN_NAME\"";
- start = doTrace ? System.nanoTime() : 0;
- try(QueryWrapper query = new
QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
- Object qResult = executeWithArray(query.getInnerQuery(),
- prepareParams(catName, dbName, tableName, partNames,
Arrays.asList(colName), engine), queryText);
- if (qResult == null) {
- return Collections.emptyList();
+ + " order by " + orderByExpr;
+
+ columnWisePartitionBatches =
+ columnWisePartitionBatcher(queryText, catName, dbName,
tableName, partNames, engine, doTrace);
+ try {
+ list = Batchable.runBatched(batchSize, Arrays.asList(colName),
columnWisePartitionBatches);
+ Object[] min = list.getFirst();
+ Object[] max = list.getLast();
+ for (int i = Math.min(batchSize - 1, list.size() - 1); i <
list.size(); i += batchSize) {
+ Object[] posMax = list.get(i);
+ if (new BigDecimal(max[0].toString()).compareTo(new
BigDecimal(posMax[0].toString())) < 0) {
+ max = posMax;
+ }
+ int j = i + 1;
+ if (j < list.size()) {
+ Object[] posMin = list.get(j);
+ if (new BigDecimal(min[0].toString()).compareTo(new
BigDecimal(posMin[0].toString())) > 0) {
+ min = posMin;
+ }
+ }
+ }
+ if (min[0] == null || max[0] == null) {
+ row[2 + colStatIndex] = null;
+ } else {
+ row[2 + colStatIndex] = extrapolateMethod.extrapolate(min,
max, colStatIndex, indexMap);
}
- fqr = (ForwardQueryResult<?>) qResult;
- Object[] avg = (Object[]) (fqr.get(0));
- // colStatIndex=12,13,14 respond to "AVG_LONG", "AVG_DOUBLE",
- // "AVG_DECIMAL"
- row[2 + colStatIndex] = avg[colStatIndex - 12];
- end = doTrace ? System.nanoTime() : 0;
- MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start,
end);
+ } finally {
+ columnWisePartitionBatches.closeAllQueries();
}
+ } else {
+ // colStatIndex=12,13,14 respond to "AVG_LONG", "AVG_DOUBLE",
+ // "AVG_DECIMAL"
+ row[2 + colStatIndex] = avg[colStatIndex - 12];
}
}
- colStats.add(prepareCSObjWithAdjustedNDV(row, 0,
useDensityFunctionForNDVEstimation, ndvTuner));
+ colStats.add(columnStatisticsObjWithAdjustedNDV
+ (Collections.singletonList(row),
useDensityFunctionForNDVEstimation, ndvTuner));
Deadline.checkTimeout();
}
}
return colStats;
}
}
+ private ColumnStatisticsObj columnStatisticsObjWithAdjustedNDV(
+ List<Object[]> columnBatchesOutput,
+ boolean useDensityFunctionForNDVEstimation, double ndvTuner)
+ throws MetaException {
+ if (columnBatchesOutput.isEmpty()) {
+ return null;
+ }
+ ColumnStatisticsData data = new ColumnStatisticsData();
+ Object[] row = columnBatchesOutput.getFirst();
+ String colName = (String) row[COLNAME];
+ String colType = (String) row[COLTYPE];
+ ColumnStatisticsObj cso = new ColumnStatisticsObj(colName, colType, data);
+ Object llow = row[LLOW];
+ Object lhigh = row[LHIGH];
+ Object dlow = row[DLOW];
+ Object dhigh = row[DHIGH];
+ Object declow = row[DECLOW];
+ Object dechigh = row[DECHIGH];
+ Object nulls = row[NULLS];
+ Object dist = row[DIST];
+ Object avglen = row[AVGLEN];
+ Object maxlen = row[MAXLEN];
+ Object trues = row[TRUES];
+ Object falses = row[FALSES];
+ Object avgLong = row[AVGLONG];
+ Object avgDouble = row[AVGDOUBLE];
+ Object avgDecimal = row[AVGDECIMAL];
+ Object sumDist = row[SUMDIST];
+ if (row.length == 18) {
+ StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data,
llow, lhigh, dlow, dhigh, declow, dechigh,
+ nulls, dist, avglen, maxlen, trues, falses, avgLong, avgDouble,
avgDecimal, sumDist,
+ useDensityFunctionForNDVEstimation, ndvTuner);
+ return cso;
+ }
+ Object sumLong = row[AVGLONG];
+ Object countLong = row[AVGLONG + 1];
+ Object sumDouble = row[AVGDOUBLE + 1];
+ Object countDouble = row[AVGDOUBLE + 2];
+ Object sumDecimal = row[AVGDECIMAL + 2];
+ Object countDecimal = row[AVGDECIMAL + 3];
Review Comment:
I was keeping it according to
https://github.com/apache/hive/blob/master/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java#L28,
but everything is dealt in terms of sum and count so I have changed the indexes
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]