This is an automated email from the ASF dual-hosted git repository.
okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 33e11ec6270 HIVE-29442: Avoid redundancy in
FilterStatsRule#evaluateComparator (#6300)
33e11ec6270 is described below
commit 33e11ec6270ad94370587a682b425cff7b66bdba
Author: Thomas Rebele <[email protected]>
AuthorDate: Mon Feb 9 12:10:45 2026 +0100
HIVE-29442: Avoid redundancy in FilterStatsRule#evaluateComparator (#6300)
---
.../stats/annotation/StatsRulesProcFactory.java | 327 ++++++++-------------
1 file changed, 129 insertions(+), 198 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 19f83f39147..42d62e0a64e 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -33,6 +33,7 @@
import java.util.Optional;
import java.util.Set;
import java.util.Stack;
+import java.util.function.Function;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
@@ -1070,210 +1071,140 @@ private long evaluateComparator(Statistics stats,
AnnotateStatsProcCtx aspCtx, E
if (cs != null && cs.getRange() != null &&
cs.getRange().maxValue != null && cs.getRange().minValue != null) {
- try {
- if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)) {
- byte value = Byte.parseByte(boundValue);
- byte maxValue = cs.getRange().maxValue.byteValue();
- byte minValue = cs.getRange().minValue.byteValue();
- if (upperBound) {
- if (maxValue < value || maxValue == value && closedBound) {
- return currNumRows;
- }
- if (minValue > value || minValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (value - minValue) / (maxValue -
minValue)) * currNumRows);
- }
- } else {
- if (minValue > value || minValue == value && closedBound) {
- return currNumRows;
- }
- if (maxValue < value || maxValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (maxValue - value) / (maxValue -
minValue)) * currNumRows);
- }
- }
- } else if
(colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)) {
- short value = Short.parseShort(boundValue);
- short maxValue = cs.getRange().maxValue.shortValue();
- short minValue = cs.getRange().minValue.shortValue();
- if (upperBound) {
- if (maxValue < value || maxValue == value && closedBound) {
- return currNumRows;
- }
- if (minValue > value || minValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (value - minValue) / (maxValue -
minValue)) * currNumRows);
- }
- } else {
- if (minValue > value || minValue == value && closedBound) {
- return currNumRows;
- }
- if (maxValue < value || maxValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (maxValue - value) / (maxValue -
minValue)) * currNumRows);
- }
- }
- } else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) ||
- colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME) ||
- colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
- long value;
- if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
- DateWritable writableVal = new
DateWritable(java.sql.Date.valueOf(boundValue));
- value = writableVal.getDays();
- } else if
(colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
- TimestampWritableV2 timestampWritable = new
TimestampWritableV2(Timestamp.valueOf(boundValue));
- value = timestampWritable.getTimestamp().toEpochSecond();
- } else {
- value = Integer.parseInt(boundValue);
- }
- long maxValue = cs.getRange().maxValue.longValue();
- long minValue = cs.getRange().minValue.longValue();
- if (upperBound) {
- if (maxValue < value || maxValue == value && closedBound) {
- return currNumRows;
- }
- if (minValue > value || minValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (value - minValue) / (maxValue -
minValue)) * currNumRows);
- }
- } else {
- if (minValue > value || minValue == value && closedBound) {
- return currNumRows;
- }
- if (maxValue < value || maxValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (maxValue - value) / (maxValue -
minValue)) * currNumRows);
- }
- }
- } else if
(colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME) ||
- colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) {
- BigDecimal value = new BigDecimal(boundValue);
- BigDecimal maxValue = new
BigDecimal(cs.getRange().maxValue.toString());
- BigDecimal minValue = new
BigDecimal(cs.getRange().minValue.toString());
- int minComparison = value.compareTo(minValue);
- int maxComparison = value.compareTo(maxValue);
- if (upperBound) {
- if (maxComparison > 0 || maxComparison == 0 && closedBound) {
- return currNumRows;
- }
- if (minComparison < 0 || minComparison == 0 && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(
-
((value.subtract(minValue)).divide(maxValue.subtract(minValue), 10,
RoundingMode.UP))
- .multiply(BigDecimal.valueOf(currNumRows))
- .doubleValue());
- }
- } else {
- if (minComparison < 0 || minComparison == 0 && closedBound) {
- return currNumRows;
- }
- if (maxComparison > 0 || maxComparison == 0 && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(
-
((maxValue.subtract(value)).divide(maxValue.subtract(minValue), 10,
RoundingMode.UP))
- .multiply(BigDecimal.valueOf(currNumRows))
- .doubleValue());
- }
- }
- } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)) {
- float value = Float.parseFloat(boundValue);
- float maxValue = cs.getRange().maxValue.floatValue();
- float minValue = cs.getRange().minValue.floatValue();
- if (upperBound) {
- if (maxValue < value || maxValue == value && closedBound) {
- return currNumRows;
- }
- if (minValue > value || minValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (value - minValue) / (maxValue -
minValue)) * currNumRows);
- }
- } else {
- if (minValue > value || minValue == value && closedBound) {
- return currNumRows;
- }
- if (maxValue < value || maxValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((double) (maxValue - value) / (maxValue -
minValue)) * currNumRows);
- }
- }
- } else if (colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME))
{
- double value = Double.parseDouble(boundValue);
- double maxValue = cs.getRange().maxValue.doubleValue();
- double minValue = cs.getRange().minValue.doubleValue();
- if (upperBound) {
- if (maxValue < value || maxValue == value && closedBound) {
- return currNumRows;
- }
- if (minValue > value || minValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((value - minValue) / (maxValue - minValue))
* currNumRows);
- }
- } else {
- if (minValue > value || minValue == value && closedBound) {
- return currNumRows;
- }
- if (maxValue < value || maxValue == value && !closedBound) {
- return 0;
- }
- if (aspCtx.isUniformWithinRange()) {
- // Assuming uniform distribution, we can use the range to
calculate
- // new estimate for the number of rows
- return Math.round(((maxValue - value) / (maxValue - minValue))
* currNumRows);
- }
- }
- }
- } catch (NumberFormatException nfe) {
- return currNumRows / 3;
+ Long result =
+ evaluateComparatorWithRangeStats(cs, currNumRows,
colTypeLowerCase, boundValue, upperBound, closedBound,
+ aspCtx);
+ if (result != null) {
+ return result;
}
}
// default
return currNumRows / 3;
}
+ private static class EvaluateComparatorWithRange<T extends Number &
Comparable<T>> {
+ /**
+ * Adjusts the number of rows assuming a uniform distribution.
+ * <p>
+ * If the values are uniformly distributed between min and max, and the
predicate
+ * only accepts values between lower and upper, do a simple linear
scaling.
+ * </p>
+ */
+ interface RescaleRows<T> {
+ double rescaleNumberOfRows(T lower, T upper, T min, T max, long
numRows);
+ }
+
+ private final Function<Number, T> convert;
+ private final Function<String, T> parse;
+ private final RescaleRows<T> rescaleRows;
+
+ EvaluateComparatorWithRange(Function<Number, T> convert,
Function<String, T> parse, RescaleRows<T> rescaleRows) {
+ this.convert = convert;
+ this.parse = parse;
+ this.rescaleRows = rescaleRows;
+ }
+
+ Long evaluate(Range range, String boundValue, boolean upperBound,
boolean closedBound, long currNumRows,
+ AnnotateStatsProcCtx aspCtx) {
+ T maxValue = convert.apply(range.maxValue);
+ T minValue = convert.apply(range.minValue);
+ T value = parse.apply(boundValue);
+
+ int maxComparison = maxValue.compareTo(value);
+ int minComparison = minValue.compareTo(value);
+ if (upperBound) {
+ if (maxComparison < 0 || maxComparison == 0 && closedBound) {
+ return currNumRows;
+ }
+ if (minComparison > 0 || minComparison == 0 && !closedBound) {
+ return 0L;
+ }
+ if (aspCtx.isUniformWithinRange()) {
+ // Assuming uniform distribution, we can use the range to calculate
+ // new estimate for the number of rows
+ return Math.round(rescaleRows.rescaleNumberOfRows(minValue, value,
minValue, maxValue, currNumRows));
+ }
+ } else {
+ if (minComparison > 0 || minComparison == 0 && closedBound) {
+ return currNumRows;
+ }
+ if (maxComparison < 0 || maxComparison == 0 && !closedBound) {
+ return 0L;
+ }
+ if (aspCtx.isUniformWithinRange()) {
+ // Assuming uniform distribution, we can use the range to calculate
+ // new estimate for the number of rows
+ return Math.round(rescaleRows.rescaleNumberOfRows(value, maxValue,
minValue, maxValue, currNumRows));
+ }
+ }
+ return null;
+ }
+ }
+
+ private Long evaluateComparatorWithRangeStats(ColStatistics cs, long
currNumRows, String type, String boundValue,
+ boolean upperBound, boolean closedBound, AnnotateStatsProcCtx aspCtx) {
+ try {
+ EvaluateComparatorWithRange<?> helper;
+ if (type.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+ type = serdeConstants.DECIMAL_TYPE_NAME;
+ }
+
+ switch (type) {
+ case serdeConstants.TINYINT_TYPE_NAME:
+ helper = new EvaluateComparatorWithRange<>(Number::byteValue,
Byte::parseByte,
+ (lower, upper, min, max, numRows) -> ((double) (upper - lower) /
(max - min)) * numRows);
+ break;
+ case serdeConstants.SMALLINT_TYPE_NAME:
+ helper = new EvaluateComparatorWithRange<>(Number::shortValue,
Short::parseShort,
+ (lower, upper, min, max, numRows) -> ((double) (upper - lower) /
(max - min)) * numRows);
+ break;
+ case serdeConstants.INT_TYPE_NAME, serdeConstants.DATE_TYPE_NAME,
serdeConstants.TIMESTAMP_TYPE_NAME:
+ Function<String, Long> parse;
+ if (type.equals(serdeConstants.DATE_TYPE_NAME)) {
+ parse = str -> {
+ DateWritable writableVal = new
DateWritable(java.sql.Date.valueOf(str));
+ return Long.valueOf(writableVal.getDays());
+ };
+ } else if (type.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
+ parse = str -> {
+ TimestampWritableV2 timestampWritable = new
TimestampWritableV2(Timestamp.valueOf(boundValue));
+ return timestampWritable.getTimestamp().toEpochSecond();
+ };
+ } else {
+ parse = str -> (long) Integer.parseInt(str);
+ }
+ helper = new EvaluateComparatorWithRange<>(Number::longValue, parse,
+ (lower, upper, min, max, numRows) -> ((double) (upper - lower) /
(max - min)) * numRows);
+ break;
+
+ case serdeConstants.DECIMAL_TYPE_NAME, serdeConstants.BIGINT_TYPE_NAME:
+ helper = new EvaluateComparatorWithRange<>(num -> new
BigDecimal(num.toString()), BigDecimal::new,
+ (lower, upper, min, max, numRows) ->
((upper.subtract(lower)).divide(max.subtract(min), 10,
+
RoundingMode.UP)).multiply(BigDecimal.valueOf(currNumRows)).doubleValue());
+ break;
+ case serdeConstants.FLOAT_TYPE_NAME:
+ helper = new EvaluateComparatorWithRange<>(Number::floatValue,
Float::parseFloat,
+ (lower, upper, min, max, numRows) -> ((double) (upper - lower) /
(max - min)) * numRows);
+ break;
+ case serdeConstants.DOUBLE_TYPE_NAME:
+ helper = new EvaluateComparatorWithRange<>(Number::doubleValue,
Double::parseDouble,
+ (lower, upper, min, max, numRows) -> ((upper - lower) / (max -
min)) * numRows);
+ break;
+ default:
+ return null;
+ }
+
+ Long helperResult = helper.evaluate(cs.getRange(), boundValue,
upperBound, closedBound, currNumRows, aspCtx);
+ if (helperResult != null) {
+ return helperResult;
+ }
+
+ } catch (NumberFormatException nfe) {
+ return null;
+ }
+ return null;
+ }
+
private long evaluateComparatorWithHistogram(ColStatistics cs, long
currNumRows, String colTypeLowerCase,
String boundValue, boolean upperBound, boolean closedBound) {
final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(cs.getHistogram()));