thomasrebele commented on code in PR #6477:
URL: https://github.com/apache/hive/pull/6477#discussion_r3249195340


##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -603,6 +605,151 @@ private Optional<Float> extractLiteral(SqlTypeName 
typeName, Object boundValueOb
     return Optional.of(value);
   }
 
+  private double computeSearchSelectivity(RexCall search) {
+    return new SearchSelectivityHelper<>(search).compute();
+  }
+
+  /**
+   * Similar to {@link SearchTransformer}, but computing the selectivity of 
the expression.
+   */
+  private final class SearchSelectivityHelper<C extends Comparable<C>> {
+    private final RexNode ref;
+    private final Sarg<C> sarg;
+    private final RelDataType operandType;
+
+    private SearchSelectivityHelper(RexCall search) {
+      ref = search.getOperands().get(0);
+      RexLiteral literal = (RexLiteral) search.operands.get(1);
+      sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg");
+      operandType = literal.getType();
+    }
+
+    private RexNode makeLiteral(C value) {
+      return rexBuilder.makeLiteral(value, operandType, true, true);
+    }
+
+    private double compute() {
+      final List<Double> selectivityList = new ArrayList<>();
+      final List<RexNode> inLiterals = new ArrayList<>();
+
+      if (sarg.nullAs == RexUnknownAs.TRUE) {
+        selectivityList.add(
+            rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, 
ref).accept(FilterSelectivityEstimator.this));
+      }
+
+      RangeSets.forEach(sarg.rangeSet, new RangeSets.Consumer<C>() {
+        @Override
+        public void all() {
+          selectivityList.add(1.0);
+        }
+
+        @Override
+        public void singleton(C value) {
+          inLiterals.add(rexBuilder.makeLiteral(value, operandType, true, 
true));
+        }
+
+        @Override
+        public void atLeast(C lower) {
+          Optional<Float> lowerLiteral = extractLiteral(makeLiteral(lower));
+          if (lowerLiteral.isEmpty()) {
+            selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
+          } else {
+            processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
+                Range.range(lowerLiteral.get(), BoundType.CLOSED, 
Float.POSITIVE_INFINITY, BoundType.CLOSED));
+          }
+        }
+
+        @Override
+        public void atMost(C upper) {
+          Optional<Float> upperLiteral = extractLiteral(makeLiteral(upper));
+          if (upperLiteral.isEmpty()) {
+            selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
+          } else {
+            processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
+                Range.range(Float.NEGATIVE_INFINITY, BoundType.CLOSED, 
upperLiteral.get(), BoundType.CLOSED));
+          }
+        }
+
+        @Override
+        public void greaterThan(C lower) {
+          Optional<Float> lowerLiteral = extractLiteral(makeLiteral(lower));
+          if (lowerLiteral.isEmpty()) {
+            selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
+          } else {
+            processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
+                Range.range(lowerLiteral.get(), BoundType.OPEN, 
Float.POSITIVE_INFINITY, BoundType.CLOSED));
+          }
+        }
+
+        @Override
+        public void lessThan(C upper) {
+          Optional<Float> upperLiteral = extractLiteral(makeLiteral(upper));
+          if (upperLiteral.isEmpty()) {
+            selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
+          } else {
+            processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
+                Range.range(Float.NEGATIVE_INFINITY, BoundType.CLOSED, 
upperLiteral.get(), BoundType.OPEN));
+          }
+        }
+
+        @Override
+        public void closed(C lower, C upper) {
+          processRange(lower, BoundType.CLOSED, upper, BoundType.CLOSED);
+        }
+
+        @Override
+        public void closedOpen(C lower, C upper) {
+          processRange(lower, BoundType.CLOSED, upper, BoundType.OPEN);
+        }
+
+        @Override
+        public void openClosed(C lower, C upper) {
+          processRange(lower, BoundType.OPEN, upper, BoundType.CLOSED);
+        }
+
+        @Override
+        public void open(C lower, C upper) {
+          processRange(lower, BoundType.OPEN, upper, BoundType.OPEN);
+        }
+
+        private void processRange(C lower, BoundType lowerBoundType, C upper, 
BoundType upperBoundType) {
+          RexNode lowerRexLiteral = makeLiteral(lower);
+          RexNode upperRexLiteral = makeLiteral(upper);
+          Supplier<Double> defaultSelectivity =
+              () -> computeFunctionSelectivity(List.of(ref, lowerRexLiteral, 
upperRexLiteral));
+          Optional<Float> lowerLiteral = extractLiteral(lowerRexLiteral);
+          Optional<Float> upperLiteral = extractLiteral(upperRexLiteral);
+          if (lowerLiteral.isEmpty() || upperLiteral.isEmpty()) {
+            selectivityList.add(defaultSelectivity.get());
+          } else {
+            processRange(defaultSelectivity,
+                Range.range(lowerLiteral.get(), lowerBoundType, 
upperLiteral.get(), upperBoundType));
+          }
+        }
+
+        private void processRange(Supplier<Double> defaultSelectivity, 
Range<Float> boundaries) {
+          
selectivityList.add(computeRangePredicateSelectivity(defaultSelectivity, ref, 
boundaries));
+        }
+      });
+
+      switch (inLiterals.size()) {
+      case 0:
+        break;
+      case 1:
+        selectivityList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, 
ref, inLiterals.get(0))
+            .accept(FilterSelectivityEstimator.this));
+        break;
+      default:
+        List<RexNode> operands = new ArrayList<>(inLiterals.size() + 1);
+        operands.add(ref);
+        operands.addAll(inLiterals);
+        selectivityList.add(rexBuilder.makeCall(HiveIn.INSTANCE, 
operands).accept(FilterSelectivityEstimator.this));
+      }
+
+      return selectivityList.size() == 1 ? selectivityList.get(0) : 
computeDisjunctionSelectivity(selectivityList);

Review Comment:
   As far as I understand `computeDisjunctionSelectivity`, it combines the 
selectivities by multiplying their complements. To use the information from the 
histogram, we would need to add the selectivities (but making sure we stay 
within the possible values [0,1]).
   I wonder how accurate estimating selectivity of individual values are 
compared to the ranges. This also depends on the type of histogram (I had 
created a ticket about the shortcoming of KLL) . I think we can just add the 
selectivities for individual values and ranges together, and revisit this in 
the future in case of need.



##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -457,6 +475,129 @@ public void 
testComputeRangePredicateSelectivityGreaterThanOrEqualWhenHigherThan
     Assert.assertEquals(0, estimator.estimateSelectivity(filter), DELTA);
   }
 
+  private RexNode simplify(RexNode e) {
+    return new RexSimplify(REX_BUILDER, RelOptPredicateList.EMPTY, 
RexUtil.EXECUTOR).simplify(e);
+  }
+
+  @Test
+  public void testComputeRangePredicateSelectivityOpenOpenWithSearch() {
+    
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
+    RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+        REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN, inputRef0, 
int1),
+        REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, inputRef0, int4));
+    filter = simplify(filter);
+    Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
+    FilterSelectivityEstimator estimator = new 
FilterSelectivityEstimator(scan, mq);
+    Assert.assertEquals(0.6153846153846154, 
estimator.estimateSelectivity(filter), DELTA);
+  }
+
+  @Test
+  public void testComputeRangePredicateSelectivityClosedOpenWithSearch() {
+    
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
+    RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+        REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN_OR_EQUAL, 
inputRef0, int2),
+        REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, inputRef0, int4));
+    filter = simplify(filter);
+    Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
+    FilterSelectivityEstimator estimator = new 
FilterSelectivityEstimator(scan, mq);
+    Assert.assertEquals(0.6153846153846154, 
estimator.estimateSelectivity(filter), DELTA);
+  }
+
+  @Test
+  public void testComputeRangePredicateSelectivityOpenClosedWithSearch() {
+    
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
+    RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+        REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN, inputRef0, 
int1),
+        REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN_OR_EQUAL, 
inputRef0, int3));
+    filter = simplify(filter);
+    Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
+    FilterSelectivityEstimator estimator = new 
FilterSelectivityEstimator(scan, mq);
+    Assert.assertEquals(0.6153846153846154, 
estimator.estimateSelectivity(filter), DELTA);
+  }
+
+  @Test
+  public void testComputeRangePredicateSelectivityClosedClosedWithSearch() {
+    
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
+    RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+        REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN_OR_EQUAL, 
inputRef0, int2),
+        REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN_OR_EQUAL, 
inputRef0, int3));
+    filter = simplify(filter);
+    Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
+    FilterSelectivityEstimator estimator = new 
FilterSelectivityEstimator(scan, mq);
+    Assert.assertEquals(0.6153846153846154, 
estimator.estimateSelectivity(filter), DELTA);
+  }
+
+  @Test
+  public void testComputeSeveralRangesPredicateSelectivityWithSearch() {
+    
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
+    RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.OR,
+        REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+            REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN, inputRef0, 
int1),
+            REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, inputRef0, 
int4)),
+        REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+            REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN, inputRef0, 
int6),
+            REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, inputRef0, 
int10)));
+    filter = simplify(filter);
+    Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
+    FilterSelectivityEstimator estimator = new 
FilterSelectivityEstimator(scan, mq);
+    Assert.assertEquals(0.6153846153846154, 
estimator.estimateSelectivity(filter), DELTA);

Review Comment:
   The condition is `1<x<4 or 6<x<10`, which is fulfilled by the values `2, 2, 
2, 2, 2, 2, 2, 3, 7`. So 9 out of 13 values fulfill the condition, which is a 
selectivity of 0.6923076923076923.



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -159,14 +163,18 @@ public Double visitCall(RexCall call) {
     case GREATER_THAN_OR_EQUAL:
     case LESS_THAN:
     case GREATER_THAN: {
-      selectivity = computeRangePredicateSelectivity(call, call.getKind());
+      selectivity = computeComparisonPredicateSelectivity(call, 
call.getKind());
       break;
     }
 
     case BETWEEN:
       selectivity = computeBetweenPredicateSelectivity(call);
       break;
 
+    case SEARCH:
+      selectivity = computeSearchSelectivity(call);
+      break;
+

Review Comment:
   I would avoid moving code (the case) in general, to avoid downstream merge 
conflicts.



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -226,7 +234,7 @@ public Double visitCall(RexCall call) {
    * @return true if the expression is a removable cast, false otherwise
    */
   private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) {
-    if(SqlKind.CAST != exp.getKind()) {
+    if (SqlKind.CAST != exp.getKind()) {

Review Comment:
   I think I need to check my IDE setup, the if without the space was 
introduced by me, sorry. Normally formatting changes should not be mixed with 
other changes. I guess we can keep the fix, as it's related.



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -603,6 +605,151 @@ private Optional<Float> extractLiteral(SqlTypeName 
typeName, Object boundValueOb
     return Optional.of(value);
   }
 
+  private double computeSearchSelectivity(RexCall search) {
+    return new SearchSelectivityHelper<>(search).compute();
+  }
+
+  /**
+   * Similar to {@link SearchTransformer}, but computing the selectivity of 
the expression.
+   */
+  private final class SearchSelectivityHelper<C extends Comparable<C>> {
+    private final RexNode ref;
+    private final Sarg<C> sarg;
+    private final RelDataType operandType;
+
+    private SearchSelectivityHelper(RexCall search) {
+      ref = search.getOperands().get(0);
+      RexLiteral literal = (RexLiteral) search.operands.get(1);
+      sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg");
+      operandType = literal.getType();
+    }
+
+    private RexNode makeLiteral(C value) {
+      return rexBuilder.makeLiteral(value, operandType, true, true);
+    }
+
+    private double compute() {
+      final List<Double> selectivityList = new ArrayList<>();
+      final List<RexNode> inLiterals = new ArrayList<>();
+
+      if (sarg.nullAs == RexUnknownAs.TRUE) {
+        selectivityList.add(
+            rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, 
ref).accept(FilterSelectivityEstimator.this));
+      }
+
+      RangeSets.forEach(sarg.rangeSet, new RangeSets.Consumer<C>() {

Review Comment:
   Maybe the code could be simplified with `for (Range<C> range : 
sarg.rangeSet.asRanges()) { ... }`? It might be possible to treat a range 
as-is, without differentiating all the different kinds of ranges.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to