zabetak commented on code in PR #6293:
URL: https://github.com/apache/hive/pull/6293#discussion_r2878452187
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -605,7 +611,36 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
}
@Test
- public void testRangePredicateCastInteger() {
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
Review Comment:
```java
List<String> fields = Arrays.asList("f_tinyint", "f_smallint", "f_integer",
"f_bigint");
List<SqlTypeName> types = Arrays.asList(TINYINT, SMALLINT, INTEGER, BIGINT);
for (String f : fields) {
useFieldWithValues(f, VALUES, KLL);
for (SqlTypeName t : types) {
checkSelectivity(3 / 13.f, ge(cast(f, t), int5));
}
}
```
Refactoring suggestion for reducing repetition and making the test matrix
more explicit.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
+ // checks many possible combinations of types
+ List<RelDataTypeField> fields = tableType.getFieldList();
+ for (var srcField : fields) {
+ if (isTemporal(srcField.getType())) {
+ continue;
+ }
+
+ useFieldWithValues(srcField.getName(), VALUES, KLL);
+
+ for (var tgt : fields) {
+ try {
+ if (isTemporal(tgt.getType())) {
+ continue;
+ }
+
+ RexNode expr = cast(srcField.getName(), tgt.getType());
+ checkBetweenSelectivity(3, VALUES.length, VALUES.length, expr, 5, 7);
+ } catch (AssertionError e) {
+ throw new AssertionError("Error when casting from " +
srcField.getType() + " to " + tgt.getType(), e);
+ }
Review Comment:
No need to wrap an assertion into another assertion. Removing the try-catch
block makes the code more readable and the stacktrace easier to follow.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
+ // checks many possible combinations of types
+ List<RelDataTypeField> fields = tableType.getFieldList();
+ for (var srcField : fields) {
+ if (isTemporal(srcField.getType())) {
+ continue;
+ }
+
+ useFieldWithValues(srcField.getName(), VALUES, KLL);
+
+ for (var tgt : fields) {
+ try {
+ if (isTemporal(tgt.getType())) {
+ continue;
+ }
+
+ RexNode expr = cast(srcField.getName(), tgt.getType());
+ checkBetweenSelectivity(3, VALUES.length, VALUES.length, expr, 5, 7);
+ } catch (AssertionError e) {
+ throw new AssertionError("Error when casting from " +
srcField.getType() + " to " + tgt.getType(), e);
+ }
+ }
+ }
+ }
+
+ private boolean isTemporal(RelDataType type) {
+ return type.getSqlTypeName() == TIMESTAMP || type.getSqlTypeName() ==
SqlTypeName.DATE;
+ }
Review Comment:
You can drop the method and use `SqlTypeUtil#isDatetime` directly.
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +189,383 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * Return whether the expression is a removable cast based on stats and type
bounds.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of the
cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of the
values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * @param exp the expression to check
+ * @param tableScan the table that provides the statistics
+ * @return true if the expression is a removable cast, false otherwise
+ */
+ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) {
+ if(SqlKind.CAST != exp.getKind()) {
+ return false;
+ }
+ RexCall cast = (RexCall) exp;
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return false;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return false;
+ }
Review Comment:
The column stats are only used when we cast between INT types and not always
either. Should we move this bail out condition within `isRemovableIntegerCast`
method to make it more explicit where they are used?
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
+ // checks many possible combinations of types
+ List<RelDataTypeField> fields = tableType.getFieldList();
+ for (var srcField : fields) {
+ if (isTemporal(srcField.getType())) {
+ continue;
+ }
+
+ useFieldWithValues(srcField.getName(), VALUES, KLL);
+
+ for (var tgt : fields) {
+ try {
+ if (isTemporal(tgt.getType())) {
+ continue;
+ }
+
+ RexNode expr = cast(srcField.getName(), tgt.getType());
+ checkBetweenSelectivity(3, VALUES.length, VALUES.length, expr, 5, 7);
+ } catch (AssertionError e) {
+ throw new AssertionError("Error when casting from " +
srcField.getType() + " to " + tgt.getType(), e);
+ }
+ }
+ }
+ }
+
+ private boolean isTemporal(RelDataType type) {
+ return type.getSqlTypeName() == TIMESTAMP || type.getSqlTypeName() ==
SqlTypeName.DATE;
+ }
+
+ @Test
+ public void testRangePredicateWithCast() {
+ useFieldWithValues("f_numeric", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(10 / 13.f, lt(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(2 / 13.f, gt(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(11 / 13.f, le(cast("f_numeric", TINYINT), int5));
+
+ checkSelectivity(12 / 13f, ge(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(1 / 13f, lt(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(5 / 13f, gt(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(8 / 13f, le(cast("f_numeric", TINYINT), int2));
+
+ // check some types
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", BIGINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SqlTypeName.FLOAT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SqlTypeName.DOUBLE),
int5));
Review Comment:
Can these tests become part of
`testRangePredicateCastIntegerValuesInsideTypeRange`?
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +189,383 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * Return whether the expression is a removable cast based on stats and type
bounds.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of the
cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of the
values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * @param exp the expression to check
+ * @param tableScan the table that provides the statistics
+ * @return true if the expression is a removable cast, false otherwise
+ */
+ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) {
+ if(SqlKind.CAST != exp.getKind()) {
+ return false;
+ }
+ RexCall cast = (RexCall) exp;
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return false;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return false;
+ }
+
+ SqlTypeName sourceType = op0.getType().getSqlTypeName();
+ SqlTypeName targetType = cast.getType().getSqlTypeName();
+
+ switch (sourceType) {
+ case TINYINT, SMALLINT, INTEGER, BIGINT:
+ switch (targetType) {// additional checks are needed
+ case TINYINT, SMALLINT, INTEGER, BIGINT:
+ return isRemovableIntegerCast(cast, op0, colStats);
+ case FLOAT, DOUBLE, DECIMAL:
+ return true;
+ default:
+ return false;
+ }
+ case FLOAT, DOUBLE, DECIMAL:
+ switch (targetType) {
+ // these CASTs do not show a modulo behavior, so it's ok to remove such
a cast
+ case TINYINT, SMALLINT, INTEGER, BIGINT, FLOAT, DOUBLE, DECIMAL:
+ return true;
+ default:
+ return false;
+ }
+ case TIMESTAMP, DATE:
+ switch (targetType) {
+ case TIMESTAMP, DATE:
+ return true;
+ default:
+ return false;
+ }
+ // unknown type, do not remove the cast
+ default:
+ return false;
+ }
+ }
+
+ private static boolean isRemovableIntegerCast(RexCall cast, RexNode op0,
List<ColStatistics> colStats) {
+ // If the source type is completely within the target type, the cast is
lossless
+ Range<Float> targetRange = getRangeOfType(cast.getType(),
BoundType.CLOSED, BoundType.CLOSED);
+ Range<Float> sourceRange = getRangeOfType(op0.getType(), BoundType.CLOSED,
BoundType.CLOSED);
+ if (targetRange.encloses(sourceRange)) {
+ return true;
+ }
+
+ // Check that the possible values of the input column are all within the
type range of the cast
+ // otherwise the CAST introduces some modulo-like behavior
+ ColStatistics colStat = colStats.getFirst();
+ ColStatistics.Range colRange = colStat.getRange();
+ if (colRange == null || colRange.minValue == null || colRange.maxValue ==
null) {
+ return false;
+ }
+
+ // are all values of the input column accepted by the cast?
+ SqlTypeName targetType = cast.getType().getSqlTypeName();
+ double min = ((Number) targetType.getLimit(false,
SqlTypeName.Limit.OVERFLOW, false, -1, -1)).doubleValue();
+ double max = ((Number) targetType.getLimit(true,
SqlTypeName.Limit.OVERFLOW, false, -1, -1)).doubleValue();
+ return min < colRange.minValue.doubleValue() &&
colRange.maxValue.doubleValue() < max;
+ }
+
+ /**
+ * Get the range of values that are rounded to valid values of a type.
+ *
+ * @param type the type
+ * @param lowerBound the lower bound type of the result
+ * @param upperBound the upper bound type of the result
+ * @return the range of the type
+ */
+ private static Range<Float> getRangeOfType(RelDataType type, BoundType
lowerBound, BoundType upperBound) {
+ switch (type.getSqlTypeName()) {
+ // in case of integer types,
+ case TINYINT:
+ return Range.closed(-128.99998f, 127.99999f);
+ case SMALLINT:
+ return Range.closed(-32768.996f, 32767.998f);
+ case INTEGER:
+ return Range.closed(-2.1474836E9f, 2.1474836E9f);
+ case BIGINT, DATE, TIMESTAMP:
+ return Range.closed(-9.223372E18f, 9.223372E18f);
+ case DECIMAL:
+ return getRangeOfDecimalType(type, lowerBound, upperBound);
+ case FLOAT, DOUBLE:
+ return Range.closed(-Float.MAX_VALUE, Float.MAX_VALUE);
+ default:
+ throw new IllegalStateException("Unsupported type: " + type);
+ }
+ }
+
+ private static Range<Float> getRangeOfDecimalType(RelDataType type,
BoundType lowerBound, BoundType upperBound) {
+ // values outside the representable range are cast to NULL, so adapt the
boundaries
+ int digits = type.getPrecision() - type.getScale();
+ // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9
+ // but CAST(99.95 AS DECIMAL(3,1)) = NULL
+ float adjust = (float) (5 * Math.pow(10, -(type.getScale() + 1)));
+ // the range of values supported by the type is interval
[-typeRangeExtent, typeRangeExtent] (both inclusive)
+ // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1)
+ float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) -
adjust));
+
+ // the resulting value of +- adjust would be rounded up, so in some cases
we need to use Math.nextDown
+ boolean lowerInclusive = BoundType.CLOSED.equals(lowerBound);
+ boolean upperInclusive = BoundType.CLOSED.equals(upperBound);
+ float lowerUniverse = lowerInclusive ? -typeRangeExtent :
Math.nextDown(-typeRangeExtent);
+ float upperUniverse = upperInclusive ? typeRangeExtent :
Math.nextUp(typeRangeExtent);
Review Comment:
To elaborate a bit more on the question about the `BoundType` in
`getRangeOfType` method I was wondering if we can avoid this adjustment and
assume that the bounds for the `DECIMAL` data type are always closed. The
`adjustRangeToType` method already performs some adjustments based on the
`BoundType` of the `predicateRange` so not sure if the type bounds must also be
adjusted.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
+ // checks many possible combinations of types
+ List<RelDataTypeField> fields = tableType.getFieldList();
+ for (var srcField : fields) {
+ if (isTemporal(srcField.getType())) {
+ continue;
+ }
+
+ useFieldWithValues(srcField.getName(), VALUES, KLL);
+
+ for (var tgt : fields) {
+ try {
+ if (isTemporal(tgt.getType())) {
+ continue;
+ }
+
+ RexNode expr = cast(srcField.getName(), tgt.getType());
+ checkBetweenSelectivity(3, VALUES.length, VALUES.length, expr, 5, 7);
+ } catch (AssertionError e) {
+ throw new AssertionError("Error when casting from " +
srcField.getType() + " to " + tgt.getType(), e);
+ }
+ }
+ }
+ }
+
+ private boolean isTemporal(RelDataType type) {
+ return type.getSqlTypeName() == TIMESTAMP || type.getSqlTypeName() ==
SqlTypeName.DATE;
+ }
+
+ @Test
+ public void testRangePredicateWithCast() {
+ useFieldWithValues("f_numeric", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(10 / 13.f, lt(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(2 / 13.f, gt(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(11 / 13.f, le(cast("f_numeric", TINYINT), int5));
+
+ checkSelectivity(12 / 13f, ge(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(1 / 13f, lt(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(5 / 13f, gt(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(8 / 13f, le(cast("f_numeric", TINYINT), int2));
+
+ // check some types
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", BIGINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SqlTypeName.FLOAT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SqlTypeName.DOUBLE),
int5));
+ }
+
+ @Test
+ public void testRangePredicateWithCast2() {
+ useFieldWithValues("f_numeric", VALUES2, KLL2);
+ RelDataType decimal3s1 = decimalType(3, 1);
+ checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1),
literalFloat(1)));
+
+ // values from -99.94999 to 99.94999 (both inclusive)
+ checkSelectivity(7 / 28.f, lt(cast("f_numeric", decimal3s1),
literalFloat(100)));
+ checkSelectivity(7 / 28.f, le(cast("f_numeric", decimal3s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, gt(cast("f_numeric", decimal3s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, ge(cast("f_numeric", decimal3s1),
literalFloat(100)));
+
+ RelDataType decimal4s1 = decimalType(4, 1);
+ checkSelectivity(10 / 28.f, lt(cast("f_numeric", decimal4s1),
literalFloat(100)));
+ checkSelectivity(20 / 28.f, le(cast("f_numeric", decimal4s1),
literalFloat(100)));
+ checkSelectivity(3 / 28.f, gt(cast("f_numeric", decimal4s1),
literalFloat(100)));
+ checkSelectivity(13 / 28.f, ge(cast("f_numeric", decimal4s1),
literalFloat(100)));
+
+ RelDataType decimal2s1 = decimalType(2, 1);
+ checkSelectivity(2 / 28.f, lt(cast("f_numeric", decimal2s1),
literalFloat(100)));
+ checkSelectivity(2 / 28.f, le(cast("f_numeric", decimal2s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, gt(cast("f_numeric", decimal2s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, ge(cast("f_numeric", decimal2s1),
literalFloat(100)));
+
+ // expected: 100_000f
+ RelDataType decimal7s1 = decimalType(7, 1);
+ checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1),
literalFloat(10000)));
+
+ // expected: 10_000f, 100_000f, because CAST(1_000_000 AS DECIMAL(7,1)) =
NULL, and similar for even larger values
+ checkSelectivity(2 / 28.f, ge(cast("f_numeric", decimal7s1),
literalFloat(9999)));
+ checkSelectivity(2 / 28.f, ge(cast("f_numeric", decimal7s1),
literalFloat(10000)));
+
+ // expected: 100_000f
+ checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1),
literalFloat(10000)));
+ checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1),
literalFloat(10001)));
+
+ // expected 1f, 10f, 99.94998f, 99.94999f
+ checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ checkSelectivity(3 / 28.f, gt(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ // expected -99.94999f, -99.94998f, 0f, 1f
+ checkSelectivity(4 / 28.f, le(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ checkSelectivity(3 / 28.f, lt(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ }
+
+ private void checkTimeFieldOnMidnightTimestamps(RexNode field) {
+ // note: use only values from VALUES_TIME that specify a date without
hh:mm:ss!
+ checkSelectivity(7 / 7.f, ge(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(5 / 7.f, ge(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(1 / 7.f, ge(field, literalTimestamp("2020-11-07")));
+
+ checkSelectivity(6 / 7.f, gt(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(4 / 7.f, gt(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(0 / 7.f, gt(field, literalTimestamp("2020-11-07")));
+
+ checkSelectivity(1 / 7.f, le(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(3 / 7.f, le(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(7 / 7.f, le(field, literalTimestamp("2020-11-07")));
+
+ checkSelectivity(0 / 7.f, lt(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(2 / 7.f, lt(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(6 / 7.f, lt(field, literalTimestamp("2020-11-07")));
+ }
+
+ private void checkTimeFieldOnIntraDayTimestamps(RexNode field) {
+ checkSelectivity(3 / 7.f, ge(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(2 / 7.f, gt(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(5 / 7.f, le(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(4 / 7.f, lt(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ }
+
+ @Test
+ public void testRangePredicateOnTimestamp() {
+ useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME);
+ checkTimeFieldOnMidnightTimestamps(currentInputRef);
+ checkTimeFieldOnIntraDayTimestamps(currentInputRef);
+ }
+
+ @Test
+ public void testRangePredicateOnTimestampWithCast() {
+ useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME);
+ RexNode expr1 = cast("f_timestamp", SqlTypeName.DATE);
+ checkTimeFieldOnMidnightTimestamps(expr1);
+ checkTimeFieldOnIntraDayTimestamps(expr1);
+
+ RexNode expr2 = cast("f_timestamp", SqlTypeName.TIMESTAMP);
+ checkTimeFieldOnMidnightTimestamps(expr2);
+ checkTimeFieldOnIntraDayTimestamps(expr2);
+ }
+
+ @Test
+ public void testRangePredicateOnDate() {
+ useFieldWithValues("f_date", VALUES_TIME, KLL_TIME);
+ checkTimeFieldOnMidnightTimestamps(currentInputRef);
+
+ // it does not make sense to compare with "2020-11-05T11:23:45Z",
+ // as that value would not be stored as-is in a date column, but as
"2020-11-05" instead
+ }
+
+ @Test
+ public void testRangePredicateOnDateWithCast() {
+ useFieldWithValues("f_date", VALUES_TIME, KLL_TIME);
+ checkTimeFieldOnMidnightTimestamps(cast("f_date", SqlTypeName.DATE));
+ checkTimeFieldOnMidnightTimestamps(cast("f_date", SqlTypeName.TIMESTAMP));
+
+ // it does not make sense to compare with "2020-11-05T11:23:45Z",
+ // as that value would not be stored as-is in a date column, but as
"2020-11-05" instead
+ }
+
+ @Test
+ public void testBetweenWithCastToTinyIntCheckRounding() {
+ useFieldWithValues("f_numeric", VALUES3, KLL3);
+ float total = VALUES3.length;
+ float universe = 10; // the number of values that "survive" the cast
+ RexNode cast = cast("f_numeric", TINYINT);
+ // check rounding of positive numbers
+ checkBetweenSelectivity(3, universe, total, cast, 0, 10);
+ checkBetweenSelectivity(4, universe, total, cast, 0, 11);
+ checkBetweenSelectivity(4, universe, total, cast, 10, 20);
+ checkBetweenSelectivity(1, universe, total, cast, 11, 20);
+
+ // check rounding of negative numbers
+ checkBetweenSelectivity(4, universe, total, cast, -20, -10);
+ checkBetweenSelectivity(1, universe, total, cast, -20, -11);
+ checkBetweenSelectivity(3, universe, total, cast, -10, 0);
+ checkBetweenSelectivity(4, universe, total, cast, -11, 0);
+ }
+
+ @Test
+ public void testBetweenWithCastToTinyInt() {
+ useFieldWithValues("f_numeric", VALUES3, KLL3);
+ float total = VALUES3.length;
+ float universe = 10; // the number of values that "survive" the cast
+ RexNode cast = cast("f_numeric", TINYINT);
+ checkBetweenSelectivity(5, universe, total, cast, 0, 1e20f);
+ checkBetweenSelectivity(5, universe, total, cast, -1e20f, 0);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+
+ @Test
+ public void testBetweenWithCastToSmallInt() {
+ useFieldWithValues("f_numeric", VALUES3, KLL3);
+ float total = VALUES3.length;
+ float universe = 14; // the number of values that "survive" the cast
+ RexNode cast = cast("f_numeric", SMALLINT);
+ checkBetweenSelectivity(7, universe, total, cast, 0, 1e20f);
+ checkBetweenSelectivity(7, universe, total, cast, -1e20f, 0);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+
+ @Test
+ public void testBetweenWithCastToInteger() {
+ useFieldWithValues("f_numeric", VALUES3, KLL3);
+ float total = VALUES3.length;
+ float universe = 18; // the number of values that "survive" the cast
+ RexNode cast = cast("f_numeric", INTEGER);
+ checkBetweenSelectivity(9, universe, total, cast, 0, 1e20f);
+ checkBetweenSelectivity(9, universe, total, cast, -1e20f, 0);
+ checkBetweenSelectivity(0, universe, total, cast, 100f, 0f);
+ }
+
+ @Test
+ public void testBetweenWithCastToBigInt() {
Review Comment:
There are some inconsistencies when it comes to testing `BETWEEN` and range
predicates that makes the tests harder to follow and may lead to gaps in the
test matrix.
For BETWEEN, there is a separate test method for each CAST combination
between:
* source type: numeric/decimal
* target type: TINYINT, SMALLINT, INT, BIGINT, DEC(2,1), DEC(3,1), DEC(4,1),
DEC(7,1).
For range predicates, the respective combinations are covered by
`testRangePredicateWithCast` and `testRangePredicateWithCast2` but they are not
fully inline.
Using a consistent grouping/naming for BETWEEN and range predicates with
CAST would make it easier to see what is tested and what is not.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
+ // checks many possible combinations of types
+ List<RelDataTypeField> fields = tableType.getFieldList();
+ for (var srcField : fields) {
+ if (isTemporal(srcField.getType())) {
+ continue;
+ }
Review Comment:
Instead of checking for temporal types on every iteration in both loops you
can filter the unsupported fields from the initial `fields` list before
entering the loops.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -605,7 +611,36 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
}
@Test
- public void testRangePredicateCastInteger() {
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
Review Comment:
nit: Comment not relevant
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +189,383 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * Return whether the expression is a removable cast based on stats and type
bounds.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of the
cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of the
values
+ * is limited by the range of possible values of the type.
+ * </p>
Review Comment:
In the current version of the code, stats and type bounds are only used for
CAST between integer types. For other combinations the decision matrix returns
explicitly true or false. I don't fully understand when a CAST is considered
safe for removal.
For instance, when the source type is a `DECIMAL` and the target type is a
`TINYINT` the method returns `true` (i.e., cast is "removable"). However, there
are values in the `DECIMAL` range that cannot fit in a `TINYINT` and will come
back as null so its a bit unclear what the method is trying to achieve.
The Javadoc should be updated to clarify a bit better which cases we are
trying to capture. I am wondering if the details about NULL values after cast
are still relevant.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
Review Comment:
1. Should the name be `testBetweenPredicateCastMatrix` ?
2. Do we need the respective matrix test for a range predicate?
3. Should we combine this with
`testRangePredicateCastIntegerValuesInsideTypeRange`?
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -184,91 +189,383 @@ public Double visitCall(RexCall call) {
return selectivity;
}
+ /**
+ * Return whether the expression is a removable cast based on stats and type
bounds.
+ *
+ * <p>
+ * In Hive, if a value cannot be represented by the cast, the result of the
cast is NULL,
+ * and therefore cannot fulfill the predicate. So the possible range of the
values
+ * is limited by the range of possible values of the type.
+ * </p>
+ *
+ * @param exp the expression to check
+ * @param tableScan the table that provides the statistics
+ * @return true if the expression is a removable cast, false otherwise
+ */
+ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) {
+ if(SqlKind.CAST != exp.getKind()) {
+ return false;
+ }
+ RexCall cast = (RexCall) exp;
+ RexNode op0 = cast.getOperands().getFirst();
+ if (!(op0 instanceof RexInputRef)) {
+ return false;
+ }
+ int index = ((RexInputRef) op0).getIndex();
+ final List<ColStatistics> colStats =
tableScan.getColStat(Collections.singletonList(index));
+ if (colStats.isEmpty()) {
+ return false;
+ }
+
+ SqlTypeName sourceType = op0.getType().getSqlTypeName();
+ SqlTypeName targetType = cast.getType().getSqlTypeName();
+
+ switch (sourceType) {
+ case TINYINT, SMALLINT, INTEGER, BIGINT:
+ switch (targetType) {// additional checks are needed
+ case TINYINT, SMALLINT, INTEGER, BIGINT:
+ return isRemovableIntegerCast(cast, op0, colStats);
+ case FLOAT, DOUBLE, DECIMAL:
+ return true;
+ default:
+ return false;
Review Comment:
nit: We could possibly remove all `default` blocks and just keep one `return
false` statement at the end of the method.
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java:
##########
@@ -226,43 +247,41 @@ private boolean isRemovableCast(RexNode exp,
HiveTableScan tableScan) {
return false;
}
- SqlTypeName type = cast.getType().getSqlTypeName();
- double min;
- double max;
- switch (type) {
- case TINYINT, SMALLINT, INTEGER, BIGINT:
- min = ((Number) type.getLimit(false, SqlTypeName.Limit.OVERFLOW, false,
-1, -1)).doubleValue();
- max = ((Number) type.getLimit(true, SqlTypeName.Limit.OVERFLOW, false,
-1, -1)).doubleValue();
- break;
- case TIMESTAMP, DATE:
- min = Long.MIN_VALUE;
- max = Long.MAX_VALUE;
- break;
- case FLOAT:
- min = -Float.MAX_VALUE;
- max = Float.MAX_VALUE;
- break;
- case DOUBLE, DECIMAL:
- min = -Double.MAX_VALUE;
- max = Double.MAX_VALUE;
- break;
- default:
- // unknown type, do not remove the cast
- return false;
- }
// are all values of the input column accepted by the cast?
+ double min = ((Number) targetType.getLimit(false,
SqlTypeName.Limit.OVERFLOW, false, -1, -1)).doubleValue();
+ double max = ((Number) targetType.getLimit(true,
SqlTypeName.Limit.OVERFLOW, false, -1, -1)).doubleValue();
return min < colRange.minValue.doubleValue() &&
colRange.maxValue.doubleValue() < max;
}
/**
- * Get the range of values that are rounded to valid values of a DECIMAL
type.
+ * Get the range of values that are rounded to valid values of a type.
*
- * @param type the DECIMAL type
+ * @param type the type
* @param lowerBound the lower bound type of the result
* @param upperBound the upper bound type of the result
* @return the range of the type
*/
+ private static Range<Float> getRangeOfType(RelDataType type, BoundType
lowerBound, BoundType upperBound) {
+ switch (type.getSqlTypeName()) {
+ // in case of integer types,
+ case TINYINT:
+ return Range.closed(-128.99998f, 127.99999f);
+ case SMALLINT:
+ return Range.closed(-32768.996f, 32767.998f);
+ case INTEGER:
+ return Range.closed(-2.1474836E9f, 2.1474836E9f);
+ case BIGINT, DATE, TIMESTAMP:
+ return Range.closed(-9.223372E18f, 9.223372E18f);
+ case DECIMAL:
+ return getRangeOfDecimalType(type, lowerBound, upperBound);
Review Comment:
Indeed the endpoints values depend on the precision and scale of the decimal
type. My question is more about the `BoundType` of each end-point. For
non-decimal types we are always using `closed` ranges in this method so I was
wondering if we can also do the same for the decimal case.
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -591,6 +604,35 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
}
+ @Test
+ public void testRangePredicateCastInteger() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
Review Comment:
Thanks for the additional tests!
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -605,7 +611,36 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
}
@Test
- public void testRangePredicateCastInteger() {
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
Review Comment:
With the refactoring suggestion this test seems really close to
`testRangePredicateTypeMatrix` which is currently testing casts with `BETWEEN`.
Should we combine everything into one test?
##########
ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java:
##########
@@ -511,6 +605,428 @@ public void
testComputeRangePredicateSelectivityNotBetweenWithNULLS() {
doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0));
RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue,
inputRef0, int1, int3);
FilterSelectivityEstimator estimator = new
FilterSelectivityEstimator(scan, mq);
- Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA);
+ // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3
+ // (the NULL values do not fulfill the condition)
+ Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA);
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesInsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateCastIntegerValuesOutsideTypeRange() {
+ // use VALUES2, even if the tested types cannot represent its values
+ // we're only interested in whether the cast to a smaller integer type
results in the default selectivity
+ useFieldWithValues("f_tinyint", VALUES2, KLL2);
+ checkSelectivity(16 / 28.f, ge(cast("f_tinyint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_tinyint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_tinyint", BIGINT), int5));
+
+ useFieldWithValues("f_smallint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_smallint", TINYINT), int5));
+ checkSelectivity(18 / 28.f, ge(cast("f_smallint", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_smallint", BIGINT), int5));
+
+ useFieldWithValues("f_integer", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_integer", SMALLINT), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_integer", BIGINT), int5));
+
+ useFieldWithValues("f_bigint", VALUES2, KLL2);
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", TINYINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", SMALLINT), int5));
+ checkSelectivity(1 / 3.f, ge(cast("f_bigint", INTEGER), int5));
+ checkSelectivity(20 / 28.f, ge(cast("f_bigint", BIGINT), int5));
+ }
+
+ @Test
+ public void testRangePredicateTypeMatrix() {
+ // checks many possible combinations of types
+ List<RelDataTypeField> fields = tableType.getFieldList();
+ for (var srcField : fields) {
+ if (isTemporal(srcField.getType())) {
+ continue;
+ }
+
+ useFieldWithValues(srcField.getName(), VALUES, KLL);
+
+ for (var tgt : fields) {
+ try {
+ if (isTemporal(tgt.getType())) {
+ continue;
+ }
+
+ RexNode expr = cast(srcField.getName(), tgt.getType());
+ checkBetweenSelectivity(3, VALUES.length, VALUES.length, expr, 5, 7);
+ } catch (AssertionError e) {
+ throw new AssertionError("Error when casting from " +
srcField.getType() + " to " + tgt.getType(), e);
+ }
+ }
+ }
+ }
+
+ private boolean isTemporal(RelDataType type) {
+ return type.getSqlTypeName() == TIMESTAMP || type.getSqlTypeName() ==
SqlTypeName.DATE;
+ }
+
+ @Test
+ public void testRangePredicateWithCast() {
+ useFieldWithValues("f_numeric", VALUES, KLL);
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(10 / 13.f, lt(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(2 / 13.f, gt(cast("f_numeric", TINYINT), int5));
+ checkSelectivity(11 / 13.f, le(cast("f_numeric", TINYINT), int5));
+
+ checkSelectivity(12 / 13f, ge(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(1 / 13f, lt(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(5 / 13f, gt(cast("f_numeric", TINYINT), int2));
+ checkSelectivity(8 / 13f, le(cast("f_numeric", TINYINT), int2));
+
+ // check some types
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", INTEGER), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SMALLINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", BIGINT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SqlTypeName.FLOAT), int5));
+ checkSelectivity(3 / 13.f, ge(cast("f_numeric", SqlTypeName.DOUBLE),
int5));
+ }
+
+ @Test
+ public void testRangePredicateWithCast2() {
+ useFieldWithValues("f_numeric", VALUES2, KLL2);
+ RelDataType decimal3s1 = decimalType(3, 1);
+ checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1),
literalFloat(1)));
+
+ // values from -99.94999 to 99.94999 (both inclusive)
+ checkSelectivity(7 / 28.f, lt(cast("f_numeric", decimal3s1),
literalFloat(100)));
+ checkSelectivity(7 / 28.f, le(cast("f_numeric", decimal3s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, gt(cast("f_numeric", decimal3s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, ge(cast("f_numeric", decimal3s1),
literalFloat(100)));
+
+ RelDataType decimal4s1 = decimalType(4, 1);
+ checkSelectivity(10 / 28.f, lt(cast("f_numeric", decimal4s1),
literalFloat(100)));
+ checkSelectivity(20 / 28.f, le(cast("f_numeric", decimal4s1),
literalFloat(100)));
+ checkSelectivity(3 / 28.f, gt(cast("f_numeric", decimal4s1),
literalFloat(100)));
+ checkSelectivity(13 / 28.f, ge(cast("f_numeric", decimal4s1),
literalFloat(100)));
+
+ RelDataType decimal2s1 = decimalType(2, 1);
+ checkSelectivity(2 / 28.f, lt(cast("f_numeric", decimal2s1),
literalFloat(100)));
+ checkSelectivity(2 / 28.f, le(cast("f_numeric", decimal2s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, gt(cast("f_numeric", decimal2s1),
literalFloat(100)));
+ checkSelectivity(0 / 28.f, ge(cast("f_numeric", decimal2s1),
literalFloat(100)));
+
+ // expected: 100_000f
+ RelDataType decimal7s1 = decimalType(7, 1);
+ checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1),
literalFloat(10000)));
+
+ // expected: 10_000f, 100_000f, because CAST(1_000_000 AS DECIMAL(7,1)) =
NULL, and similar for even larger values
+ checkSelectivity(2 / 28.f, ge(cast("f_numeric", decimal7s1),
literalFloat(9999)));
+ checkSelectivity(2 / 28.f, ge(cast("f_numeric", decimal7s1),
literalFloat(10000)));
+
+ // expected: 100_000f
+ checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1),
literalFloat(10000)));
+ checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1),
literalFloat(10001)));
+
+ // expected 1f, 10f, 99.94998f, 99.94999f
+ checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ checkSelectivity(3 / 28.f, gt(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ // expected -99.94999f, -99.94998f, 0f, 1f
+ checkSelectivity(4 / 28.f, le(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ checkSelectivity(3 / 28.f, lt(cast("f_numeric", decimal3s1),
literalFloat(1)));
+ }
+
+ private void checkTimeFieldOnMidnightTimestamps(RexNode field) {
+ // note: use only values from VALUES_TIME that specify a date without
hh:mm:ss!
+ checkSelectivity(7 / 7.f, ge(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(5 / 7.f, ge(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(1 / 7.f, ge(field, literalTimestamp("2020-11-07")));
+
+ checkSelectivity(6 / 7.f, gt(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(4 / 7.f, gt(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(0 / 7.f, gt(field, literalTimestamp("2020-11-07")));
+
+ checkSelectivity(1 / 7.f, le(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(3 / 7.f, le(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(7 / 7.f, le(field, literalTimestamp("2020-11-07")));
+
+ checkSelectivity(0 / 7.f, lt(field, literalTimestamp("2020-11-01")));
+ checkSelectivity(2 / 7.f, lt(field, literalTimestamp("2020-11-03")));
+ checkSelectivity(6 / 7.f, lt(field, literalTimestamp("2020-11-07")));
+ }
+
+ private void checkTimeFieldOnIntraDayTimestamps(RexNode field) {
+ checkSelectivity(3 / 7.f, ge(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(2 / 7.f, gt(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(5 / 7.f, le(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ checkSelectivity(4 / 7.f, lt(field,
literalTimestamp("2020-11-05T11:23:45Z")));
+ }
+
+ @Test
+ public void testRangePredicateOnTimestamp() {
+ useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME);
+ checkTimeFieldOnMidnightTimestamps(currentInputRef);
+ checkTimeFieldOnIntraDayTimestamps(currentInputRef);
+ }
+
+ @Test
+ public void testRangePredicateOnTimestampWithCast() {
+ useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME);
+ RexNode expr1 = cast("f_timestamp", SqlTypeName.DATE);
+ checkTimeFieldOnMidnightTimestamps(expr1);
+ checkTimeFieldOnIntraDayTimestamps(expr1);
+
+ RexNode expr2 = cast("f_timestamp", SqlTypeName.TIMESTAMP);
+ checkTimeFieldOnMidnightTimestamps(expr2);
+ checkTimeFieldOnIntraDayTimestamps(expr2);
+ }
+
+ @Test
+ public void testRangePredicateOnDate() {
+ useFieldWithValues("f_date", VALUES_TIME, KLL_TIME);
+ checkTimeFieldOnMidnightTimestamps(currentInputRef);
+
+ // it does not make sense to compare with "2020-11-05T11:23:45Z",
+ // as that value would not be stored as-is in a date column, but as
"2020-11-05" instead
+ }
+
+ @Test
+ public void testRangePredicateOnDateWithCast() {
Review Comment:
It seems that there are no tests with CAST for DATE/TIMESTAMP using the
`BETWEEN` predicate.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]