kgyrtkirk commented on a change in pull request #1126: URL: https://github.com/apache/hive/pull/1126#discussion_r447081115
########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java ########## @@ -483,46 +489,44 @@ protected final SqlOperator getSqlOperator(String fnName) { } /** - * Rewrites {@code cume_dist() over (order by id)}. + * Provides a generic way to rewrite function into using an estimation based on CDF. + * + * There are a few methods which could be supported this way: NTILE, CUME_DIST, RANK * + * For example: * <pre> * SELECT id, CUME_DIST() OVER (ORDER BY id) FROM sketch_input; - * ⇒ SELECT id, 1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT) )[0] + * ⇒ SELECT id, ds_kll_cdf(ds, CAST(id AS FLOAT) )[0] * FROM sketch_input JOIN ( - * SELECT ds_kll_sketch(CAST(-id AS FLOAT)) AS ds FROM sketch_input + * SELECT ds_kll_sketch(CAST(id AS FLOAT)) AS ds FROM sketch_input * ) q; * </pre> */ - public static class CumeDistRewrite extends WindowingToProjectAggregateJoinProject { + public static abstract class AbstractRankBasedRewriteRule extends WindowingToProjectAggregateJoinProject { - public CumeDistRewrite(String sketchType) { + public AbstractRankBasedRewriteRule(String sketchType) { super(sketchType); } - @Override - protected VbuilderPAP buildProcessor(RelOptRuleCall call) { - return new VB(sketchType, call.builder()); - } + protected static abstract class AbstractRankBasedRewriteBuilder extends VbuilderPAP { - private static class VB extends VbuilderPAP { - - protected VB(String sketchClass, RelBuilder relBuilder) { + protected AbstractRankBasedRewriteBuilder(String sketchClass, RelBuilder relBuilder) { super(sketchClass, relBuilder); } @Override - boolean isApplicable(RexOver over) { - SqlAggFunction aggOp = over.getAggOperator(); + final boolean isApplicable(RexOver over) { RexWindow window = over.getWindow(); - if (aggOp.getName().equalsIgnoreCase("cume_dist") && window.orderKeys.size() == 1 - && window.getLowerBound().isUnbounded() && window.getUpperBound().isUnbounded()) { + if (window.orderKeys.size() == 1 + && window.getLowerBound().isUnbounded() && window.getUpperBound().isUnbounded() Review comment: interesting; mostly the second :D for the current functions (ntile,cume_dist) doesn't really make sense to set the window anything than unbounded (or at least I don't see a usecase for it) I've tried this out for the below query: ``` select id,ntile(id) over (order by id rows between 1 preceding and 1 following) from sketch_input order by id nulls last; ``` * mysql: rejects it with an error that `ntile` doesn't support it * psql: accepts and executes it without interpreting the preceding/following stuff correctly * hive: stops with a semanticexception ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org