Jackie-Jiang commented on PR #12354:
URL: https://github.com/apache/pinot/pull/12354#issuecomment-1961924193

   I tried the following micro-benchmark and see very interesting result:
   
   Code:
   ```
   package org.apache.pinot.perf;
   
   import java.io.IOException;
   import java.math.BigDecimal;
   import java.util.List;
   import java.util.Map;
   import java.util.concurrent.TimeUnit;
   import java.util.function.LongSupplier;
   import org.apache.pinot.common.request.context.ExpressionContext;
   import org.apache.pinot.core.common.BlockValSet;
   import org.apache.pinot.core.plan.DocIdSetPlanNode;
   import org.apache.pinot.core.query.aggregation.AggregationResultHolder;
   import org.apache.pinot.core.query.aggregation.DoubleAggregationResultHolder;
   import org.apache.pinot.core.query.aggregation.function.AggregationFunction;
   import 
org.apache.pinot.core.query.aggregation.function.SumAggregationFunction;
   import 
org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldDouble;
   import 
org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldHolder;
   import 
org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldPrimitive;
   import org.apache.pinot.segment.spi.index.reader.Dictionary;
   import org.apache.pinot.spi.data.FieldSpec.DataType;
   import org.jetbrains.annotations.Nullable;
   import org.openjdk.jmh.annotations.Benchmark;
   import org.openjdk.jmh.annotations.BenchmarkMode;
   import org.openjdk.jmh.annotations.Fork;
   import org.openjdk.jmh.annotations.Level;
   import org.openjdk.jmh.annotations.Measurement;
   import org.openjdk.jmh.annotations.Mode;
   import org.openjdk.jmh.annotations.OutputTimeUnit;
   import org.openjdk.jmh.annotations.Param;
   import org.openjdk.jmh.annotations.Scope;
   import org.openjdk.jmh.annotations.Setup;
   import org.openjdk.jmh.annotations.State;
   import org.openjdk.jmh.annotations.Warmup;
   import org.openjdk.jmh.infra.Blackhole;
   import org.openjdk.jmh.runner.Runner;
   import org.openjdk.jmh.runner.RunnerException;
   import org.openjdk.jmh.runner.options.Options;
   import org.openjdk.jmh.runner.options.OptionsBuilder;
   import org.roaringbitmap.RoaringBitmap;
   
   
   @Fork(1)
   @BenchmarkMode(Mode.Throughput)
   @Warmup(iterations = 2, time = 1)
   @Measurement(iterations = 3, time = 1)
   @OutputTimeUnit(TimeUnit.MILLISECONDS)
   @State(Scope.Benchmark)
   public class BenchmarkSumAggregation2 {
     public static void main(String[] args)
         throws RunnerException {
       Options opt = new 
OptionsBuilder().include(BenchmarkSumAggregation2.class.getSimpleName())
   //        .addProfiler(GCProfiler.class)
           .build();
   
       new Runner(opt).run();
     }
   
     private static final ExpressionContext EXPR = 
ExpressionContext.forIdentifier("col");
   
     @Param({"true", "false"})
     public boolean _nullHandlingEnabled;
     @Param({"2", "4", "8", "16", "32", "64", "128"})
     public int _nullInterval;
     @Param({"normal", "foldDouble", "foldPrimitive"})
     public String _impl;
   
     private AggregationFunction _aggregationFunction;
     private AggregationResultHolder _resultHolder;
     private RoaringBitmap _nullBitmap;
     private long[] _values;
     private Map<ExpressionContext, BlockValSet> _blockValSetMap;
     private double _expectedSum;
     private double _nextExpectedSum;
   
     @Setup(Level.Trial)
     public void setup()
         throws IOException {
       switch (_impl) {
         case "normal":
           _aggregationFunction = new SumAggregationFunction(List.of(EXPR), 
_nullHandlingEnabled);
           break;
         case "foldDouble":
           _aggregationFunction = new 
SumAggregationFunctionFoldDouble(List.of(EXPR), _nullHandlingEnabled);
           break;
         case "foldPrimitive":
           _aggregationFunction = new 
SumAggregationFunctionFoldPrimitive(List.of(EXPR), _nullHandlingEnabled);
           break;
         case "foldHolder":
           _aggregationFunction = new 
SumAggregationFunctionFoldHolder(List.of(EXPR), _nullHandlingEnabled);
           break;
         default:
           throw new IllegalArgumentException("Unknown impl: " + _impl);
       }
       _resultHolder = _aggregationFunction.createAggregationResultHolder();
       _nullBitmap = createNullBitmap();
       _values = new long[DocIdSetPlanNode.MAX_DOC_PER_CALL];
       LongSupplier longSupplier = Distribution.createLongSupplier(42, 
"EXP(0.5)");
       for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i++) {
         _values[i] = longSupplier.getAsLong();
       }
       _expectedSum = 0;
       for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i++) {
         if (_nullBitmap == null || !_nullBitmap.contains(i)) {
           _expectedSum += _values[i];
         }
       }
       _nextExpectedSum = _expectedSum;
       _blockValSetMap = Map.of(EXPR, new BenchmarkBlockValSet(_nullBitmap, 
_values));
     }
   
     private RoaringBitmap createNullBitmap() {
       if (_nullHandlingEnabled) {
         RoaringBitmap nullBitmap = new RoaringBitmap();
         for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i += 
_nullInterval) {
           nullBitmap.add(i);
         }
         return nullBitmap;
       } else {
         return null;
       }
     }
   
     private static class BenchmarkBlockValSet implements BlockValSet {
   
       final RoaringBitmap _nullBitmap;
       final long[] _values;
   
       private BenchmarkBlockValSet(@Nullable RoaringBitmap nullBitmap, long[] 
values) {
         _nullBitmap = nullBitmap;
         _values = values;
       }
   
       @Nullable
       @Override
       public RoaringBitmap getNullBitmap() {
         return _nullBitmap;
       }
   
       @Override
       public DataType getValueType() {
         return DataType.LONG;
       }
   
       @Override
       public boolean isSingleValue() {
         return true;
       }
   
       @Nullable
       @Override
       public Dictionary getDictionary() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public int[] getDictionaryIdsSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public int[] getIntValuesSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public long[] getLongValuesSV() {
         return _values;
       }
   
       @Override
       public float[] getFloatValuesSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public double[] getDoubleValuesSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public BigDecimal[] getBigDecimalValuesSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public String[] getStringValuesSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public byte[][] getBytesValuesSV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public int[][] getDictionaryIdsMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public int[][] getIntValuesMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public long[][] getLongValuesMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public float[][] getFloatValuesMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public double[][] getDoubleValuesMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public String[][] getStringValuesMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public byte[][][] getBytesValuesMV() {
         throw new UnsupportedOperationException();
       }
   
       @Override
       public int[] getNumMVEntries() {
         throw new UnsupportedOperationException();
       }
     }
   
     @Benchmark
     public void test(Blackhole bh) {
       _aggregationFunction.aggregate(DocIdSetPlanNode.MAX_DOC_PER_CALL, 
_resultHolder, _blockValSetMap);
       if (_resultHolder instanceof DoubleAggregationResultHolder) {
         double result = _resultHolder.getDoubleResult();
         if (result != _nextExpectedSum) {
           throw new IllegalStateException("Expected: " + _nextExpectedSum + ", 
got: " + result);
         }
         bh.consume(result);
       } else {
         Double result = _resultHolder.getResult();
         if (result != _nextExpectedSum) {
           throw new IllegalStateException("Expected: " + _nextExpectedSum + ", 
got: " + _resultHolder.getResult());
         }
         bh.consume(result);
       }
       _nextExpectedSum += _expectedSum;
     }
   }
   ```
   
   Result (`foldHolder` doesn't give correct result, please take a look):
   ```
   Benchmark                            (_impl)  (_nullHandlingEnabled)  
(_nullInterval)   Mode  Cnt    Score     Error   Units
   BenchmarkSumAggregation2.test         normal                    true         
       2  thrpt    3   39.276 ±  20.887  ops/ms
   BenchmarkSumAggregation2.test         normal                    true         
       4  thrpt    3    2.328 ±   3.333  ops/ms
   BenchmarkSumAggregation2.test         normal                    true         
       8  thrpt    3    3.280 ±   1.050  ops/ms
   BenchmarkSumAggregation2.test         normal                    true         
      16  thrpt    3    4.128 ±   1.031  ops/ms
   BenchmarkSumAggregation2.test         normal                    true         
      32  thrpt    3    4.516 ±   0.757  ops/ms
   BenchmarkSumAggregation2.test         normal                    true         
      64  thrpt    3    5.817 ±   0.127  ops/ms
   BenchmarkSumAggregation2.test         normal                    true         
     128  thrpt    3    5.833 ±   0.155  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
       2  thrpt    3   91.698 ±  16.053  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
       4  thrpt    3   90.171 ±  12.357  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
       8  thrpt    3   93.684 ±   4.769  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
      16  thrpt    3   91.339 ±  49.780  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
      32  thrpt    3   92.072 ±  13.054  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
      64  thrpt    3   92.704 ±   4.640  ops/ms
   BenchmarkSumAggregation2.test         normal                   false         
     128  thrpt    3   91.891 ±  20.353  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
       2  thrpt    3   28.216 ±   0.871  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
       4  thrpt    3   49.684 ±  12.845  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
       8  thrpt    3   88.009 ±  24.841  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
      16  thrpt    3  123.312 ±  11.984  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
      32  thrpt    3  116.148 ±  65.820  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
      64  thrpt    3  123.562 ±  23.135  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                    true         
     128  thrpt    3  115.864 ± 116.102  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
       2  thrpt    3   89.577 ±  69.898  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
       4  thrpt    3   91.662 ±  35.556  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
       8  thrpt    3   91.258 ±  17.352  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
      16  thrpt    3   90.990 ±  27.492  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
      32  thrpt    3   91.151 ±  22.358  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
      64  thrpt    3   92.060 ±   3.458  ops/ms
   BenchmarkSumAggregation2.test     foldDouble                   false         
     128  thrpt    3   91.715 ±   4.820  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
       2  thrpt    3   26.680 ±   7.063  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
       4  thrpt    3   46.278 ±   5.670  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
       8  thrpt    3   80.658 ±  33.080  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
      16  thrpt    3  149.492 ±  30.142  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
      32  thrpt    3  225.837 ±  63.126  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
      64  thrpt    3  186.245 ±  24.884  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                    true         
     128  thrpt    3  198.177 ±  73.624  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
       2  thrpt    3  364.593 ±  44.298  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
       4  thrpt    3  359.073 ± 194.996  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
       8  thrpt    3  362.277 ±  92.352  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
      16  thrpt    3  362.172 ±  52.064  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
      32  thrpt    3  350.423 ± 193.227  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
      64  thrpt    3  350.067 ± 107.294  ops/ms
   BenchmarkSumAggregation2.test  foldPrimitive                   false         
     128  thrpt    3  361.977 ± 128.563  ops/ms
   ```
   
   As you can see, new implementation is giving much better performance when 
null is enabled, and similar performance when null is disabled. `foldPrimitive` 
gives even better performance, even though we cannot use it because of the 
potential overflow.
   
   I tried this on my laptop. Can you try this benchmark on your work station 
and see if you can produce the results? Maybe we can get more insights from it.
   
   Some improvements we should do for the new implementation:
   - When null is enabled, we should use `ObjectAggregationResultHolder` to 
handle the case of all values are `null`
   - Fix the `foldHolder` implementation to give correct result


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to