Jackie-Jiang commented on PR #12354: URL: https://github.com/apache/pinot/pull/12354#issuecomment-1961924193
I tried the following micro-benchmark and see very interesting result: Code: ``` package org.apache.pinot.perf; import java.io.IOException; import java.math.BigDecimal; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.LongSupplier; import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.core.common.BlockValSet; import org.apache.pinot.core.plan.DocIdSetPlanNode; import org.apache.pinot.core.query.aggregation.AggregationResultHolder; import org.apache.pinot.core.query.aggregation.DoubleAggregationResultHolder; import org.apache.pinot.core.query.aggregation.function.AggregationFunction; import org.apache.pinot.core.query.aggregation.function.SumAggregationFunction; import org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldDouble; import org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldHolder; import org.apache.pinot.core.query.aggregation.function.SumAggregationFunctionFoldPrimitive; import org.apache.pinot.segment.spi.index.reader.Dictionary; import org.apache.pinot.spi.data.FieldSpec.DataType; import org.jetbrains.annotations.Nullable; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; import org.openjdk.jmh.runner.Runner; import org.openjdk.jmh.runner.RunnerException; import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; import org.roaringbitmap.RoaringBitmap; @Fork(1) @BenchmarkMode(Mode.Throughput) @Warmup(iterations = 2, time = 1) @Measurement(iterations = 3, time = 1) @OutputTimeUnit(TimeUnit.MILLISECONDS) @State(Scope.Benchmark) public class BenchmarkSumAggregation2 { public static void main(String[] args) throws RunnerException { Options opt = new OptionsBuilder().include(BenchmarkSumAggregation2.class.getSimpleName()) // .addProfiler(GCProfiler.class) .build(); new Runner(opt).run(); } private static final ExpressionContext EXPR = ExpressionContext.forIdentifier("col"); @Param({"true", "false"}) public boolean _nullHandlingEnabled; @Param({"2", "4", "8", "16", "32", "64", "128"}) public int _nullInterval; @Param({"normal", "foldDouble", "foldPrimitive"}) public String _impl; private AggregationFunction _aggregationFunction; private AggregationResultHolder _resultHolder; private RoaringBitmap _nullBitmap; private long[] _values; private Map<ExpressionContext, BlockValSet> _blockValSetMap; private double _expectedSum; private double _nextExpectedSum; @Setup(Level.Trial) public void setup() throws IOException { switch (_impl) { case "normal": _aggregationFunction = new SumAggregationFunction(List.of(EXPR), _nullHandlingEnabled); break; case "foldDouble": _aggregationFunction = new SumAggregationFunctionFoldDouble(List.of(EXPR), _nullHandlingEnabled); break; case "foldPrimitive": _aggregationFunction = new SumAggregationFunctionFoldPrimitive(List.of(EXPR), _nullHandlingEnabled); break; case "foldHolder": _aggregationFunction = new SumAggregationFunctionFoldHolder(List.of(EXPR), _nullHandlingEnabled); break; default: throw new IllegalArgumentException("Unknown impl: " + _impl); } _resultHolder = _aggregationFunction.createAggregationResultHolder(); _nullBitmap = createNullBitmap(); _values = new long[DocIdSetPlanNode.MAX_DOC_PER_CALL]; LongSupplier longSupplier = Distribution.createLongSupplier(42, "EXP(0.5)"); for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i++) { _values[i] = longSupplier.getAsLong(); } _expectedSum = 0; for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i++) { if (_nullBitmap == null || !_nullBitmap.contains(i)) { _expectedSum += _values[i]; } } _nextExpectedSum = _expectedSum; _blockValSetMap = Map.of(EXPR, new BenchmarkBlockValSet(_nullBitmap, _values)); } private RoaringBitmap createNullBitmap() { if (_nullHandlingEnabled) { RoaringBitmap nullBitmap = new RoaringBitmap(); for (int i = 0; i < DocIdSetPlanNode.MAX_DOC_PER_CALL; i += _nullInterval) { nullBitmap.add(i); } return nullBitmap; } else { return null; } } private static class BenchmarkBlockValSet implements BlockValSet { final RoaringBitmap _nullBitmap; final long[] _values; private BenchmarkBlockValSet(@Nullable RoaringBitmap nullBitmap, long[] values) { _nullBitmap = nullBitmap; _values = values; } @Nullable @Override public RoaringBitmap getNullBitmap() { return _nullBitmap; } @Override public DataType getValueType() { return DataType.LONG; } @Override public boolean isSingleValue() { return true; } @Nullable @Override public Dictionary getDictionary() { throw new UnsupportedOperationException(); } @Override public int[] getDictionaryIdsSV() { throw new UnsupportedOperationException(); } @Override public int[] getIntValuesSV() { throw new UnsupportedOperationException(); } @Override public long[] getLongValuesSV() { return _values; } @Override public float[] getFloatValuesSV() { throw new UnsupportedOperationException(); } @Override public double[] getDoubleValuesSV() { throw new UnsupportedOperationException(); } @Override public BigDecimal[] getBigDecimalValuesSV() { throw new UnsupportedOperationException(); } @Override public String[] getStringValuesSV() { throw new UnsupportedOperationException(); } @Override public byte[][] getBytesValuesSV() { throw new UnsupportedOperationException(); } @Override public int[][] getDictionaryIdsMV() { throw new UnsupportedOperationException(); } @Override public int[][] getIntValuesMV() { throw new UnsupportedOperationException(); } @Override public long[][] getLongValuesMV() { throw new UnsupportedOperationException(); } @Override public float[][] getFloatValuesMV() { throw new UnsupportedOperationException(); } @Override public double[][] getDoubleValuesMV() { throw new UnsupportedOperationException(); } @Override public String[][] getStringValuesMV() { throw new UnsupportedOperationException(); } @Override public byte[][][] getBytesValuesMV() { throw new UnsupportedOperationException(); } @Override public int[] getNumMVEntries() { throw new UnsupportedOperationException(); } } @Benchmark public void test(Blackhole bh) { _aggregationFunction.aggregate(DocIdSetPlanNode.MAX_DOC_PER_CALL, _resultHolder, _blockValSetMap); if (_resultHolder instanceof DoubleAggregationResultHolder) { double result = _resultHolder.getDoubleResult(); if (result != _nextExpectedSum) { throw new IllegalStateException("Expected: " + _nextExpectedSum + ", got: " + result); } bh.consume(result); } else { Double result = _resultHolder.getResult(); if (result != _nextExpectedSum) { throw new IllegalStateException("Expected: " + _nextExpectedSum + ", got: " + _resultHolder.getResult()); } bh.consume(result); } _nextExpectedSum += _expectedSum; } } ``` Result (`foldHolder` doesn't give correct result, please take a look): ``` Benchmark (_impl) (_nullHandlingEnabled) (_nullInterval) Mode Cnt Score Error Units BenchmarkSumAggregation2.test normal true 2 thrpt 3 39.276 ± 20.887 ops/ms BenchmarkSumAggregation2.test normal true 4 thrpt 3 2.328 ± 3.333 ops/ms BenchmarkSumAggregation2.test normal true 8 thrpt 3 3.280 ± 1.050 ops/ms BenchmarkSumAggregation2.test normal true 16 thrpt 3 4.128 ± 1.031 ops/ms BenchmarkSumAggregation2.test normal true 32 thrpt 3 4.516 ± 0.757 ops/ms BenchmarkSumAggregation2.test normal true 64 thrpt 3 5.817 ± 0.127 ops/ms BenchmarkSumAggregation2.test normal true 128 thrpt 3 5.833 ± 0.155 ops/ms BenchmarkSumAggregation2.test normal false 2 thrpt 3 91.698 ± 16.053 ops/ms BenchmarkSumAggregation2.test normal false 4 thrpt 3 90.171 ± 12.357 ops/ms BenchmarkSumAggregation2.test normal false 8 thrpt 3 93.684 ± 4.769 ops/ms BenchmarkSumAggregation2.test normal false 16 thrpt 3 91.339 ± 49.780 ops/ms BenchmarkSumAggregation2.test normal false 32 thrpt 3 92.072 ± 13.054 ops/ms BenchmarkSumAggregation2.test normal false 64 thrpt 3 92.704 ± 4.640 ops/ms BenchmarkSumAggregation2.test normal false 128 thrpt 3 91.891 ± 20.353 ops/ms BenchmarkSumAggregation2.test foldDouble true 2 thrpt 3 28.216 ± 0.871 ops/ms BenchmarkSumAggregation2.test foldDouble true 4 thrpt 3 49.684 ± 12.845 ops/ms BenchmarkSumAggregation2.test foldDouble true 8 thrpt 3 88.009 ± 24.841 ops/ms BenchmarkSumAggregation2.test foldDouble true 16 thrpt 3 123.312 ± 11.984 ops/ms BenchmarkSumAggregation2.test foldDouble true 32 thrpt 3 116.148 ± 65.820 ops/ms BenchmarkSumAggregation2.test foldDouble true 64 thrpt 3 123.562 ± 23.135 ops/ms BenchmarkSumAggregation2.test foldDouble true 128 thrpt 3 115.864 ± 116.102 ops/ms BenchmarkSumAggregation2.test foldDouble false 2 thrpt 3 89.577 ± 69.898 ops/ms BenchmarkSumAggregation2.test foldDouble false 4 thrpt 3 91.662 ± 35.556 ops/ms BenchmarkSumAggregation2.test foldDouble false 8 thrpt 3 91.258 ± 17.352 ops/ms BenchmarkSumAggregation2.test foldDouble false 16 thrpt 3 90.990 ± 27.492 ops/ms BenchmarkSumAggregation2.test foldDouble false 32 thrpt 3 91.151 ± 22.358 ops/ms BenchmarkSumAggregation2.test foldDouble false 64 thrpt 3 92.060 ± 3.458 ops/ms BenchmarkSumAggregation2.test foldDouble false 128 thrpt 3 91.715 ± 4.820 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 2 thrpt 3 26.680 ± 7.063 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 4 thrpt 3 46.278 ± 5.670 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 8 thrpt 3 80.658 ± 33.080 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 16 thrpt 3 149.492 ± 30.142 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 32 thrpt 3 225.837 ± 63.126 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 64 thrpt 3 186.245 ± 24.884 ops/ms BenchmarkSumAggregation2.test foldPrimitive true 128 thrpt 3 198.177 ± 73.624 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 2 thrpt 3 364.593 ± 44.298 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 4 thrpt 3 359.073 ± 194.996 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 8 thrpt 3 362.277 ± 92.352 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 16 thrpt 3 362.172 ± 52.064 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 32 thrpt 3 350.423 ± 193.227 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 64 thrpt 3 350.067 ± 107.294 ops/ms BenchmarkSumAggregation2.test foldPrimitive false 128 thrpt 3 361.977 ± 128.563 ops/ms ``` As you can see, new implementation is giving much better performance when null is enabled, and similar performance when null is disabled. `foldPrimitive` gives even better performance, even though we cannot use it because of the potential overflow. I tried this on my laptop. Can you try this benchmark on your work station and see if you can produce the results? Maybe we can get more insights from it. Some improvements we should do for the new implementation: - When null is enabled, we should use `ObjectAggregationResultHolder` to handle the case of all values are `null` - Fix the `foldHolder` implementation to give correct result -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org