As requested, here are the results with modifications to the annotations on Reference.reachabilityFence. Much more promising ...
* Benchmark 1 * Test Code : package org.sample; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import java.nio.ByteBuffer; public class ByteBufferBenchmark { @State(Scope.Benchmark) public static class ByteBufferContainer { ByteBuffer bb; @Setup(Level.Invocation) public void initByteBuffer() { bb = ByteBuffer.allocateDirect(1); } ByteBuffer getByteBuffer() { return bb; } } @Benchmark public void benchmark_byte_buffer_put(ByteBufferContainer bbC) { bbC.getByteBuffer().put((byte)42); } } Results : - Unmodified Build - Benchmark Mode Cnt Score Error Units ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 35604933.518 ± 654975.515 ops/s - Build With Reference.reachabilityFences Added - Benchmark Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 33100911.857 ± 747461.951 ops/s -7.033% - Build With Reference.reachabilityFences Added And DontInline Replaced With ForceInline - Benchmark Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 34836320.294 ± 640188.408 ops/s -2.159% - Build With Reference.reachabilityFences Added And DontInline Removed - Benchmark Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 34740015.332 ± 556578.542 ops/s -2.429% * Benchmark 2 * Test Code : package org.sample; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import java.nio.ByteBuffer; @State(Scope.Benchmark) public class ByteBufferBenchmark { @Param({"1", "10", "100", "1000", "10000"}) public int L; @State(Scope.Benchmark) public static class ByteBufferContainer { ByteBuffer bb; @Setup(Level.Invocation) public void initByteBuffer() { bb = ByteBuffer.allocateDirect(10000); } ByteBuffer getByteBuffer() { return bb; } } @Benchmark public ByteBuffer benchmark_byte_buffer_put(ByteBufferContainer bbC) { ByteBuffer bb = bbC.getByteBuffer(); for (int i = 0; i < L; i++) { bb.put((byte)i); } return bb; } } Results : - Unmodified Build - Benchmark (L) Mode Cnt Score Error Units ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 29303145.752 ± 635979.750 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 24260859.017 ± 528891.303 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 8512366.637 ± 136615.070 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 1323756.037 ± 21485.369 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 145965.305 ± 1301.469 ops/s - Build With Reference.reachabilityFences Added - Benchmark (L) Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 28893540.122 ± 754554.747 ops/s -1.398% ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 15317696.355 ± 231621.608 ops/s -36.863% ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 2546599.578 ± 32136.873 ops/s -70.084% ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 288832.514 ± 3854.522 ops/s -78.181% ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 29747.386 ± 214.831 ops/s -79.620% - Build With Reference.reachabilityFences Added And DontInline Replaced With ForceInline - Benchmark (L) Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 29372326.859 ± 525988.179 ops/s +0.236% ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 24326735.480 ± 484358.862 ops/s +0.272% ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 8492692.912 ± 120924.878 ops/s -0.231% ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 1332131.417 ± 14981.587 ops/s +0.633% ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 144990.569 ± 1518.877 ops/s -0.668% - Build With Reference.reachabilityFences Added And DontInline Removed - Benchmark (L) Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 29842696.017 ± 462902.634 ops/s +1.841% ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 24842729.069 ± 436174.452 ops/s +2.398% ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 8518393.953 ± 129254.536 ops/s +0.071% ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 1344772.370 ± 15916.867 ops/s +1.588% ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 145087.256 ± 1277.491 ops/s -0.602% * Benchmark 3 * Test Code : package org.sample; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import java.nio.ByteBuffer; @State(Scope.Benchmark) public class ByteBufferBenchmark { @Param({"1", "10", "100", "1000", "10000"}) public int L; @State(Scope.Benchmark) public static class ByteBufferContainer { ByteBuffer bb; @Setup(Level.Invocation) public void initByteBuffer() { bb = ByteBuffer.allocateDirect(4 * 10000); for (int i = 0; i < 10000; i++) { bb.putInt(i); } } ByteBuffer getByteBuffer() { return bb; } } @Benchmark public int benchmark_byte_buffer_put(ByteBufferContainer bbC) { ByteBuffer bb = bbC.getByteBuffer(); bb.position(0); int sum = 0; for (int i = 0; i < L; i++) { sum += bb.getInt(); } return sum; } } Results : - Unmodified Build - Benchmark (L) Mode Cnt Score Error Units ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 29677205.748 ± 544721.142 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 18219951.454 ± 320724.793 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 7767650.826 ± 121798.910 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 1646075.010 ± 9804.499 ops/s ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 183489.418 ± 1355.967 ops/s - Build With Reference.reachabilityFences Added - Benchmark (L) Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 15230086.695 ± 390174.190 ops/s -48.681% ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 8126310.728 ± 123661.342 ops/s -55.399% ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 1582699.233 ± 7278.744 ops/s -79.624% ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 179726.465 ± 802.333 ops/s -89.082% ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 18327.049 ± 9.506 ops/s -90.012% - Build With Reference.reachabilityFences Added And DontInline Replaced With ForceInline - Benchmark (L) Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 29839190.147 ± 576585.796 ops/s +0.546% ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 18397768.759 ± 338144.327 ops/s +0.976% ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 7746079.875 ± 101621.105 ops/s -0.278% ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 1629413.444 ± 24163.399 ops/s -1.012% ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 182250.811 ± 2028.461 ops/s -0.675% - Build With Reference.reachabilityFences Added And DontInline Removed - Benchmark (L) Mode Cnt Score Error Units Impact ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 29442980.464 ± 556324.877 ops/s -0.789% ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 18401757.539 ± 419383.901 ops/s +0.998% ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 7816766.062 ± 100144.611 ops/s +0.632% ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 1636811.564 ± 13811.447 ops/s -0.563% ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 183463.292 ± 2056.016 ops/s -0.014% Regards, Ben From: Paul Sandoz <paul.san...@oracle.com> To: Ben Walsh <ben_wa...@uk.ibm.com> Cc: core-libs-dev <core-libs-dev@openjdk.java.net> Date: 08/02/2018 16:54 Subject: Re: [PATCH] Reduce Chance Of Mistakenly Early Backing Memory Cleanup Hi Ben, Thanks. I anticipated a performance hit but not necessarily a 10x. Without looking at the generated code of the benchmark method it is hard to be sure [*], but i believe the fence is interfering with loop unrolling and/or vectorization, the comparative differences between byte and int may be related to vectorization (for byte there may be less or limited support for vectorization). How about we now try another experiment commenting out the @DontInline on the fence method and re-run the benchmarks. From Peter’s observations and Vladimir’s analysis we should be able to remove that, or even, contrary to what we initial expected when adding this feature, change to @ForceInline! Thanks, Paul. [*] If you are running on linux you can use the excellent JMH perfasm feature to dump the hot parts of HotSpots generated code. > On Feb 8, 2018, at 8:22 AM, Ben Walsh <ben_wa...@uk.ibm.com> wrote: > > Hi Paul, > > Following up with the requested loop and vectorization benchmarks ... > > > (Do the vectorization benchmark results imply that the Hotspot compiler > has been unable to perform the vectorization optimisation due to the > presence of the reachabilityFence ?) > > > ----------------------------------------------------------------------------------------------------------------------- > > > Loop Benchmarking > ---- ------------ > > package org.sample; > > import org.openjdk.jmh.annotations.Benchmark; > import org.openjdk.jmh.annotations.Level; > import org.openjdk.jmh.annotations.Param; > import org.openjdk.jmh.annotations.Scope; > import org.openjdk.jmh.annotations.Setup; > import org.openjdk.jmh.annotations.State; > > import java.nio.ByteBuffer; > > @State(Scope.Benchmark) > public class ByteBufferBenchmark { > > @Param({"1", "10", "100", "1000", "10000"}) > public int L; > > @State(Scope.Benchmark) > public static class ByteBufferContainer { > > ByteBuffer bb; > > @Setup(Level.Invocation) > public void initByteBuffer() { > bb = ByteBuffer.allocateDirect(10000); > } > > ByteBuffer getByteBuffer() { > return bb; > } > } > > @Benchmark > public ByteBuffer benchmark_byte_buffer_put(ByteBufferContainer bbC) { > > ByteBuffer bb = bbC.getByteBuffer(); > > for (int i = 0; i < L; i++) { > bb.put((byte)i); > } > > return bb; > } > > } > > > Without Changes > > Benchmark (L) Mode Cnt Score > Error Units > ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 > 29303145.752 ± 635979.750 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 > 24260859.017 ± 528891.303 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 > 8512366.637 ± 136615.070 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 > 1323756.037 ± 21485.369 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 > 145965.305 ± 1301.469 ops/s > > > With Changes > > Benchmark (L) Mode Cnt Score > Error Units Impact > ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 > 28893540.122 ± 754554.747 ops/s -1.398% > ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 > 15317696.355 ± 231621.608 ops/s -36.863% > ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 > 2546599.578 ± 32136.873 ops/s -70.084% > ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 > 288832.514 ± 3854.522 ops/s -78.181% > ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 29747.386 > ± 214.831 ops/s -79.620% > > > ----------------------------------------------------------------------------------------------------------------------- > > > Vectorization Benchmarking > ------------- ------------ > > package org.sample; > > import org.openjdk.jmh.annotations.Benchmark; > import org.openjdk.jmh.annotations.Level; > import org.openjdk.jmh.annotations.Param; > import org.openjdk.jmh.annotations.Scope; > import org.openjdk.jmh.annotations.Setup; > import org.openjdk.jmh.annotations.State; > > import java.nio.ByteBuffer; > > @State(Scope.Benchmark) > public class ByteBufferBenchmark { > > @Param({"1", "10", "100", "1000", "10000"}) > public int L; > > @State(Scope.Benchmark) > public static class ByteBufferContainer { > > ByteBuffer bb; > > @Setup(Level.Invocation) > public void initByteBuffer() { > bb = ByteBuffer.allocateDirect(4 * 10000); > > for (int i = 0; i < 10000; i++) { > bb.putInt(i); > } > } > > ByteBuffer getByteBuffer() { > return bb; > } > > } > > @Benchmark > public int benchmark_byte_buffer_put(ByteBufferContainer bbC) { > > ByteBuffer bb = bbC.getByteBuffer(); > > bb.position(0); > > int sum = 0; > > for (int i = 0; i < L; i++) { > sum += bb.getInt(); > } > > return sum; > > } > > } > > > Without Changes > > Benchmark (L) Mode Cnt Score > Error Units > ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 > 29677205.748 ± 544721.142 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 > 18219951.454 ± 320724.793 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 > 7767650.826 ± 121798.910 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 > 1646075.010 ± 9804.499 ops/s > ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 > 183489.418 ± 1355.967 ops/s > > > With Changes > > Benchmark (L) Mode Cnt Score > Error Units Impact > ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200 > 15230086.695 ± 390174.190 ops/s -48.681% > ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200 > 8126310.728 ± 123661.342 ops/s -55.399% > ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200 > 1582699.233 ± 7278.744 ops/s -79.624% > ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200 > 179726.465 ± 802.333 ops/s -89.082% > ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 18327.049 > ± 9.506 ops/s -90.012% > > > > NB : For reference - for this and previous benchmarking results ... > > "Without Changes" and "With Changes" - java -version ... > > openjdk version "10-internal" 2018-03-20 > OpenJDK Runtime Environment (build 10-internal+0-adhoc.walshbp.jdk) > OpenJDK 64-Bit Server VM (build 10-internal+0-adhoc.walshbp.jdk, mixed > mode) > > > ----------------------------------------------------------------------------------------------------------------------- > > > Regards, > Ben Walsh > Unless stated otherwise above: IBM United Kingdom Limited - Registered in England and Wales with number 741598. Registered office: PO Box 41, North Harbour, Portsmouth, Hampshire PO6 3AU