This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-rng.git
commit b947c91fb8e96d85c8dbf1d9a696173146ba27b0 Author: Alex Herbert <[email protected]> AuthorDate: Thu Feb 26 14:59:10 2026 +0000 RNG-188: Add Philox generation performance Benchmark use of a method handle to call java.lang.Math 128-bit multiplication functions for 64-bit arguments. Add variants of the 32-bit generator to test performance. Add variants of the 64-bit generator to call Math 128-bit multiplication functions. --- .../commons/rng/core/source64/LXMSupport.java | 3 +- commons-rng-examples/examples-jmh/pom.xml | 3 +- .../rng/examples/jmh/core/LXMBenchmark.java | 93 ++- .../jmh/core/PhiloxGenerationPerformance.java | 886 +++++++++++++++++++++ .../rng/examples/jmh/core/LXMBenchmarkTest.java | 13 +- 5 files changed, 989 insertions(+), 9 deletions(-) diff --git a/commons-rng-core/src/main/java/org/apache/commons/rng/core/source64/LXMSupport.java b/commons-rng-core/src/main/java/org/apache/commons/rng/core/source64/LXMSupport.java index fa23f247..c64e6458 100644 --- a/commons-rng-core/src/main/java/org/apache/commons/rng/core/source64/LXMSupport.java +++ b/commons-rng-core/src/main/java/org/apache/commons/rng/core/source64/LXMSupport.java @@ -110,8 +110,9 @@ final class LXMSupport { * }</pre> * * <p>Note: The method {@code Math.multiplyHigh} was added in JDK 9 - * and should be used as above when the source code targets Java 11 + * and could be used as above when the source code targets Java 11 * to exploit the intrinsic method. + * Benchmarks are available in the Commons RNG JMH module. * * <p>Note: The method {@code Math.unsignedMultiplyHigh} was added in JDK 18 * and should be used when the source code target allows. diff --git a/commons-rng-examples/examples-jmh/pom.xml b/commons-rng-examples/examples-jmh/pom.xml index bf377d43..871367a2 100644 --- a/commons-rng-examples/examples-jmh/pom.xml +++ b/commons-rng-examples/examples-jmh/pom.xml @@ -80,6 +80,8 @@ <pmd.skip>true</pmd.skip> <cpd.skip>true</cpd.skip> <spotbugs.skip>true</spotbugs.skip> + <!-- Disable to allow use of MethodHandle to higher JDK version methods. --> + <animal.sniffer.skip>true</animal.sniffer.skip> <!-- Skip tests. This avoids the lengthy test of the ziggurat samplers. Override using: mvn test -DskipTests=false @@ -94,7 +96,6 @@ <maven.compiler.source>11</maven.compiler.source> <maven.compiler.target>11</maven.compiler.target> <commons.compiler.release>11</commons.compiler.release> - <animal.sniffer.skip>true</animal.sniffer.skip> --> </properties> diff --git a/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmark.java b/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmark.java index 0935c7e3..14637e6e 100644 --- a/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmark.java +++ b/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmark.java @@ -17,8 +17,12 @@ package org.apache.commons.rng.examples.jmh.core; +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; +import java.util.function.LongBinaryOperator; import java.util.function.LongSupplier; import java.util.stream.LongStream; import org.apache.commons.rng.JumpableUniformRandomProvider; @@ -77,12 +81,61 @@ public class LXMBenchmark { */ @State(Scope.Benchmark) public static class UnsignedMultiplyHighSource { + /** Method name for Math.multiplyHigh. */ + private static final String MULTIPLY_HIGH = "multiplyHigh"; + /** Method name for Math.unsignedMultiplyHigh. */ + private static final String UNSIGNED_MULTIPLY_HIGH = "unsignedMultiplyHigh"; /** A mask to convert an {@code int} to an unsigned integer stored as a {@code long}. */ private static final long INT_TO_UNSIGNED_BYTE_MASK = 0xffff_ffffL; /** Precomputed upper split (32-bits) of the low half of the 128-bit multiplier constant. */ private static final long X = ML >>> 32; /** Precomputed lower split (32-bits) of the low half of the 128-bit multiplier constant. */ private static final long Y = ML & INT_TO_UNSIGNED_BYTE_MASK; + /** Method handle to unsigned multiply using + * java.lang.Math.multiplyHigh if Java 9, otherwise + * a default implementation. */ + private static final LongBinaryOperator MATH_MULTIPLY_HIGH; + /** Method handle to unsigned multiply using + * java.lang.Math.unsignedMultiplyHigh if Java 18, otherwise + * a default implementation. */ + private static final LongBinaryOperator MATH_UNSIGNED_MULTIPLY_HIGH; + + static { + final MethodHandles.Lookup lookup = MethodHandles.lookup(); + LongBinaryOperator op1; + // Allow catch of Throwable + // CHECKSTYLE: stop IllegalCatch + try { + final MethodHandle h1 = lookup.findStatic(Math.class, MULTIPLY_HIGH, + MethodType.methodType(long.class, long.class, long.class)); + op1 = (a, b) -> { + try { + return (long) h1.invokeExact(a, b) + ((a >> 63) & b) + ((b >> 63) & a); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }; + } catch (NoSuchMethodException | IllegalAccessException ignored) { + op1 = UnsignedMultiplyHighSource::unsignedMultiplyHigh; + } + MATH_MULTIPLY_HIGH = op1; + LongBinaryOperator op2; + try { + final MethodHandle h2 = lookup.findStatic(Math.class, UNSIGNED_MULTIPLY_HIGH, + MethodType.methodType(long.class, long.class, long.class)); + op2 = (a, b) -> { + try { + return (long) h2.invokeExact(a, b); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }; + } catch (NoSuchMethodException | IllegalAccessException ignored) { + op2 = UnsignedMultiplyHighSource::unsignedMultiplyHigh; + } + // CHECKSTYLE: resume IllegalCatch + MATH_UNSIGNED_MULTIPLY_HIGH = op2; + } /** * The method to compute the value. @@ -92,9 +145,10 @@ public class LXMBenchmark { //"mathMultiplyHigh", "mathMultiplyHighWithML", // Require JDK 18+ //"mathUnsignedMultiplyHigh", "mathUnsignedMultiplyHighWithML", - "unsignedMultiplyHigh", "unsignedMultiplyHighWithML", + UNSIGNED_MULTIPLY_HIGH, "unsignedMultiplyHighWithML", "unsignedMultiplyHighML", "unsignedMultiplyHighPlusMultiplyLow", "unsignedMultiplyHighAndLow", + "mhMultiplyHigh", "mhUnsignedMultiplyHigh", }) private String method; @@ -155,7 +209,7 @@ public class LXMBenchmark { if (BASELINE1.equals(method)) { gen = () -> ga.getAsLong(); } else if (BASELINE2.equals(method)) { - gen = () -> ga.getAsLong() ^ gb.getAsLong(); + gen = () -> ga.getAsLong() * gb.getAsLong(); } else if ("mathMultiplyHigh".equals(method)) { gen = () -> mathMultiplyHigh(ga.getAsLong(), gb.getAsLong()); } else if ("mathMultiplyHighWithML".equals(method)) { @@ -164,7 +218,7 @@ public class LXMBenchmark { gen = () -> mathUnsignedMultiplyHigh(ga.getAsLong(), gb.getAsLong()); } else if ("mathUnsignedMultiplyHighWithML".equals(method)) { gen = () -> mathUnsignedMultiplyHigh(ga.getAsLong(), ML); - } else if ("unsignedMultiplyHigh".equals(method)) { + } else if (UNSIGNED_MULTIPLY_HIGH.equals(method)) { gen = () -> unsignedMultiplyHigh(ga.getAsLong(), gb.getAsLong()); } else if ("unsignedMultiplyHighWithML".equals(method)) { gen = () -> unsignedMultiplyHigh(ga.getAsLong(), ML); @@ -192,6 +246,11 @@ public class LXMBenchmark { final long hi = unsignedMultiplyHigh(a, b, lo); return hi ^ lo[0]; }; + // Method handles + } else if ("mhMultiplyHigh".equals(method)) { + gen = () -> mhMultiplyHigh(ga.getAsLong(), gb.getAsLong()); + } else if ("mhUnsignedMultiplyHigh".equals(method)) { + gen = () -> mhUnsignedMultiplyHigh(ga.getAsLong(), gb.getAsLong()); } else { throw new IllegalStateException("Unknown method: " + method); } @@ -365,6 +424,30 @@ public class LXMBenchmark { return (bx >>> 32) + (carry >>> 32) + ax; } + + /** + * Compute the unsigned multiply of two values using a method handle to + * Math.multiplyHigh. If not using JDK 9 this reverts to a default implementation. + * + * @param a First value + * @param b Second value + * @return the upper 64-bits of the 128-bit result + */ + static long mhMultiplyHigh(long a, long b) { + return MATH_MULTIPLY_HIGH.applyAsLong(a, b); + } + + /** + * Compute the unsigned multiply of two values using a method handle to + * Math.unsignedMultiplyHigh. If not using JDK 18 this reverts to a default implementation. + * + * @param a First value + * @param b Second value + * @return the upper 64-bits of the 128-bit result + */ + static long mhUnsignedMultiplyHigh(long a, long b) { + return MATH_UNSIGNED_MULTIPLY_HIGH.applyAsLong(a, b); + } } /** @@ -451,9 +534,9 @@ public class LXMBenchmark { } if (BASELINE2.equals(method)) { - gen = () -> ga.getAsLong() ^ gb.getAsLong(); + gen = () -> ga.getAsLong() * gb.getAsLong(); } else if (BASELINE4.equals(method)) { - gen = () -> ga.getAsLong() ^ gb.getAsLong() ^ gc.getAsLong() ^ gd.getAsLong(); + gen = () -> ga.getAsLong() * gb.getAsLong() + gc.getAsLong() * gd.getAsLong(); } else if ("unsignedMultiplyHighPlusProducts".equals(method)) { gen = () -> { final long a = ga.getAsLong(); diff --git a/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/PhiloxGenerationPerformance.java b/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/PhiloxGenerationPerformance.java new file mode 100644 index 00000000..cd21a39b --- /dev/null +++ b/commons-rng-examples/examples-jmh/src/main/java/org/apache/commons/rng/examples/jmh/core/PhiloxGenerationPerformance.java @@ -0,0 +1,886 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.rng.examples.jmh.core; + +import java.util.Arrays; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.core.util.NumberFactory; +import org.apache.commons.rng.examples.jmh.core.LXMBenchmark.UnsignedMultiplyHighSource; +import org.apache.commons.rng.simple.RandomSource; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; + +/** + * Executes benchmark to compare the speed of generation of random numbers from the + * various Philox providers. + * + * <p>Performance Notes + * + * <p>Some improvements to performance are observed on JDK 8 for the inline and zero + * variations of the 32-bit generator. On higher JDKs the performance of the original + * implementation is consistently the best and the modified variants are either the + * same speed but can be significantly slower. This typically occurs for the inline + * variant which may not be efficiently targeted by the JVM optimisations. + * The variations are left for future development. + * + * <p>The 64-bit generator can be made significantly faster if it uses the 64-bit + * multiplication methods available in the {@link Math} class (see RNG-188). + */ +public class PhiloxGenerationPerformance extends AbstractBenchmark { + /** The int value. Must NOT be final to prevent JVM optimisation! */ + private int intValue; + /** The long value. Must NOT be final to prevent JVM optimisation! */ + private long longValue; + + /** + * The benchmark state (retrieve the various "RandomSource"s). + */ + @State(Scope.Benchmark) + public static class Sources { + /** + * RNG providers. + */ + @Param({"PHILOX_4X32_ORIGINAL", + "PHILOX_4X32_CONST", + "PHILOX_4X32_ZERO", + "PHILOX_4X32_INLINE", + "PHILOX_4X32", + "PHILOX_4X64_ORIGINAL", + "PHILOX_4X64_MH", + "PHILOX_4X64_UMH", + "PHILOX_4X64"}) + private String randomSourceName; + + /** RNG. */ + private UniformRandomProvider provider; + + /** + * Gets the generator. + * + * @return the RNG. + */ + public UniformRandomProvider getGenerator() { + return provider; + } + + /** Instantiates generator. This need only be done once per set of iterations. */ + @Setup(Level.Trial) + public void setup() { + if ("PHILOX_4X32_ORIGINAL".equals(randomSourceName)) { + provider = new Philox4x32Original(intSeed()); + } else if ("PHILOX_4X32_CONST".equals(randomSourceName)) { + provider = new Philox4x32Const(intSeed()); + } else if ("PHILOX_4X32_ZERO".equals(randomSourceName)) { + provider = new Philox4x32Zero(intSeed()); + } else if ("PHILOX_4X32_INLINE".equals(randomSourceName)) { + provider = new Philox4x32Inline(intSeed()); + } else if ("PHILOX_4X64_ORIGINAL".equals(randomSourceName)) { + provider = new Philox4x64Original(longSeed()); + } else if ("PHILOX_4X64_MH".equals(randomSourceName)) { + provider = new Philox4x64MH(longSeed()); + } else if ("PHILOX_4X64_UMH".equals(randomSourceName)) { + provider = new Philox4x64UMH(longSeed()); + } else { + final RandomSource randomSource = RandomSource.valueOf(randomSourceName); + provider = randomSource.create(); + } + } + + /** + * @return the int[] seed + */ + private static int[] intSeed() { + return ThreadLocalRandom.current().ints(6).toArray(); + } + + /** + * @return the long[] seed + */ + private static long[] longSeed() { + return ThreadLocalRandom.current().longs(6).toArray(); + } + } + + /** + * Class adapted from the original implementation of Philox4x32. + */ + static class Philox4x32Original implements UniformRandomProvider { + /** Philox 32-bit mixing constant for counter 0. */ + protected static final int K_PHILOX_10_A = 0x9E3779B9; + /** Philox 32-bit mixing constant for counter 1. */ + protected static final int K_PHILOX_10_B = 0xBB67AE85; + /** Philox 32-bit constant for key 0. */ + protected static final int K_PHILOX_SA = 0xD2511F53; + /** Philox 32-bit constant for key 1. */ + protected static final int K_PHILOX_SB = 0xCD9E8D57; + /** Internal buffer size. */ + protected static final int PHILOX_BUFFER_SIZE = 4; + + /** Counter 0. */ + protected int counter0; + /** Counter 1. */ + protected int counter1; + /** Counter 2. */ + protected int counter2; + /** Counter 3. */ + protected int counter3; + /** Output buffer. */ + protected final int[] buffer = new int[PHILOX_BUFFER_SIZE]; + /** Key low bits. */ + protected int key0; + /** Key high bits. */ + protected int key1; + /** Output buffer index. When at the end of the buffer the counter is + * incremented and the buffer regenerated. */ + protected int bufferPosition; + + /** + * Creates a new instance based on an array of int containing, key (first two ints) and + * the counter (next 4 ints, low bits = first int). The counter is not scrambled and may + * be used to create contiguous blocks with size a multiple of 4 ints. For example, + * setting seed[2] = 1 is equivalent to start with seed[2]=0 and calling {@link #next()} 4 times. + * + * @param seed Array of size 6 defining key0,key1,counter0,counter1,counter2,counter3. + * If the size is smaller, zero values are assumed. + */ + Philox4x32Original(int[] seed) { + final int[] input = seed.length < 6 ? Arrays.copyOf(seed, 6) : seed; + key0 = input[0]; + key1 = input[1]; + counter0 = input[2]; + counter1 = input[3]; + counter2 = input[4]; + counter3 = input[5]; + bufferPosition = PHILOX_BUFFER_SIZE; + } + + /** {@inheritDoc} */ + @Override + public int nextInt() { + final int p = bufferPosition; + if (p < PHILOX_BUFFER_SIZE) { + bufferPosition = p + 1; + return buffer[p]; + } + incrementCounter(); + rand10(); + bufferPosition = 1; + return buffer[0]; + } + + /** + * Increment the counter by one. + */ + protected void incrementCounter() { + if (++counter0 != 0) { + return; + } + if (++counter1 != 0) { + return; + } + if (++counter2 != 0) { + return; + } + ++counter3; + } + + /** + * Perform 10 rounds, using counter0, counter1, counter2, counter3 as starting point. + * It updates the buffer member variable, but no others. + */ + protected void rand10() { + buffer[0] = counter0; + buffer[1] = counter1; + buffer[2] = counter2; + buffer[3] = counter3; + + int k0 = key0; + int k1 = key1; + + //unrolled loop for performance + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRound(buffer, k0, k1); + } + + /** + * Performs a single round of philox. + * + * @param counter Counter, which will be updated after each call. + * @param key0 Key low bits. + * @param key1 Key high bits. + */ + protected static void singleRound(int[] counter, int key0, int key1) { + final long product0 = (K_PHILOX_SA & 0xffff_ffffL) * (counter[0] & 0xffff_ffffL); + final int hi0 = (int) (product0 >>> 32); + final long product1 = (K_PHILOX_SB & 0xffff_ffffL) * (counter[2] & 0xffff_ffffL); + final int hi1 = (int) (product1 >>> 32); + + counter[0] = hi1 ^ counter[1] ^ key0; + counter[1] = (int) product1; + counter[2] = hi0 ^ counter[3] ^ key1; + counter[3] = (int) product0; + } + + @Override + public long nextLong() { + return NumberFactory.makeLong(nextInt(), nextInt()); + } + } + + /** + * Updated to use pre-computed long constants. + */ + static final class Philox4x32Const extends Philox4x32Original { + /** Philox 32-bit constant for key 0 as an unsigned integer. */ + private static final long K_PHILOX_SAL = Integer.toUnsignedLong(0xD2511F53); + /** Philox 32-bit constant for key 1 as an unsigned integer. */ + private static final long K_PHILOX_SBL = Integer.toUnsignedLong(0xCD9E8D57); + + /** + * Creates a new instance. + * + * @param seed Seed. + */ + Philox4x32Const(int[] seed) { + super(seed); + } + + @Override + protected void rand10() { + buffer[0] = counter0; + buffer[1] = counter1; + buffer[2] = counter2; + buffer[3] = counter3; + + int k0 = key0; + int k1 = key1; + + //unrolled loop for performance + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + singleRoundL(buffer, k0, k1); + } + + /** + * Performs a single round of philox. + * + * @param counter Counter, which will be updated after each call. + * @param key0 Key low bits. + * @param key1 Key high bits. + */ + private static void singleRoundL(int[] counter, int key0, int key1) { + final long product0 = K_PHILOX_SAL * (counter[0] & 0xffff_ffffL); + final int hi0 = (int) (product0 >>> 32); + final long product1 = K_PHILOX_SBL * (counter[2] & 0xffff_ffffL); + final int hi1 = (int) (product1 >>> 32); + + counter[0] = hi1 ^ counter[1] ^ key0; + counter[1] = (int) product1; + counter[2] = hi0 ^ counter[3] ^ key1; + counter[3] = (int) product0; + } + } + + /** + * Updated to use count the buffer position down to zero. + * + * <p>Note: This implementation will output the incorrect order for each + * block of 4. This is for simplicity during testing. + */ + static final class Philox4x32Zero extends Philox4x32Original { + /** + * Creates a new instance. + * + * @param seed Seed. + */ + Philox4x32Zero(int[] seed) { + super(seed); + bufferPosition = 0; + } + + /** {@inheritDoc} */ + @Override + public int nextInt() { + int p = bufferPosition - 1; + if (p < 0) { + incrementCounter(); + rand10(); + p = 3; + } + bufferPosition = p; + return buffer[p]; + } + } + + /** + * Updated to inline the entire buffer update with pre-computed long constants. + */ + static final class Philox4x32Inline extends Philox4x32Original { + /** + * Creates a new instance. + * + * @param seed Seed. + */ + Philox4x32Inline(int[] seed) { + super(seed); + } + + /** + * Perform 10 rounds, using counter0, counter1, counter2, counter3 as starting point. + * It updates the buffer member variable, but no others. + */ + protected void rand10() { + final long sa = K_PHILOX_SA & 0xffff_ffffL; + final long sb = K_PHILOX_SB & 0xffff_ffffL; + + long c0 = counter0; + int c1 = counter1; + long c2 = counter2; + int c3 = counter3; + + long product0; + long product1; + int hi0; + int hi1; + + // Unrolled loop for performance + int k0 = key0; + int k1 = key1; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + k0 += K_PHILOX_10_A; + k1 += K_PHILOX_10_B; + product0 = sa * (c0 & 0xffff_ffffL); + hi0 = (int) (product0 >>> 32); + product1 = sb * (c2 & 0xffff_ffffL); + hi1 = (int) (product1 >>> 32); + + c0 = hi1 ^ c1 ^ k0; + c1 = (int) product1; + c2 = hi0 ^ c3 ^ k1; + c3 = (int) product0; + + buffer[0] = (int) c0; + buffer[1] = c1; + buffer[2] = (int) c2; + buffer[3] = c3; + } + } + + /** + * Class adapted from the original implementation of Philox4x64. + */ + static class Philox4x64Original implements UniformRandomProvider { + /** Philox 64-bit mixing constant for counter 0. */ + protected static final long PHILOX_M0 = 0xD2E7470EE14C6C93L; + /** Philox 64-bit mixing constant for counter 1. */ + protected static final long PHILOX_M1 = 0xCA5A826395121157L; + /** Philox 64-bit constant for key 0. */ + protected static final long PHILOX_W0 = 0x9E3779B97F4A7C15L; + /** Philox 64-bit constant for key 1. */ + protected static final long PHILOX_W1 = 0xBB67AE8584CAA73BL; + /** Internal buffer size. */ + protected static final int PHILOX_BUFFER_SIZE = 4; + + /** Counter 0. */ + protected long counter0; + /** Counter 1. */ + protected long counter1; + /** Counter 2. */ + protected long counter2; + /** Counter 3. */ + protected long counter3; + /** Output buffer. */ + protected final long[] buffer = new long[PHILOX_BUFFER_SIZE]; + /** Key low bits. */ + protected long key0; + /** Key high bits. */ + protected long key1; + /** Output buffer index. When at the end of the buffer the counter is + * incremented and the buffer regenerated. */ + protected int bufferPosition; + + /** + * Creates a new instance given 6 long numbers containing, key (first two longs) and + * the counter (next 4 longs, low bits = first long). The counter is not scrambled and may + * be used to create contiguous blocks with size a multiple of 4 longs. For example, + * setting seed[2] = 1 is equivalent to start with seed[2]=0 and calling {@link #next()} 4 times. + * + * @param seed Array of size 6 defining key0,key1,counter0,counter1,counter2,counter3. + * If the size is smaller, zero values are assumed. + */ + Philox4x64Original(long[] seed) { + final long[] input = seed.length < 6 ? Arrays.copyOf(seed, 6) : seed; + key0 = input[0]; + key1 = input[1]; + counter0 = input[2]; + counter1 = input[3]; + counter2 = input[4]; + counter3 = input[5]; + bufferPosition = PHILOX_BUFFER_SIZE; + } + + /** {@inheritDoc} */ + @Override + public long nextLong() { + final int p = bufferPosition; + if (p < PHILOX_BUFFER_SIZE) { + bufferPosition = p + 1; + return buffer[p]; + } + incrementCounter(); + rand10(); + bufferPosition = 1; + return buffer[0]; + } + + /** + * Increment the counter by one. + */ + protected void incrementCounter() { + if (++counter0 != 0) { + return; + } + if (++counter1 != 0) { + return; + } + if (++counter2 != 0) { + return; + } + ++counter3; + } + + /** + * Perform 10 rounds, using counter0, counter1, counter2, counter3 as starting point. + * It updates the buffer member variable, but no others. + */ + protected void rand10() { + buffer[0] = counter0; + buffer[1] = counter1; + buffer[2] = counter2; + buffer[3] = counter3; + + long k0 = key0; + long k1 = key1; + + //unrolled loop for performance + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + } + + + /** + * Performs a single round of philox. + * + * @param counter Counter, which will be updated after each call. + * @param key0 Key low bits. + * @param key1 Key high bits. + */ + private static void singleRound(long[] counter, long key0, long key1) { + final long lo0 = PHILOX_M0 * counter[0]; + final long hi0 = UnsignedMultiplyHighSource.unsignedMultiplyHigh(PHILOX_M0, counter[0]); + final long lo1 = PHILOX_M1 * counter[2]; + final long hi1 = UnsignedMultiplyHighSource.unsignedMultiplyHigh(PHILOX_M1, counter[2]); + + counter[0] = hi1 ^ counter[1] ^ key0; + counter[1] = lo1; + counter[2] = hi0 ^ counter[3] ^ key1; + counter[3] = lo0; + } + } + + /** + * Updated to use a MethodHandle to Math.multiplyHigh (UMH) (Java 9+). + * If not available then revert to a default implementation. + */ + static final class Philox4x64MH extends Philox4x64Original { + /** + * Creates a new instance. + * + * @param seed Seed. + */ + Philox4x64MH(long[] seed) { + super(seed); + } + + @Override + protected void rand10() { + buffer[0] = counter0; + buffer[1] = counter1; + buffer[2] = counter2; + buffer[3] = counter3; + + long k0 = key0; + long k1 = key1; + + //unrolled loop for performance + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + } + + + /** + * Performs a single round of philox. + * + * @param counter Counter, which will be updated after each call. + * @param key0 Key low bits. + * @param key1 Key high bits. + */ + private static void singleRound(long[] counter, long key0, long key1) { + final long lo0 = PHILOX_M0 * counter[0]; + final long hi0 = UnsignedMultiplyHighSource.mhMultiplyHigh(PHILOX_M0, counter[0]); + final long lo1 = PHILOX_M1 * counter[2]; + final long hi1 = UnsignedMultiplyHighSource.mhMultiplyHigh(PHILOX_M1, counter[2]); + + counter[0] = hi1 ^ counter[1] ^ key0; + counter[1] = lo1; + counter[2] = hi0 ^ counter[3] ^ key1; + counter[3] = lo0; + } + } + + /** + * Updated to use a MethodHandle to Math.unsignedMultiplyHigh (UMH) (Java 18+). + * If not available then revert to a default implementation. + */ + static final class Philox4x64UMH extends Philox4x64Original { + /** + * Creates a new instance. + * + * @param seed Seed. + */ + Philox4x64UMH(long[] seed) { + super(seed); + } + + @Override + protected void rand10() { + buffer[0] = counter0; + buffer[1] = counter1; + buffer[2] = counter2; + buffer[3] = counter3; + + long k0 = key0; + long k1 = key1; + + //unrolled loop for performance + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + k0 += PHILOX_W0; + k1 += PHILOX_W1; + singleRound(buffer, k0, k1); + } + + + /** + * Performs a single round of philox. + * + * @param counter Counter, which will be updated after each call. + * @param key0 Key low bits. + * @param key1 Key high bits. + */ + private static void singleRound(long[] counter, long key0, long key1) { + final long lo0 = PHILOX_M0 * counter[0]; + final long hi0 = UnsignedMultiplyHighSource.mhUnsignedMultiplyHigh(PHILOX_M0, counter[0]); + final long lo1 = PHILOX_M1 * counter[2]; + final long hi1 = UnsignedMultiplyHighSource.mhUnsignedMultiplyHigh(PHILOX_M1, counter[2]); + + counter[0] = hi1 ^ counter[1] ^ key0; + counter[1] = lo1; + counter[2] = hi0 ^ counter[3] ^ key1; + counter[3] = lo0; + } + } + + /** + * Baseline for a JMH method call returning an {@code int}. + * + * @return the value + */ + @Benchmark + public int baselineInt() { + return intValue; + } + + /** + * Baseline for a JMH method call returning a {@code long}. + * + * @return the value + */ + @Benchmark + public long baselineLong() { + return longValue; + } + + /** + * Exercise the {@link UniformRandomProvider#nextInt()} method. + * + * @param sources Source of randomness. + * @return the int + */ + @Benchmark + public int nextInt(Sources sources) { + return sources.getGenerator().nextInt(); + } + + /** + * Exercise the {@link UniformRandomProvider#nextLong()} method. + * + * @param sources Source of randomness. + * @return the long + */ + @Benchmark + public long nextLong(Sources sources) { + return sources.getGenerator().nextLong(); + } +} diff --git a/commons-rng-examples/examples-jmh/src/test/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmarkTest.java b/commons-rng-examples/examples-jmh/src/test/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmarkTest.java index 5bc314b4..e8dbd41b 100644 --- a/commons-rng-examples/examples-jmh/src/test/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmarkTest.java +++ b/commons-rng-examples/examples-jmh/src/test/java/org/apache/commons/rng/examples/jmh/core/LXMBenchmarkTest.java @@ -80,8 +80,7 @@ class LXMBenchmarkTest { } @Test - // Note: JAVA_18 is not in the enum - @EnabledForJreRange(min = JRE.OTHER) + @EnabledForJreRange(min = JRE.JAVA_18) void testMathUnsignedMultiplyHigh() { try { assertUnsignedMultiply(UnsignedMultiplyHighSource::mathUnsignedMultiplyHigh, null); @@ -101,6 +100,16 @@ class LXMBenchmarkTest { assertUnsignedMultiply((a, b) -> UnsignedMultiplyHighSource.unsignedMultiplyHigh(a, b, lo), lo); } + @Test + void testMhMultiplyHigh() { + assertUnsignedMultiply(UnsignedMultiplyHighSource::mhMultiplyHigh, null); + } + + @Test + void testMhUnsignedMultiplyHigh() { + assertUnsignedMultiply(UnsignedMultiplyHighSource::mhUnsignedMultiplyHigh, null); + } + private static void assertUnsignedMultiply(LongLongFunction fun, long[] lo) { final UniformRandomProvider rng = RandomSource.JSF_64.create(); for (int i = 0; i < 100; i++) {
