random routines

2026-04-14 14:03:04 -04:00
parent 4530ed3603
commit caf347e8f5
45 changed files with 1086 additions and 1 deletions
--- a/build_linux64/libamsculib3.linux64.a
+++ b/build_linux64/libamsculib3.linux64.a
--- a/build_linux64/objstore/amscu_comp128.o
+++ b/build_linux64/objstore/amscu_comp128.o
--- a/build_linux64/objstore/amscu_comp64.o
+++ b/build_linux64/objstore/amscu_comp64.o
--- a/build_linux64/objstore/amscu_cudafunctions.o
+++ b/build_linux64/objstore/amscu_cudafunctions.o
--- a/build_linux64/objstore/amscu_random.o
+++ b/build_linux64/objstore/amscu_random.o
--- a/build_linux64/objstore/amscuarray.o
+++ b/build_linux64/objstore/amscuarray.o
--- a/build_linux64/objstore/amscuarray_dops.o
+++ b/build_linux64/objstore/amscuarray_dops.o
--- a/build_linux64/objstore/amscugeom.o
+++ b/build_linux64/objstore/amscugeom.o
--- a/build_linux64/objstore/amsculib3.o
+++ b/build_linux64/objstore/amsculib3.o
--- a/build_linux64/objstore/amscumath.o
+++ b/build_linux64/objstore/amscumath.o
--- a/build_linux64/objstore/amscupcg.o
+++ b/build_linux64/objstore/amscupcg.o
--- a/build_linux64/objstore/amscurandom1.o
+++ b/build_linux64/objstore/amscurandom1.o
--- a/build_linux64/objstore/amscurandom1_dbuff.o
+++ b/build_linux64/objstore/amscurandom1_dbuff.o
--- a/build_linux64/objstore/amscurandom1_hbuff.o
+++ b/build_linux64/objstore/amscurandom1_hbuff.o
--- a/build_linux64/objstore/amscurarray.o
+++ b/build_linux64/objstore/amscurarray.o
--- a/build_linux64/objstore/amscusplitmix.o
+++ b/build_linux64/objstore/amscusplitmix.o
--- a/build_linux64/objstore/amsxoroshiro.o
+++ b/build_linux64/objstore/amsxoroshiro.o
--- a/build_linux64/objstore/cuvec2.o
+++ b/build_linux64/objstore/cuvec2.o
--- a/build_linux64/objstore/cuvec2f.o
+++ b/build_linux64/objstore/cuvec2f.o
--- a/build_linux64/objstore/cuvec2i.o
+++ b/build_linux64/objstore/cuvec2i.o
--- a/build_linux64/objstore/cuvec3.o
+++ b/build_linux64/objstore/cuvec3.o
--- a/build_linux64/objstore/cuvec3f.o
+++ b/build_linux64/objstore/cuvec3f.o
--- a/build_linux64/objstore/cuvec3i.o
+++ b/build_linux64/objstore/cuvec3i.o
--- a/build_linux64/objstore/cuvec4.o
+++ b/build_linux64/objstore/cuvec4.o
--- a/build_linux64/objstore/cuvec4f.o
+++ b/build_linux64/objstore/cuvec4f.o
--- a/build_linux64/objstore/cuvec4i.o
+++ b/build_linux64/objstore/cuvec4i.o
--- a/build_linux64/test
+++ b/build_linux64/test
--- a/include/amsculib3/amsculib3.hpp
+++ b/include/amsculib3/amsculib3.hpp
@ -52,6 +52,8 @@ namespace amscuda
 #include <amsculib3/amscuda_binarrrw.hpp>
 #include <amsculib3/amscu_random.hpp>
 #include <amsculib3/random/amscurandom.cuh>
 #include <amsculib3/amscuarray_dops.hpp>
 #include <amsculib3/amscurarray.cuh>
--- a/include/amsculib3/random/amscufhash.cuh
+++ b/include/amsculib3/random/amscufhash.cuh
@ -0,0 +1,23 @@
 #ifndef __AMSCUFHASH_CUH__
 #define __AMSCUFHASH_CUH__
 namespace amscuda
 {
 namespace random
 {
 namespace fhash
 {
 //Floating point hash functions
 };
 };
 };
 #endif
--- a/include/amsculib3/random/amsculcg.cuh
+++ b/include/amsculib3/random/amsculcg.cuh
@ -0,0 +1,22 @@
 #ifndef __AMSCULCG_CUH__
 #define __AMSCULCG_CUH__
 namespace amscuda
 {
 namespace random
 {
 namespace lcg
 {
 //Legacy linear congruential generators
 };
 };
 };
 #endif
--- a/include/amsculib3/random/amscupcg.cuh
+++ b/include/amsculib3/random/amscupcg.cuh
@ -0,0 +1,19 @@
 #ifndef __AMSCUPCG_CUH__
 #define __AMSCUPCG_CUH__
 namespace amscuda
 {
 namespace random
 {
 };
 };
 #endif
--- a/include/amsculib3/random/amscurandom.cuh
+++ b/include/amsculib3/random/amscurandom.cuh
@ -0,0 +1,55 @@
 #ifndef __AMSCU_RANDOM1_HPP__
 #define __AMSCU_RANDOM1_HPP__
 #include <amsculib3/random/amscupcg.cuh>
 #include <amsculib3/random/amsxoroshiro.cuh>
 #include <amsculib3/random/amsculcg.cuh>
 #include <amsculib3/random/amscufhash.cuh>
 namespace amscuda
 {
 namespace random
 {
 typedef xoroshiro::xs64ss_state randstate_t;
 extern randstate_t global_randstate;
 //default random number generator functions that wrap one of the generators
 //defined in the prefix libraries. Choosing xoroshiro64** for now due to 
 //only 32 bit operations being needed.
 __host__ void rand_seed(const uint32_t seed);
 __host__ __device__ void rand_state_increment(const int32_t inc, randstate_t *state = NULL);
 __host__ __device__ void rand_next(randstate_t *state = NULL);
 __host__ __device__ uint32_t randui32(randstate_t *state = NULL);
 __host__ __device__ int randint(int low, int high, randstate_t *state = NULL);
 __host__ __device__ float randf(randstate_t *state = NULL);
 __host__ __device__ double rand(randstate_t *state = NULL);
 __host__ __device__ float randnf(randstate_t *state = NULL);
 __host__ __device__ double randn(randstate_t *state = NULL);
 //Operations to fill a host buffer with random values
 __host__ int hbuff_randf(float *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int hbuff_rand(double *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int hbuff_randnf(float *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int hbuff_randn(double *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int hbuff_randint(int *hbuffer, int64_t size, int low, int high, randstate_t *state = NULL);
 //Operations to fill a device buffer with random values
 __host__ int dbuff_randf(float *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int dbuff_rand(double *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int dbuff_randnf(float *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int dbuff_randn(double *hbuffer, int64_t size, randstate_t *state = NULL);
 __host__ int dbuff_randint(int *hbuffer, int64_t size, int low, int high, randstate_t *state = NULL);
 };
 };
 #endif
--- a/include/amsculib3/random/amsxoroshiro.cuh
+++ b/include/amsculib3/random/amsxoroshiro.cuh
@ -0,0 +1,69 @@
 #ifndef __AMSXOROSHIRO_CUH__
 #define __AMSXOROSHIRO_CUH__
 //implementations of the xoroshiro family of PRNGs
 //Described by: https://prng.di.unimi.it/
 //ref: https://prng.di.unimi.it/xoroshiro64star.c
 //ref: David Blackman and Sebastiano Vigna. Scrambled linear pseudorandom number generators. ACM Trans. Math. Softw., 47:1−32, 2021. 
 namespace amscuda
 {
 namespace random
 {
 namespace xoroshiro
 {
 //https://github.com/joelkp/ranoise/blob/main/splitmix32.c
 __device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state);
 __host__ __device__ void splitmix64_next(uint64_t *state);
 __host__ __device__ uint64_t splitmix64_nextint(uint64_t *state);
 struct xs64ss_state
 {
    public:
    uint32_t low;
    uint32_t high;
    __device__ __host__ xs64ss_state();
    __device__ __host__ xs64ss_state(const uint32_t seed);
    __device__ __host__ xs64ss_state(const uint32_t _low, const uint32_t _high);
 };
 extern xs64ss_state xs64ss_globalstate;
 __host__ void xs64ss_seed(const uint32_t q);
 // __host__ void xs64ss_seed(const uint32_t _a, const uint32_t _b);
 //implements the xoroshiro64** PRNG 
 __host__ __device__ uint32_t xs64ss_next(xs64ss_state *state);
 //The problem with xoroshiro128+ is that 64 bit integer operations on GPUs are 20 (multiplication) to 80 times slower
 //than the native 32 bit integer opeations. Everywhere a 64 bit integer shows up and is multiplied by anything, things slow down
 struct xs128pp_state
 {
    public:
    uint64_t low;
    uint64_t high;
    __device__ __host__ xs128pp_state();
    __device__ __host__ xs128pp_state(const uint64_t seed);
    __device__ __host__ xs128pp_state(const uint64_t _low, const uint64_t _high);
 };
 extern xs128pp_state xs128pp_globalstate;
 __host__ __device__ uint64_t xs128pp_rotl(const uint64_t x, int k);
 __host__ __device__ uint64_t xs128pp_next(xs128pp_state* state);
 //equivalent to 2^64 calls to xs128pp_next()
 __device__ __host__ void xs128pp_jump(xs128pp_state* state);
 __host__ void xs128pp_seed(uint64_t seed);
 };
 };
 };
 #endif
--- a/src/amsculib3/math/cuvec3.cu
+++ b/src/amsculib3/math/cuvec3.cu
@ -736,7 +736,7 @@ __host__ __device__ cumat3 rotmat_from_axisangle(const cuvec3 &axis, const doubl
 // Tests //
 ///////////
-__host__ void test_cudavectf_logic1()
+__host__ void test_cudavect_logic1()
 {
 }
--- a/src/amsculib3/random/amscupcg.cu
+++ b/src/amsculib3/random/amscupcg.cu
@ -0,0 +1,6 @@
 #include <amsculib3/amsculib3.hpp>
 namespace amscuda
 {
 };
--- a/src/amsculib3/random/amscurandom1.cu
+++ b/src/amsculib3/random/amscurandom1.cu
@ -0,0 +1,130 @@
 #include <amsculib3/amsculib3.hpp>
 using namespace amscuda::random::xoroshiro;
 namespace amscuda
 {
 namespace random
 {
    //Choosing xoroshiro64** as my default RNG due to 32 bit only operations 
    randstate_t global_rand_cpustate = xs64ss_state();
    __host__ void rand_seed(const uint32_t seed)
    {
        global_rand_cpustate = xs64ss_state(seed);
    }
    __host__ __device__ void rand_state_increment(const int32_t inc, randstate_t *state)
    {
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        xoroshiro::xs64ss_state* s2 = (xoroshiro::xs64ss_state*)state;
        s2->low += inc;
        xs64ss_next(s2);
        return;
    }
    __host__ __device__ void rand_next(randstate_t *state)
    {
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        xs64ss_next((xoroshiro::xs64ss_state*)state);
        return;
    }
    __host__ __device__ uint32_t randui32(randstate_t *state)
    {
        uint32_t ret;
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        ret = xoroshiro::xs64ss_next((xoroshiro::xs64ss_state*)state);
        return ret;
    }
    __host__ __device__ int randint(int low, int high, randstate_t *state)
    {
        int ret;
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        int32_t q = (int)((randui32(state)>>1U)%(1U<<16U));
        ret = (q%(high-low))+low;
        return ret;
    }
    __host__ __device__ float randf(randstate_t *state)
    {
        float ret;
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        ret= ((float)randui32(state))/(2147483648.0f);
        return ret;
    }
    __host__ __device__ double rand(randstate_t *state)
    {
        double ret;
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        ret= ((double)randui32(state))/(2147483648.0f);
        return ret;
    }
    __host__ __device__ float randnf(randstate_t *state)
    {
        float q1,q2;
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        q1 = randf(state);
        q2 = randf(state);
        return (::sqrtf(-2.0f*::logf(q1))*cos(2.0f*pif*q2));
    }
    __host__ __device__ double randn(randstate_t *state)
    {
        double q1,q2;
        #ifdef __CUDA_ARCH__
            // GPU-specific code (device path)
        #else
            // CPU-specific code (host path)
            if(state==NULL) state = &global_rand_cpustate;
        #endif
        q1 = rand(state);
        q2 = rand(state);
        return (::sqrt(-2.0*::log(q1))*cos(2.0*pi*q2));
    }
 }; //end namespaces
 };
--- a/src/amsculib3/random/amscurandom1_dbuff.cu
+++ b/src/amsculib3/random/amscurandom1_dbuff.cu
@ -0,0 +1,9 @@
 #include <amsculib3/amsculib3.hpp>
 namespace amscuda
 {
 namespace random
 {
 }; //end namespaces
 };
--- a/src/amsculib3/random/amscurandom1_hbuff.cu
+++ b/src/amsculib3/random/amscurandom1_hbuff.cu
@ -0,0 +1,9 @@
 #include <amsculib3/amsculib3.hpp>
 namespace amscuda
 {
 namespace random
 {
 }; //end namespaces
 };
--- a/src/amsculib3/random/amscusplitmix.cu
+++ b/src/amsculib3/random/amscusplitmix.cu
@ -0,0 +1,125 @@
 #include <amsculib3/amsculib3.hpp>
 namespace amscuda
 {
 namespace random
 {
 namespace xoroshiro
 {
 //ref: https://github.com/joelkp/ranoise/blob/main/splitmix32.c
 // __device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state)
 // {
 //     uint32_t ret;
 //     *splitmix32_state += 2654435769U;
 //     ret = *splitmix32_state;
 //     ret ^= ret >> 16;
 //     ret *= 0x85ebca6bU;
 //     ret ^= ret >> 13;
 //     ret *= 0xc2b2ae35U;
 //     ret ^= ret >> 16;
 //     return ret;
 // }
 //Better constants: see discussion below.
 __device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state)
 {
    uint32_t ret;
    *splitmix32_state += 0x9e3779b9U;
    ret = *splitmix32_state;
    ret ^= ret >> 16;
    ret *= 0x21f0aaadU;
    ret ^= ret >> 15;
    ret *= 0x735a2d97U;
    ret ^= ret >> 15;
    return ret;
 }
 __host__ __device__ void splitmix64_next(uint64_t *state)
 {
    //*state += 0x9E3779B97F4A7C15uL;
    *state += 0x9E3779B97F4A7C15ULL;
 }
 __host__ __device__ uint64_t splitmix64_nextint(uint64_t *state)
 {
    //ref: https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64
    uint64_t ret;
    *state += 0x9E3779B97F4A7C15ULL;
    ret = *state;
    ret = (ret ^ (ret >> 30)) * 0xbf58476d1ce4e5b9ULL;
    ret = (ret ^ (ret >> 27)) * 0x94d049bb133111ebULL;
    ret = ret^(ret>>31);
    return ret;
 }
 ////////////////////
 // Unsorted Notes //
 ////////////////////
    //ref: https://github.com/joelkp/ranoise/blob/main/splitmix32.c
    //2654435769U
    //#define ROR32(x, r) \
 	//((uint32_t)(x) >> ((r) & 31) | (uint32_t)(x) << ((32-(r)) & 31))
    //#define MUVAROR32(x, r, ro) \
 	//(((uint32_t)(x) | ((1<<((ro) & 31))|1)) * ROR32((x), (r)+(ro)))
    // static inline uint32_t splitmix32(uint32_t *restrict pos) {
    //     uint32_t s = *pos += 2654435769U;
    //     s ^= s >> 16;
    //     s *= 0x85ebca6b;
    //     s ^= s >> 13;
    //     s *= 0xc2b2ae35;
    //     s ^= s >> 16;
    //     return s;
    // }
    // int main(int argc, char *argv[]) {
    //     uint32_t pos = 0;
    //     for (;;) {
    //         /* SplitMix32 test */
    //         add_output(splitmix32(&pos));
    //     }
    //     return 0;
    // }
    /*
    //ref: https://stackoverflow.com/questions/17035441/looking-for-decent-quality-prng-with-only-32-bits-of-state
    uint32_t splitmix32(void) {
        uint32_t z = state += 0x9e3779b9;
        z ^= z >> 15; // 16 for murmur3
        z *= 0x85ebca6b;
        z ^= z >> 13;
        z *= 0xc2b2ae35;
        return z ^= z >> 16;
    }
    Update: Better constants have been found that potentially make this a very good PRNG:
    uint32_t splitmix32(void) {
    uint32_t z = (state += 0x9e3779b9);
    z ^= z >> 16; z *= 0x21f0aaad;
    z ^= z >> 15; z *= 0x735a2d97;
    z ^= z >> 15;
    return z;
 }
 Mark Dickinson
 Over a year ago
 Any thoughts on the PCG family? (E.g., the generator called "PCG RXS M XS 32 (LCG)" in the PCG paper.)
 bryc
 Over a year ago
@MarkDickinson From what I've seen PCG relies on 64-bit math, and there are no fully 32-bit variants that I can find. I can't find any code specific to the generator you mentioned either. I'm sure PCG is a good generator, but I feel like its a poor choice for JavaScript or embedded systems. xoshiro128** for example is ideal in that regard, since it uses 32-bit integers only.
    */
 };//end namespace splitmix 
 }; //end namespaces
 };
--- a/src/amsculib3/random/amsxoroshiro.cu
+++ b/src/amsculib3/random/amsxoroshiro.cu
@ -0,0 +1,146 @@
 #include <amsculib3/amsculib3.hpp>
 namespace amscuda
 {
 namespace random
 {
 namespace xoroshiro
 {
    //////////////////
    //Xoroshiro 64**//
    //////////////////
    __device__ __host__ xs64ss_state::xs64ss_state()
    {
        low = 0; high = 0;
    }
    __device__ __host__ xs64ss_state::xs64ss_state(const uint32_t seed)
    {
        uint32_t sm32 = seed;
        splitmix32_next(&sm32);
        low = splitmix32_next(&sm32);
        high = splitmix32_next(&sm32);
        return;
    }
    __device__ __host__ xs64ss_state::xs64ss_state(const uint32_t _low, const uint32_t _high)
    {
        low = _low; high = _high;
    }
    xs64ss_state xs64ss_globalstate = xs64ss_state();
    __host__ void xs64ss_seed(const uint32_t seed)
    {
        xs64ss_globalstate = xs64ss_state(seed);
    }
    __device__ __host__ static inline uint32_t xs64ss_rotl(const uint32_t x, int k) {
        return (x << k) | (x >> (32 - k));
    }
    //implements the xoroshiro64** PRNG
    __host__ __device__ uint32_t xs64ss_next(xs64ss_state *state)
    {
        const uint32_t low = state->low;
        uint32_t high = state->high;
        const uint32_t ret = xs64ss_rotl(low*0x9E3779BBU,5)*5;
        high ^= low;
        state->low = xs64ss_rotl(low, 26) ^ high ^ (high << 9);
        state->high = xs64ss_rotl(high, 13);
        return ret;
    }
    //////////////////
    //Xoroshiro 128+//
    //////////////////
    __device__ __host__ xs128pp_state::xs128pp_state()
    {
        low = 0;
        high = 0;
        return;
    }
    __device__ __host__ xs128pp_state::xs128pp_state(const uint64_t seed)
    {
        uint64_t sm64s = seed;
        splitmix64_next(&sm64s);
        low = splitmix64_nextint(&sm64s);
        high = splitmix64_nextint(&sm64s);
        return;
    }
    __device__ __host__ xs128pp_state::xs128pp_state(const uint64_t _low, const uint64_t _high)
    {
        low = _low;
        high = _high;
        return;
    }
    xs128pp_state xs128pp_globalstate = xs128pp_state();
    __host__ __device__ uint64_t xs128pp_rotl(const uint64_t x, int k)
    {
        return (x << k) | (x >> (64 - k));
    }
    __host__ __device__ uint64_t xs128pp_next(xs128pp_state* state)
    {
        const uint64_t low = state->low;
        uint64_t high = state->high;
        const uint64_t result = low + high;
        high ^= low;
        state->low = xs128pp_rotl(low,24) ^ high ^ (high << 16);
        state->high = xs128pp_rotl(high,37);
        return result;
    }
    __host__ __device__  void xs128pp_jump(xs128pp_state* state)
    {
        static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
        uint64_t low = 0;
        uint64_t high = 0;
        int I;
        int B;
        for(I=0;I<2;I++)
        {
            for(B=0;B<64;B++)
            {
                if(JUMP[I] & (1ULL<<B))
                {
                    low ^= state->low;
                    high ^= state->high;
                }
                xs128pp_next(state);
            }
        }
        state->low = low;
        state->high = high;
        return;
    }
    __host__ void xs128pp_seed(uint64_t seed)
    {
        xs128pp_globalstate = xs128pp_state(seed);
    }
 //To have a different implementation for host and device functions
 // __host__ __device__ void my_function() {
 //     #ifdef __CUDA_ARCH__
 //         // GPU-specific code (device path)
 //     #else
 //         // CPU-specific code (host path)
 //     #endif
 // }
 }; //end namespaces
 };
 };
--- a/test_scripts/pseudorandom_gens/notes.txt
+++ b/test_scripts/pseudorandom_gens/notes.txt
@ -0,0 +1,31 @@
 Mark Dickinson
 Over a year ago
 Any thoughts on the PCG family? (E.g., the generator called "PCG RXS M XS 32 (LCG)" in the PCG paper.)
 bryc
 Over a year ago
@MarkDickinson From what I've seen PCG relies on 64-bit math, and there are no fully 32-bit variants that I can find. I can't find any code specific to the generator you mentioned either. I'm sure PCG is a good generator, but I feel like its a poor choice for JavaScript or embedded systems. xoshiro128** for example is ideal in that regard, since it uses 32-bit integers only.
 https://www.pcg-random.org/posts/some-prng-implementations.html
 JSF: Bob Jenkins's Small/Fast Chaotic PRNG
 The gist jsf.hpp contains an implementation of Bob Jenkins's “Small/Fast PRNG”, which is based on a random invertible mapping—what some call a “chaotic PRNG”. I discussed this generator in a previous post, but the short version is that it passes stringent statistical tests, seems to be quite annoying to predict, and works well. It's also very fast.
 My C++ implementation provides the standard jsf64 and jsf32 variants, as well as a number of variations Jenkins suggests that use different constants that should also work well. It also includes some tiny versions, jsf16 and jsf8, which are mostly designed for experimental use (these smaller variants should not be expected to pass extensive statistical tests).
 GJrand: David Blackman's Chaotic PRNG
 The gist gjrand.hpp contains an implementation of David Blackman's gjrand PRNG, which is based on a random invertible mapping that includes a counter to guarantee no small cycles. I will discuss this generator in a future post, but the short version is that it passes stringent statistical tests, and works well. It's also fast, although not quite as fast as JSF, but on the other hand appears to have slightly stronger bit-mixing properties than JSF.
 My C++ implementation provides the standard gjrand64 variant, and at my request Blackman also made a gjrand32 variant. It also includes some tiny versions, gjrand16 and gjrand8, which are mostly designed for experimental use (these smaller variants should not be expected to pass extensive statistical tests). 
 SplitMix: 32-Bit and 64-Bit Output from 128-Bit State
 The gist splitmix.hpp provides a C++ implementation of SplitMix, as described by Guy L. Steele, Jr., Doug Lea and Christine H. Flood in the paper Fast Splittable Pseudorandom Number Generators and implemented in Java 8 as SplittableRandom (cannonical source code).
 This C++ implementation avoids the bugs present in implementations directly derived from the (erroneous) code in the SplitMix paper. Without these bugs it has all properties, good and bad, that are inherent in SplitMix's design (discussed at length in previous posts). In particular, the 64-bit–output variant, splitmix64, may not be suitable for general-purpose use because it has the property that each number is only output once (similar to _once_insecure PCG variants), which can be detected as a deviation from random behavior by statistical tests. The 32-bit–output variant, splitmix32, does not have this issue.
 Unlike the other implementations described in this post, this implementation includes both jump-ahead and distance operations (most implementations of SplitMix do not offer these features, although they are quite easy to provide). Also, in contrast to other implementations, the two variants are represented as separate classes (although it is possible to cast between them; splitmix32 is a subclass of splitmix64).
--- a/test_scripts/pseudorandom_gens/splitmix.hpp
+++ b/test_scripts/pseudorandom_gens/splitmix.hpp
@ -0,0 +1,236 @@
 //ref: https://gist.github.com/imneme/6179748664e88ef3c34860f44309fc71
 //ref:  Guy L. Steele, Jr., Doug Lea and Christine H. Flood in the paper Fast Splittable Pseudorandom Number Generators and implemented in Java 8 as SplittableRandom (cannonical source code).
 #ifndef SPLITMIX_HPP_INCLUDED
 #define SPLITMIX_HPP_INCLUDED 1
 /*
 * A C++ implementation of SplitMix
 *   Original design by Guy L. Steele, Jr., Doug Lea and Christine H. Flood
 *   Described in _Fast splittable pseudorandom number generators_
 *       http://dx.doi.org/10.1145/2714064.2660195 and implemented in
 *       Java 8 as SplittableRandom
 *   Based on code from the original paper, with revisions based on changes
 *   made to the the Java 8 source, at
 *      http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/tip/
 *             src/share/classes/java/util/SplittableRandom.java
 *   and other publicly available implementations.
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2018, 2025 Melissa E. O'Neill
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <cstdint>
 namespace splitmix_detail {
 template <typename IntRep>
 IntRep fast_exp(IntRep x, IntRep power)
 {
    IntRep result = IntRep(1);
    IntRep multiplier = x;
    while (power != IntRep(0)) {
        IntRep thismult = power & IntRep(1) ? multiplier : IntRep(1);
        result *= thismult;
        power >>= 1;
        multiplier *= multiplier;
    }
    return result;
 }
 template <typename IntRep>
 inline IntRep modular_inverse(IntRep x)
 {
    return fast_exp(x, IntRep(-1));
 }
 #if __cpp_lib_bitops > 201907L
 // If possible, use trivial wrapper for C++20's std::popcount
 #include <bit>
 template <typename IntRep>
 constexpr inline IntRep pop_count(IntRep x) {
    return std::popcount(x);
 }
 #elif !defined(__GNUC__)
 // Fallback code based on code at https://en.wikipedia.org/wiki/Hamming_weight
 // use a popcount intrinsic if you can since most modern CPUs have one.
 template <typename IntRep>
 inline unsigned int pop_count(IntRep x) {
    unsigned int count = 0;
    while (x) {
        ++count;
        x &= x - IntRep(1);
    }
    return count;
 }
 #else
 template <typename IntRep>
 inline IntRep pop_count(IntRep x) {
    static_assert(sizeof(IntRep) <= sizeof(unsigned long long),
                  "IntRep must be no larger than unsigned long long");
    if (sizeof(IntRep) <= sizeof(unsigned int))
        return __builtin_popcount(x);
    else if (sizeof(IntRep) <= sizeof(unsigned long))
        return __builtin_popcountl(x);
    else
        return __builtin_popcountll(x);
 }
 #endif
 template <uint64_t m1, uint64_t m2, 
          unsigned int p, unsigned int q, unsigned int r,
          uint64_t m3, uint64_t m4,
          unsigned int s, unsigned int t, unsigned int u>
 class splitmix64_base {
 public:
    using result_type = uint64_t;
    static constexpr result_type min() { return 0; };
    static constexpr result_type max() { return ~result_type(0); };
 protected:
    uint64_t seed_;
    uint64_t gamma_;
    static uint64_t mix_gamma(uint64_t x) {
        x ^= x >> p;
        x *= m1; 
        x ^= x >> q;
        x *= m2;
        x ^= x >> r;
        x |= 1ul;
        int n = pop_count(x ^ (x >> 1));
        return (n < 24) ? x ^ 0xaaaaaaaaaaaaaaaa : x;
    }
    static uint64_t mix64(uint64_t x) {
        x ^= x >> s;
        x *= m3; 
        x ^= x >> t;
        x *= m4;
        x ^= x >> u;
        return x;
    }
    void advance() {
        seed_ += gamma_;
    }
    uint64_t next_seed() {
        uint64_t result = seed_;
        advance();
        return result;
    }
 public:
    splitmix64_base(uint64_t seed  = 0xbad0ff1ced15ea5e,
                    uint64_t gamma = 0x9e3779b97f4a7c15)
        : seed_(seed), gamma_(gamma | 1)
    {
        // Nothing (else) to do.
    }
    uint64_t operator()() {
        return mix64(next_seed());
    }
    void advance(uint64_t delta) {
        seed_ += delta * gamma_;
    }
    void backstep(uint64_t delta) {
        advance(-delta);
    }
    bool wrapped() {
        return seed_ == 0;
    }
    uint64_t seed() const {
        return seed_;
    }
    uint64_t gamma() const {
        return gamma_;
    }
    uint64_t operator-(const splitmix64_base& other) {
        return (seed_ - other.seed_) * modular_inverse(other.gamma_);
    }
    splitmix64_base split() {
        uint64_t new_seed  = operator()();
        uint64_t new_gamma = mix_gamma(next_seed());
        return { new_seed, new_gamma };
    }
    bool operator==(const splitmix64_base& rhs) {
        return (seed_ == rhs.seed_) && (gamma_ == rhs.gamma_);
    }
 };
 template <uint64_t m5, uint64_t m6, unsigned int v, unsigned int w,
          typename splitmix>
 class splitmix32_base : public splitmix {
 public:
    using result_type = uint32_t;
    static constexpr result_type min() { return 0; };
    static constexpr result_type max() { return ~result_type(0); };
    using splitmix::splitmix;
    result_type operator()() {
        uint64_t seed = splitmix::next_seed();
        seed ^= seed >> v;
        seed *= m5;
        seed ^= seed >> w;
        seed *= m6;
        return result_type(seed >> 32);
    }
    splitmix32_base split() {
        return splitmix::split();
    }
 };
 }
 using splitmix64 = splitmix_detail::splitmix64_base<
                       0xff51afd7ed558ccdul, 0xc4ceb9fe1a85ec53ul,
                       33, 33, 33,
                       0xbf58476d1ce4e5b9ul, 0x94d049bb133111ebul,
                       30, 27, 31>;
 using splitmix32 = splitmix_detail::splitmix32_base<
                       0x62a9d9ed799705f5ul, 0xcb24d0a5c88c35b3ul,
                       33, 28, splitmix64>;
 #endif // SPLITMIX_HPP_INCLUDED
--- a/test_scripts/pseudorandom_gens/splitmix32.c
+++ b/test_scripts/pseudorandom_gens/splitmix32.c
@ -0,0 +1,45 @@
 /* SplitMix32 test program
 * Copyright (c) 2022 Joel K. Pettersson
 * <joelkpettersson@gmail.com>.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
 #include "testwrite.h"
 #include "muvaror32.h"
 /*
  * Here's SplitMix32 for comparison and a quick test.
  */
 //FIBH32 = 2654435769U
 /**
  * Get next value for and update \p pos.
  */
 static inline uint32_t splitmix32(uint32_t *restrict pos) {
     uint32_t s = *pos += FIBH32;
     s ^= s >> 16;
     s *= 0x85ebca6b;
     s ^= s >> 13;
     s *= 0xc2b2ae35;
     s ^= s >> 16;
     return s;
 }
 int main(int argc, char *argv[]) {
     uint32_t pos = 0;
     for (;;) {
         /* SplitMix32 test */
         add_output(splitmix32(&pos));
     }
     return 0;
 }
--- a/test_scripts/pseudorandom_gens/xoroshiro128plus.c
+++ b/test_scripts/pseudorandom_gens/xoroshiro128plus.c
@ -0,0 +1,108 @@
 /*  Written in 2016-2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
 To the extent possible under law, the author has dedicated all copyright
 and related and neighboring rights to this software to the public domain
 worldwide.
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted.
 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
 IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 #include <stdint.h>
 /* This is xoroshiro128+ 1.0, our best and fastest small-state generator
   for floating-point numbers, but its state space is large enough only
   for mild parallelism. We suggest to use its upper bits for
   floating-point generation, as it is slightly faster than
   xoroshiro128++/xoroshiro128**. It passes all tests we are aware of
   except for the four lower bits, which might fail linearity tests (and
   just those), so if low linear complexity is not considered an issue (as
   it is usually the case) it can be used to generate 64-bit outputs, too;
   moreover, this generator has a very mild Hamming-weight dependency
   making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of
   output; we believe this slight bias cannot affect any application. If
   you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.
   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.
   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. 
   NOTE: the parameters (a=24, b=16, b=37) of this version give slightly
   better results in our test than the 2016 version (a=55, b=14, c=36).
 */
 static inline uint64_t rotl(const uint64_t x, int k) {
 	return (x << k) | (x >> (64 - k));
 }
 static uint64_t s[2];
 uint64_t next(void) {
 	const uint64_t s0 = s[0];
 	uint64_t s1 = s[1];
 	const uint64_t result = s0 + s1;
 	s1 ^= s0;
 	s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
 	s[1] = rotl(s1, 37); // c
 	return result;
 }
 /* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */
 void jump(void) {
 	static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
 	uint64_t s0 = 0;
 	uint64_t s1 = 0;
 	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
 		for(int b = 0; b < 64; b++) {
 			if (JUMP[i] & UINT64_C(1) << b) {
 				s0 ^= s[0];
 				s1 ^= s[1];
 			}
 			next();
 		}
 	s[0] = s0;
 	s[1] = s1;
 }
 /* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */
 void long_jump(void) {
 	static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };
 	uint64_t s0 = 0;
 	uint64_t s1 = 0;
 	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
 		for(int b = 0; b < 64; b++) {
 			if (LONG_JUMP[i] & UINT64_C(1) << b) {
 				s0 ^= s[0];
 				s1 ^= s[1];
 			}
 			next();
 		}
 	s[0] = s0;
 	s[1] = s1;
 }
--- a/test_scripts/pseudorandom_gens/xoroshiro64starstar.c
+++ b/test_scripts/pseudorandom_gens/xoroshiro64starstar.c
@ -0,0 +1,50 @@
 // https://prng.di.unimi.it/xoroshiro64starstar.c
 /*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
 To the extent possible under law, the author has dedicated all copyright
 and related and neighboring rights to this software to the public domain
 worldwide.
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted.
 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
 IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 #include <stdint.h>
 /* This is xoroshiro64** 1.0, our 32-bit all-purpose, rock-solid,
   small-state generator. It is extremely fast and it passes all tests we
   are aware of, but its state space is not large enough for any parallel
   application.
   For generating just single-precision (i.e., 32-bit) floating-point
   numbers, xoroshiro64* is even faster.
   The state must be seeded so that it is not everywhere zero. */
 static inline uint32_t rotl(const uint32_t x, int k) {
 	return (x << k) | (x >> (32 - k));
 }
 static uint32_t s[2];
 uint32_t next(void) {
 	const uint32_t s0 = s[0];
 	uint32_t s1 = s[1];
 	const uint32_t result = rotl(s0 * 0x9E3779BB, 5) * 5;
 	s1 ^= s0;
 	s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
 	s[1] = rotl(s1, 13); // c
 	return result;
 }