random routines

2026-04-14 14:03:04 -04:00
parent 4530ed3603
commit caf347e8f5
45 changed files with 1086 additions and 1 deletions
--- a/src/amsculib3/math/cuvec3.cu
+++ b/src/amsculib3/math/cuvec3.cu
@ -736,7 +736,7 @@ __host__ __device__ cumat3 rotmat_from_axisangle(const cuvec3 &axis, const doubl
 // Tests //
 ///////////

-__host__ void test_cudavectf_logic1()
+__host__ void test_cudavect_logic1()
 {

 }
--- a/src/amsculib3/random/amscupcg.cu
+++ b/src/amsculib3/random/amscupcg.cu
@ -0,0 +1,6 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+
+};
--- a/src/amsculib3/random/amscurandom1.cu
+++ b/src/amsculib3/random/amscurandom1.cu
@ -0,0 +1,130 @@
+#include <amsculib3/amsculib3.hpp>
+
+using namespace amscuda::random::xoroshiro;
+
+namespace amscuda
+{
+namespace random
+{
+
+    //Choosing xoroshiro64** as my default RNG due to 32 bit only operations 
+    randstate_t global_rand_cpustate = xs64ss_state();
+
+
+    __host__ void rand_seed(const uint32_t seed)
+    {
+        global_rand_cpustate = xs64ss_state(seed);
+    }
+
+    __host__ __device__ void rand_state_increment(const int32_t inc, randstate_t *state)
+    {
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+        xoroshiro::xs64ss_state* s2 = (xoroshiro::xs64ss_state*)state;
+        s2->low += inc;
+        xs64ss_next(s2);
+        return;
+    }
+
+    __host__ __device__ void rand_next(randstate_t *state)
+    {
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+        xs64ss_next((xoroshiro::xs64ss_state*)state);
+        return;
+    }
+
+    __host__ __device__ uint32_t randui32(randstate_t *state)
+    {
+        uint32_t ret;
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+
+        ret = xoroshiro::xs64ss_next((xoroshiro::xs64ss_state*)state);
+        return ret;
+    }
+
+    __host__ __device__ int randint(int low, int high, randstate_t *state)
+    {
+        int ret;
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+        int32_t q = (int)((randui32(state)>>1U)%(1U<<16U));
+        ret = (q%(high-low))+low;
+        return ret;
+    }
+
+    __host__ __device__ float randf(randstate_t *state)
+    {
+        float ret;
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+        ret= ((float)randui32(state))/(2147483648.0f);
+        return ret;
+    }
+
+    __host__ __device__ double rand(randstate_t *state)
+    {
+        double ret;
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+        ret= ((double)randui32(state))/(2147483648.0f);
+        return ret;
+    }
+
+    __host__ __device__ float randnf(randstate_t *state)
+    {
+        float q1,q2;
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+
+        q1 = randf(state);
+        q2 = randf(state);
+        return (::sqrtf(-2.0f*::logf(q1))*cos(2.0f*pif*q2));
+    }
+
+    __host__ __device__ double randn(randstate_t *state)
+    {
+        double q1,q2;
+        #ifdef __CUDA_ARCH__
+            // GPU-specific code (device path)
+        #else
+            // CPU-specific code (host path)
+            if(state==NULL) state = &global_rand_cpustate;
+        #endif
+
+        q1 = rand(state);
+        q2 = rand(state);
+        return (::sqrt(-2.0*::log(q1))*cos(2.0*pi*q2));
+    }
+
+}; //end namespaces
+};
--- a/src/amsculib3/random/amscurandom1_dbuff.cu
+++ b/src/amsculib3/random/amscurandom1_dbuff.cu
@ -0,0 +1,9 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+namespace random
+{
+
+}; //end namespaces
+};
--- a/src/amsculib3/random/amscurandom1_hbuff.cu
+++ b/src/amsculib3/random/amscurandom1_hbuff.cu
@ -0,0 +1,9 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+namespace random
+{
+
+}; //end namespaces
+};
--- a/src/amsculib3/random/amscusplitmix.cu
+++ b/src/amsculib3/random/amscusplitmix.cu
@ -0,0 +1,125 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+namespace random
+{
+namespace xoroshiro
+{
+
+//ref: https://github.com/joelkp/ranoise/blob/main/splitmix32.c
+// __device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state)
+// {
+//     uint32_t ret;
+//     *splitmix32_state += 2654435769U;
+//     ret = *splitmix32_state;
+//     ret ^= ret >> 16;
+//     ret *= 0x85ebca6bU;
+//     ret ^= ret >> 13;
+//     ret *= 0xc2b2ae35U;
+//     ret ^= ret >> 16;
+//     return ret;
+// }
+
+//Better constants: see discussion below.
+__device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state)
+{
+    uint32_t ret;
+    *splitmix32_state += 0x9e3779b9U;
+    ret = *splitmix32_state;
+    ret ^= ret >> 16;
+    ret *= 0x21f0aaadU;
+    ret ^= ret >> 15;
+    ret *= 0x735a2d97U;
+    ret ^= ret >> 15;
+    return ret;
+}
+
+
+__host__ __device__ void splitmix64_next(uint64_t *state)
+{
+    //*state += 0x9E3779B97F4A7C15uL;
+    *state += 0x9E3779B97F4A7C15ULL;
+}
+
+__host__ __device__ uint64_t splitmix64_nextint(uint64_t *state)
+{
+    //ref: https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64
+
+    uint64_t ret;
+    *state += 0x9E3779B97F4A7C15ULL;
+    ret = *state;
+    ret = (ret ^ (ret >> 30)) * 0xbf58476d1ce4e5b9ULL;
+    ret = (ret ^ (ret >> 27)) * 0x94d049bb133111ebULL;
+    ret = ret^(ret>>31);
+    return ret;
+}
+
+
+
+////////////////////
+// Unsorted Notes //
+////////////////////
+
+    //ref: https://github.com/joelkp/ranoise/blob/main/splitmix32.c
+    //2654435769U
+    //#define ROR32(x, r) \
+	//((uint32_t)(x) >> ((r) & 31) | (uint32_t)(x) << ((32-(r)) & 31))
+
+    //#define MUVAROR32(x, r, ro) \
+	//(((uint32_t)(x) | ((1<<((ro) & 31))|1)) * ROR32((x), (r)+(ro)))
+
+    // static inline uint32_t splitmix32(uint32_t *restrict pos) {
+    //     uint32_t s = *pos += 2654435769U;
+    //     s ^= s >> 16;
+    //     s *= 0x85ebca6b;
+    //     s ^= s >> 13;
+    //     s *= 0xc2b2ae35;
+    //     s ^= s >> 16;
+    //     return s;
+    // }
+    
+    // int main(int argc, char *argv[]) {
+    //     uint32_t pos = 0;
+    //     for (;;) {
+    //         /* SplitMix32 test */
+    //         add_output(splitmix32(&pos));
+    //     }
+    //     return 0;
+    // }
+
+
+    /*
+    //ref: https://stackoverflow.com/questions/17035441/looking-for-decent-quality-prng-with-only-32-bits-of-state
+
+    uint32_t splitmix32(void) {
+        uint32_t z = state += 0x9e3779b9;
+        z ^= z >> 15; // 16 for murmur3
+        z *= 0x85ebca6b;
+        z ^= z >> 13;
+        z *= 0xc2b2ae35;
+        return z ^= z >> 16;
+    }
+
+    Update: Better constants have been found that potentially make this a very good PRNG:
+    uint32_t splitmix32(void) {
+    uint32_t z = (state += 0x9e3779b9);
+    z ^= z >> 16; z *= 0x21f0aaad;
+    z ^= z >> 15; z *= 0x735a2d97;
+    z ^= z >> 15;
+    return z;
+}
+
+
+Mark Dickinson
+Over a year ago
+Any thoughts on the PCG family? (E.g., the generator called "PCG RXS M XS 32 (LCG)" in the PCG paper.)
+bryc
+Over a year ago
+@MarkDickinson From what I've seen PCG relies on 64-bit math, and there are no fully 32-bit variants that I can find. I can't find any code specific to the generator you mentioned either. I'm sure PCG is a good generator, but I feel like its a poor choice for JavaScript or embedded systems. xoshiro128** for example is ideal in that regard, since it uses 32-bit integers only.
+
+    */
+
+};//end namespace splitmix 
+}; //end namespaces
+};
--- a/src/amsculib3/random/amsxoroshiro.cu
+++ b/src/amsculib3/random/amsxoroshiro.cu
@ -0,0 +1,146 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+namespace random
+{
+namespace xoroshiro
+{
+
+    //////////////////
+    //Xoroshiro 64**//
+    //////////////////
+
+    __device__ __host__ xs64ss_state::xs64ss_state()
+    {
+        low = 0; high = 0;
+    }
+
+    __device__ __host__ xs64ss_state::xs64ss_state(const uint32_t seed)
+    {
+        uint32_t sm32 = seed;
+        splitmix32_next(&sm32);
+        low = splitmix32_next(&sm32);
+        high = splitmix32_next(&sm32);
+        return;
+    }
+
+    __device__ __host__ xs64ss_state::xs64ss_state(const uint32_t _low, const uint32_t _high)
+    {
+        low = _low; high = _high;
+    }
+
+    xs64ss_state xs64ss_globalstate = xs64ss_state();
+
+    __host__ void xs64ss_seed(const uint32_t seed)
+    {
+        xs64ss_globalstate = xs64ss_state(seed);
+    }
+
+    __device__ __host__ static inline uint32_t xs64ss_rotl(const uint32_t x, int k) {
+        return (x << k) | (x >> (32 - k));
+    }
+
+    //implements the xoroshiro64** PRNG
+    __host__ __device__ uint32_t xs64ss_next(xs64ss_state *state)
+    {
+        const uint32_t low = state->low;
+        uint32_t high = state->high;
+        const uint32_t ret = xs64ss_rotl(low*0x9E3779BBU,5)*5;
+        high ^= low;
+        state->low = xs64ss_rotl(low, 26) ^ high ^ (high << 9);
+        state->high = xs64ss_rotl(high, 13);
+        return ret;
+    }
+
+
+    //////////////////
+    //Xoroshiro 128+//
+    //////////////////
+    __device__ __host__ xs128pp_state::xs128pp_state()
+    {
+        low = 0;
+        high = 0;
+        return;
+    }
+
+    __device__ __host__ xs128pp_state::xs128pp_state(const uint64_t seed)
+    {
+        uint64_t sm64s = seed;
+        splitmix64_next(&sm64s);
+        low = splitmix64_nextint(&sm64s);
+        high = splitmix64_nextint(&sm64s);
+        return;
+    }
+
+    __device__ __host__ xs128pp_state::xs128pp_state(const uint64_t _low, const uint64_t _high)
+    {
+        low = _low;
+        high = _high;
+        return;
+    }
+
+    xs128pp_state xs128pp_globalstate = xs128pp_state();
+
+    __host__ __device__ uint64_t xs128pp_rotl(const uint64_t x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+    __host__ __device__ uint64_t xs128pp_next(xs128pp_state* state)
+    {
+        const uint64_t low = state->low;
+        uint64_t high = state->high;
+        const uint64_t result = low + high;
+        high ^= low;
+        state->low = xs128pp_rotl(low,24) ^ high ^ (high << 16);
+        state->high = xs128pp_rotl(high,37);
+        return result;
+    }
+
+    __host__ __device__  void xs128pp_jump(xs128pp_state* state)
+    {
+        static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
+        uint64_t low = 0;
+        uint64_t high = 0;
+        int I;
+        int B;
+
+        for(I=0;I<2;I++)
+        {
+            for(B=0;B<64;B++)
+            {
+                if(JUMP[I] & (1ULL<<B))
+                {
+                    low ^= state->low;
+                    high ^= state->high;
+                }
+                xs128pp_next(state);
+            }
+        }
+
+        state->low = low;
+        state->high = high;
+        return;
+    }
+
+    __host__ void xs128pp_seed(uint64_t seed)
+    {
+        xs128pp_globalstate = xs128pp_state(seed);
+    }
+
+
+//To have a different implementation for host and device functions
+// __host__ __device__ void my_function() {
+//     #ifdef __CUDA_ARCH__
+//         // GPU-specific code (device path)
+//     #else
+//         // CPU-specific code (host path)
+//     #endif
+// }
+
+
+
+}; //end namespaces
+};
+};