random routines

This commit is contained in:
2026-04-14 14:03:04 -04:00
parent 4530ed3603
commit caf347e8f5
45 changed files with 1086 additions and 1 deletions

View File

@ -736,7 +736,7 @@ __host__ __device__ cumat3 rotmat_from_axisangle(const cuvec3 &axis, const doubl
// Tests //
///////////
__host__ void test_cudavectf_logic1()
__host__ void test_cudavect_logic1()
{
}

View File

@ -0,0 +1,6 @@
#include <amsculib3/amsculib3.hpp>
namespace amscuda
{
};

View File

@ -0,0 +1,130 @@
#include <amsculib3/amsculib3.hpp>
using namespace amscuda::random::xoroshiro;
namespace amscuda
{
namespace random
{
//Choosing xoroshiro64** as my default RNG due to 32 bit only operations
randstate_t global_rand_cpustate = xs64ss_state();
__host__ void rand_seed(const uint32_t seed)
{
global_rand_cpustate = xs64ss_state(seed);
}
__host__ __device__ void rand_state_increment(const int32_t inc, randstate_t *state)
{
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
xoroshiro::xs64ss_state* s2 = (xoroshiro::xs64ss_state*)state;
s2->low += inc;
xs64ss_next(s2);
return;
}
__host__ __device__ void rand_next(randstate_t *state)
{
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
xs64ss_next((xoroshiro::xs64ss_state*)state);
return;
}
__host__ __device__ uint32_t randui32(randstate_t *state)
{
uint32_t ret;
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
ret = xoroshiro::xs64ss_next((xoroshiro::xs64ss_state*)state);
return ret;
}
__host__ __device__ int randint(int low, int high, randstate_t *state)
{
int ret;
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
int32_t q = (int)((randui32(state)>>1U)%(1U<<16U));
ret = (q%(high-low))+low;
return ret;
}
__host__ __device__ float randf(randstate_t *state)
{
float ret;
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
ret= ((float)randui32(state))/(2147483648.0f);
return ret;
}
__host__ __device__ double rand(randstate_t *state)
{
double ret;
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
ret= ((double)randui32(state))/(2147483648.0f);
return ret;
}
__host__ __device__ float randnf(randstate_t *state)
{
float q1,q2;
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
q1 = randf(state);
q2 = randf(state);
return (::sqrtf(-2.0f*::logf(q1))*cos(2.0f*pif*q2));
}
__host__ __device__ double randn(randstate_t *state)
{
double q1,q2;
#ifdef __CUDA_ARCH__
// GPU-specific code (device path)
#else
// CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate;
#endif
q1 = rand(state);
q2 = rand(state);
return (::sqrt(-2.0*::log(q1))*cos(2.0*pi*q2));
}
}; //end namespaces
};

View File

@ -0,0 +1,9 @@
#include <amsculib3/amsculib3.hpp>
namespace amscuda
{
namespace random
{
}; //end namespaces
};

View File

@ -0,0 +1,9 @@
#include <amsculib3/amsculib3.hpp>
namespace amscuda
{
namespace random
{
}; //end namespaces
};

View File

@ -0,0 +1,125 @@
#include <amsculib3/amsculib3.hpp>
namespace amscuda
{
namespace random
{
namespace xoroshiro
{
//ref: https://github.com/joelkp/ranoise/blob/main/splitmix32.c
// __device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state)
// {
// uint32_t ret;
// *splitmix32_state += 2654435769U;
// ret = *splitmix32_state;
// ret ^= ret >> 16;
// ret *= 0x85ebca6bU;
// ret ^= ret >> 13;
// ret *= 0xc2b2ae35U;
// ret ^= ret >> 16;
// return ret;
// }
//Better constants: see discussion below.
__device__ __host__ uint32_t splitmix32_next(uint32_t *splitmix32_state)
{
uint32_t ret;
*splitmix32_state += 0x9e3779b9U;
ret = *splitmix32_state;
ret ^= ret >> 16;
ret *= 0x21f0aaadU;
ret ^= ret >> 15;
ret *= 0x735a2d97U;
ret ^= ret >> 15;
return ret;
}
__host__ __device__ void splitmix64_next(uint64_t *state)
{
//*state += 0x9E3779B97F4A7C15uL;
*state += 0x9E3779B97F4A7C15ULL;
}
__host__ __device__ uint64_t splitmix64_nextint(uint64_t *state)
{
//ref: https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64
uint64_t ret;
*state += 0x9E3779B97F4A7C15ULL;
ret = *state;
ret = (ret ^ (ret >> 30)) * 0xbf58476d1ce4e5b9ULL;
ret = (ret ^ (ret >> 27)) * 0x94d049bb133111ebULL;
ret = ret^(ret>>31);
return ret;
}
////////////////////
// Unsorted Notes //
////////////////////
//ref: https://github.com/joelkp/ranoise/blob/main/splitmix32.c
//2654435769U
//#define ROR32(x, r) \
//((uint32_t)(x) >> ((r) & 31) | (uint32_t)(x) << ((32-(r)) & 31))
//#define MUVAROR32(x, r, ro) \
//(((uint32_t)(x) | ((1<<((ro) & 31))|1)) * ROR32((x), (r)+(ro)))
// static inline uint32_t splitmix32(uint32_t *restrict pos) {
// uint32_t s = *pos += 2654435769U;
// s ^= s >> 16;
// s *= 0x85ebca6b;
// s ^= s >> 13;
// s *= 0xc2b2ae35;
// s ^= s >> 16;
// return s;
// }
// int main(int argc, char *argv[]) {
// uint32_t pos = 0;
// for (;;) {
// /* SplitMix32 test */
// add_output(splitmix32(&pos));
// }
// return 0;
// }
/*
//ref: https://stackoverflow.com/questions/17035441/looking-for-decent-quality-prng-with-only-32-bits-of-state
uint32_t splitmix32(void) {
uint32_t z = state += 0x9e3779b9;
z ^= z >> 15; // 16 for murmur3
z *= 0x85ebca6b;
z ^= z >> 13;
z *= 0xc2b2ae35;
return z ^= z >> 16;
}
Update: Better constants have been found that potentially make this a very good PRNG:
uint32_t splitmix32(void) {
uint32_t z = (state += 0x9e3779b9);
z ^= z >> 16; z *= 0x21f0aaad;
z ^= z >> 15; z *= 0x735a2d97;
z ^= z >> 15;
return z;
}
Mark Dickinson
Over a year ago
Any thoughts on the PCG family? (E.g., the generator called "PCG RXS M XS 32 (LCG)" in the PCG paper.)
bryc
Over a year ago
@MarkDickinson From what I've seen PCG relies on 64-bit math, and there are no fully 32-bit variants that I can find. I can't find any code specific to the generator you mentioned either. I'm sure PCG is a good generator, but I feel like its a poor choice for JavaScript or embedded systems. xoshiro128** for example is ideal in that regard, since it uses 32-bit integers only.
*/
};//end namespace splitmix
}; //end namespaces
};

View File

@ -0,0 +1,146 @@
#include <amsculib3/amsculib3.hpp>
namespace amscuda
{
namespace random
{
namespace xoroshiro
{
//////////////////
//Xoroshiro 64**//
//////////////////
__device__ __host__ xs64ss_state::xs64ss_state()
{
low = 0; high = 0;
}
__device__ __host__ xs64ss_state::xs64ss_state(const uint32_t seed)
{
uint32_t sm32 = seed;
splitmix32_next(&sm32);
low = splitmix32_next(&sm32);
high = splitmix32_next(&sm32);
return;
}
__device__ __host__ xs64ss_state::xs64ss_state(const uint32_t _low, const uint32_t _high)
{
low = _low; high = _high;
}
xs64ss_state xs64ss_globalstate = xs64ss_state();
__host__ void xs64ss_seed(const uint32_t seed)
{
xs64ss_globalstate = xs64ss_state(seed);
}
__device__ __host__ static inline uint32_t xs64ss_rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
//implements the xoroshiro64** PRNG
__host__ __device__ uint32_t xs64ss_next(xs64ss_state *state)
{
const uint32_t low = state->low;
uint32_t high = state->high;
const uint32_t ret = xs64ss_rotl(low*0x9E3779BBU,5)*5;
high ^= low;
state->low = xs64ss_rotl(low, 26) ^ high ^ (high << 9);
state->high = xs64ss_rotl(high, 13);
return ret;
}
//////////////////
//Xoroshiro 128+//
//////////////////
__device__ __host__ xs128pp_state::xs128pp_state()
{
low = 0;
high = 0;
return;
}
__device__ __host__ xs128pp_state::xs128pp_state(const uint64_t seed)
{
uint64_t sm64s = seed;
splitmix64_next(&sm64s);
low = splitmix64_nextint(&sm64s);
high = splitmix64_nextint(&sm64s);
return;
}
__device__ __host__ xs128pp_state::xs128pp_state(const uint64_t _low, const uint64_t _high)
{
low = _low;
high = _high;
return;
}
xs128pp_state xs128pp_globalstate = xs128pp_state();
__host__ __device__ uint64_t xs128pp_rotl(const uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
__host__ __device__ uint64_t xs128pp_next(xs128pp_state* state)
{
const uint64_t low = state->low;
uint64_t high = state->high;
const uint64_t result = low + high;
high ^= low;
state->low = xs128pp_rotl(low,24) ^ high ^ (high << 16);
state->high = xs128pp_rotl(high,37);
return result;
}
__host__ __device__ void xs128pp_jump(xs128pp_state* state)
{
static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
uint64_t low = 0;
uint64_t high = 0;
int I;
int B;
for(I=0;I<2;I++)
{
for(B=0;B<64;B++)
{
if(JUMP[I] & (1ULL<<B))
{
low ^= state->low;
high ^= state->high;
}
xs128pp_next(state);
}
}
state->low = low;
state->high = high;
return;
}
__host__ void xs128pp_seed(uint64_t seed)
{
xs128pp_globalstate = xs128pp_state(seed);
}
//To have a different implementation for host and device functions
// __host__ __device__ void my_function() {
// #ifdef __CUDA_ARCH__
// // GPU-specific code (device path)
// #else
// // CPU-specific code (host path)
// #endif
// }
}; //end namespaces
};
};