host random buffer fillers

This commit is contained in:
2026-04-14 21:20:49 -04:00
parent b0121a2f83
commit 361ee4c6da
42 changed files with 540 additions and 57 deletions

View File

@ -17,7 +17,7 @@ builddir = "./build_linux64"
doinstall = True #copies the build_output to the install dir when finished doinstall = True #copies the build_output to the install dir when finished
cc = "nvcc" #compiler cc = "nvcc" #compiler
cflags = "-dc --compiler-options '-fPIC -O3'" cflags = "-dc --compiler-options '-fPIC -O3'"
libraries = "-l{}".format(libname) libraries = "-l{} -lpthread".format(libname)
libdirs = "-L{} -L{}/lib -L{}/lib".format(builddir,commondir,depdir) libdirs = "-L{} -L{}/lib -L{}/lib".format(builddir,commondir,depdir)
linkerflags = " -Xlinker=-rpath,." linkerflags = " -Xlinker=-rpath,."
srcexts = [".c",".cpp",".cu"] srcexts = [".c",".cpp",".cu"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -7,7 +7,12 @@
#include <math.h> #include <math.h>
#include <stdint.h> #include <stdint.h>
#include <time.h> #include <time.h>
//C++ standard library headers
#include <new> #include <new>
#include <thread>
#include <functional>
#include <mutex>
#include <cuda_runtime_api.h> //where all the cuda functions live #include <cuda_runtime_api.h> //where all the cuda functions live
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -41,12 +46,18 @@ namespace amscuda
//default numthreads to execute on cpu //default numthreads to execute on cpu
AMSCU_CONST static const int amscu_defcputhreads = 8; AMSCU_CONST static const int amscu_defcputhreads = 8;
AMSCU_CONST static const int amscu_success = 1;
AMSCU_CONST static const int amscu_meh = 0;
AMSCU_CONST static const int amscu_failure = -1;
}; //end namespace amscuda }; //end namespace amscuda
//Components //Components
#include <amsculib3/amscu_cudafunctions.hpp> #include <amsculib3/amscu_cudafunctions.hpp>
#include <amsculib3/math/amscumath.hpp> #include <amsculib3/math/amscumath.hpp>
#include <amsculib3/geom/amscugeom.hpp> #include <amsculib3/geom/amscugeom.hpp>
#include <amsculib3/util/amscu_util.hpp>
#include <amsculib3/amscuarray.hpp> #include <amsculib3/amscuarray.hpp>
#include <amsculib3/amscuda_binarrrw.hpp> #include <amsculib3/amscuda_binarrrw.hpp>

View File

@ -34,18 +34,18 @@ __host__ __device__ float randnf(randstate_t *state = NULL);
__host__ __device__ double randn(randstate_t *state = NULL); __host__ __device__ double randn(randstate_t *state = NULL);
//Operations to fill a host buffer with random values //Operations to fill a host buffer with random values
__host__ int hbuff_randf(float *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int hbuff_randf(float *hbuffer, int size, randstate_t *state = NULL);
__host__ int hbuff_rand(double *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int hbuff_rand(double *hbuffer, int size, randstate_t *state = NULL);
__host__ int hbuff_randnf(float *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int hbuff_randnf(float *hbuffer, int size, randstate_t *state = NULL);
__host__ int hbuff_randn(double *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int hbuff_randn(double *hbuffer, int size, randstate_t *state = NULL);
__host__ int hbuff_randint(int *hbuffer, int64_t size, int low, int high, randstate_t *state = NULL); __host__ int hbuff_randint(int *hbuffer, int size, int low, int high, randstate_t *state = NULL);
//Operations to fill a device buffer with random values //Operations to fill a device buffer with random values
__host__ int dbuff_randf(float *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int dbuff_randf(float *dbuffer, int size, randstate_t *state = NULL);
__host__ int dbuff_rand(double *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int dbuff_rand(double *dbuffer, int size, randstate_t *state = NULL);
__host__ int dbuff_randnf(float *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int dbuff_randnf(float *dbuffer, int size, randstate_t *state = NULL);
__host__ int dbuff_randn(double *hbuffer, int64_t size, randstate_t *state = NULL); __host__ int dbuff_randn(double *dbuffer, int size, randstate_t *state = NULL);
__host__ int dbuff_randint(int *hbuffer, int64_t size, int low, int high, randstate_t *state = NULL); __host__ int dbuff_randint(int *dbuffer, int size, int low, int high, randstate_t *state = NULL);
//Tests //Tests

View File

@ -0,0 +1,22 @@
#ifndef __AMSCU_CPUTHREADING_HPP__
#define __AMSCU_CPUTHREADING_HPP__
namespace amscuda
{
namespace util
{
int amscpu_cputhreading_threadplan(int probsize);
//A template function that takes as input a function pointer and a series of arguments
//The function is executed with fptr(threadnum, nthreads, otherargs...) with a dynamic number of threads
//psize must be supplied, which will call amscpu_cputhreading_threadplan to determine number of threads to use for execution
template<typename callable, typename ... argst> int threaded_execute(callable &&fptr, int64_t psize, argst&&... args);
};
};
#include <amsculib3/util/amscu_cputhreading_impl.hpp>
#endif

View File

@ -0,0 +1,73 @@
#ifndef __AMSCU_CPUTHREADING_IMPL_HPP__
#define __AMSCU_CPUTHREADING_IMPL_HPP__
namespace amscuda
{
namespace util
{
template<typename callable, typename ... argst> int threaded_execute(callable &&fptr, int64_t psize, argst&&... args)
{
int ret = amscu_success;
int I;
std::vector<std::thread*> threads;
int nthreads = amscpu_cputhreading_threadplan(psize);
if(nthreads<=1)
{
nthreads = 1;
I = 0;
// std::invoke(
// std::forward<callable>(fptr),
// I,
// nthreads,
// std::forward<argst>(args)...
// );
//std::invoke is a C++17 feature, and mingw8 complains even so.
// Can I get away with just calling the functions?
fptr(I,nthreads,std::forward<argst>(args)...);
}
else
{
threads.resize(nthreads);
for(I=0;I<nthreads;I++) threads[I] = NULL;
for(I=0;I<nthreads;I++)
{
threads[I] = new(std::nothrow) std::thread
(
std::forward<callable>(fptr),
I,
nthreads,
std::forward<argst>(args)...
);
}
for(I=0;I<nthreads;I++)
{
if(threads[I]==NULL)
{ //null thread creation failure check
//printf("debug check!\n");
ret = amscu_failure;
}
}
for(I=0;I<nthreads;I++)
{
if(threads[I]!=NULL)
{
threads[I]->join();
delete threads[I];
threads[I] = NULL;
}
}
}
return ret;
}
};
};
#endif

View File

@ -0,0 +1,15 @@
#ifndef __AMSCU_UTIL_CUH__
#define __AMSCU_UTIL_CUH__
namespace amscuda
{
namespace util
{
};
};
#include <amsculib3/util/amscu_cputhreading.hpp>
#endif

View File

@ -8,12 +8,12 @@ namespace random
{ {
//Choosing xoroshiro64** as my default RNG due to 32 bit only operations //Choosing xoroshiro64** as my default RNG due to 32 bit only operations
randstate_t global_rand_cpustate = xs64ss_state(); randstate_t global_randstate = xs64ss_state();
__host__ void rand_seed(const uint32_t seed) __host__ void rand_seed(const uint32_t seed)
{ {
global_rand_cpustate = xs64ss_state(seed); global_randstate = xs64ss_state(seed);
} }
__host__ __device__ void rand_state_increment(const int32_t inc, randstate_t *state) __host__ __device__ void rand_state_increment(const int32_t inc, randstate_t *state)
@ -22,7 +22,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
xoroshiro::xs64ss_state* s2 = (xoroshiro::xs64ss_state*)state; xoroshiro::xs64ss_state* s2 = (xoroshiro::xs64ss_state*)state;
s2->low += inc; s2->low += inc;
@ -36,7 +36,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
xs64ss_next((xoroshiro::xs64ss_state*)state); xs64ss_next((xoroshiro::xs64ss_state*)state);
return; return;
@ -49,7 +49,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
ret = xoroshiro::xs64ss_next((xoroshiro::xs64ss_state*)state); ret = xoroshiro::xs64ss_next((xoroshiro::xs64ss_state*)state);
@ -63,7 +63,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
int32_t q = (int)((randui32(state)>>1U)%(1U<<16U)); int32_t q = (int)((randui32(state)>>1U)%(1U<<16U));
ret = (q%(high-low))+low; ret = (q%(high-low))+low;
@ -77,7 +77,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
ret= ((float)randui32(state))/(4294967296.0f); ret= ((float)randui32(state))/(4294967296.0f);
return ret; return ret;
@ -90,7 +90,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
ret= ((double)randui32(state))/(4294967296.0f); ret= ((double)randui32(state))/(4294967296.0f);
return ret; return ret;
@ -103,7 +103,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
q1 = randf(state); q1 = randf(state);
@ -118,7 +118,7 @@ namespace random
// GPU-specific code (device path) // GPU-specific code (device path)
#else #else
// CPU-specific code (host path) // CPU-specific code (host path)
if(state==NULL) state = &global_rand_cpustate; if(state==NULL) state = &global_randstate;
#endif #endif
q1 = rand(state); q1 = rand(state);

View File

@ -4,6 +4,290 @@ namespace amscuda
{ {
namespace random namespace random
{ {
template<typename dtype, typename randfunc> void rand_threadfunc1(
int threadnum,
int nthreads,
randfunc &&rf,
dtype *buffer,
int64_t N,
randstate_t *seed
)
{
int64_t I,Is,I0,I1;
Is = N/nthreads;
if(Is<1) Is = 1;
I0 = Is*(threadnum);
I1 = Is*(threadnum+1);
if(I1>N) I1 = N;
for(I=I0;I<I1;I++)
{
buffer[I] = rf(seed);
}
return;
}
template<typename callable, typename randfunc, typename dtype>
int hbuff_rand_threadedexec(
callable &&fptr,
randfunc &&rf,
dtype *buffer,
int64_t N,
randstate_t *mainseed
)
{
int ret = amscu_success;
int I;
std::vector<std::thread*> threads;
std::vector<randstate_t> seeds;
int nthreads = amscuda::util::amscpu_cputhreading_threadplan(N);
if(nthreads<=1)
{
nthreads = 1;
I = 0;
fptr(I,nthreads,rf,buffer,N,mainseed);
}
else
{
threads.resize(nthreads);
seeds.resize(nthreads);
for(I=0;I<nthreads;I++)
{
seeds[I] = *mainseed;
rand_state_increment(I,&seeds[I]);
}
rand_state_increment(I,mainseed);
for(I=0;I<nthreads;I++) threads[I] = NULL;
for(I=0;I<nthreads;I++)
{
threads[I] = new(std::nothrow) std::thread
(
std::forward<callable>(fptr),
I,
nthreads,
rf,
buffer,
N,
&seeds[I]
);
}
for(I=0;I<nthreads;I++)
{
if(threads[I]==NULL)
{ //null thread creation failure check
ret = amscu_failure;
}
}
for(I=0;I<nthreads;I++)
{
if(threads[I]!=NULL)
{
threads[I]->join();
delete threads[I];
threads[I] = NULL;
}
}
}
return ret;
}
template<typename dtype, typename rngbnd, typename randfunc> void rand_threadfunc2(
int threadnum,
int nthreads,
randfunc &&rf,
dtype *buffer,
int64_t N,
rngbnd min,
rngbnd max,
randstate_t *seed
)
{
int64_t I,Is,I0,I1;
Is = N/nthreads;
if(Is<1) Is = 1;
I0 = Is*(threadnum);
I1 = Is*(threadnum+1);
if(I1>N) I1 = N;
for(I=I0;I<I1;I++)
{
buffer[I] = rf(min,max,seed);
}
return;
}
template<typename callable, typename randfunc, typename rngbnd, typename dtype>
int hbuff_rand_threadedexec2(
callable &&fptr,
randfunc &&rf,
dtype *buffer,
int64_t N,
rngbnd min,
rngbnd max,
randstate_t *mainseed
)
{
int ret = amscu_success;
int I;
std::vector<std::thread*> threads;
std::vector<randstate_t> seeds;
int nthreads = amscuda::util::amscpu_cputhreading_threadplan(N);
if(nthreads<=1)
{
nthreads = 1;
I = 0;
fptr(I,nthreads,rf,buffer,N,min,max,mainseed);
}
else
{
threads.resize(nthreads);
seeds.resize(nthreads);
for(I=0;I<nthreads;I++)
{
seeds[I] = *mainseed;
rand_state_increment(I,&seeds[I]);
}
rand_state_increment(I,mainseed);
for(I=0;I<nthreads;I++) threads[I] = NULL;
for(I=0;I<nthreads;I++)
{
threads[I] = new(std::nothrow) std::thread
(
std::forward<callable>(fptr),
I,
nthreads,
rf,
buffer,
N,
min,max,
&seeds[I]
);
}
for(I=0;I<nthreads;I++)
{
if(threads[I]==NULL)
{ //null thread creation failure check
ret = amscu_failure;
}
}
for(I=0;I<nthreads;I++)
{
if(threads[I]!=NULL)
{
threads[I]->join();
delete threads[I];
threads[I] = NULL;
}
}
}
return ret;
}
__host__ int hbuff_randf(float *hbuffer, int size, randstate_t *state)
{
int ret = amscu_success;
int res;
if(state==NULL) state = &global_randstate;
res = hbuff_rand_threadedexec(
rand_threadfunc1<float, float (*)(amscuda::random::randstate_t*)>,
randf,
hbuffer,
(int64_t) size,
state
);
if(res!=amscu_success) ret = amscu_failure;
return ret;
}
__host__ int hbuff_rand(double *hbuffer, int size, randstate_t *state)
{
int ret = amscu_success;
int res;
if(state==NULL) state = &global_randstate;
res = hbuff_rand_threadedexec(
rand_threadfunc1<double, double (*)(amscuda::random::randstate_t*)>,
rand,
hbuffer,
(int64_t) size,
state
);
if(res!=amscu_success) ret = amscu_failure;
return ret;
}
__host__ int hbuff_randnf(float *hbuffer, int size, randstate_t *state)
{
int ret = amscu_success;
int res;
if(state==NULL) state = &global_randstate;
res = hbuff_rand_threadedexec(
rand_threadfunc1<float, float (*)(amscuda::random::randstate_t*)>,
randnf,
hbuffer,
(int64_t) size,
state
);
if(res!=amscu_success) ret = amscu_failure;
return ret;
}
__host__ int hbuff_randn(double *hbuffer, int size, randstate_t *state)
{
int ret = amscu_success;
int res;
if(state==NULL) state = &global_randstate;
res = hbuff_rand_threadedexec(
rand_threadfunc1<double, double (*)(amscuda::random::randstate_t*)>,
randn,
hbuffer,
(int64_t) size,
state
);
if(res!=amscu_success) ret = amscu_failure;
return ret;
}
__host__ int hbuff_randint(int *hbuffer, int size, int low, int high, randstate_t *state)
{
int ret = amscu_success;
int res;
if(state==NULL) state = &global_randstate;
res = hbuff_rand_threadedexec2(
rand_threadfunc2<int, int, int (*)(int, int, amscuda::random::randstate_t*)>,
randint,
hbuffer,
(int64_t) size,
low,high,
state
);
if(res!=amscu_success) ret = amscu_failure;
return ret;
}
}; //end namespaces }; //end namespaces
}; };

View File

@ -5,55 +5,27 @@ namespace amscuda
namespace random namespace random
{ {
__host__ void amscurand_tests1() __host__ void amscurand_tests1_randstat1()
{ {
using namespace random;
printf("Random number generator basic function test:\n");
int I; int I;
// uint32_t q1;
// float q2;
// double q3;
// int q4;
rand_seed(0);
printf("from seed 0...\n");
for(I=0;I<10;I++)
{
printf("randui32[%d] = %u\n",I,randui32());
}
rand_seed(0);
printf("from seed 0...\n");
for(I=0;I<10;I++)
{
printf("randf[%d] = %1.3f\n",I,randf());
}
rand_seed(0);
printf("from seed 0...\n");
for(I=0;I<10;I++)
{
printf("randf[%d] = %1.3f\n",I,rand());
}
float q0,qlast,qmindelta,qmaxdelta,qmx,qmn,qmean,qstd,qsum,qsumsq; float q0,qlast,qmindelta,qmaxdelta,qmx,qmn,qmean,qstd,qsum,qsumsq;
int N = 100000; int N = 100000;
uint32_t seed= 34533623; uint32_t seed= 2;
rand_seed(seed); rand_seed(seed);
printf("from seed %u...\n",seed); printf("from seed %u...\n",seed);
q0 = randf(); q0 = randnf();
qmx = q0; qmx = q0;
qmn = q0; qmn = q0;
qsum = 0.0f; qsum = 0.0f;
qsumsq = 0.0f; qsumsq = 0.0f;
qlast = q0; qlast = q0;
q0 = randf(); q0 = randnf();
qmindelta = ::fabsf(q0-qlast); qmindelta = ::fabsf(q0-qlast);
qmaxdelta = ::fabsf(q0-qlast); qmaxdelta = ::fabsf(q0-qlast);
for(I=0;I<N;I++) for(I=0;I<N;I++)
{ {
qlast = q0; qlast = q0;
q0 = randf(); q0 = randnf();
qsum += q0; qsum += q0;
qsumsq += q0*q0; qsumsq += q0*q0;
@ -66,13 +38,93 @@ namespace random
qmean = qsum/((float)N); qmean = qsum/((float)N);
qstd = ::sqrtf((qsumsq-qsum*qsum/((float)N))/((float)N)); qstd = ::sqrtf((qsumsq-qsum*qsum/((float)N))/((float)N));
printf("Statistics for randf() N=%d\n",N); printf("Statistics for randnf() N=%d\n",N);
printf("\tmin: %1.3g\n",qmn); printf("\tmin: %1.3g\n",qmn);
printf("\tmax: %1.3g\n",qmx); printf("\tmax: %1.3g\n",qmx);
printf("\tmean: %1.3g\n",qmean); printf("\tmean: %1.3g\n",qmean);
printf("\tstdev: %1.3g\n",qstd); printf("\tstdev: %1.3g\n",qstd);
printf("\tmindelta: %1.3g\n",qmindelta); printf("\tmindelta: %1.3g\n",qmindelta);
printf("\tmaxdelta: %1.3g\n",qmaxdelta); printf("\tmaxdelta: %1.3g\n",qmaxdelta);
}
__host__ void amscurand_tests1_randstat2()
{
int I;
double q0,qlast,qmindelta,qmaxdelta,qmx,qmn,qmean,qstd,qsum,qsumsq;
int N = 100000;
uint32_t seed= 2;
rand_seed(seed);
printf("from seed %u...\n",seed);
q0 = randn();
qmx = q0;
qmn = q0;
qsum = 0.0f;
qsumsq = 0.0f;
qlast = q0;
q0 = randn();
qmindelta = ::fabs(q0-qlast);
qmaxdelta = ::fabs(q0-qlast);
for(I=0;I<N;I++)
{
qlast = q0;
q0 = randn();
qsum += q0;
qsumsq += q0*q0;
if(q0>qmx) qmx = q0;
if(q0<qmn) qmn = q0;
if(::fabs(q0-qlast)<qmindelta) qmindelta = ::fabs(q0-qlast);
if(::fabs(q0-qlast)>qmaxdelta) qmaxdelta = ::fabs(q0-qlast);
}
qmean = qsum/((double)N);
qstd = ::sqrt((qsumsq-qsum*qsum/((double)N))/((double)N));
printf("Statistics for randn() N=%d\n",N);
printf("\tmin: %1.6g\n",qmn);
printf("\tmax: %1.6g\n",qmx);
printf("\tmean: %1.6g\n",qmean);
printf("\tstdev: %1.6g\n",qstd);
printf("\tmindelta: %1.6g\n",qmindelta);
printf("\tmaxdelta: %1.6g\n",qmaxdelta);
}
__host__ void amscurand_tests1()
{
printf("Random number generator basic function test:\n");
int I;
// uint32_t q1;
// float q2;
// double q3;
// int q4;
rand_seed(1);
printf("from seed 1...\n");
for(I=0;I<10;I++)
{
printf("randui32[%d] = %u\n",I,randui32());
}
rand_seed(1);
printf("from seed 1...\n");
for(I=0;I<10;I++)
{
printf("randf[%d] = %1.3f\n",I,randf());
}
rand_seed(0);
printf("from seed 0...\n");
for(I=0;I<10;I++)
{
printf("randf[%d] = %1.3f\n",I,rand());
}
amscurand_tests1_randstat1();
amscurand_tests1_randstat2();
return; return;

View File

@ -100,7 +100,7 @@ namespace xoroshiro
__host__ __device__ void xs128pp_jump(xs128pp_state* state) __host__ __device__ void xs128pp_jump(xs128pp_state* state)
{ {
static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc }; AMSCU_CONST static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
uint64_t low = 0; uint64_t low = 0;
uint64_t high = 0; uint64_t high = 0;
int I; int I;

View File

@ -0,0 +1,26 @@
#include <amsculib3/amsculib3.hpp>
namespace amscuda
{
namespace util
{
int amscpu_cputhreading_threadplan(int probsize)
{
int nthreads = 1;
int ta = std::thread::hardware_concurrency();
if(probsize>1024)
{
nthreads = probsize/1024;
if(nthreads>ta-2) nthreads = ta-2;
if(nthreads>amscuda::amscu_defcputhreads) nthreads = amscuda::amscu_defcputhreads;
if(nthreads<1) nthreads = 1;
}
return nthreads;
}
};
};

View File

@ -23,7 +23,7 @@ int main(int argc, char* argv[])
//test_amscurarray1(); //test_amscurarray1();
random::amscurand_tests1(); //random::amscurand_tests1();
return 0; return 0;
} }