diff --git a/build_linux64/libamsculib3.linux64.a b/build_linux64/libamsculib3.linux64.a index 85fcdfa..25d94d0 100644 Binary files a/build_linux64/libamsculib3.linux64.a and b/build_linux64/libamsculib3.linux64.a differ diff --git a/build_linux64/objstore/amscurandom1_dbuff.o b/build_linux64/objstore/amscurandom1_dbuff.o index 1816f0f..92f2234 100644 Binary files a/build_linux64/objstore/amscurandom1_dbuff.o and b/build_linux64/objstore/amscurandom1_dbuff.o differ diff --git a/build_linux64/test b/build_linux64/test index 1569edf..d83a87f 100644 Binary files a/build_linux64/test and b/build_linux64/test differ diff --git a/src/amsculib3/random/amscurandom1_dbuff.cu b/src/amsculib3/random/amscurandom1_dbuff.cu index 51eae29..b6f1499 100644 --- a/src/amsculib3/random/amscurandom1_dbuff.cu +++ b/src/amsculib3/random/amscurandom1_dbuff.cu @@ -5,5 +5,266 @@ namespace amscuda namespace random { + __global__ void dbuff_randf_kf(float *dbuffer, int size, randstate_t *dstates) + { + int I; + int index = threadIdx.x + blockIdx.x*blockDim.x; + + randstate_t dstate = dstates[index]; + + for(I=index;I states; + randstate_t *dstates = NULL; + cudaError_t err = cudaSuccess; + + states.resize(nblocks*nthreads); + + cudaMalloc(&dstates,sizeof(randstate_t)*states.length); + cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice); + + for(I=0;I>>(dbuffer,size,dstates); + cudaDeviceSynchronize(); + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("dbuff_randf: cuda error %s\n",cudaGetErrorString(err)); + } + + + cudaFree(dstates); dstates=NULL; + + return ret; + } + + __global__ void dbuff_rand_kf(double *dbuffer, int size, randstate_t *dstates) + { + int I; + int index = threadIdx.x + blockIdx.x*blockDim.x; + + randstate_t dstate = dstates[index]; + + for(I=index;I states; + randstate_t *dstates = NULL; + cudaError_t err = cudaSuccess; + + states.resize(nblocks*nthreads); + + cudaMalloc(&dstates,sizeof(randstate_t)*states.length); + cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice); + + for(I=0;I>>(dbuffer,size,dstates); + cudaDeviceSynchronize(); + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("dbuff_randf: cuda error %s\n",cudaGetErrorString(err)); + } + + + cudaFree(dstates); dstates=NULL; + + return ret; + } + + __global__ void dbuff_randnf_kf(float *dbuffer, int size, randstate_t *dstates) + { + int I; + int index = threadIdx.x + blockIdx.x*blockDim.x; + + randstate_t dstate = dstates[index]; + + for(I=index;I states; + randstate_t *dstates = NULL; + cudaError_t err = cudaSuccess; + + states.resize(nblocks*nthreads); + + cudaMalloc(&dstates,sizeof(randstate_t)*states.length); + cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice); + + for(I=0;I>>(dbuffer,size,dstates); + cudaDeviceSynchronize(); + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("dbuff_randf: cuda error %s\n",cudaGetErrorString(err)); + } + + + cudaFree(dstates); dstates=NULL; + + return ret; + } + + __host__ int dbuff_randn(double *dbuffer, int size, randstate_t *state) + { + int ret = amscu_success; + + int nthreads = 128; + int nblocks = (size+nthreads)/nthreads; + int I; + cuarray states; + randstate_t *dstates = NULL; + cudaError_t err = cudaSuccess; + + states.resize(nblocks*nthreads); + + cudaMalloc(&dstates,sizeof(randstate_t)*states.length); + cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice); + + for(I=0;I>>(dbuffer,size,dstates); + cudaDeviceSynchronize(); + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("dbuff_randf: cuda error %s\n",cudaGetErrorString(err)); + } + + + cudaFree(dstates); dstates=NULL; + + return ret; + } + + __global__ void dbuff_randint_kf(int *dbuffer, int size, int low, int high, randstate_t *dstates) + { + int I; + int index = threadIdx.x + blockIdx.x*blockDim.x; + + randstate_t dstate = dstates[index]; + + for(I=index;I states; + randstate_t *dstates = NULL; + cudaError_t err = cudaSuccess; + + states.resize(nblocks*nthreads); + + cudaMalloc(&dstates,sizeof(randstate_t)*states.length); + cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice); + + for(I=0;I>>(dbuffer,size,low,high,dstates); + cudaDeviceSynchronize(); + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("dbuff_randf: cuda error %s\n",cudaGetErrorString(err)); + } + + + cudaFree(dstates); dstates=NULL; + + return ret; + } + + }; //end namespaces }; \ No newline at end of file