testing rands

2026-04-14 23:03:26 -04:00
parent 1691b2c415
commit 0600b08a63
18 changed files with 348 additions and 13 deletions
--- a/build_linux64/libamsculib3.linux64.a
+++ b/build_linux64/libamsculib3.linux64.a
--- a/build_linux64/objstore/amscu_cputhreading.o
+++ b/build_linux64/objstore/amscu_cputhreading.o
--- a/build_linux64/objstore/amscu_util.o
+++ b/build_linux64/objstore/amscu_util.o
--- a/build_linux64/objstore/amscurandom1.o
+++ b/build_linux64/objstore/amscurandom1.o
--- a/build_linux64/objstore/amscurandom1_dbuff.o
+++ b/build_linux64/objstore/amscurandom1_dbuff.o
--- a/build_linux64/objstore/amscurandom1_hbuff.o
+++ b/build_linux64/objstore/amscurandom1_hbuff.o
--- a/build_linux64/objstore/amscurandom_tests1.o
+++ b/build_linux64/objstore/amscurandom_tests1.o
--- a/build_linux64/test
+++ b/build_linux64/test
--- a/include/amsculib3/random/amscurandom.cuh
+++ b/include/amsculib3/random/amscurandom.cuh
@ -50,6 +50,7 @@ __host__ int dbuff_randint(int *dbuffer, int size, int low, int high, randstate_

 //Tests
 __host__ void amscurand_tests1(); //test basic random functions
+__host__ void amscurand_tests2(); //test basic random functions


 };
--- a/include/amsculib3/util/amscu_util.hpp
+++ b/include/amsculib3/util/amscu_util.hpp
@ -5,7 +5,7 @@ namespace amscuda
 {
 namespace util
 {
-
+    double time_msec();

 };
 };
--- a/src/amsculib3/random/amscurandom1.cu
+++ b/src/amsculib3/random/amscurandom1.cu
@ -8,7 +8,9 @@ namespace random
 {

    //Choosing xoroshiro64** as my default RNG due to 32 bit only operations 
-    randstate_t global_randstate = xs64ss_state();
+    randstate_t global_randstate = xs64ss_state(0);
+    //must intialize randstate to a value other than {0,0}. The single-param constructor calls splitmix32 to initialize
+


    __host__ void rand_seed(const uint32_t seed)
--- a/src/amsculib3/random/amscurandom1_dbuff.cu
+++ b/src/amsculib3/random/amscurandom1_dbuff.cu
@ -24,24 +24,31 @@ namespace random
        int ret = amscu_success;

        int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
        int I;
        cuarray<randstate_t> states;
        randstate_t *dstates = NULL;
        cudaError_t err = cudaSuccess;

+        if(state==NULL) state=&amscuda::random::global_randstate;
+
        states.resize(nblocks*nthreads);

+        //printf("debug: %d %d %d\n",nblocks,nthreads,size);
+        
        cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        

        for(I=0;I<nblocks*nthreads;I++)
        {
            states[I] = *state;
            rand_state_increment(I,&(states[I]));
+            //printf("debug: %d, %u %u\n",I,states[I].low,states[I].high);
        }
        rand_state_increment(I,state);

+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        
        dbuff_randf_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
        cudaDeviceSynchronize();
        
@ -76,16 +83,17 @@ namespace random
        int ret = amscu_success;

        int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
        int I;
        cuarray<randstate_t> states;
        randstate_t *dstates = NULL;
        cudaError_t err = cudaSuccess;

+        if(state==NULL) state=&amscuda::random::global_randstate;
+
        states.resize(nblocks*nthreads);
        
        cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);

        for(I=0;I<nblocks*nthreads;I++)
        {
@ -94,6 +102,8 @@ namespace random
        }
        rand_state_increment(I,state);

+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        
        dbuff_rand_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
        cudaDeviceSynchronize();
        
@ -142,16 +152,17 @@ namespace random
        int ret = amscu_success;

        int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
        int I;
        cuarray<randstate_t> states;
        randstate_t *dstates = NULL;
        cudaError_t err = cudaSuccess;

+        if(state==NULL) state=&amscuda::random::global_randstate;
+
        states.resize(nblocks*nthreads);
        
        cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);

        for(I=0;I<nblocks*nthreads;I++)
        {
@ -160,6 +171,8 @@ namespace random
        }
        rand_state_increment(I,state);

+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        
        dbuff_randnf_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
        cudaDeviceSynchronize();
        
@ -180,16 +193,17 @@ namespace random
        int ret = amscu_success;

        int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
        int I;
        cuarray<randstate_t> states;
        randstate_t *dstates = NULL;
        cudaError_t err = cudaSuccess;

+        if(state==NULL) state=&amscuda::random::global_randstate;
+
        states.resize(nblocks*nthreads);
        
        cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);

        for(I=0;I<nblocks*nthreads;I++)
        {
@ -198,6 +212,8 @@ namespace random
        }
        rand_state_increment(I,state);

+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        
        dbuff_randn_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
        cudaDeviceSynchronize();
        
@ -232,16 +248,17 @@ namespace random
        int ret = amscu_success;

        int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
        int I;
        cuarray<randstate_t> states;
        randstate_t *dstates = NULL;
        cudaError_t err = cudaSuccess;

+        if(state==NULL) state=&amscuda::random::global_randstate;
+
        states.resize(nblocks*nthreads);
        
        cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
        
        for(I=0;I<nblocks*nthreads;I++)
        {
@ -250,6 +267,8 @@ namespace random
        }
        rand_state_increment(I,state);

+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        
        dbuff_randint_kf<<<nblocks,nthreads>>>(dbuffer,size,low,high,dstates);
        cudaDeviceSynchronize();
        
--- a/src/amsculib3/random/amscurandom1_hbuff.cu
+++ b/src/amsculib3/random/amscurandom1_hbuff.cu
@ -58,6 +58,7 @@ namespace random
            {
                seeds[I] = *mainseed;
                rand_state_increment(I,&seeds[I]);
+                //printf("debug: %d %u %u\n",(int)I,seeds[I].high,seeds[I].low);
            }
            rand_state_increment(I,mainseed);

--- a/src/amsculib3/random/amscurandom_tests1.cu
+++ b/src/amsculib3/random/amscurandom_tests1.cu
@ -127,6 +127,159 @@ namespace random



+        return;
+    }
+
+
+    void amscurand_tests2_1()
+    {
+        const char *fnout = "../test_scripts/randf_array.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<float> data;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = util::time_msec();
+        hbuff_randf(data.data,Nx*Ny);
+        t1 = util::time_msec();
+
+        printf("hbuff_randf execution time: %1.3f msec\n",t1-t0);
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        fclose(fp);
+        return;
+    }
+
+    void amscurand_tests2_2()
+    {
+        const char *fnout = "../test_scripts/randf_array2.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<float> data;
+        float *ddata = NULL;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        cudaMalloc(&ddata,sizeof(float)*Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = amscuda::util::time_msec();
+        dbuff_randf(ddata,Nx*Ny);
+        t1 = amscuda::util::time_msec();
+        
+        printf("dbuff_randf execution time: %1.3f msec\n",t1-t0);
+
+        cudaMemcpy(data.data,ddata,sizeof(float)*Nx*Ny,cudaMemcpyDeviceToHost);
+
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        cudaFree(ddata); ddata=NULL;
+
+        fclose(fp);
+        return;
+    }
+
+    void amscurand_tests2_3()
+    {
+        const char *fnout = "../test_scripts/randint_array.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<int> data;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = util::time_msec();
+        hbuff_randint(data.data,Nx*Ny,0,10);
+        t1 = util::time_msec();
+
+        printf("hbuff_randint execution time: %1.3f msec\n",t1-t0);
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        fclose(fp);
+        return;
+    }
+
+    void amscurand_tests2_4()
+    {
+        const char *fnout = "../test_scripts/randint_array2.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<int> data;
+        int *ddata = NULL;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        cudaMalloc(&ddata,sizeof(int)*Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = amscuda::util::time_msec();
+        dbuff_randint(ddata,Nx*Ny,0,10);
+        t1 = amscuda::util::time_msec();
+        
+        printf("dbuff_randint execution time: %1.3f msec\n",t1-t0);
+
+        cudaMemcpy(data.data,ddata,sizeof(int)*Nx*Ny,cudaMemcpyDeviceToHost);
+
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        cudaFree(ddata); ddata=NULL;
+
+        fclose(fp);
+        return;
+    }
+
+    __host__ void amscurand_tests2()
+    {
+        amscurand_tests2_1();
+        amscurand_tests2_2();
+        amscurand_tests2_3();
+        amscurand_tests2_4();
+
        return;
    }

--- a/src/amsculib3/util/amscu_util.cu
+++ b/src/amsculib3/util/amscu_util.cu
@ -0,0 +1,15 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+namespace util
+{
+
+    double time_msec()
+    {
+        double ret = (double)clock()/CLOCKS_PER_SEC;
+        ret *= 1000.0;
+        return ret;
+    }
+};
+};
--- a/src/main.cu
+++ b/src/main.cu
@ -24,6 +24,7 @@ int main(int argc, char* argv[])
    //test_amscurarray1();

    //random::amscurand_tests1();
+    random::amscurand_tests2();

    return 0;
 }
--- a/test_scripts/test_dbuff_dpr32.py
+++ b/test_scripts/test_dbuff_dpr32.py
@ -40,6 +40,9 @@ def binsave_float_ndarray(fp,arr):
 ## Main Script ##
 #################

+
+
+
 def test_1():

    fname = "./test_scripts/test_dbuff_rand_dpr32.bin"
--- a/test_scripts/test_randomplot1.py
+++ b/test_scripts/test_randomplot1.py
@ -0,0 +1,140 @@
+#!/usr/bin/python3
+
+import os,sys,math
+import numpy as np
+import matplotlib.pyplot as plt
+
+#################
+## Subroutines ##
+#################
+
+def binload_float_ndarray(fp):
+    arr = np.zeros((0),dtype=np.float32,order='F')
+
+    qb = fp.read(4)
+    Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+    shp = np.zeros((Nd),dtype=np.int32)
+
+    piprod = 1
+    for I in range(0,Nd):
+        qb = fp.read(4)
+        shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+        piprod = piprod*shp[I]
+    
+    qb = fp.read(4*piprod)
+    arr = np.frombuffer(qb,dtype=np.float32,count=piprod)
+
+    arr = arr.reshape(shp)
+    
+    return arr;
+
+def binload_int_ndarray(fp):
+    arr = np.zeros((0),dtype=np.float32,order='F')
+
+    qb = fp.read(4)
+    Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+    shp = np.zeros((Nd),dtype=np.int32)
+
+    piprod = 1
+    for I in range(0,Nd):
+        qb = fp.read(4)
+        shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+        piprod = piprod*shp[I]
+    
+    qb = fp.read(4*piprod)
+    arr = np.frombuffer(qb,dtype=np.int32,count=piprod)
+
+    arr = arr.reshape(shp)
+    
+    return arr;
+
+
+#################
+## Main Script ##
+#################
+
+def periodcheck(arr):
+
+    arr = np.asarray(arr).copy().flatten()
+    N = arr.shape[0]
+    q1 = arr[0]
+    q2 = arr[1]
+    q3 = arr[2]
+    ind = -1
+    for I in range(3,N-3):
+        if(arr[I]==q1):
+            if(arr[I+1]==q2):
+                if(arr[I+2]==q3):
+                    ind = I
+                    break
+    if(ind>=0):
+        print("array has detected a period of {} out of {}".format(ind,N))
+    
+    return ind
+
+def test_1():
+
+    fname = "./test_scripts/randf_array.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr = binload_float_ndarray(fp)
+    periodcheck(arr)
+    fp.close()
+
+    fname = "./test_scripts/randf_array2.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr2 = binload_float_ndarray(fp)
+    periodcheck(arr2)
+    fp.close()
+
+    fname = "./test_scripts/randint_array.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr3 = binload_int_ndarray(fp)
+    periodcheck(arr3)
+    print("array 3 max {} min {}".format(np.max(arr3),np.min(arr3)))
+    fp.close()
+
+    fname = "./test_scripts/randint_array2.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr4 = binload_int_ndarray(fp)
+    periodcheck(arr4)
+    print("array 4 max {} min {}".format(np.max(arr4),np.min(arr4)))
+    fp.close()
+
+    plt.subplot(2,2,1)
+    plt.imshow(arr)
+    plt.subplot(2,2,2)
+    plt.imshow(arr2)
+    plt.show()
+
+    plt.subplot(2,2,1)
+    plt.imshow(arr3)
+    plt.colorbar()
+    plt.subplot(2,2,2)
+    plt.imshow(arr4)
+    plt.colorbar()
+    plt.show()
+    
+
+    return
+
+if(__name__=="__main__"):
+    test_1()
+
+    exit(0)
+