diff --git a/build_linux64/libamsculib3.linux64.a b/build_linux64/libamsculib3.linux64.a
index 25d94d0..0472127 100644
Binary files a/build_linux64/libamsculib3.linux64.a and b/build_linux64/libamsculib3.linux64.a differ
diff --git a/build_linux64/objstore/amscu_cputhreading.o b/build_linux64/objstore/amscu_cputhreading.o
index 1127744..06c4827 100644
Binary files a/build_linux64/objstore/amscu_cputhreading.o and b/build_linux64/objstore/amscu_cputhreading.o differ
diff --git a/build_linux64/objstore/amscu_util.o b/build_linux64/objstore/amscu_util.o
new file mode 100644
index 0000000..b831198
Binary files /dev/null and b/build_linux64/objstore/amscu_util.o differ
diff --git a/build_linux64/objstore/amscurandom1.o b/build_linux64/objstore/amscurandom1.o
index 0af19a2..f4bdb59 100644
Binary files a/build_linux64/objstore/amscurandom1.o and b/build_linux64/objstore/amscurandom1.o differ
diff --git a/build_linux64/objstore/amscurandom1_dbuff.o b/build_linux64/objstore/amscurandom1_dbuff.o
index 92f2234..5f4a2ed 100644
Binary files a/build_linux64/objstore/amscurandom1_dbuff.o and b/build_linux64/objstore/amscurandom1_dbuff.o differ
diff --git a/build_linux64/objstore/amscurandom1_hbuff.o b/build_linux64/objstore/amscurandom1_hbuff.o
index cc2edf1..31e2c0a 100644
Binary files a/build_linux64/objstore/amscurandom1_hbuff.o and b/build_linux64/objstore/amscurandom1_hbuff.o differ
diff --git a/build_linux64/objstore/amscurandom_tests1.o b/build_linux64/objstore/amscurandom_tests1.o
index c099166..3fcdab9 100644
Binary files a/build_linux64/objstore/amscurandom_tests1.o and b/build_linux64/objstore/amscurandom_tests1.o differ
diff --git a/build_linux64/test b/build_linux64/test
index d83a87f..e3083a4 100644
Binary files a/build_linux64/test and b/build_linux64/test differ
diff --git a/include/amsculib3/random/amscurandom.cuh b/include/amsculib3/random/amscurandom.cuh
index 3365a39..461360c 100644
--- a/include/amsculib3/random/amscurandom.cuh
+++ b/include/amsculib3/random/amscurandom.cuh
@@ -50,6 +50,7 @@ __host__ int dbuff_randint(int *dbuffer, int size, int low, int high, randstate_
 
 //Tests
 __host__ void amscurand_tests1(); //test basic random functions
+__host__ void amscurand_tests2(); //test basic random functions
 
 
 };
diff --git a/include/amsculib3/util/amscu_util.hpp b/include/amsculib3/util/amscu_util.hpp
index 7fde3da..93ebc95 100644
--- a/include/amsculib3/util/amscu_util.hpp
+++ b/include/amsculib3/util/amscu_util.hpp
@@ -5,7 +5,7 @@ namespace amscuda
 {
 namespace util
 {
-
+    double time_msec();
 
 };
 };
diff --git a/src/amsculib3/random/amscurandom1.cu b/src/amsculib3/random/amscurandom1.cu
index 7a972e0..e0bcf8d 100644
--- a/src/amsculib3/random/amscurandom1.cu
+++ b/src/amsculib3/random/amscurandom1.cu
@@ -8,7 +8,9 @@ namespace random
 {
 
     //Choosing xoroshiro64** as my default RNG due to 32 bit only operations 
-    randstate_t global_randstate = xs64ss_state();
+    randstate_t global_randstate = xs64ss_state(0);
+    //must intialize randstate to a value other than {0,0}. The single-param constructor calls splitmix32 to initialize
+
 
 
     __host__ void rand_seed(const uint32_t seed)
diff --git a/src/amsculib3/random/amscurandom1_dbuff.cu b/src/amsculib3/random/amscurandom1_dbuff.cu
index b6f1499..eb59b55 100644
--- a/src/amsculib3/random/amscurandom1_dbuff.cu
+++ b/src/amsculib3/random/amscurandom1_dbuff.cu
@@ -24,23 +24,30 @@ namespace random
         int ret = amscu_success;
 
         int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
         int I;
         cuarray<randstate_t> states;
         randstate_t *dstates = NULL;
         cudaError_t err = cudaSuccess;
 
+        if(state==NULL) state=&amscuda::random::global_randstate;
+
         states.resize(nblocks*nthreads);
+
+        //printf("debug: %d %d %d\n",nblocks,nthreads,size);
         
         cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
+        
 
         for(I=0;I<nblocks*nthreads;I++)
         {
             states[I] = *state;
             rand_state_increment(I,&(states[I]));
+            //printf("debug: %d, %u %u\n",I,states[I].low,states[I].high);
         }
         rand_state_increment(I,state);
+
+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
         
         dbuff_randf_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
         cudaDeviceSynchronize();
@@ -76,16 +83,17 @@ namespace random
         int ret = amscu_success;
 
         int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
         int I;
         cuarray<randstate_t> states;
         randstate_t *dstates = NULL;
         cudaError_t err = cudaSuccess;
 
+        if(state==NULL) state=&amscuda::random::global_randstate;
+
         states.resize(nblocks*nthreads);
         
         cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
 
         for(I=0;I<nblocks*nthreads;I++)
         {
@@ -93,6 +101,8 @@ namespace random
             rand_state_increment(I,&(states[I]));
         }
         rand_state_increment(I,state);
+
+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
         
         dbuff_rand_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
         cudaDeviceSynchronize();
@@ -142,16 +152,17 @@ namespace random
         int ret = amscu_success;
 
         int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
         int I;
         cuarray<randstate_t> states;
         randstate_t *dstates = NULL;
         cudaError_t err = cudaSuccess;
 
+        if(state==NULL) state=&amscuda::random::global_randstate;
+
         states.resize(nblocks*nthreads);
         
         cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
 
         for(I=0;I<nblocks*nthreads;I++)
         {
@@ -159,6 +170,8 @@ namespace random
             rand_state_increment(I,&(states[I]));
         }
         rand_state_increment(I,state);
+
+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
         
         dbuff_randnf_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
         cudaDeviceSynchronize();
@@ -180,16 +193,17 @@ namespace random
         int ret = amscu_success;
 
         int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
         int I;
         cuarray<randstate_t> states;
         randstate_t *dstates = NULL;
         cudaError_t err = cudaSuccess;
 
+        if(state==NULL) state=&amscuda::random::global_randstate;
+
         states.resize(nblocks*nthreads);
         
         cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
 
         for(I=0;I<nblocks*nthreads;I++)
         {
@@ -197,6 +211,8 @@ namespace random
             rand_state_increment(I,&(states[I]));
         }
         rand_state_increment(I,state);
+
+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
         
         dbuff_randn_kf<<<nblocks,nthreads>>>(dbuffer,size,dstates);
         cudaDeviceSynchronize();
@@ -232,23 +248,26 @@ namespace random
         int ret = amscu_success;
 
         int nthreads = 128;
-        int nblocks = (size+nthreads)/nthreads;
+        int nblocks = (size+nthreads*1024)/nthreads/1024;
         int I;
         cuarray<randstate_t> states;
         randstate_t *dstates = NULL;
         cudaError_t err = cudaSuccess;
 
+        if(state==NULL) state=&amscuda::random::global_randstate;
+
         states.resize(nblocks*nthreads);
         
         cudaMalloc(&dstates,sizeof(randstate_t)*states.length);
-        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
-
+        
         for(I=0;I<nblocks*nthreads;I++)
         {
             states[I] = *state;
             rand_state_increment(I,&(states[I]));
         }
         rand_state_increment(I,state);
+
+        cudaMemcpy(dstates,states.data,sizeof(randstate_t)*states.length,cudaMemcpyHostToDevice);
         
         dbuff_randint_kf<<<nblocks,nthreads>>>(dbuffer,size,low,high,dstates);
         cudaDeviceSynchronize();
diff --git a/src/amsculib3/random/amscurandom1_hbuff.cu b/src/amsculib3/random/amscurandom1_hbuff.cu
index c4a710e..6015e22 100644
--- a/src/amsculib3/random/amscurandom1_hbuff.cu
+++ b/src/amsculib3/random/amscurandom1_hbuff.cu
@@ -58,6 +58,7 @@ namespace random
             {
                 seeds[I] = *mainseed;
                 rand_state_increment(I,&seeds[I]);
+                //printf("debug: %d %u %u\n",(int)I,seeds[I].high,seeds[I].low);
             }
             rand_state_increment(I,mainseed);
 
diff --git a/src/amsculib3/random/amscurandom_tests1.cu b/src/amsculib3/random/amscurandom_tests1.cu
index 1659be0..68782d1 100644
--- a/src/amsculib3/random/amscurandom_tests1.cu
+++ b/src/amsculib3/random/amscurandom_tests1.cu
@@ -127,6 +127,159 @@ namespace random
 
 
 
+        return;
+    }
+
+
+    void amscurand_tests2_1()
+    {
+        const char *fnout = "../test_scripts/randf_array.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<float> data;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = util::time_msec();
+        hbuff_randf(data.data,Nx*Ny);
+        t1 = util::time_msec();
+
+        printf("hbuff_randf execution time: %1.3f msec\n",t1-t0);
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        fclose(fp);
+        return;
+    }
+
+    void amscurand_tests2_2()
+    {
+        const char *fnout = "../test_scripts/randf_array2.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<float> data;
+        float *ddata = NULL;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        cudaMalloc(&ddata,sizeof(float)*Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = amscuda::util::time_msec();
+        dbuff_randf(ddata,Nx*Ny);
+        t1 = amscuda::util::time_msec();
+        
+        printf("dbuff_randf execution time: %1.3f msec\n",t1-t0);
+
+        cudaMemcpy(data.data,ddata,sizeof(float)*Nx*Ny,cudaMemcpyDeviceToHost);
+
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        cudaFree(ddata); ddata=NULL;
+
+        fclose(fp);
+        return;
+    }
+
+    void amscurand_tests2_3()
+    {
+        const char *fnout = "../test_scripts/randint_array.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<int> data;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = util::time_msec();
+        hbuff_randint(data.data,Nx*Ny,0,10);
+        t1 = util::time_msec();
+
+        printf("hbuff_randint execution time: %1.3f msec\n",t1-t0);
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        fclose(fp);
+        return;
+    }
+
+    void amscurand_tests2_4()
+    {
+        const char *fnout = "../test_scripts/randint_array2.bin";
+        FILE *fp = NULL;
+        int Nx = 4096;
+        int Ny = 4096;
+        cuarray<int> data;
+        int *ddata = NULL;
+        cuarray<int> dims;
+        fp = fopen(fnout,"w+b");
+
+        data.resize(Nx*Ny);
+        cudaMalloc(&ddata,sizeof(int)*Nx*Ny);
+        dims.resize(2);
+        dims[0] = Nx;
+        dims[1] = Ny;
+
+        double t0,t1;
+        t0 = amscuda::util::time_msec();
+        dbuff_randint(ddata,Nx*Ny,0,10);
+        t1 = amscuda::util::time_msec();
+        
+        printf("dbuff_randint execution time: %1.3f msec\n",t1-t0);
+
+        cudaMemcpy(data.data,ddata,sizeof(int)*Nx*Ny,cudaMemcpyDeviceToHost);
+
+
+        fwrite_ndarray(fp,&dims,&data);
+        if(fp==NULL)
+        {
+            printf("amscurand_tests2_1 error: could not write %s",fnout);
+        }
+
+        cudaFree(ddata); ddata=NULL;
+
+        fclose(fp);
+        return;
+    }
+
+    __host__ void amscurand_tests2()
+    {
+        amscurand_tests2_1();
+        amscurand_tests2_2();
+        amscurand_tests2_3();
+        amscurand_tests2_4();
+
         return;
     }
 
diff --git a/src/amsculib3/util/amscu_util.cu b/src/amsculib3/util/amscu_util.cu
new file mode 100644
index 0000000..f53c410
--- /dev/null
+++ b/src/amsculib3/util/amscu_util.cu
@@ -0,0 +1,15 @@
+#include <amsculib3/amsculib3.hpp>
+
+namespace amscuda
+{
+namespace util
+{
+
+    double time_msec()
+    {
+        double ret = (double)clock()/CLOCKS_PER_SEC;
+        ret *= 1000.0;
+        return ret;
+    }
+};
+};
\ No newline at end of file
diff --git a/src/main.cu b/src/main.cu
index c7be39f..12bc8ab 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -24,6 +24,7 @@ int main(int argc, char* argv[])
     //test_amscurarray1();
 
     //random::amscurand_tests1();
+    random::amscurand_tests2();
 
     return 0;
 }
\ No newline at end of file
diff --git a/test_scripts/test_dbuff_dpr32.py b/test_scripts/test_dbuff_dpr32.py
index 2fb817e..fae8425 100644
--- a/test_scripts/test_dbuff_dpr32.py
+++ b/test_scripts/test_dbuff_dpr32.py
@@ -40,6 +40,9 @@ def binsave_float_ndarray(fp,arr):
 ## Main Script ##
 #################
 
+
+
+
 def test_1():
 
     fname = "./test_scripts/test_dbuff_rand_dpr32.bin"
diff --git a/test_scripts/test_randomplot1.py b/test_scripts/test_randomplot1.py
new file mode 100644
index 0000000..3a6ca31
--- /dev/null
+++ b/test_scripts/test_randomplot1.py
@@ -0,0 +1,140 @@
+#!/usr/bin/python3
+
+import os,sys,math
+import numpy as np
+import matplotlib.pyplot as plt
+
+#################
+## Subroutines ##
+#################
+
+def binload_float_ndarray(fp):
+    arr = np.zeros((0),dtype=np.float32,order='F')
+
+    qb = fp.read(4)
+    Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+    shp = np.zeros((Nd),dtype=np.int32)
+
+    piprod = 1
+    for I in range(0,Nd):
+        qb = fp.read(4)
+        shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+        piprod = piprod*shp[I]
+    
+    qb = fp.read(4*piprod)
+    arr = np.frombuffer(qb,dtype=np.float32,count=piprod)
+
+    arr = arr.reshape(shp)
+    
+    return arr;
+
+def binload_int_ndarray(fp):
+    arr = np.zeros((0),dtype=np.float32,order='F')
+
+    qb = fp.read(4)
+    Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+    shp = np.zeros((Nd),dtype=np.int32)
+
+    piprod = 1
+    for I in range(0,Nd):
+        qb = fp.read(4)
+        shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+        piprod = piprod*shp[I]
+    
+    qb = fp.read(4*piprod)
+    arr = np.frombuffer(qb,dtype=np.int32,count=piprod)
+
+    arr = arr.reshape(shp)
+    
+    return arr;
+
+
+#################
+## Main Script ##
+#################
+
+def periodcheck(arr):
+
+    arr = np.asarray(arr).copy().flatten()
+    N = arr.shape[0]
+    q1 = arr[0]
+    q2 = arr[1]
+    q3 = arr[2]
+    ind = -1
+    for I in range(3,N-3):
+        if(arr[I]==q1):
+            if(arr[I+1]==q2):
+                if(arr[I+2]==q3):
+                    ind = I
+                    break
+    if(ind>=0):
+        print("array has detected a period of {} out of {}".format(ind,N))
+    
+    return ind
+
+def test_1():
+
+    fname = "./test_scripts/randf_array.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr = binload_float_ndarray(fp)
+    periodcheck(arr)
+    fp.close()
+
+    fname = "./test_scripts/randf_array2.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr2 = binload_float_ndarray(fp)
+    periodcheck(arr2)
+    fp.close()
+
+    fname = "./test_scripts/randint_array.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr3 = binload_int_ndarray(fp)
+    periodcheck(arr3)
+    print("array 3 max {} min {}".format(np.max(arr3),np.min(arr3)))
+    fp.close()
+
+    fname = "./test_scripts/randint_array2.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    arr4 = binload_int_ndarray(fp)
+    periodcheck(arr4)
+    print("array 4 max {} min {}".format(np.max(arr4),np.min(arr4)))
+    fp.close()
+
+    plt.subplot(2,2,1)
+    plt.imshow(arr)
+    plt.subplot(2,2,2)
+    plt.imshow(arr2)
+    plt.show()
+
+    plt.subplot(2,2,1)
+    plt.imshow(arr3)
+    plt.colorbar()
+    plt.subplot(2,2,2)
+    plt.imshow(arr4)
+    plt.colorbar()
+    plt.show()
+    
+
+    return
+
+if(__name__=="__main__"):
+    test_1()
+
+    exit(0)
+