diff --git a/amsculib2.code-workspace b/amsculib2.code-workspace
index 362d7c2..d35f5f5 100644
--- a/amsculib2.code-workspace
+++ b/amsculib2.code-workspace
@@ -3,5 +3,19 @@
 		{
 			"path": "."
 		}
-	]
+	],
+	//"editor.quickSuggestions": false
+	//https://code.visualstudio.com/docs/editing/intellisense#_customizing-intellisense	
+	// Controls if quick suggestions should show up while typing
+	"settings": {
+		"editor.quickSuggestions": {
+			"other": false,
+			"comments": false,
+			"strings": false
+		},
+		"editor.suggestOnTriggerCharacters": false,
+		"editor.tabCompletion": "off",
+		"editor.parameterHints.enabled": false,
+		"editor.acceptSuggestionOnCommitCharacter": false
+	}
 }
\ No newline at end of file
diff --git a/build_linux64/libamsculib2.linux64.a b/build_linux64/libamsculib2.linux64.a
index 5a19883..06cd762 100644
Binary files a/build_linux64/libamsculib2.linux64.a and b/build_linux64/libamsculib2.linux64.a differ
diff --git a/build_linux64/objstore/amscu_comp128.o b/build_linux64/objstore/amscu_comp128.o
index 7ac3343..4af119f 100644
Binary files a/build_linux64/objstore/amscu_comp128.o and b/build_linux64/objstore/amscu_comp128.o differ
diff --git a/build_linux64/objstore/amscu_comp64.o b/build_linux64/objstore/amscu_comp64.o
index cb96761..e2dffd0 100644
Binary files a/build_linux64/objstore/amscu_comp64.o and b/build_linux64/objstore/amscu_comp64.o differ
diff --git a/build_linux64/objstore/amscu_cudafunctions.o b/build_linux64/objstore/amscu_cudafunctions.o
index 44d5377..827be9b 100644
Binary files a/build_linux64/objstore/amscu_cudafunctions.o and b/build_linux64/objstore/amscu_cudafunctions.o differ
diff --git a/build_linux64/objstore/amscu_random.o b/build_linux64/objstore/amscu_random.o
index 18ff6f8..d616470 100644
Binary files a/build_linux64/objstore/amscu_random.o and b/build_linux64/objstore/amscu_random.o differ
diff --git a/build_linux64/objstore/amscuarray.o b/build_linux64/objstore/amscuarray.o
index 56cb5bb..96d9685 100644
Binary files a/build_linux64/objstore/amscuarray.o and b/build_linux64/objstore/amscuarray.o differ
diff --git a/build_linux64/objstore/amscuarray_dops.o b/build_linux64/objstore/amscuarray_dops.o
index 0c0f549..1db6b56 100644
Binary files a/build_linux64/objstore/amscuarray_dops.o and b/build_linux64/objstore/amscuarray_dops.o differ
diff --git a/build_linux64/objstore/amscugeom.o b/build_linux64/objstore/amscugeom.o
index a016b5c..4956135 100644
Binary files a/build_linux64/objstore/amscugeom.o and b/build_linux64/objstore/amscugeom.o differ
diff --git a/build_linux64/objstore/amsculib2.o b/build_linux64/objstore/amsculib2.o
index 02c9cc7..46c1bd6 100644
Binary files a/build_linux64/objstore/amsculib2.o and b/build_linux64/objstore/amsculib2.o differ
diff --git a/build_linux64/objstore/amscumath.o b/build_linux64/objstore/amscumath.o
index b7f744b..3acd851 100644
Binary files a/build_linux64/objstore/amscumath.o and b/build_linux64/objstore/amscumath.o differ
diff --git a/build_linux64/objstore/amscurarray.o b/build_linux64/objstore/amscurarray.o
index eb3211e..f616177 100644
Binary files a/build_linux64/objstore/amscurarray.o and b/build_linux64/objstore/amscurarray.o differ
diff --git a/build_linux64/objstore/cuvect2.o b/build_linux64/objstore/cuvect2.o
index f88c2e0..2bfe452 100644
Binary files a/build_linux64/objstore/cuvect2.o and b/build_linux64/objstore/cuvect2.o differ
diff --git a/build_linux64/objstore/cuvect2f.o b/build_linux64/objstore/cuvect2f.o
index f70fa95..19dfc05 100644
Binary files a/build_linux64/objstore/cuvect2f.o and b/build_linux64/objstore/cuvect2f.o differ
diff --git a/build_linux64/objstore/cuvect3.o b/build_linux64/objstore/cuvect3.o
index 672a662..91da2d0 100644
Binary files a/build_linux64/objstore/cuvect3.o and b/build_linux64/objstore/cuvect3.o differ
diff --git a/build_linux64/objstore/cuvect3f.o b/build_linux64/objstore/cuvect3f.o
index ec36225..9725095 100644
Binary files a/build_linux64/objstore/cuvect3f.o and b/build_linux64/objstore/cuvect3f.o differ
diff --git a/build_linux64/objstore/cuvect4.o b/build_linux64/objstore/cuvect4.o
index 0b33597..18b550b 100644
Binary files a/build_linux64/objstore/cuvect4.o and b/build_linux64/objstore/cuvect4.o differ
diff --git a/build_linux64/objstore/cuvect4f.o b/build_linux64/objstore/cuvect4f.o
index 4e24365..1effae1 100644
Binary files a/build_linux64/objstore/cuvect4f.o and b/build_linux64/objstore/cuvect4f.o differ
diff --git a/build_linux64/test b/build_linux64/test
index 7b811ba..5f25607 100644
Binary files a/build_linux64/test and b/build_linux64/test differ
diff --git a/include/amsculib2/cuvect4f.hpp b/include/amsculib2/cuvect4f.hpp
index 1a5b999..f2ac7d9 100644
--- a/include/amsculib2/cuvect4f.hpp
+++ b/include/amsculib2/cuvect4f.hpp
@@ -35,30 +35,54 @@ namespace amscuda
     class cumat4f
     {
         public:
-        float dat[16];
+        //float dat[16];
 
-        // float m00,m10,m20,m30; //named references to force register use?
-        // float m01,m11,m21,m31; //switched to column-major-order to match GLSL/lapack
-        // float m02,m12,m22,m32;
-        // float m03,m13,m23,m33;
+        float m00,m10,m20,m30; //named references to force register use?
+        float m01,m11,m21,m31; //switched to column-major-order to match GLSL/lapack
+        float m02,m12,m22,m32;
+        float m03,m13,m23,m33;
 
         __host__ __device__ cumat4f();
         __host__ __device__ ~cumat4f();
-        __host__ __device__ float& operator[](const int I);
-        __host__ __device__ float& operator()(const int I, const int J);
-        __host__ __device__ float& at(const int I, const int J);
+        __host__ __device__ cumat4f(
+            const float & _m00, const float & _m01, const float & _m02, const float & _m03,
+            const float & _m10, const float & _m11, const float & _m12, const float & _m13,
+            const float & _m20, const float & _m21, const float & _m22, const float & _m23,
+            const float & _m30, const float & _m31, const float & _m32, const float & _m33
+        );
 
-        __host__ __device__ cumat4f operator+(cumat4f lhs);
-        __host__ __device__ cumat4f operator-(cumat4f lhs);
-        __host__ __device__ cumat4f operator*(float lhs);
-        __host__ __device__ cumat4f operator/(float lhs);
-        __host__ __device__ cuvect4f operator*(cuvect4f lhs);
-        __host__ __device__ cumat4f operator*(cumat4f lhs);
-        __host__ __device__ friend cumat4f operator-(cumat4f rhs);
+        __host__ __device__ explicit cumat4f(const float *data16);
+        
+        //__forceinline__
+        __host__ __device__ float& operator[](const int &I);
+        __host__ __device__ float& operator()(const int &I, const int &J);
+        __host__ __device__ float& at(const int &I, const int &J);
 
+        __host__ __device__ const float& operator[](const int &I) const;
+        __host__ __device__ const float& operator()(const int &I, const int &J) const;
+        __host__ __device__ const float& at(const int &I, const int &J) const;
+
+        __host__ __device__ cumat4f operator+(const cumat4f &rhs);
+        __host__ __device__ cumat4f operator-(const cumat4f &rhs);
+        __host__ __device__ cumat4f operator*(const float &rhs);
+        __host__ __device__ cumat4f operator/(const float &rhs);
+        __host__ __device__ cuvect4f operator*(const cuvect4f &rhs);
+        __host__ __device__ cumat4f operator*(const cumat4f &rhs);
+        __host__ __device__ friend cumat4f operator-(const cumat4f &rhs);
+        
         __host__ __device__ float det();
         __host__ __device__ cumat4f transpose();
         __host__ __device__ cumat4f inverse();
+
+        __host__ __device__ float* data(); //pointer to float[9] representation of matrix
+        __host__ __device__ const float* data() const; //pointer to float[9] representation of matrix
+
+        //In place operations (to save GPU register use)
+        __host__ __device__ cumat4f& operator+=(const cumat4f &rhs);
+        __host__ __device__ cumat4f& operator-=(const cumat4f &rhs);
+        __host__ __device__ cumat4f& operator/=(const float &rhs);
+        __host__ __device__ cumat4f& operator*=(const float &rhs);
+        __host__ __device__ cumat4f& operator*=(const cumat4f &rhs);
     };
 
     __host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b);
diff --git a/src/amsculib2/cuvect4f.cu b/src/amsculib2/cuvect4f.cu
index 6a4fc7f..46c60e7 100644
--- a/src/amsculib2/cuvect4f.cu
+++ b/src/amsculib2/cuvect4f.cu
@@ -115,6 +115,16 @@ __host__ __device__ cuvect4f& cuvect4f::operator/=(const float &rhs)
     return *this;
 }
 
+__host__ __device__ cuvect4f operator-(const cuvect4f &rhs)
+{
+    cuvect4f ret;
+    ret[0] = -rhs[0];
+    ret[1] = -rhs[1];
+    ret[2] = -rhs[2];
+    ret[3] = -rhs[3];
+    return ret;
+}
+
 
 __host__ __device__ cuvect4f::cuvect4f(const float &_x, const float &_y, const float &_z, const float &_w)
 {
@@ -122,302 +132,545 @@ __host__ __device__ cuvect4f::cuvect4f(const float &_x, const float &_y, const f
     return;
 }
 
+
 __host__ __device__ cumat4f::cumat4f()
 {
-    int I;
-    for(I=0;I<16;I++)
-    {
-        dat[I] = 0.0;
-    }
+    m00 = 0.0f;
+    m10 = 0.0f;
+    m20 = 0.0f;
+    m30 = 0.0f;
+
+    m01 = 0.0f;
+    m11 = 0.0f;
+    m21 = 0.0f;
+    m31 = 0.0f;
+
+    m02 = 0.0f;
+    m12 = 0.0f;
+    m22 = 0.0f;
+    m32 = 0.0f;
+
+    m03 = 0.0f;
+    m13 = 0.0f;
+    m23 = 0.0f;
+    m33 = 0.0f;
+    
     return;
 }
 
 __host__ __device__ cumat4f::~cumat4f()
 {
-    int I;
-    for(I=0;I<16;I++)
-    {
-        dat[I] = 0.0;
-    }
+    m00 = 0.0f;
+    m10 = 0.0f;
+    m20 = 0.0f;
+    m30 = 0.0f;
+
+    m01 = 0.0f;
+    m11 = 0.0f;
+    m21 = 0.0f;
+    m31 = 0.0f;
+
+    m02 = 0.0f;
+    m12 = 0.0f;
+    m22 = 0.0f;
+    m32 = 0.0f;
+
+    m03 = 0.0f;
+    m13 = 0.0f;
+    m23 = 0.0f;
+    m33 = 0.0f;
+
     return;
 }
 
-__host__ __device__ float& cumat4f::operator[](const int I)
+__host__ __device__ cumat4f::cumat4f(
+    const float & _m00, const float & _m01, const float & _m02, const float & _m03,
+    const float & _m10, const float & _m11, const float & _m12, const float & _m13,
+    const float & _m20, const float & _m21, const float & _m22, const float & _m23,
+    const float & _m30, const float & _m31, const float & _m32, const float & _m33
+)
 {
-    return dat[I];
+    m00 = _m00;
+    m10 = _m10;
+    m20 = _m20;
+    m30 = _m30;
+
+    m01 = _m01;
+    m11 = _m11;
+    m21 = _m21;
+    m31 = _m31;
+
+    m02 = _m02;
+    m12 = _m12;
+    m22 = _m22;
+    m32 = _m32;
+
+    m03 = _m03;
+    m13 = _m13;
+    m23 = _m23;
+    m33 = _m33;
+    
+    return;
 }
 
-__host__ __device__ float& cumat4f::operator()(const int I, const int J)
+__host__ __device__ cumat4f::cumat4f(const float *data16)
 {
-    return dat[I+4*J];
+    m00 = data16[0];
+    m10 = data16[1];
+    m20 = data16[2];
+    m30 = data16[3];
+
+    m01 = data16[4];
+    m11 = data16[5];
+    m21 = data16[6];
+    m31 = data16[7];
+
+    m02 = data16[8];
+    m12 = data16[9];
+    m22 = data16[10];
+    m32 = data16[11];
+
+    m03 = data16[12];
+    m13 = data16[13];
+    m23 = data16[14];
+    m33 = data16[15];
+
+    return;
 }
 
-__host__ __device__ float& cumat4f::at(const int I, const int J)
+__host__ __device__ float& cumat4f::operator[](const int &I)
 {
-    return dat[I+4*J];
+    switch(I)
+    {
+        case 0: return m00;
+        case 1: return m10;
+        case 2: return m20;
+        case 3: return m30;
+
+        case 4: return m01;
+        case 5: return m11;
+        case 6: return m21;
+        case 7: return m31;
+
+        case 8: return m02;
+        case 9: return m12;
+        case 10: return m22;
+        case 11: return m32;
+
+        case 12: return m03;
+        case 13: return m13;
+        case 14: return m23;
+        case 15: return m33;
+    }
+    
+    return m00;
 }
 
-__host__ __device__ cumat4f cumat4f::operator+(cumat4f lhs)
+__host__ __device__ float& cumat4f::operator()(const int &I, const int &J)
+{
+    return (*this)[I+4*J];
+}
+
+
+__host__ __device__ float& cumat4f::at(const int &I, const int &J)
+{
+    return (*this)[I+4*J];
+}
+
+__host__ __device__ const float& cumat4f::operator[](const int &I) const 
+{
+    switch(I)
+    {
+        case 0: return m00;
+        case 1: return m10;
+        case 2: return m20;
+        case 3: return m30;
+
+        case 4: return m01;
+        case 5: return m11;
+        case 6: return m21;
+        case 7: return m31;
+
+        case 8: return m02;
+        case 9: return m12;
+        case 10: return m22;
+        case 11: return m32;
+
+        case 12: return m03;
+        case 13: return m13;
+        case 14: return m23;
+        case 15: return m33;
+    }
+    
+    return m00;
+}
+
+__host__ __device__ const float& cumat4f::operator()(const int &I, const int &J) const 
+{
+    return (*this)[I+4*J];
+}
+
+
+__host__ __device__ const float& cumat4f::at(const int &I, const int &J) const 
+{
+    return (*this)[I+4*J];
+}
+
+__host__ __device__ cumat4f cumat4f::operator+(const cumat4f &rhs)
 {
     cumat4f ret;
-    int I;
-    for(I=0;I<16;I++)
-    {
-        ret.dat[I] = this->dat[I] + lhs.dat[I];
-    }
+    ret.m00 = m00 + rhs.m00;
+    ret.m10 = m10 + rhs.m10;
+    ret.m20 = m20 + rhs.m20;
+    ret.m30 = m30 + rhs.m30;
+    ret.m01 = m01 + rhs.m01;
+    ret.m11 = m11 + rhs.m11;
+    ret.m21 = m21 + rhs.m21;
+    ret.m31 = m31 + rhs.m31;
+    ret.m02 = m02 + rhs.m02;
+    ret.m12 = m12 + rhs.m12;
+    ret.m22 = m22 + rhs.m22;
+    ret.m32 = m32 + rhs.m32;
+    ret.m03 = m03 + rhs.m03;
+    ret.m13 = m13 + rhs.m13;
+    ret.m23 = m23 + rhs.m23;
+    ret.m33 = m33 + rhs.m33;
     return ret;
 }
 
-__host__ __device__ cumat4f cumat4f::operator-(cumat4f lhs)
+__host__ __device__ cumat4f cumat4f::operator-(const cumat4f &rhs)
 {
     cumat4f ret;
-    int I;
-    for(I=0;I<16;I++)
-    {
-        ret.dat[I] = this->dat[I] - lhs.dat[I];
-    }
+    ret.m00 = m00 - rhs.m00;
+    ret.m10 = m10 - rhs.m10;
+    ret.m20 = m20 - rhs.m20;
+    ret.m30 = m30 - rhs.m30;
+    ret.m01 = m01 - rhs.m01;
+    ret.m11 = m11 - rhs.m11;
+    ret.m21 = m21 - rhs.m21;
+    ret.m31 = m31 - rhs.m31;
+    ret.m02 = m02 - rhs.m02;
+    ret.m12 = m12 - rhs.m12;
+    ret.m22 = m22 - rhs.m22;
+    ret.m32 = m32 - rhs.m32;
+    ret.m03 = m03 - rhs.m03;
+    ret.m13 = m13 - rhs.m13;
+    ret.m23 = m23 - rhs.m23;
+    ret.m33 = m33 - rhs.m33;
     return ret;
 }
 
-__host__ __device__ cumat4f cumat4f::operator*(float lhs)
+__host__ __device__ cumat4f cumat4f::operator*(const float &rhs)
 {
     cumat4f ret;
-    int I;
-    for(I=0;I<16;I++)
-    {
-        ret.dat[I] = this->dat[I]*lhs;
-    }
+    ret.m00 = m00 * rhs;
+    ret.m10 = m10 * rhs;
+    ret.m20 = m20 * rhs;
+    ret.m30 = m30 * rhs;
+    ret.m01 = m01 * rhs;
+    ret.m11 = m11 * rhs;
+    ret.m21 = m21 * rhs;
+    ret.m31 = m31 * rhs;
+    ret.m02 = m02 * rhs;
+    ret.m12 = m12 * rhs;
+    ret.m22 = m22 * rhs;
+    ret.m32 = m32 * rhs;
+    ret.m03 = m03 * rhs;
+    ret.m13 = m13 * rhs;
+    ret.m23 = m23 * rhs;
+    ret.m33 = m33 * rhs;
     return ret;
 }
 
-__host__ __device__ cumat4f cumat4f::operator/(float lhs)
+__host__ __device__ cumat4f cumat4f::operator/(const float &rhs)
 {
     cumat4f ret;
-    int I;
-    for(I=0;I<16;I++)
-    {
-        ret.dat[I] = this->dat[I]/lhs;
-    }
+    ret.m00 = m00 / rhs;
+    ret.m10 = m10 / rhs;
+    ret.m20 = m20 / rhs;
+    ret.m30 = m30 / rhs;
+    ret.m01 = m01 / rhs;
+    ret.m11 = m11 / rhs;
+    ret.m21 = m21 / rhs;
+    ret.m31 = m31 / rhs;
+    ret.m02 = m02 / rhs;
+    ret.m12 = m12 / rhs;
+    ret.m22 = m22 / rhs;
+    ret.m32 = m32 / rhs;
+    ret.m03 = m03 / rhs;
+    ret.m13 = m13 / rhs;
+    ret.m23 = m23 / rhs;
+    ret.m33 = m33 / rhs;
     return ret;
 }
-
-__host__ __device__ cuvect4f cumat4f::operator*(cuvect4f lhs)
+        
+__host__ __device__ cuvect4f cumat4f::operator*(const cuvect4f &rhs)
 {
-    cuvect4f ret = cuvect4f(0.0,0.0,0.0,0.0);
-    int I,J;
-
-    for(I=0;I<4;I++)
-    {
-        for(J=0;J<4;J++)
-        {
-            ret[I] = ret[I] + this->at(I,J)*lhs[J];
-        }
-    }
-
+    cuvect4f ret;
+    ret.x = m00*rhs.x + m01*rhs.y + m02*rhs.z + m03*rhs.w;
+    ret.y = m10*rhs.x + m11*rhs.y + m12*rhs.z + m13*rhs.w;
+    ret.z = m20*rhs.x + m21*rhs.y + m22*rhs.z + m23*rhs.w;
+    ret.w = m30*rhs.x + m31*rhs.y + m32*rhs.z + m33*rhs.w;
     return ret;
 }
 
-__host__ __device__ cumat4f cumat4f::operator*(cumat4f lhs)
+
+__host__ __device__ cumat4f cumat4f::operator*(const cumat4f &rhs)
 {
     cumat4f ret;
-    int I,J,K;
-    for(I=0;I<4;I++)
-    {
-        for(J=0;J<4;J++)
-        {
-            ret(I,J) = 0;
-            for(K=0;K<4;K++)
-            {
-                ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J);
-            }
-        }
-    }
+    ret.m00 = m00*rhs.m00 + m01*rhs.m10 + m02*rhs.m20 + m03*rhs.m30;
+    ret.m01 = m00*rhs.m01 + m01*rhs.m11 + m02*rhs.m21 + m03*rhs.m31;
+    ret.m02 = m00*rhs.m02 + m01*rhs.m12 + m02*rhs.m22 + m03*rhs.m32;
+    ret.m03 = m00*rhs.m03 + m01*rhs.m13 + m02*rhs.m23 + m03*rhs.m33;
+    ret.m10 = m10*rhs.m00 + m11*rhs.m10 + m12*rhs.m20 + m13*rhs.m30;
+    ret.m11 = m10*rhs.m01 + m11*rhs.m11 + m12*rhs.m21 + m13*rhs.m31;
+    ret.m12 = m10*rhs.m02 + m11*rhs.m12 + m12*rhs.m22 + m13*rhs.m32;
+    ret.m13 = m10*rhs.m03 + m11*rhs.m13 + m12*rhs.m23 + m13*rhs.m33;
+    ret.m20 = m20*rhs.m00 + m21*rhs.m10 + m22*rhs.m20 + m23*rhs.m30;
+    ret.m21 = m20*rhs.m01 + m21*rhs.m11 + m22*rhs.m21 + m23*rhs.m31;
+    ret.m22 = m20*rhs.m02 + m21*rhs.m12 + m22*rhs.m22 + m23*rhs.m32;
+    ret.m23 = m20*rhs.m03 + m21*rhs.m13 + m22*rhs.m23 + m23*rhs.m33;
+    ret.m30 = m30*rhs.m00 + m31*rhs.m10 + m32*rhs.m20 + m33*rhs.m30;
+    ret.m31 = m30*rhs.m01 + m31*rhs.m11 + m32*rhs.m21 + m33*rhs.m31;
+    ret.m32 = m30*rhs.m02 + m31*rhs.m12 + m32*rhs.m22 + m33*rhs.m32;
+    ret.m33 = m30*rhs.m03 + m31*rhs.m13 + m32*rhs.m23 + m33*rhs.m33;
     return ret;
 }
-
 __host__ __device__ cumat4f cumat4f::transpose()
 {
     cumat4f q;
-    int I,J;
-    for(I=0;I<4;I++)
-    {
-        for(J=0;J<4;J++)
-        {
-            q(I,J) = this->at(J,I);
-        }
-    }
+    q.m00 = m00;
+    q.m10 = m01;
+    q.m20 = m02;
+    q.m30 = m03;
+    
+    q.m01 = m10;
+    q.m11 = m11;
+    q.m21 = m12;
+    q.m31 = m13;
+    
+    q.m02 = m20;
+    q.m12 = m21;
+    q.m22 = m22;
+    q.m32 = m23;
+    
+    q.m03 = m30;
+    q.m13 = m31;
+    q.m23 = m32;
+    q.m33 = m33;
+    
     return q;
 }
 
 __host__ __device__ float cumat4f::det()
 {
-    float a00,a01,a02,a03;
-    float a10,a11,a12,a13;
-    float a20,a21,a22,a23;
-    float a30,a31,a32,a33;
-    float det;
+    float dt;
+    dt = m03*m12*m21*m30 -
+        m02*m13*m21*m30 -
+        m03*m11*m22*m30 +
+        m01*m13*m22*m30 +
+        m02*m11*m23*m30 -
+        m01*m12*m23*m30 -
+        m03*m12*m20*m31 +
+        m02*m13*m20*m31 +
+        m03*m10*m22*m31 -
+        m00*m13*m22*m31 -
+        m02*m10*m23*m31 +
+        m00*m12*m23*m31 +
+        m03*m11*m20*m32 -
+        m01*m13*m20*m32 -
+        m03*m10*m21*m32 +
+        m00*m13*m21*m32 +
+        m01*m10*m23*m32 -
+        m00*m11*m23*m32 -
+        m02*m11*m20*m33 +
+        m01*m12*m20*m33 +
+        m02*m10*m21*m33 -
+        m00*m12*m21*m33 -
+        m01*m10*m22*m33 +
+        m00*m11*m22*m33;
 
-    a00 = this->at(0,0);
-    a01 = this->at(0,1);
-    a02 = this->at(0,2);
-    a03 = this->at(0,3);
-    a10 = this->at(1,0);
-    a11 = this->at(1,1);
-    a12 = this->at(1,2);
-    a13 = this->at(1,3);
-    a20 = this->at(2,0);
-    a21 = this->at(2,1);
-    a22 = this->at(2,2);
-    a23 = this->at(2,3);
-    a30 = this->at(3,0);
-    a31 = this->at(3,1);
-    a32 = this->at(3,2);
-    a33 = this->at(3,3);
-
-    det = a03*a12*a21*a30 -
-        a02*a13*a21*a30 -
-        a03*a11*a22*a30 +
-        a01*a13*a22*a30 +
-        a02*a11*a23*a30 -
-        a01*a12*a23*a30 -
-        a03*a12*a20*a31 +
-        a02*a13*a20*a31 +
-        a03*a10*a22*a31 -
-        a00*a13*a22*a31 -
-        a02*a10*a23*a31 +
-        a00*a12*a23*a31 +
-        a03*a11*a20*a32 -
-        a01*a13*a20*a32 -
-        a03*a10*a21*a32 +
-        a00*a13*a21*a32 +
-        a01*a10*a23*a32 -
-        a00*a11*a23*a32 -
-        a02*a11*a20*a33 +
-        a01*a12*a20*a33 +
-        a02*a10*a21*a33 -
-        a00*a12*a21*a33 -
-        a01*a10*a22*a33 +
-        a00*a11*a22*a33;
-
-    return det;
-}
-
-__host__ __device__ cumat4f minverse(cumat4f ma)
-{
-  cumat4f mb;
-
-  float a00,a01,a02,a03;
-  float a10,a11,a12,a13;
-  float a20,a21,a22,a23;
-  float a30,a31,a32,a33;
-
-  float b00,b01,b02,b03;
-  float b10,b11,b12,b13;
-  float b20,b21,b22,b23;
-  float b30,b31,b32,b33;
-
-  float det = 0.0;
-
-  a00 = ma.at(0,0);
-  a01 = ma.at(0,1);
-  a02 = ma.at(0,2);
-  a03 = ma.at(0,3);
-  a10 = ma.at(1,0);
-  a11 = ma.at(1,1);
-  a12 = ma.at(1,2);
-  a13 = ma.at(1,3);
-  a20 = ma.at(2,0);
-  a21 = ma.at(2,1);
-  a22 = ma.at(2,2);
-  a23 = ma.at(2,3);
-  a30 = ma.at(3,0);
-  a31 = ma.at(3,1);
-  a32 = ma.at(3,2);
-  a33 = ma.at(3,3);
-
-  det = a03*a12*a21*a30 -
-        a02*a13*a21*a30 -
-        a03*a11*a22*a30 +
-        a01*a13*a22*a30 +
-        a02*a11*a23*a30 -
-        a01*a12*a23*a30 -
-        a03*a12*a20*a31 +
-        a02*a13*a20*a31 +
-        a03*a10*a22*a31 -
-        a00*a13*a22*a31 -
-        a02*a10*a23*a31 +
-        a00*a12*a23*a31 +
-        a03*a11*a20*a32 -
-        a01*a13*a20*a32 -
-        a03*a10*a21*a32 +
-        a00*a13*a21*a32 +
-        a01*a10*a23*a32 -
-        a00*a11*a23*a32 -
-        a02*a11*a20*a33 +
-        a01*a12*a20*a33 +
-        a02*a10*a21*a33 -
-        a00*a12*a21*a33 -
-        a01*a10*a22*a33 +
-        a00*a11*a22*a33;
-
-  if(det*det>1.0E-30)
-  {
-    b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33;
-    b01 =  a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33;
-    b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33;
-    b03 =  a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23;
-    b10 =  a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33;
-    b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33;
-    b12 =  a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33;
-    b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23;
-    b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33;
-    b21 =  a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33;
-    b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33;
-    b23 =  a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23;
-    b30 =  a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32;
-    b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32;
-    b32 =  a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32;
-    b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22;
-    b00 = b00/det;
-    b01 = b01/det;
-    b02 = b02/det;
-    b03 = b03/det;
-    b10 = b10/det;
-    b11 = b11/det;
-    b12 = b12/det;
-    b13 = b13/det;
-    b20 = b20/det;
-    b21 = b21/det;
-    b22 = b22/det;
-    b23 = b23/det;
-    b30 = b30/det;
-    b31 = b31/det;
-    b32 = b32/det;
-    b33 = b33/det;
-    mb.at(0,0) = b00;
-    mb.at(0,1) = b01;
-    mb.at(0,2) = b02;
-    mb.at(0,3) = b03;
-    mb.at(1,0) = b10;
-    mb.at(1,1) = b11;
-    mb.at(1,2) = b12;
-    mb.at(1,3) = b13;
-    mb.at(2,0) = b20;
-    mb.at(2,1) = b21;
-    mb.at(2,2) = b22;
-    mb.at(2,3) = b23;
-    mb.at(3,0) = b30;
-    mb.at(3,1) = b31;
-    mb.at(3,2) = b32;
-    mb.at(3,3) = b33;
-  }
-  //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again!
-  return mb;
+    return dt;
 }
 
 __host__ __device__ cumat4f cumat4f::inverse()
 {
-    return minverse(*this);
+    cumat4f mret;
+    float dt = this->det();
+    float idet;
+
+    if(dt>1E-7 || dt<-1E-7)
+    {
+      idet = 1.0f/dt;
+      mret.m00 = -m13*m22*m31 + m12*m23*m31 + m13*m21*m32 - m11*m23*m32 - m12*m21*m33 + m11*m22*m33;
+      mret.m01 =  m03*m22*m31 - m02*m23*m31 - m03*m21*m32 + m01*m23*m32 + m02*m21*m33 - m01*m22*m33;
+      mret.m02 = -m03*m12*m31 + m02*m13*m31 + m03*m11*m32 - m01*m13*m32 - m02*m11*m33 + m01*m12*m33;
+      mret.m03 =  m03*m12*m21 - m02*m13*m21 - m03*m11*m22 + m01*m13*m22 + m02*m11*m23 - m01*m12*m23;
+      mret.m10 =  m13*m22*m30 - m12*m23*m30 - m13*m20*m32 + m10*m23*m32 + m12*m20*m33 - m10*m22*m33;
+      mret.m11 = -m03*m22*m30 + m02*m23*m30 + m03*m20*m32 - m00*m23*m32 - m02*m20*m33 + m00*m22*m33;
+      mret.m12 =  m03*m12*m30 - m02*m13*m30 - m03*m10*m32 + m00*m13*m32 + m02*m10*m33 - m00*m12*m33;
+      mret.m13 = -m03*m12*m20 + m02*m13*m20 + m03*m10*m22 - m00*m13*m22 - m02*m10*m23 + m00*m12*m23;
+      mret.m20 = -m13*m21*m30 + m11*m23*m30 + m13*m20*m31 - m10*m23*m31 - m11*m20*m33 + m10*m21*m33;
+      mret.m21 =  m03*m21*m30 - m01*m23*m30 - m03*m20*m31 + m00*m23*m31 + m01*m20*m33 - m00*m21*m33;
+      mret.m22 = -m03*m11*m30 + m01*m13*m30 + m03*m10*m31 - m00*m13*m31 - m01*m10*m33 + m00*m11*m33;
+      mret.m23 =  m03*m11*m20 - m01*m13*m20 - m03*m10*m21 + m00*m13*m21 + m01*m10*m23 - m00*m11*m23;
+      mret.m30 =  m12*m21*m30 - m11*m22*m30 - m12*m20*m31 + m10*m22*m31 + m11*m20*m32 - m10*m21*m32;
+      mret.m31 = -m02*m21*m30 + m01*m22*m30 + m02*m20*m31 - m00*m22*m31 - m01*m20*m32 + m00*m21*m32;
+      mret.m32 =  m02*m11*m30 - m01*m12*m30 - m02*m10*m31 + m00*m12*m31 + m01*m10*m32 - m00*m11*m32;
+      mret.m33 = -m02*m11*m20 + m01*m12*m20 + m02*m10*m21 - m00*m12*m21 - m01*m10*m22 + m00*m11*m22;
+      mret.m00 *= idet;
+      mret.m01 *= idet;
+      mret.m02 *= idet;
+      mret.m03 *= idet;
+      mret.m10 *= idet;
+      mret.m11 *= idet;
+      mret.m12 *= idet;
+      mret.m13 *= idet;
+      mret.m20 *= idet;
+      mret.m21 *= idet;
+      mret.m22 *= idet;
+      mret.m23 *= idet;
+      mret.m30 *= idet;
+      mret.m31 *= idet;
+      mret.m32 *= idet;
+      mret.m33 *= idet;
+    }
+    else
+    {
+      mret.m00 = inf;
+      mret.m10 = inf;
+      mret.m20 = inf;
+      mret.m30 = inf;
+      mret.m01 = inf;
+      mret.m11 = inf;
+      mret.m21 = inf;
+      mret.m31 = inf;
+      mret.m02 = inf;
+      mret.m12 = inf;
+      mret.m22 = inf;
+      mret.m32 = inf;
+      mret.m03 = inf;
+      mret.m13 = inf;
+      mret.m23 = inf;
+      mret.m33 = inf;
+    }
+    //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again!
+
+    return mret;
+}
+
+__host__ __device__ cumat4f operator-(const cumat4f &rhs)
+{
+    cumat4f ret;
+    ret.m00 = -rhs.m00;
+    ret.m10 = -rhs.m10;
+    ret.m20 = -rhs.m20;
+    ret.m30 = -rhs.m30;
+    ret.m01 = -rhs.m01;
+    ret.m11 = -rhs.m11;
+    ret.m21 = -rhs.m21;
+    ret.m31 = -rhs.m31;
+    ret.m02 = -rhs.m02;
+    ret.m12 = -rhs.m12;
+    ret.m22 = -rhs.m22;
+    ret.m32 = -rhs.m32;
+    ret.m03 = -rhs.m03;
+    ret.m13 = -rhs.m13;
+    ret.m23 = -rhs.m23;
+    ret.m33 = -rhs.m33;
+    return ret;
+}
+
+__host__ __device__ cumat4f& cumat4f::operator+=(const cumat4f &rhs)
+{
+    m00 += rhs.m00;  
+    m01 += rhs.m01;  
+    m02 += rhs.m02;  
+    m03 += rhs.m03;
+    m10 += rhs.m10;  
+    m11 += rhs.m11;  
+    m12 += rhs.m12;  
+    m13 += rhs.m13;
+    m20 += rhs.m20;  
+    m21 += rhs.m21;  
+    m22 += rhs.m22;  
+    m23 += rhs.m23;
+    m30 += rhs.m30;  
+    m31 += rhs.m31;  
+    m32 += rhs.m32;  
+    m33 += rhs.m33;
+    return *this;
+}
+
+__host__ __device__ cumat4f& cumat4f::operator-=(const cumat4f &rhs)
+{
+    m00 -= rhs.m00;  
+    m01 -= rhs.m01;  
+    m02 -= rhs.m02;  
+    m03 -= rhs.m03;
+    m10 -= rhs.m10;  
+    m11 -= rhs.m11;  
+    m12 -= rhs.m12;  
+    m13 -= rhs.m13;
+    m20 -= rhs.m20;  
+    m21 -= rhs.m21; 
+    m22 -= rhs.m22;  
+    m23 -= rhs.m23;
+    m30 -= rhs.m30;  
+    m31 -= rhs.m31;  
+    m32 -= rhs.m32;  
+    m33 -= rhs.m33;
+    return *this;
+}
+
+__host__ __device__ cumat4f& cumat4f::operator*=(const float &rhs)
+{
+    m00 *= rhs;  
+    m01 *= rhs;  
+    m02 *= rhs;  
+    m03 *= rhs;
+    m10 *= rhs;  
+    m11 *= rhs;  
+    m12 *= rhs;  
+    m13 *= rhs;
+    m20 *= rhs;  
+    m21 *= rhs;  
+    m22 *= rhs;  
+    m23 *= rhs;
+    m30 *= rhs;  
+    m31 *= rhs;  
+    m32 *= rhs;  
+    m33 *= rhs;
+    return *this;
+}
+
+__host__ __device__ cumat4f& cumat4f::operator/=(const float &rhs)
+{
+    float irhs = 1.0f / rhs;
+    m00 *= irhs;  
+    m01 *= irhs;  
+    m02 *= irhs;  
+    m03 *= irhs;
+    m10 *= irhs;  
+    m11 *= irhs;  
+    m12 *= irhs;  
+    m13 *= irhs;
+    m20 *= irhs;  
+    m21 *= irhs;  
+    m22 *= irhs;  
+    m23 *= irhs;
+    m30 *= irhs;  
+    m31 *= irhs;  
+    m32 *= irhs;  
+    m33 *= irhs;
+    return *this;
+}
+
+__host__ __device__ cumat4f& cumat4f::operator*=(const cumat4f &rhs)
+{
+    cumat4f tmp = (*this);
+    (*this) = tmp*rhs;
+    return *this;;
 }
 
 
@@ -453,22 +706,7 @@ __host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b)
     return ret;
 }
 
-__host__ __device__ cuvect4f operator-(cuvect4f rhs)
-{
-    cuvect4f ret;
-    ret[0] = -rhs[0];
-    ret[1] = -rhs[1];
-    ret[2] = -rhs[2];
-    ret[3] = -rhs[3];
-    return ret;
-}
 
-__host__ __device__ cumat4f operator-(cumat4f rhs)
-{
-    cumat4f ret;
-    int I;
-    for(I=0;I<16;I++) ret[I] = -rhs[I];
-    return ret;
-}
+
 
 }; //namespace amscuda