diff --git a/build_linux64/libamsculib2.linux64.a b/build_linux64/libamsculib2.linux64.a
index 03f8a6f..6b696bb 100644
Binary files a/build_linux64/libamsculib2.linux64.a and b/build_linux64/libamsculib2.linux64.a differ
diff --git a/build_linux64/objstore/amscu_comp128.o b/build_linux64/objstore/amscu_comp128.o
index 31b4869..ff579cb 100644
Binary files a/build_linux64/objstore/amscu_comp128.o and b/build_linux64/objstore/amscu_comp128.o differ
diff --git a/build_linux64/objstore/amscu_comp64.o b/build_linux64/objstore/amscu_comp64.o
index 600fd01..43bc637 100644
Binary files a/build_linux64/objstore/amscu_comp64.o and b/build_linux64/objstore/amscu_comp64.o differ
diff --git a/build_linux64/objstore/amscu_cudafunctions.o b/build_linux64/objstore/amscu_cudafunctions.o
index c8def50..0b9561a 100644
Binary files a/build_linux64/objstore/amscu_cudafunctions.o and b/build_linux64/objstore/amscu_cudafunctions.o differ
diff --git a/build_linux64/objstore/amscu_random.o b/build_linux64/objstore/amscu_random.o
index c9163e1..5b69888 100644
Binary files a/build_linux64/objstore/amscu_random.o and b/build_linux64/objstore/amscu_random.o differ
diff --git a/build_linux64/objstore/amscuarray.o b/build_linux64/objstore/amscuarray.o
index 7b9bf27..777dd9e 100644
Binary files a/build_linux64/objstore/amscuarray.o and b/build_linux64/objstore/amscuarray.o differ
diff --git a/build_linux64/objstore/amscuarray_dops.o b/build_linux64/objstore/amscuarray_dops.o
index 25e8f42..280372b 100644
Binary files a/build_linux64/objstore/amscuarray_dops.o and b/build_linux64/objstore/amscuarray_dops.o differ
diff --git a/build_linux64/objstore/amscugeom.o b/build_linux64/objstore/amscugeom.o
index 7260fa5..3794d5d 100644
Binary files a/build_linux64/objstore/amscugeom.o and b/build_linux64/objstore/amscugeom.o differ
diff --git a/build_linux64/objstore/amsculib2.o b/build_linux64/objstore/amsculib2.o
index 838b8fa..e93abb5 100644
Binary files a/build_linux64/objstore/amsculib2.o and b/build_linux64/objstore/amsculib2.o differ
diff --git a/build_linux64/objstore/amscumath.o b/build_linux64/objstore/amscumath.o
index b947051..375cf4c 100644
Binary files a/build_linux64/objstore/amscumath.o and b/build_linux64/objstore/amscumath.o differ
diff --git a/build_linux64/objstore/amscurarray.o b/build_linux64/objstore/amscurarray.o
index 6e2bd50..de7d765 100644
Binary files a/build_linux64/objstore/amscurarray.o and b/build_linux64/objstore/amscurarray.o differ
diff --git a/build_linux64/objstore/cuvect2.o b/build_linux64/objstore/cuvect2.o
index 1a20557..3d1a0cc 100644
Binary files a/build_linux64/objstore/cuvect2.o and b/build_linux64/objstore/cuvect2.o differ
diff --git a/build_linux64/objstore/cuvect2f.o b/build_linux64/objstore/cuvect2f.o
index 9d85181..bf2666f 100644
Binary files a/build_linux64/objstore/cuvect2f.o and b/build_linux64/objstore/cuvect2f.o differ
diff --git a/build_linux64/objstore/cuvect3.o b/build_linux64/objstore/cuvect3.o
index 629ecd0..4251c08 100644
Binary files a/build_linux64/objstore/cuvect3.o and b/build_linux64/objstore/cuvect3.o differ
diff --git a/build_linux64/objstore/cuvect3f.o b/build_linux64/objstore/cuvect3f.o
index dbf481c..0dfc3e2 100644
Binary files a/build_linux64/objstore/cuvect3f.o and b/build_linux64/objstore/cuvect3f.o differ
diff --git a/build_linux64/objstore/cuvect4.o b/build_linux64/objstore/cuvect4.o
index 038e6a4..d5e8af5 100644
Binary files a/build_linux64/objstore/cuvect4.o and b/build_linux64/objstore/cuvect4.o differ
diff --git a/build_linux64/objstore/cuvect4f.o b/build_linux64/objstore/cuvect4f.o
index 78c5350..3a789cd 100644
Binary files a/build_linux64/objstore/cuvect4f.o and b/build_linux64/objstore/cuvect4f.o differ
diff --git a/build_linux64/test b/build_linux64/test
index aa56a4f..7862fc4 100644
Binary files a/build_linux64/test and b/build_linux64/test differ
diff --git a/include/amsculib2/cuvect2f.hpp b/include/amsculib2/cuvect2f.hpp
index 237f736..a0ba6ce 100644
--- a/include/amsculib2/cuvect2f.hpp
+++ b/include/amsculib2/cuvect2f.hpp
@@ -13,16 +13,16 @@ namespace amscuda
 
         __host__ __device__ cuvect2f();
         __host__ __device__ ~cuvect2f();
-        __host__ __device__ cuvect2f(float _x, float _y);
+        __host__ __device__ cuvect2f(const float &_x, const float &_y);
 
-        __host__ __device__ float& operator[](const int I);
-        __host__ __device__ const float& operator[](const int I) const;
+        __host__ __device__ float& operator[](const int &I);
+        __host__ __device__ const float& operator[](const int &I) const;
 
-        __host__ __device__ cuvect2f operator+(cuvect2f lhs);
-        __host__ __device__ cuvect2f operator-(cuvect2f lhs);
-        __host__ __device__ cuvect2f operator*(float lhs);
-        __host__ __device__ cuvect2f operator/(float lhs);
-        __host__ __device__ friend cuvect2f operator-(cuvect2f rhs);
+        __host__ __device__ cuvect2f operator+(const cuvect2f &lhs);
+        __host__ __device__ cuvect2f operator-(const cuvect2f &lhs);
+        __host__ __device__ cuvect2f operator*(const float &lhs);
+        __host__ __device__ cuvect2f operator/(const float &lhs);
+        __host__ __device__ friend cuvect2f operator-(const cuvect2f &rhs);
     };
 
     class cumat2f
@@ -49,11 +49,11 @@ namespace amscuda
         __host__ __device__ cumat2f inverse();
     };
 
-    __host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b);
-    __host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b);
-    __host__ __device__ float cuvect2f_norm(cuvect2f a);
-    __host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a);
-    __host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b);
+    __host__ __device__ float cuvect2f_dot(const cuvect2f &a, const cuvect2f &b);
+    __host__ __device__ float cuvect2f_cross(const cuvect2f &a, const cuvect2f &b);
+    __host__ __device__ float cuvect2f_norm(const cuvect2f &a);
+    __host__ __device__ cuvect2f cuvect2f_normalize(const cuvect2f &a);
+    __host__ __device__ cuvect2f cuvect2f_proj(const cuvect2f &a, const cuvect2f &b);
 
     //2x2 matrix operations
     //matrix order is assumed to be mat[I,J] = mat[I+3*J]
@@ -74,7 +74,7 @@ namespace amscuda
     __host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c);
 
     // ret = a*b
-    __host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b);
+    __host__ __device__ cuvect2f mat2f_mult(float *mat2a, const cuvect2f &b);
 
 
     void test_cuvect2f_1();
diff --git a/include/amsculib2/cuvect3f.hpp b/include/amsculib2/cuvect3f.hpp
index bc66d68..0e06f45 100644
--- a/include/amsculib2/cuvect3f.hpp
+++ b/include/amsculib2/cuvect3f.hpp
@@ -13,49 +13,82 @@ namespace amscuda
 
         __host__ __device__ cuvect3f();
         __host__ __device__ ~cuvect3f();
-        __host__ __device__ cuvect3f(float _x, float _y, float _z);
+        __host__ __device__ cuvect3f(const float &_x, const float &_y, const float &_z);
         
         
-        __host__ __device__ float& operator[](const int I);
-        __host__ __device__ const float& operator[](const int I) const;
+        __host__ __device__ float& operator[](const int &I);
+        __host__ __device__ const float& operator[](const int &I) const;
+
+        __host__ __device__ cuvect3f operator+(const cuvect3f &rhs);
+        __host__ __device__ cuvect3f operator-(const cuvect3f &rhs);
+        __host__ __device__ cuvect3f operator*(const float &rhs);
+        __host__ __device__ cuvect3f operator/(const float &rhs);
+        __host__ __device__ friend cuvect3f operator-(const cuvect3f &rhs);
+
+        __host__ __device__ cuvect3f& operator+=(const cuvect3f &rhs);
+        __host__ __device__ cuvect3f& operator-=(const cuvect3f &rhs);
+        __host__ __device__ cuvect3f& operator/=(const float &rhs);
+        __host__ __device__ cuvect3f& operator*=(const float &rhs);
 
-        __host__ __device__ cuvect3f operator+(cuvect3f lhs);
-        __host__ __device__ cuvect3f operator-(cuvect3f lhs);
-        __host__ __device__ cuvect3f operator*(float lhs);
-        __host__ __device__ cuvect3f operator/(float lhs);
-        __host__ __device__ friend cuvect3f operator-(cuvect3f rhs);
     };
 
     class cumat3f
     {
         public:
-        float dat[9];
+        float m00,m10,m20; //named references to force register use?
+        float m01,m11,m21; //switched to column-major-order to match GLSL/lapack
+        float m02,m12,m22;
 
         __host__ __device__ cumat3f();
         __host__ __device__ ~cumat3f();
-        __host__ __device__ float& operator[](const int I);
-        __host__ __device__ float& operator()(const int I, const int J);
-        __host__ __device__ float& at(const int I, const int J);
+        __host__ __device__ cumat3f(
+            const float & _m00, const float & _m01, const float & _m02,
+            const float & _m10, const float & _m11, const float & _m12,
+            const float & _m20, const float & _m21, const float & _m22
+        );
 
-        __host__ __device__ cumat3f operator+(cumat3f lhs);
-        __host__ __device__ cumat3f operator-(cumat3f lhs);
-        __host__ __device__ cumat3f operator*(float lhs);
-        __host__ __device__ cumat3f operator/(float lhs);
-        __host__ __device__ cuvect3f operator*(cuvect3f lhs);
-        __host__ __device__ cumat3f operator*(cumat3f lhs);
-        __host__ __device__ friend cumat3f operator-(cumat3f rhs);
+        __host__ __device__ float& operator[](const int &I);
+        __host__ __device__ float& operator()(const int &I, const int &J);
+        __host__ __device__ float& at(const int &I, const int &J);
+
+        __host__ __device__ const float& operator[](const int &I) const;
+        __host__ __device__ const float& operator()(const int &I, const int &J) const;
+        __host__ __device__ const float& at(const int &I, const int &J) const;
+
+        __host__ __device__ cumat3f operator+(const cumat3f &rhs);
+        __host__ __device__ cumat3f operator-(const cumat3f &rhs);
+        __host__ __device__ cumat3f operator*(const float &rhs);
+        __host__ __device__ cumat3f operator/(const float &rhs);
+        __host__ __device__ cuvect3f operator*(const cuvect3f &rhs);
+        __host__ __device__ cumat3f operator*(const cumat3f &rhs);
+        __host__ __device__ friend cumat3f operator-(const cumat3f &rhs);
         
         __host__ __device__ float det();
         __host__ __device__ cumat3f transpose();
         __host__ __device__ cumat3f inverse();
+
+        __host__ __device__ float* data(); //pointer to float[9] representation of matrix
+        __host__ __device__ const float* data() const; //pointer to float[9] representation of matrix
+
+        //In place operations (to save GPU register use)
+        __host__ __device__ cumat3f& operator+=(const cumat3f &rhs);
+        __host__ __device__ cumat3f& operator-=(const cumat3f &rhs);
+        __host__ __device__ cumat3f& operator/=(const float &rhs);
+        __host__ __device__ cumat3f& operator*=(const float &rhs);
+        __host__ __device__ cumat3f& operator*=(const cumat3f &rhs);
         
     };
 
-    __host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b);
-    __host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b);
-    __host__ __device__ float cuvect3f_norm(cuvect3f a);
-    __host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a);
-    __host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b);
+    __host__ __device__ float cuvect3f_dot(const cuvect3f &a,const cuvect3f &b);
+    __host__ __device__ cuvect3f cuvect3f_cross(const cuvect3f &a, const cuvect3f &b);
+    __host__ __device__ float cuvect3f_norm(const cuvect3f &a);
+    __host__ __device__ cuvect3f cuvect3f_normalize(const cuvect3f &a);
+    __host__ __device__ cuvect3f cuvect3f_proj(const cuvect3f &a, const cuvect3f &b);
+
+    __host__ __device__ cumat3f hodge_dual(const cuvect3f &vin);
+    __host__ __device__ cuvect3f hodge_dual(const cumat3f &min);
+    __host__ __device__ cumat3f rotmat_from_axisangle(const cuvect3f &axis, const float &angle);
+
 
     //3x3 matrix operations
     //matrix order is assumed to be mat[I,J] = mat[I+3*J]
@@ -72,11 +105,11 @@ namespace amscuda
     //inverts a 3x3 (9 element) matrix
     __host__ __device__ void mat3f_inverse(float *mat3inout);
 
-    __host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin);
+    __host__ __device__ cuvect3f mat3f_mult(float *mat3in, const cuvect3f &cvin);
     __host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout);
 
-    __host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout);
-    __host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout);
+    __host__ __device__ void mat3f_hodgedual(const cuvect3f &vecin, float *matout);
+    __host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f &vecout);
 
     //returns direction cosine rotation matrix from axis and angle
     __host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout);
diff --git a/src/amsculib2/cuvect2f.cu b/src/amsculib2/cuvect2f.cu
index 42c63db..9b53d79 100644
--- a/src/amsculib2/cuvect2f.cu
+++ b/src/amsculib2/cuvect2f.cu
@@ -15,27 +15,27 @@ namespace amscuda
         return;
     }
 
-    __host__ __device__ cuvect2f::cuvect2f(float _x, float _y)
+    __host__ __device__ cuvect2f::cuvect2f(const float &_x, const float &_y)
     {
         x = _x; y = _y;
         return;
     }
 
-    __host__ __device__ float& cuvect2f::operator[](const int I)
+    __host__ __device__ float& cuvect2f::operator[](const int &I)
     {
         if(I==0) return x;
         if(I==1) return y;
         return x;
     }
 
-    __host__ __device__ const float& cuvect2f::operator[](const int I) const
+    __host__ __device__ const float& cuvect2f::operator[](const int &I) const
     {
         if(I==0) return x;
         if(I==1) return y;
         return x;
     }
 
-    __host__ __device__ cuvect2f cuvect2f::operator+(cuvect2f lhs)
+    __host__ __device__ cuvect2f cuvect2f::operator+(const cuvect2f &lhs)
     {
         cuvect2f ret;
         ret.x = x + lhs.x;
@@ -43,14 +43,14 @@ namespace amscuda
         return ret;
     }
 
-    __host__ __device__ cuvect2f cuvect2f::operator-(cuvect2f lhs)
+    __host__ __device__ cuvect2f cuvect2f::operator-(const cuvect2f &lhs)
     {
         cuvect2f ret;
         ret.x = x - lhs.x;
         ret.y = y - lhs.y;
         return ret;
     }
-    __host__ __device__ cuvect2f cuvect2f::operator*(float lhs)
+    __host__ __device__ cuvect2f cuvect2f::operator*(const float &lhs)
     {
         cuvect2f ret;
         ret.x = x*lhs;
@@ -58,7 +58,7 @@ namespace amscuda
         return ret;
     }
 
-    __host__ __device__ cuvect2f cuvect2f::operator/(float lhs)
+    __host__ __device__ cuvect2f cuvect2f::operator/(const float &lhs)
     {
         cuvect2f ret;
         ret.x = x/lhs;
@@ -66,25 +66,25 @@ namespace amscuda
         return ret;
     }
 
-    __host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b)
+    __host__ __device__ float cuvect2f_dot(const cuvect2f &a, const cuvect2f &b)
     {
         float ret = a.x*b.x+a.y*b.y;
         return ret;
     }
 
-    __host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b)
+    __host__ __device__ float cuvect2f_cross(const cuvect2f &a, const cuvect2f &b)
     {
         float ret = a.x*b.y-a.y*b.x;
         return ret;
     }
 
-    __host__ __device__ float cuvect2f_norm(cuvect2f a)
+    __host__ __device__ float cuvect2f_norm(const cuvect2f &a)
     {
         float ret = ::sqrtf(a.x*a.x+a.y*a.y);
         return ret;
     }
 
-    __host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a)
+    __host__ __device__ cuvect2f cuvect2f_normalize(const cuvect2f &a)
     {
         cuvect2f ret;
         float m = cuvect2f_norm(a);
@@ -99,7 +99,7 @@ namespace amscuda
         return ret;
     }
 
-    __host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b)
+    __host__ __device__ cuvect2f cuvect2f_proj(const cuvect2f &a, const cuvect2f &b)
     {
         cuvect2f ret;
         cuvect2f bn = cuvect2f_normalize(b);
@@ -343,7 +343,7 @@ __host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c)
 }
 
 // ret = a*b
-__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b)
+__host__ __device__ cuvect2f mat2f_mult(float *mat2a, const cuvect2f &b)
 {
     cuvect2f ret;
     ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2];
diff --git a/src/amsculib2/cuvect3f.cu b/src/amsculib2/cuvect3f.cu
index 8be5f3d..309d6bb 100644
--- a/src/amsculib2/cuvect3f.cu
+++ b/src/amsculib2/cuvect3f.cu
@@ -15,7 +15,7 @@ namespace amscuda
         return;
     }
 
-    __host__ __device__ float& cuvect3f::operator[](const int I)
+    __host__ __device__ float& cuvect3f::operator[](const int &I)
     {
         if(I==0) return x;
         if(I==1) return y;
@@ -23,7 +23,7 @@ namespace amscuda
         return x;
     }
 
-    __host__ __device__ const float& cuvect3f::operator[](const int I) const
+    __host__ __device__ const float& cuvect3f::operator[](const int &I) const
     {
         if(I==0) return x;
         if(I==1) return y;
@@ -31,59 +31,92 @@ namespace amscuda
         return x;
     }
 
-    __host__ __device__ cuvect3f cuvect3f::operator+(cuvect3f lhs)
+    __host__ __device__ cuvect3f cuvect3f::operator+(const cuvect3f &rhs)
     {
         cuvect3f ret;
-        ret.x = x+lhs.x;
-        ret.y = y+lhs.y;
-        ret.z = z+lhs.z;
+        ret.x = x+rhs.x;
+        ret.y = y+rhs.y;
+        ret.z = z+rhs.z;
 
         return ret;
     }
 
-    __host__ __device__ cuvect3f cuvect3f::operator-(cuvect3f lhs)
+    __host__ __device__ cuvect3f cuvect3f::operator-(const cuvect3f &rhs)
     {
         cuvect3f ret;
-        ret.x = x-lhs.x;
-        ret.y = y-lhs.y;
-        ret.z = z-lhs.z;
+        ret.x = x-rhs.x;
+        ret.y = y-rhs.y;
+        ret.z = z-rhs.z;
 
         return ret;
     }
 
-    __host__ __device__ cuvect3f cuvect3f::operator*(float lhs)
+    __host__ __device__ cuvect3f cuvect3f::operator*(const float &rhs)
     {
         cuvect3f ret;
-        ret.x = x*lhs;
-        ret.y = y*lhs;
-        ret.z = z*lhs;
+        ret.x = x*rhs;
+        ret.y = y*rhs;
+        ret.z = z*rhs;
         return ret;
     }
 
-    __host__ __device__ cuvect3f cuvect3f::operator/(float lhs)
+    __host__ __device__ cuvect3f cuvect3f::operator/(const float &rhs)
     {
         cuvect3f ret;
-        ret.x = x/lhs;
-        ret.y = y/lhs;
-        ret.z = z/lhs;
+        ret.x = x/rhs;
+        ret.y = y/rhs;
+        ret.z = z/rhs;
         return ret;
     }
 
-    __host__ __device__ cuvect3f::cuvect3f(float _x, float _y, float _z)
+    __host__ __device__ cuvect3f& cuvect3f::operator+=(const cuvect3f &rhs)
+    {
+        x = x + rhs.x;
+        y = y + rhs.y;
+        z = z + rhs.z;
+        return *this;
+    }
+
+    __host__ __device__ cuvect3f& cuvect3f::operator-=(const cuvect3f &rhs)
+    {
+        x = x - rhs.x;
+        y = y - rhs.y;
+        z = z - rhs.z;
+        return *this;
+    }
+
+    __host__ __device__ cuvect3f& cuvect3f::operator*=(const float &rhs)
+    {
+        x = x * rhs;
+        y = y * rhs;
+        z = z * rhs;
+        return *this;
+    }
+
+    __host__ __device__ cuvect3f& cuvect3f::operator/=(const float &rhs)
+    {
+        x = x / rhs;
+        y = y / rhs;
+        z = z / rhs;
+        return *this;
+    }
+
+
+    __host__ __device__ cuvect3f::cuvect3f(const float &_x, const float &_y, const float &_z)
     {
         x = _x; y = _y; z = _z;
         return;
     }
 
 
-    __host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b)
+    __host__ __device__ float cuvect3f_dot(const cuvect3f &a, const cuvect3f &b)
     {
         float ret = a.x*b.x+a.y*b.y+a.z*b.z;
 
         return ret;
     }
 
-    __host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b)
+    __host__ __device__ cuvect3f cuvect3f_cross(const cuvect3f &a, const cuvect3f &b)
     {
         cuvect3f ret;
         ret[0] = a[1]*b[2]-a[2]*b[1];
@@ -93,14 +126,14 @@ namespace amscuda
         return ret;
     }
 
-    __host__ __device__ float cuvect3f_norm(cuvect3f a)
+    __host__ __device__ float cuvect3f_norm(const cuvect3f &a)
     {
         float ret;
         ret = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
         return ret;
     }
 
-    __host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a)
+    __host__ __device__ cuvect3f cuvect3f_normalize(const cuvect3f &a)
     {
         cuvect3f ret;
         float m;
@@ -117,7 +150,7 @@ namespace amscuda
         return ret;
     }
 
-    __host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b)
+    __host__ __device__ cuvect3f cuvect3f_proj(const cuvect3f &a, const cuvect3f &b)
     {
         cuvect3f ret;
         cuvect3f bn = cuvect3f_normalize(b);
@@ -126,6 +159,420 @@ namespace amscuda
         return ret;
     }
 
+
+
+
+__host__ __device__ cumat3f::cumat3f()
+{
+    m00 = 0.0f;
+    m01 = 0.0f;
+    m02 = 0.0f;
+    m10 = 0.0f;
+    m11 = 0.0f;
+    m12 = 0.0f;
+    m20 = 0.0f;
+    m21 = 0.0f;
+    m22 = 0.0f;
+
+    return;
+}
+
+__host__ __device__ cumat3f::~cumat3f()
+{
+    m00 = 0.0f;
+    m01 = 0.0f;
+    m02 = 0.0f;
+    m10 = 0.0f;
+    m11 = 0.0f;
+    m12 = 0.0f;
+    m20 = 0.0f;
+    m21 = 0.0f;
+    m22 = 0.0f;
+    return;
+}
+
+__host__ __device__ float& cumat3f::operator[](const int &I)
+{
+    if(I==0) return m00;
+    if(I==1) return m10;
+    if(I==2) return m20;
+    if(I==3) return m01;
+    if(I==4) return m11;
+    if(I==5) return m21;
+    if(I==6) return m02;
+    if(I==7) return m12;
+    if(I==8) return m22;
+    
+    return m00;
+}
+
+__host__ __device__ float& cumat3f::operator()(const int &I, const int &J)
+{
+    if(I==0 && J==0) return m00;
+    if(I==1 && J==0) return m10;
+    if(I==2 && J==0) return m20;
+    if(I==0 && J==1) return m01;
+    if(I==1 && J==1) return m11;
+    if(I==2 && J==1) return m21;
+    if(I==0 && J==2) return m02;
+    if(I==1 && J==2) return m12;
+    if(I==2 && J==2) return m22;
+
+    return m00;
+}
+
+
+__host__ __device__ float& cumat3f::at(const int &I, const int &J)
+{
+    if(I==0 && J==0) return m00;
+    if(I==1 && J==0) return m10;
+    if(I==2 && J==0) return m20;
+    if(I==0 && J==1) return m01;
+    if(I==1 && J==1) return m11;
+    if(I==2 && J==1) return m21;
+    if(I==0 && J==2) return m02;
+    if(I==1 && J==2) return m12;
+    if(I==2 && J==2) return m22;
+
+    return m00;
+}
+
+__host__ __device__ const float& cumat3f::operator[](const int &I) const 
+{
+    if(I==0) return m00;
+    if(I==1) return m10;
+    if(I==2) return m20;
+    if(I==3) return m01;
+    if(I==4) return m11;
+    if(I==5) return m21;
+    if(I==6) return m02;
+    if(I==7) return m12;
+    if(I==8) return m22;
+    
+    return m00;
+}
+
+__host__ __device__ const float& cumat3f::operator()(const int &I, const int &J) const 
+{
+    if(I==0 && J==0) return m00;
+    if(I==1 && J==0) return m10;
+    if(I==2 && J==0) return m20;
+    if(I==0 && J==1) return m01;
+    if(I==1 && J==1) return m11;
+    if(I==2 && J==1) return m21;
+    if(I==0 && J==2) return m02;
+    if(I==1 && J==2) return m12;
+    if(I==2 && J==2) return m22;
+
+    return m00;
+}
+
+
+__host__ __device__ const float& cumat3f::at(const int &I, const int &J) const 
+{
+    if(I==0 && J==0) return m00;
+    if(I==1 && J==0) return m10;
+    if(I==2 && J==0) return m20;
+    if(I==0 && J==1) return m01;
+    if(I==1 && J==1) return m11;
+    if(I==2 && J==1) return m21;
+    if(I==0 && J==2) return m02;
+    if(I==1 && J==2) return m12;
+    if(I==2 && J==2) return m22;
+
+    return m00;
+}
+
+
+__host__ __device__ cumat3f cumat3f::operator+(const cumat3f &rhs)
+{
+    cumat3f ret;
+    ret.m00 = m00 + rhs.m00;
+    ret.m10 = m10 + rhs.m10;
+    ret.m20 = m20 + rhs.m20;
+    ret.m01 = m01 + rhs.m01;
+    ret.m11 = m11 + rhs.m11;
+    ret.m21 = m21 + rhs.m21;
+    ret.m02 = m02 + rhs.m02;
+    ret.m12 = m12 + rhs.m12;
+    ret.m22 = m22 + rhs.m22;
+    
+
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator-(const cumat3f &rhs)
+{
+    cumat3f ret;
+    ret.m00 = m00 - rhs.m00;
+    ret.m10 = m10 - rhs.m10;
+    ret.m20 = m20 - rhs.m20;
+    ret.m01 = m01 - rhs.m01;
+    ret.m11 = m11 - rhs.m11;
+    ret.m21 = m21 - rhs.m21;
+    ret.m02 = m02 - rhs.m02;
+    ret.m12 = m12 - rhs.m12;
+    ret.m22 = m22 - rhs.m22;
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator*(const float &rhs)
+{
+    cumat3f ret;
+    ret.m00 = m00 * rhs;
+    ret.m10 = m10 * rhs;
+    ret.m20 = m20 * rhs;
+    ret.m01 = m01 * rhs;
+    ret.m11 = m11 * rhs;
+    ret.m21 = m21 * rhs;
+    ret.m02 = m02 * rhs;
+    ret.m12 = m12 * rhs;
+    ret.m22 = m22 * rhs;
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator/(const float &rhs)
+{
+    cumat3f ret;
+    ret.m00 = m00 / rhs;
+    ret.m10 = m10 / rhs;
+    ret.m20 = m20 / rhs;
+    ret.m01 = m01 / rhs;
+    ret.m11 = m11 / rhs;
+    ret.m21 = m21 / rhs;
+    ret.m02 = m02 / rhs;
+    ret.m12 = m12 / rhs;
+    ret.m22 = m22 / rhs;
+    return ret;
+}
+        
+__host__ __device__ cuvect3f cumat3f::operator*(const cuvect3f &rhs)
+{
+    cuvect3f ret;
+    
+    ret.x = m00*rhs.x + m01*rhs.y + m02*rhs.z;
+    ret.y = m10*rhs.x + m11*rhs.y + m12*rhs.z;
+    ret.z = m20*rhs.x + m21*rhs.y + m22*rhs.z;
+
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator*(const cumat3f &rhs)
+{
+    cumat3f ret;
+
+    ret.m00 = m00*rhs.m00 + m01*rhs.m10 + m02*rhs.m20;
+    ret.m01 = m00*rhs.m01 + m01*rhs.m11 + m02*rhs.m21;
+    ret.m02 = m00*rhs.m02 + m01*rhs.m12 + m02*rhs.m22;
+    ret.m10 = m10*rhs.m00 + m11*rhs.m10 + m12*rhs.m20;
+    ret.m11 = m10*rhs.m01 + m11*rhs.m11 + m12*rhs.m21;
+    ret.m12 = m10*rhs.m02 + m11*rhs.m12 + m12*rhs.m22;
+    ret.m20 = m20*rhs.m00 + m21*rhs.m10 + m22*rhs.m20;
+    ret.m21 = m20*rhs.m01 + m21*rhs.m11 + m22*rhs.m21;
+    ret.m22 = m20*rhs.m02 + m21*rhs.m12 + m22*rhs.m22;
+
+    return ret;
+}
+
+__host__ __device__ float cumat3f::det()
+{
+    float ret = 0.0;
+
+    ret += m00*m11*m22;
+    ret += m01*m12*m20;
+    ret += m02*m10*m21;
+    ret -= m20*m11*m02;
+    ret -= m21*m12*m00;
+    ret -= m22*m10*m01;
+
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::transpose()
+{
+    cumat3f ret;
+
+    ret.m00 = m00;
+    ret.m01 = m10;
+    ret.m02 = m20;
+    ret.m10 = m01;
+    ret.m11 = m11;
+    ret.m12 = m21;
+    ret.m20 = m02;
+    ret.m21 = m12;
+    ret.m22 = m22;
+
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::inverse()
+{
+    cumat3f q;
+    float dt = det();
+    if(dt!=0.0)
+    {
+        q(0,0) =  (at(1,1)*at(2,2)-at(1,2)*at(2,1))/dt;
+        q(1,0) = -(at(1,0)*at(2,2)-at(1,2)*at(2,0))/dt;
+        q(2,0) =  (at(1,0)*at(2,1)-at(1,1)*at(2,0))/dt;
+        q(0,1) = -(at(0,1)*at(2,2)-at(0,2)*at(2,1))/dt;
+        q(1,1) =  (at(0,0)*at(2,2)-at(0,2)*at(2,0))/dt;
+        q(2,1) = -(at(0,0)*at(2,1)-at(0,1)*at(2,0))/dt;
+        q(0,2) =  (at(0,1)*at(1,2)-at(0,2)*at(1,1))/dt;
+        q(1,2) = -(at(0,0)*at(1,2)-at(0,2)*at(1,0))/dt;
+        q(2,2) =  (at(0,0)*at(1,1)-at(0,1)*at(1,0))/dt;
+        // q(0,0) =  (at(1,1)*at(2,2)-at(1,2)*at(2,1))/dt;
+        // q(0,1) = -(at(1,0)*at(2,2)-at(1,2)*at(2,0))/dt;
+        // q(0,2) =  (at(1,0)*at(2,1)-at(1,1)*at(2,0))/dt;
+        // q(1,0) = -(at(0,1)*at(2,2)-at(0,2)*at(2,1))/dt;
+        // q(1,1) =  (at(0,0)*at(2,2)-at(0,2)*at(2,0))/dt;
+        // q(1,2) = -(at(0,0)*at(2,1)-at(0,1)*at(2,0))/dt;
+        // q(2,0) =  (at(0,1)*at(1,2)-at(0,2)*at(1,1))/dt;
+        // q(2,1) = -(at(0,0)*at(1,2)-at(0,2)*at(1,0))/dt;
+        // q(2,2) =  (at(0,0)*at(1,1)-at(0,1)*at(1,0))/dt;
+        // q = q.transpose();
+    }
+    else
+    {
+        q(0,0) = inf;
+        q(0,1) = inf;
+        q(0,2) = inf;
+        q(1,0) = inf;
+        q(1,1) = inf;
+        q(1,2) = inf;
+        q(2,0) = inf;
+        q(2,1) = inf;
+        q(2,2) = inf;
+    }
+
+    return q;
+}
+
+__host__ __device__ cumat3f operator-(const cumat3f &rhs)
+{
+    cumat3f ret;
+    ret.m00 = -rhs.m00;
+    ret.m10 = -rhs.m10;
+    ret.m20 = -rhs.m20;
+    ret.m01 = -rhs.m01;
+    ret.m11 = -rhs.m11;
+    ret.m21 = -rhs.m21;
+    ret.m02 = -rhs.m02;
+    ret.m12 = -rhs.m12;
+    ret.m22 = -rhs.m22;
+
+    return ret;
+}
+
+__host__ __device__ cumat3f& cumat3f::operator+=(const cumat3f &rhs)
+{
+    m00 += rhs.m00;
+    m10 += rhs.m10;
+    m20 += rhs.m20;
+    m01 += rhs.m01;
+    m11 += rhs.m11;
+    m21 += rhs.m21;
+    m02 += rhs.m02;
+    m12 += rhs.m12;
+    m22 += rhs.m22;
+
+    return *this;
+}
+
+__host__ __device__ cumat3f& cumat3f::operator-=(const cumat3f &rhs)
+{
+    m00 -= rhs.m00;
+    m10 -= rhs.m10;
+    m20 -= rhs.m20;
+    m01 -= rhs.m01;
+    m11 -= rhs.m11;
+    m21 -= rhs.m21;
+    m02 -= rhs.m02;
+    m12 -= rhs.m12;
+    m22 -= rhs.m22;
+
+    return *this;
+}
+
+
+
+__host__ __device__ cumat3f& cumat3f::operator/=(const float &rhs)
+{
+    m00 /= rhs;
+    m10 /= rhs;
+    m20 /= rhs;
+    m01 /= rhs;
+    m11 /= rhs;
+    m21 /= rhs;
+    m02 /= rhs;
+    m12 /= rhs;
+    m22 /= rhs;
+
+    return *this;
+}
+
+__host__ __device__ cumat3f& cumat3f::operator*=(const float &rhs)
+{
+    m00 *= rhs;
+    m10 *= rhs;
+    m20 *= rhs;
+    m01 *= rhs;
+    m11 *= rhs;
+    m21 *= rhs;
+    m02 *= rhs;
+    m12 *= rhs;
+    m22 *= rhs;
+
+    return *this;
+}
+
+__host__ __device__ cumat3f& cumat3f::operator*=(const cumat3f &rhs)
+{
+    cumat3f tmp;
+
+    tmp.m00 = m00*rhs.m00 + m01*rhs.m10 + m02*rhs.m20;
+    tmp.m01 = m00*rhs.m01 + m01*rhs.m11 + m02*rhs.m21;
+    tmp.m02 = m00*rhs.m02 + m01*rhs.m12 + m02*rhs.m22;
+    tmp.m10 = m10*rhs.m00 + m11*rhs.m10 + m12*rhs.m20;
+    tmp.m11 = m10*rhs.m01 + m11*rhs.m11 + m12*rhs.m21;
+    tmp.m12 = m10*rhs.m02 + m11*rhs.m12 + m12*rhs.m22;
+    tmp.m20 = m20*rhs.m00 + m21*rhs.m10 + m22*rhs.m20;
+    tmp.m21 = m20*rhs.m01 + m21*rhs.m11 + m22*rhs.m21;
+    tmp.m22 = m20*rhs.m02 + m21*rhs.m12 + m22*rhs.m22;
+
+    (*this) = tmp;
+
+    return *this;
+}
+
+__host__ __device__ cumat3f::cumat3f(
+    const float & _m00, const float & _m01, const float & _m02,
+    const float & _m10, const float & _m11, const float & _m12,
+    const float & _m20, const float & _m21, const float & _m22
+)
+{
+    m00 = _m00;
+    m01 = _m01;
+    m02 = _m02;
+    m10 = _m10;
+    m11 = _m11;
+    m12 = _m12;
+    m20 = _m20;
+    m21 = _m21;
+    m22 = _m22;
+}
+
+__host__ __device__ float* cumat3f::data()
+{
+     //pointer to float[9] representation of matrix
+    return (float*) this;
+}
+
+__host__ __device__ const float* cumat3f::data() const
+{
+     //pointer to float[9] representation of matrix
+    return (const float*) this;
+}
+
 //transposes a 3x3 (9 element) matrix
 __host__ __device__ void mat3f_transpose(float *mat3inout)
 {
@@ -202,7 +649,7 @@ __host__ __device__ void mat3f_inverse(float *mat3inout)
     return;
 }
 
-__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin)
+__host__ __device__ cuvect3f mat3f_mult(float *mat3in, const cuvect3f &cvin)
 {
     int I,J;
     cuvect3f ret;
@@ -253,7 +700,7 @@ __host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout)
     return;
 }
 
-__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout)
+__host__ __device__ void mat3f_hodgedual(const cuvect3f &vecin, float *matout)
 {
     matout[0 + 0*3] = 0.0f;
     matout[1 + 0*3] = -vecin[2];
@@ -269,7 +716,7 @@ __host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout)
     return;
 }
 
-__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout)
+__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f &vecout)
 {
     vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]);
     vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]);
@@ -400,199 +847,69 @@ __host__ void test_cudavectf_logic1()
     return;
 }
 
-
-
-__host__ __device__ cumat3f::cumat3f()
-{
-    int I;
-    for(I=0;I<9;I++)
-    {
-        dat[I] = 0.0;
-    }
-    return;
-}
-
-__host__ __device__ cumat3f::~cumat3f()
-{
-    int I;
-    for(I=0;I<9;I++)
-    {
-        dat[I] = 0.0;
-    }
-    return;
-}
-
-__host__ __device__ float& cumat3f::operator[](const int I)
-{
-    return dat[I];
-}
-
-__host__ __device__ float& cumat3f::operator()(const int I, const int J)
-{
-    return dat[I+3*J];
-}
-
-__host__ __device__ cumat3f cumat3f::operator+(cumat3f lhs)
-{
-    int I;
-    cumat3f ret;
-    for(I=0;I<9;I++)
-    {
-        ret.dat[I] = this->dat[I] + lhs.dat[I];
-    }
-    return ret;
-}
-
-__host__ __device__ cumat3f cumat3f::operator-(cumat3f lhs)
-{
-    int I;
-    cumat3f ret;
-    for(I=0;I<9;I++)
-    {
-        ret.dat[I] = this->dat[I] - lhs.dat[I];
-    }
-    return ret;
-}
-
-__host__ __device__ cumat3f cumat3f::operator*(float lhs)
+__host__ __device__ cumat3f hodge_dual(const cuvect3f &vin)
 {
     cumat3f ret;
-    int I;
-    for(I=0;I<9;I++)
-    {
-        ret.dat[I] = this->dat[I]*lhs;
-    }
-    return ret;
-}
 
-__host__ __device__ cumat3f cumat3f::operator/(float lhs)
-{
-    cumat3f ret;
-    int I;
-    for(I=0;I<9;I++)
-    {
-        ret.dat[I] = this->dat[I]/lhs;
-    }
-    return ret;
-}
+    ret.m00 = 0.0f;
+    ret.m11 = 0.0f;
+    ret.m22 = 0.0f;
 
-__host__ __device__ float& cumat3f::at(const int I, const int J)
-{
-    return dat[I+3*J];
-}
-        
+    ret.m01 = vin.z;
+    ret.m12 = vin.x;
+    ret.m20 = vin.y;
 
-__host__ __device__ cuvect3f cumat3f::operator*(cuvect3f lhs)
-{
-    cuvect3f ret = cuvect3f(0.0,0.0,0.0);
-    int I,J;
-    for(I=0;I<3;I++)
-    {
-        for(J=0;J<3;J++)
-        {
-            ret[I] = ret[I] + this->at(I,J)*lhs[J];
-        }
-    }
-    return ret;
-}
-
-__host__ __device__ cumat3f cumat3f::operator*(cumat3f lhs)
-{
-    cumat3f ret;
-    int I,J,K;
-
-    for(I=0;I<3;I++)
-    {
-        for(J=0;J<3;J++)
-        {
-            ret(I,J) = 0.0;
-            for(K=0;K<3;K++)
-            {
-                ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
-            }
-        }
-    }
+    ret.m10 = -vin.z;
+    ret.m21 = -vin.x;
+    ret.m02 = -vin.y;
 
     return ret;
 }
 
-__host__ __device__ float cumat3f::det()
-{
-    float ret = 0.0;
-
-    ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2);
-    ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0);
-    ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1);
-    ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1);
-    ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2);
-    ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0);
-
-    return ret;
-}
-
-__host__ __device__ cumat3f cumat3f::transpose()
-{
-    cumat3f q;
-    int I,J;
-    for(I=0;I<3;I++)
-    {
-        for(J=0;J<3;J++)
-        {
-            q.at(I,J) = this->at(J,I);
-        }
-    }
-    return q;
-}
-
-__host__ __device__ cumat3f cumat3f::inverse()
-{
-    cumat3f q;
-    float dt = q.det();
-    if(dt!=0.0)
-    {
-        q(0,0) =  (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt;
-        q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt;
-        q(0,2) =  (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt;
-        q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt;
-        q(1,1) =  (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt;
-        q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt;
-        q(2,0) =  (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt;
-        q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt;
-        q(2,2) =  (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt;
-
-        q = q.transpose();
-    }
-    else
-    {
-        q(0,0) = inf;
-        q(0,1) = inf;
-        q(0,2) = inf;
-        q(1,0) = inf;
-        q(1,1) = inf;
-        q(1,2) = inf;
-        q(2,0) = inf;
-        q(2,1) = inf;
-        q(2,2) = inf;
-    }
-
-    return q;
-}
-
-__host__ __device__ cuvect3f operator-(cuvect3f rhs)
+__host__ __device__ cuvect3f hodge_dual(const cumat3f &min)
 {
     cuvect3f ret;
-    ret[0] = -rhs[0];
-    ret[1] = -rhs[1];
-    ret[2] = -rhs[2];
+
+    ret.x = 0.5f*(min.m12 - min.m21);
+    ret.y = 0.5f*(min.m20 - min.m02);
+    ret.z = 0.5f*(min.m01 - min.m10);
+
     return ret;
 }
 
-__host__ __device__ cumat3f operator-(cumat3f rhs)
+__host__ __device__ const cumat3f cumat3f_eye()
 {
     cumat3f ret;
-    int I;
-    for(I=0;I<9;I++) ret[I] = -rhs[I];
+    ret.m00 = 1.0f;
+    ret.m11 = 1.0f;
+    ret.m22 = 1.0f;
+    
     return ret;
 }
 
+__host__ __device__ const cumat3f cumat3f_zeros()
+{
+    cumat3f ret;
+    return ret;
+}
+
+__host__ __device__ cumat3f rotmat_from_axisangle(const cuvect3f &axis, const float &angle)
+{
+    cumat3f ret = cumat3f_zeros();
+    cumat3f H;
+    cuvect3f _naxis;
+
+    _naxis = cuvect3f_normalize(axis);
+    H = hodge_dual(_naxis);
+
+    ret += H*sinf(angle);
+    H = H*H;
+    ret -= H*cosf(angle);
+    ret += (H + cumat3f_eye());
+
+    return ret;
+}
+
+
+
 };
\ No newline at end of file