Making some of my library code public.

2025-02-04 21:59:29 -05:00
commit 64a7bfc851
85 changed files with 9729 additions and 0 deletions
--- a/bin_linux64/libamsculib2.linux64.a
+++ b/bin_linux64/libamsculib2.linux64.a
--- a/bin_linux64/test
+++ b/bin_linux64/test
--- a/bin_winx64/libamsculib2.msvc64.lib
+++ b/bin_winx64/libamsculib2.msvc64.lib
--- a/bin_winx64/test.exe
+++ b/bin_winx64/test.exe
--- a/bin_winx64/test.exp
+++ b/bin_winx64/test.exp
--- a/bin_winx64/test.lib
+++ b/bin_winx64/test.lib
--- a/compscripts/pycache/complib2.cpython-310.pyc
+++ b/compscripts/pycache/complib2.cpython-310.pyc
--- a/compscripts/pycache/complib2.cpython-36.pyc
+++ b/compscripts/pycache/complib2.cpython-36.pyc
--- a/compscripts/pycache/complib2.cpython-38.pyc
+++ b/compscripts/pycache/complib2.cpython-38.pyc
--- a/compscripts/pycache/complib2.cpython-39.pyc
+++ b/compscripts/pycache/complib2.cpython-39.pyc
--- a/compscripts/pycache/complib3.cpython-310.pyc
+++ b/compscripts/pycache/complib3.cpython-310.pyc
--- a/compscripts/pycache/complib3.cpython-39.pyc
+++ b/compscripts/pycache/complib3.cpython-39.pyc
--- a/compscripts/complib2.py
+++ b/compscripts/complib2.py
@ -0,0 +1,639 @@
+#!/usr/bin/python3
+
+#Python3 compilation library
+#Aaron M. Schinder
+#29 Dec 2020
+#
+#Cleanup and refactor from 2017 python2 version compilation libraries
+
+import os,sys,math,subprocess
+
+#####################
+#Directory Functions#
+#####################
+
+##flist - list all files in a given directory pth
+##optional arguments:
+#   recurse - (T/F): Whether to recursively search for files in directory tree
+#   exts - (list): A list of file extensions to filter on
+#   normpath (T/F): whether to normalize path variables after
+#filelist = flist(pth,**kwargs): 
+def flist(pth,**kwargs): 
+    flst = []
+    if(not('recurse' in kwargs)):
+        recurse_ = False
+    else:
+        recurse_ = kwargs['recurse']
+    if(not('exts' in kwargs)):
+        filterexts_ = False
+    else:
+        filterexts_ = True
+        exts = kwargs['exts']
+    if(not('normpath' in kwargs)):
+        normpath_ = True
+    else:
+        normpath_ = kwargs['normpath']
+    if(not('linuxpath' in kwargs)):
+        linuxpath_ = False
+    else:
+        linuxpath_ = kwargs['linuxpath']
+    if(not('followlinks' in kwargs)):
+        followlinks_ = False
+    else:
+        followlinks_ = kwargs['followlinks']
+
+    dirlist = []
+    rawlist = os.listdir(pth)
+
+    for F in rawlist:
+        F2 = os.path.join(pth,F)
+        if(os.path.isdir(F2)):
+            b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2)))
+            if(b):
+                if((F2!=".")&(F2!="..")):
+                    dirlist.append(F2)
+        elif(os.path.isfile(F2)):
+            flst.append(F2)
+
+    #Recurse through directories
+    if(recurse_):
+        for D in dirlist:
+            lst = flist(D,**kwargs)
+            for L in lst:
+                flst.append(L)
+
+    #Postprocess:
+    #Filter out all extensions except the selected ext list
+    if(filterexts_):
+        flst = filterexts(flst,exts)
+
+    #Normalize filename path according to os
+    if(normpath_):
+        flst2 = list(flst)
+        for I in range(0,len(flst2)):
+            flst[I] = os.path.normpath(flst2[I])
+
+    #If linuxpath, convert all \\ to /
+    #if(linuxpath_):
+    #    flst2 = list(flst)
+    #    for I in range(0,len(flst2)):
+    #        flst[I] = linuxpath(flst2[I])
+    
+    return flst
+
+#Filters by extensions in a list of files
+#flst = def filterexts(flst,exts):
+def filterexts(flst,exts):
+    flst2 = []
+    if(isinstance(exts,str)):
+        exts = list([exts])
+    for F in flst:
+        b = False
+        for ext in exts:
+            if(ext[0]!='.'):
+                ext = '.'+ext
+            F2 = os.path.splitext(F)
+            if(len(F2)>=2):
+                ex = F2[1]
+                if(len(ex)>0):            
+                    if(ex[0]!='.'):
+                        ex = '.'+ex
+                    if(ex==ext):
+                        b = True
+        if(b):
+            flst2.append(F)
+              
+    return flst2
+
+#Find a file fname, starting in pth and recursing
+#Used for finding library files to link
+def findfile(fname,pth,**kwargs):
+    fullfname = ""
+    flst = flist(pth,recurse=True)
+    for F in flst:
+        F2 = os.path.split(F)[1]
+        if(F2 == fname):
+            fullfname = F
+    
+    return fullfname
+
+#List to space-seperated-string
+def list_to_sss(lst):
+    lout = ""
+    for I in range(0,len(lst)-1):
+        lout = lout + lst[I] + " "
+    if(len(lst)>0):
+        lout = lout + lst[len(lst)-1]
+    return lout
+
+def strip_whitespace(strin):
+    strout = ""
+    I1 = -1
+    I2 = -1
+    for I in range(0,len(strin)):
+        if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
+            I1 = I
+            break
+    q = list(range(0,len(strin)))
+    q.reverse()
+    for I in q:
+        if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
+            I2 = I+1
+            break
+    if(I1>=0 and I2>=0):
+        strout = strin[I1:I2]
+    return strout
+
+def sss_to_list(sss):
+    lout = []
+    l1 = sss.split(' ')
+    for l in l1:
+        l2 = strip_whitespace(l)
+        lout.append(l2)
+    return lout
+
+
+def replaceext(fname,ext):
+    fname2 = ""
+    if(len(ext)>0):
+        if(ext[0]!='.'):
+            ext = '.'+ext
+        fname2 = os.path.splitext(fname)[0]+ext
+    else:
+        fname2 = os.path.splitext(fname)[0]
+    return fname2
+
+def replaceexts(fnamelist,ext):
+    fname2list = []
+    for F in fnamelist:
+        F2 = replaceext(F,ext)
+        fname2list.append(F2)
+    return fname2list
+
+# def except_contains_oldv(lst1,exc):
+#     lst2 = []
+#     for item in lst1:
+#         b = 1
+#         for item2 in exc:
+#             if(item.find(item2)>=0):
+#                 b = 0
+#                 break
+#         if(b==1):
+#             lst2.append(item)
+#     return lst2
+
+#filenames must match
+def except_contains(lst1,exc):
+    lst2 = []
+    for item in lst1:
+        b = 1
+        for item2 in exc:
+            fsplit = os.path.split(item)
+            fn = fsplit[len(fsplit)-1]
+            if(fn==item2):
+                b = 0
+                break
+        if(b==1):
+            lst2.append(item)
+    return lst2
+
+##########################
+##System Call Procedures##
+##########################
+
+def callproc(cmd, **kwargs):
+    if(not('logfile' in kwargs)):
+        use_lf = False
+    else:
+        logfile = kwargs['logfile']
+        if(logfile!=""):
+            fp = open(kwargs['logfile'],'a+')
+            use_lf = True
+        else:
+            use_lf = False
+        
+    if(not('echo' in kwargs)):
+        echo = True
+    else:
+        echo = kwargs['echo']
+
+    if(echo):
+        print(cmd)
+
+    #encoding/deconding to/from bytes is necessary to use the subprocess command
+    #in python3.7
+    #However, only do this in linux
+    if(sys.platform!='win32'):
+        cmd2 = cmd.encode(encoding='utf-8')
+    else:
+        cmd2 = cmd
+    proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
+    (out, err) = proc.communicate()
+
+    out = out.decode(encoding='utf-8')
+    
+    if(echo):
+        print(out)
+        #print(err);
+    if(use_lf):
+        fp.writelines(cmd+'\n')
+        fp.writelines(out+'\n')
+
+    if(use_lf):
+        fp.close()
+
+#######################################
+##Compiler, Archive, and Linker Calls##
+#######################################
+
+def smartcompile(srcfile,objext='.o'):
+    mtsrc = os.path.getmtime(srcfile)
+    objfile = replaceext(srcfile,objext)
+    objexists = os.path.exists(objfile)
+    ret = True
+    if(objexists):
+        mtobj = os.path.getmtime(objfile)
+        if(mtobj>=mtsrc):
+            ret = False
+    
+    return ret
+
+#MSVC compiler wrapper
+def msvc_compile(compilername, srcfile, **kwargs):
+
+    if(not('include' in kwargs)):
+        include = ''
+    else:
+        include = kwargs['include']
+        if(isinstance(include,list)):
+            include  = list_to_sss(include)
+    
+    if(not('flags' in kwargs)):
+        flags = ''
+    else:
+        flags = kwargs['flags']
+        if(isinstance(flags,list)):
+            flags = list_to_sss(flags)
+
+    if(not('objext' in kwargs)):
+        objext = '.obj'
+    else:
+        objext = kwargs['objext']
+    
+    if(not('srcfileflag' in kwargs)):
+        srcfileflag = '/c'
+    else:
+        srcfileflag = kwargs['srcfileflag']
+
+    if(not('outfileflag' in kwargs)):
+        outfileflag = '/Fo:'
+    else:
+        outfileflag = kwargs['outfileflag']
+    if(not('logfile' in kwargs)):
+        logfile = ""
+    else:
+        logfile = kwargs['logfile']
+
+    outfile = replaceext(srcfile,objext)
+    ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile
+    ln = ln + " " + include
+
+    callproc(ln,echo=True,logfile=logfile)
+
+    return
+
+#MSVC compiler wrapper
+def msvc_compile_list(compiler,srclist,**kwargs):
+    for S in srclist:
+        msvc_compile(compiler,S,**kwargs)
+    return
+
+#gnu-style compiler compile: Should work with gcc, g++, gfortran
+def gs_compile(compiler,srcfile,**kwargs):
+    if(not('include' in kwargs)):
+        include = ''
+    else:
+        include = kwargs['include']
+        if(isinstance(include,list)):
+            include  = list_to_sss(include)
+    
+    if(not('flags' in kwargs)):
+        flags = ''
+    else:
+        flags = kwargs['flags']
+        if(isinstance(flags,list)):
+            flags = list_to_sss(flags)
+
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+
+    if(not('srcfileflag' in kwargs)):
+        srcfileflag = '-c'
+    else:
+        srcfileflag = kwargs['srcfileflag']
+
+    if(not('outfileflag' in kwargs)):
+        outfileflag = '-o'
+    else:
+        outfileflag = kwargs['outfileflag']
+
+    if(not('logfile' in kwargs)):
+        logfile = ""
+    else:
+        logfile = kwargs['logfile']
+
+    if(not('smartcompile' in kwargs)):
+        _smartcompile = True
+    else:
+        _smartcompile = kwargs['smartcompile']
+    
+    #Do I want to make this thing this general?
+
+    if(not(_smartcompile) or smartcompile(srcfile,objext)):
+        outfile = replaceext(srcfile,objext)
+        ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile
+        ln = ln + " " + include
+
+        callproc(ln,echo=True,logfile=logfile)
+            
+    return
+
+def gs_compile_list(compiler,srclist,**kwargs):
+    for S in srclist:
+        gs_compile(compiler,S,**kwargs)
+    return
+
+def gs_compile_all(compiler,srcdir,srcexts,**kwargs):
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+    srcfils = flist(srcdir,exts=srcexts,recurse=recurse)
+    
+    for S in srcfils:
+        gs_compile(compiler,S,**kwargs)
+
+    return
+
+def gs_link_all(linker,srcpath,target,**kwargs):
+
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+
+    objfils = flist(srcpath,exts=objext,recurse=recurse)
+    oflst = list_to_sss(objfils)
+    
+    gs_link_list(linker,oflst,target,**kwargs)
+    
+    return
+
+def gs_link_list(linker,objlist,target,**kwargs):
+
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+
+    if(not('libdir' in kwargs)):
+        libdir = ''
+    else:
+        libdir = kwargs['libdir']
+
+    if(not('staticlibs' in kwargs)):
+        staticlibs = ''
+    else:
+        staticlibs = kwargs['staticlibs']
+
+    if(not('libflags' in kwargs)):
+        libflags = ''
+    else:
+        libflags = kwargs['libflags']
+
+    if(not('linkerflags' in kwargs)):
+        linkerflags = ''
+    else:
+        linkerflags = kwargs['linkerflags']
+
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+    if(not('logfile' in kwargs)):
+        logfile = ''
+    else:
+        logfile = kwargs['logfile']
+
+    ln = linker+" -o "+target+" "+libdir
+    ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags
+    
+    callproc(ln,logfile=logfile)
+    return
+
+def msvc_link_list(objlist,target,**kwargs):
+
+    linker = 'link'
+
+    if(not('objext' in kwargs)):
+        objext = '.obj'
+    else:
+        objext = kwargs['objext']
+
+    if(not('libdir' in kwargs)):
+        libdir = ''
+    else:
+        libdir = kwargs['libdir']
+
+    if(not('staticlibs' in kwargs)):
+        staticlibs = ''
+    else:
+        staticlibs = kwargs['staticlibs']
+
+    if(not('libflags' in kwargs)):
+        libflags = ''
+    else:
+        libflags = kwargs['libflags']
+
+    if(not('linkerflags' in kwargs)):
+        linkerflags = ''
+    else:
+        linkerflags = kwargs['linkerflags']
+
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+    if(not('logfile' in kwargs)):
+        logfile = ''
+    else:
+        logfile = kwargs['logfile']
+
+    ln = linker+" "+libdir
+    ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags
+    ln = ln+" /out:"+target+" "+libflags
+    
+    callproc(ln,logfile=logfile)
+
+    return
+
+def ar_all(srcpath,arname,**kwargs):
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+    
+    objlist = flist(srcpath,exts=objext,recurse=recurse)
+    ar_list(objlist,arname,**kwargs)
+    
+    return
+
+def msvc_lib_list(objlist,arname,**kwargs):
+    objlist2 = list_to_sss(objlist)
+    
+    ln = "lib "+objlist2+" /out:"+arname
+    callproc(ln)
+
+    return
+
+def ar_list(objlist,arname,**kwargs):
+    objlist2 = list_to_sss(objlist)
+
+    ln = "ar cr "+ arname+" "+objlist2
+    callproc(ln)
+    
+    return
+
+def ar_add_list(objlist,arname,**kwargs):
+    objlist2 = list_to_sss(objlist)
+
+    ln = "ar t "+arname+" "+objlist2
+    callproc(ln)
+    return
+
+##############################
+##Derived Compiler Functions##
+##############################
+
+def gcc_compile(srcfile,**kwargs):
+    compiler = 'gcc'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c']
+    
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def gcc_compile_all(srcdir,**kwargs):
+    compiler = 'gcc'
+    kwargs['objext'] = '.o'
+    srcexts = ['.c']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+    
+    return
+
+def gcc_compile_list(srclist,**kwargs):
+    compiler = 'gcc'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
+
+def gpp_compile(srcfile,**kwargs):
+    compiler = 'g++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+    
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def gpp_compile_all(srcdir,**kwargs):
+    compiler = 'g++'
+    kwargs['objext'] = '.o'
+    srcexts = ['.c','.cpp']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+
+    return
+
+def gpp_compile_list(srclist,**kwargs):
+    compiler = 'g++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
+
+def gfortran_compile(srcfile,**kwargs):
+    compiler = 'gfortran'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.f','.f90','.f77']
+
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def gfortran_compile_all(srcdir,**kwargs):
+    compiler = 'gfortran'
+    kwargs['objext'] = '.o'
+    srcexts = ['.f','.f90','.f77']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+
+    return
+
+def gfortran_compile_list(srclist,**kwargs):
+    compiler = 'gfortran'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.f','.f90','.f77']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
+
+def clang_compile(srcfile,**kwargs):
+    compiler = 'clang++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+    
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def clang_compile_all(srcdir,**kwargs):
+    compiler = 'clang++'
+    kwargs['objext'] = '.o'
+    srcexts = ['.c','.cpp']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+    
+    return
+
+def clang_compile_list(srclist,**kwargs):
+    compiler = 'clang++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
--- a/compscripts/complib3.py
+++ b/compscripts/complib3.py
@ -0,0 +1,524 @@
+#!/usr/bin/python3
+
+import os,sys,math
+import subprocess
+
+##flist - list all files in a given directory pth
+##optional arguments:
+#   recurse - (T/F): Whether to recursively search for files in directory tree
+#   exts - (list): A list of file extensions to filter on
+#   normpath (T/F): whether to normalize path variables after
+#filelist = flist(pth,**kwargs): 
+def flist(pth,**kwargs): 
+    flst = []
+    if(not('recurse' in kwargs)):
+        recurse_ = False
+    else:
+        recurse_ = kwargs['recurse']
+    if(not('exts' in kwargs)):
+        filterexts_ = False
+    else:
+        filterexts_ = True
+        exts = kwargs['exts']
+    if(not('normpath' in kwargs)):
+        normpath_ = True
+    else:
+        normpath_ = kwargs['normpath']
+    if(not('linuxpath' in kwargs)):
+        linuxpath_ = False
+    else:
+        linuxpath_ = kwargs['linuxpath']
+    if(not('followlinks' in kwargs)):
+        followlinks_ = False
+    else:
+        followlinks_ = kwargs['followlinks']
+
+    dirlist = []
+    rawlist = os.listdir(pth)
+
+    for F in rawlist:
+        F2 = os.path.join(pth,F)
+        if(os.path.isdir(F2)):
+            b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2)))
+            if(b):
+                if((F2!=".")&(F2!="..")):
+                    dirlist.append(F2)
+        elif(os.path.isfile(F2)):
+            flst.append(F2)
+
+    #Recurse through directories
+    if(recurse_):
+        for D in dirlist:
+            lst = flist(D,**kwargs)
+            for L in lst:
+                flst.append(L)
+
+    #Postprocess:
+    #Filter out all extensions except the selected ext list
+    if(filterexts_):
+        flst = filterexts(flst,exts)
+
+    #Normalize filename path according to os
+    if(normpath_):
+        flst2 = list(flst)
+        for I in range(0,len(flst2)):
+            flst[I] = os.path.normpath(flst2[I])
+
+    #If linuxpath, convert all \\ to /
+    #if(linuxpath_):
+    #    flst2 = list(flst)
+    #    for I in range(0,len(flst2)):
+    #        flst[I] = linuxpath(flst2[I])
+    
+    return flst
+
+#Filters by extensions in a list of files
+#flst = def filterexts(flst,exts):
+def filterexts(flst,exts):
+    flst2 = []
+    if(isinstance(exts,str)):
+        exts = list([exts])
+    for F in flst:
+        b = False
+        for ext in exts:
+            if(ext[0]!='.'):
+                ext = '.'+ext
+            F2 = os.path.splitext(F)
+            if(len(F2)>=2):
+                ex = F2[1]
+                if(len(ex)>0):            
+                    if(ex[0]!='.'):
+                        ex = '.'+ex
+                    if(ex==ext):
+                        b = True
+        if(b):
+            flst2.append(F)
+              
+    return flst2
+
+#Find a file fname, starting in pth and recursing
+#Used for finding library files to link
+def findfile(fname,pth,**kwargs):
+    fullfname = ""
+    flst = flist(pth,recurse=True)
+    for F in flst:
+        F2 = os.path.split(F)[1]
+        if(F2 == fname):
+            fullfname = F
+    
+    return fullfname
+
+def replaceext(fname,ext):
+    fname2 = ""
+    if(len(ext)>0):
+        if(ext[0]!='.'):
+            ext = '.'+ext
+        fname2 = os.path.splitext(fname)[0]+ext
+    else:
+        fname2 = os.path.splitext(fname)[0]
+    return fname2
+
+def replaceexts(fnamelist,ext):
+    fname2list = []
+    for F in fnamelist:
+        F2 = replaceext(F,ext)
+        fname2list.append(F2)
+    return fname2list
+
+#filenames must match
+def except_contains(lst1,exc):
+    lst2 = []
+    for item in lst1:
+        b = 1
+        for item2 in exc:
+            fsplit = os.path.split(item)
+            fn = fsplit[len(fsplit)-1]
+            if(fn==item2):
+                b = 0
+                break
+        if(b==1):
+            lst2.append(item)
+    return lst2
+
+##########################
+##System Call Procedures##
+##########################
+
+def callproc(cmd, **kwargs):
+    if(not('logfile' in kwargs)):
+        use_lf = False
+    else:
+        logfile = kwargs['logfile']
+        if(logfile!=""):
+            fp = open(kwargs['logfile'],'a+')
+            use_lf = True
+        else:
+            use_lf = False
+        
+    if(not('echo' in kwargs)):
+        echo = True
+    else:
+        echo = kwargs['echo']
+
+    if(echo):
+        print(cmd)
+
+    #encoding/deconding to/from bytes is necessary to use the subprocess command
+    #in python3.7
+    #However, only do this in linux
+    if(sys.platform!='win32'):
+        cmd2 = cmd.encode(encoding='utf-8')
+    else:
+        cmd2 = cmd
+    proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
+    (out, err) = proc.communicate()
+
+    out = out.decode(encoding='utf-8')
+    
+    if(echo):
+        print(out)
+        #print(err);
+    if(use_lf):
+        fp.writelines(cmd+'\n')
+        fp.writelines(out+'\n')
+
+    if(use_lf):
+        fp.close()
+
+#List to space-seperated-string
+def list_to_sss(lst):
+    lout = ""
+    for I in range(0,len(lst)-1):
+        lout = lout + lst[I] + " "
+    if(len(lst)>0):
+        lout = lout + lst[len(lst)-1]
+    return lout
+
+#####################################
+## Incremental Compilation Library ##
+#####################################
+
+#silently read lines from a text file if exists
+def readtextlines(fname):
+    txtlns = []
+
+    if(not os.path.isfile(fname)):
+        return txtlns
+
+    try:
+        fp = open(fname,"r")
+    except:
+        return txtlns
+
+    ln = " "
+    while(ln!=""):
+        ln = fp.readline()
+        txtlns.append(ln)
+
+    fp.close()
+
+    return txtlns
+
+def getincludefnfrage(includeline):
+
+    fnfrag = ""
+    I1 = -1
+    I2 = -1
+
+    for I in range(0,len(includeline)):
+        if(I1<0 and (includeline[I]=='<' or includeline[I]=='"')):
+            I1 = I
+        if(I1>=0 and (includeline[I]=='>' or includeline[I]=='"')):
+            I2 = I
+            break
+    if(I1>=0 and I2>=0):
+        fnfrag = includeline[I1+1:I2]
+
+    return fnfrag
+
+#Returns the name of the source file fname (if it exists)
+#and all included filenames
+def getsrcandincludes(fname, incdirs):
+
+    flist = []
+    if(os.path.isfile(fname)):
+        flist.append(fname)
+
+        Ilist = 0
+        while(Ilist<len(flist)):
+            #recurse through files
+            f1 = flist[Ilist]
+            lns = readtextlines(f1)
+            for J in range(0,len(lns)):
+                if(lns[J].find("#include")>=0):
+                    fnfrag = getincludefnfrage(lns[J])
+                    for K in range(0,len(incdirs)):
+                        tfn = os.path.join(incdirs[K],fnfrag)
+                        if(os.path.isfile(tfn)):
+                            flist.append(tfn)
+                            break
+
+            Ilist = Ilist + 1
+
+    return flist
+
+#Returns the name of the object file associated with the source file
+#within the object store folder (if it exists)
+def getobjfile(fname,objstore,objext = ".o"):
+
+    fret = ""
+    f1 = os.path.split(fname)[1]
+    f2 = f1
+    while(os.path.splitext(f2)[1]!=""):
+        f2 = os.path.splitext(f2)[0]
+    objext = objext.strip('.')
+    f3 = os.path.join(objstore,"{}.{}".format(f2,objext))
+    if(os.path.exists(f3)):
+        fret = f3
+
+    return fret
+
+def getsrctimes(fname, incdirs):
+
+    ftimes = []
+    flst = getsrcandincludes(fname, incdirs)
+    for I in range(0,len(flst)):
+        f = flst[I]
+        mt = os.path.getmtime(f)
+        ftimes.append(mt)
+    
+    return ftimes
+
+def getobjtime(fname,objstore,objext=".o"):
+    ret = -1
+    fret = getobjfile(fname,objstore,objext)
+    if(fret!=""):
+        ret = os.path.getmtime(fret)
+
+    return ret
+
+#Decide whether or not to compile source file
+def decidecompile(fname,**kwargs):
+    ret = True
+
+    if(not os.path.isfile(fname)):
+        ret = False
+        return ret
+    
+    ##unpack kwargs
+    if("searchincdirs" in kwargs):
+        incdirs = kwargs["searchincdirs"]
+    else:
+        incdirs = ["./include"]
+    
+    if("objext" in kwargs):
+        objext = kwargs["objext"]
+    else:
+        objext = ".o"
+    if("objstore" in kwargs):
+        objstore = kwargs["objstore"]
+    else:
+        objstore = "./objstore"
+    
+    
+    srclist = getsrcandincludes(fname,incdirs)
+    srctlist = getsrctimes(fname,incdirs)
+    obj = getobjfile(fname,objstore,objext)
+    objt = getobjtime(fname,objstore,objext)
+    
+    if(obj!=""):
+        ret = False
+        for I in range(0,len(srctlist)):
+            if(srctlist[I]>objt):
+                ret = True
+                break
+
+    return ret
+
+def gs_incremental_compile(compiler,srcfile,**kwargs):
+
+    if(not('include' in kwargs)):
+        include = ''
+    else:
+        include = kwargs['include']
+        if(isinstance(include,list)):
+            include  = list_to_sss(include)
+    if(not('flags' in kwargs)):
+        flags = ''
+    else:
+        flags = kwargs['flags']
+        if(isinstance(flags,list)):
+            flags = list_to_sss(flags)
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+    if(not('srcfileflag' in kwargs)):
+        srcfileflag = '-c'
+    else:
+        srcfileflag = kwargs['srcfileflag']
+    if(not('outfileflag' in kwargs)):
+        outfileflag = '-o'
+    else:
+        outfileflag = kwargs['outfileflag']
+
+    if(not('logfile' in kwargs)):
+        logfile = ""
+    else:
+        logfile = kwargs['logfile']
+    if(not('smartcompile' in kwargs)):
+        _smartcompile = True
+    else:
+        _smartcompile = kwargs['smartcompile']
+
+    #incrementalcompile
+    if("searchincdirs" in kwargs):
+        incdirs = kwargs["searchincdirs"]
+    else:
+        incdirs = ["./include"]
+    
+    if("objext" in kwargs):
+        objext = kwargs["objext"]
+    else:
+        objext = ".o"
+    if("objstore" in kwargs):
+        objstore = kwargs["objstore"]
+    else:
+        objstore = "./objstore"
+    
+    #Do I want to make this thing this general?
+
+    docompile = decidecompile(srcfile,**kwargs)
+
+    if(docompile):
+        f1 = os.path.split(srcfile)[1]
+        f2 = f1
+        while(os.path.splitext(f2)[1]!=""):
+            f2 = os.path.splitext(f2)[0]
+        outfile = os.path.join(objstore,"{}{}".format(f2,objext))
+
+        ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile
+        ln = ln + " " + include
+
+        callproc(ln,echo=True,logfile=logfile)
+
+    return
+
+def gs_incremental_compile_list(compiler,srclist,**kwargs):
+
+    for s in srclist:
+        gs_incremental_compile(compiler,s,**kwargs)
+
+    return
+
+#MSVC compiler wrapper
+
+def msvc_incremental_compile(compilername, srcfile, **kwargs):
+
+    if(not('include' in kwargs)):
+        include = ''
+    else:
+        include = kwargs['include']
+        if(isinstance(include,list)):
+            include  = list_to_sss(include)
+    
+    if(not('flags' in kwargs)):
+        flags = ''
+    else:
+        flags = kwargs['flags']
+        if(isinstance(flags,list)):
+            flags = list_to_sss(flags)
+
+    if(not('objext' in kwargs)):
+        objext = '.obj'
+    else:
+        objext = kwargs['objext']
+    
+    if(not('srcfileflag' in kwargs)):
+        srcfileflag = '/c'
+    else:
+        srcfileflag = kwargs['srcfileflag']
+
+    if(not('outfileflag' in kwargs)):
+        outfileflag = '/Fo:'
+    else:
+        outfileflag = kwargs['outfileflag']
+    if(not('logfile' in kwargs)):
+        logfile = ""
+    else:
+        logfile = kwargs['logfile']
+
+    #incrementalcompile
+    if("searchincdirs" in kwargs):
+        incdirs = kwargs["searchincdirs"]
+    else:
+        incdirs = ["./include"]
+    # if("objext" in kwargs):
+    #     objext = kwargs["objext"]
+    # else:
+    #     objext = ".o"
+    if("objstore" in kwargs):
+        objstore = kwargs["objstore"]
+    else:
+        objstore = "./objstore"
+
+    docompile = decidecompile(srcfile,**kwargs)
+
+    if(docompile):
+        f1 = os.path.split(srcfile)[1]
+        f2 = f1
+        while(os.path.splitext(f2)[1]!=""):
+            f2 = os.path.splitext(f2)[0]
+        outfile = os.path.join(objstore,"{}{}".format(f2,objext))
+
+        ln = compilername+" "+flags+" "+srcfileflag+" "+srcfile+" "+outfileflag+" "+outfile
+        ln = ln + " " + include
+
+        callproc(ln,echo=True,logfile=logfile)
+
+    # outfile = replaceext(srcfile,objext)
+    # ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile
+    # ln = ln + " " + include
+
+    callproc(ln,echo=True,logfile=logfile)
+
+    return
+
+def msvc_incremental_compile_list(compiler,srclist,**kwargs):
+    for S in srclist:
+        msvc_incremental_compile(compiler,S,**kwargs)
+    return
+
+#######################
+## Main Script Tests ##
+#######################
+
+def testtimes(args):
+    if(len(args)>=2):
+        flist = getsrcandincludes(args[1],["./include"])
+        ftlist = getsrctimes(args[1],["./include"])
+        for I in range(0,len(flist)):
+            print("{}\t\t{}".format(flist[I],ftlist[I]))
+        
+        print("associated obj file:")
+        fobj = getobjfile(args[1],"./objstore")
+        ftobj = getobjtime(args[1],"./objstore")
+        if(fobj!=""):
+            print("{}\t\t{}".format(fobj,ftobj))
+        else:
+            print("none found")
+
+        cflag = decidecompile(args[1])
+        print("compile? : {}".format(cflag))
+
+    
+    return
+
+# if(__name__ == "__main__"):
+
+#     args = sys.argv
+#     testtimes(args)
+    
+        
+
+        
--- a/compscripts/linux64.makelib.py
+++ b/compscripts/linux64.makelib.py
@ -0,0 +1,52 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+from complib3 import gs_incremental_compile, gs_incremental_compile_list
+
+import shutil
+#from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
+from shutil import copytree
+
+libname = 'amsculib2.linux64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../linux64/include"
+commonlibdir = "../../linux64/lib"
+localbindir = "./bin_linux64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
+#--ptxas-options=-v
+kwargs['flags'] = "-dc --compiler-options '-fPIC -O3'"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+kwargs['objstore'] = "./objstore"
+kwargs['searchincdirs'] = ['./include']
+
+#find all source files, except the main project files
+files = flist('./src',exts = srcexts, recurse=True)
+files = except_contains(files,mainsrc)
+objfiles = replaceexts(files,'.o')
+objfiles_sss = list_to_sss(objfiles)
+
+#compile all the source files in the list
+#gs_compile_list(cc,files,**kwargs)
+gs_incremental_compile_list(cc,files,**kwargs)
+
+#archive all the source files into a static library
+#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
+objlist = flist(kwargs['objstore'],exts='.o',recurse=True)
+ar_list(objlist,'{}/lib{}.a'.format(localbindir,libname))
+
+# #Push any libraries to the common lib folder
+shutil.copy('{}/lib{}.a'.format(localbindir,libname),commonlibdir)
+
+# #Copy include files to the common include folder
+copytree('./include/',commonincdir+'/',dirs_exist_ok=True)
--- a/compscripts/linux64.maketest.py
+++ b/compscripts/linux64.maketest.py
@ -0,0 +1,43 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+from complib3 import gs_incremental_compile, gs_incremental_compile_list
+
+import shutil
+
+libname = 'amsculib2.linux64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../linux64/include"
+commonlibdir = "../../linux64/lib"
+localbindir = "./bin_linux64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
+kwargs['flags'] = "-dc --compiler-options '-fPIC'"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{} -lamsculib2.linux64".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+kwargs['objstore'] = "./objstore"
+kwargs['searchincdirs'] = ['./include']
+
+#-lamsmathlib3.linux64 -lamsstring3.linux64 -lamsmatrix_cpp.linux64 -llapack -lblas -lgfortran -lamsmathutilthread.linux64 -lamsmathutil2.linux64
+
+#Pull required binary dynamic libraries to the bin folder
+#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
+#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
+#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
+
+#Designate source files for main test program
+fsrc = ['./src/main.cu']
+fobj = replaceexts(fsrc,'.o')
+
+#Compile test programs
+gs_compile_list(cc,fsrc,**kwargs)
+gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)
--- a/compscripts/old/complib2.py
+++ b/compscripts/old/complib2.py
@ -0,0 +1,639 @@
+#!/usr/bin/python3
+
+#Python3 compilation library
+#Aaron M. Schinder
+#29 Dec 2020
+#
+#Cleanup and refactor from 2017 python2 version compilation libraries
+
+import os,sys,math,subprocess
+
+#####################
+#Directory Functions#
+#####################
+
+##flist - list all files in a given directory pth
+##optional arguments:
+#   recurse - (T/F): Whether to recursively search for files in directory tree
+#   exts - (list): A list of file extensions to filter on
+#   normpath (T/F): whether to normalize path variables after
+#filelist = flist(pth,**kwargs): 
+def flist(pth,**kwargs): 
+    flst = []
+    if(not('recurse' in kwargs)):
+        recurse_ = False
+    else:
+        recurse_ = kwargs['recurse']
+    if(not('exts' in kwargs)):
+        filterexts_ = False
+    else:
+        filterexts_ = True
+        exts = kwargs['exts']
+    if(not('normpath' in kwargs)):
+        normpath_ = True
+    else:
+        normpath_ = kwargs['normpath']
+    if(not('linuxpath' in kwargs)):
+        linuxpath_ = False
+    else:
+        linuxpath_ = kwargs['linuxpath']
+    if(not('followlinks' in kwargs)):
+        followlinks_ = False
+    else:
+        followlinks_ = kwargs['followlinks']
+
+    dirlist = []
+    rawlist = os.listdir(pth)
+
+    for F in rawlist:
+        F2 = os.path.join(pth,F)
+        if(os.path.isdir(F2)):
+            b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2)))
+            if(b):
+                if((F2!=".")&(F2!="..")):
+                    dirlist.append(F2)
+        elif(os.path.isfile(F2)):
+            flst.append(F2)
+
+    #Recurse through directories
+    if(recurse_):
+        for D in dirlist:
+            lst = flist(D,**kwargs)
+            for L in lst:
+                flst.append(L)
+
+    #Postprocess:
+    #Filter out all extensions except the selected ext list
+    if(filterexts_):
+        flst = filterexts(flst,exts)
+
+    #Normalize filename path according to os
+    if(normpath_):
+        flst2 = list(flst)
+        for I in range(0,len(flst2)):
+            flst[I] = os.path.normpath(flst2[I])
+
+    #If linuxpath, convert all \\ to /
+    #if(linuxpath_):
+    #    flst2 = list(flst)
+    #    for I in range(0,len(flst2)):
+    #        flst[I] = linuxpath(flst2[I])
+    
+    return flst
+
+#Filters by extensions in a list of files
+#flst = def filterexts(flst,exts):
+def filterexts(flst,exts):
+    flst2 = []
+    if(isinstance(exts,str)):
+        exts = list([exts])
+    for F in flst:
+        b = False
+        for ext in exts:
+            if(ext[0]!='.'):
+                ext = '.'+ext
+            F2 = os.path.splitext(F)
+            if(len(F2)>=2):
+                ex = F2[1]
+                if(len(ex)>0):            
+                    if(ex[0]!='.'):
+                        ex = '.'+ex
+                    if(ex==ext):
+                        b = True
+        if(b):
+            flst2.append(F)
+              
+    return flst2
+
+#Find a file fname, starting in pth and recursing
+#Used for finding library files to link
+def findfile(fname,pth,**kwargs):
+    fullfname = ""
+    flst = flist(pth,recurse=True)
+    for F in flst:
+        F2 = os.path.split(F)[1]
+        if(F2 == fname):
+            fullfname = F
+    
+    return fullfname
+
+#List to space-seperated-string
+def list_to_sss(lst):
+    lout = ""
+    for I in range(0,len(lst)-1):
+        lout = lout + lst[I] + " "
+    if(len(lst)>0):
+        lout = lout + lst[len(lst)-1]
+    return lout
+
+def strip_whitespace(strin):
+    strout = ""
+    I1 = -1
+    I2 = -1
+    for I in range(0,len(strin)):
+        if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
+            I1 = I
+            break
+    q = list(range(0,len(strin)))
+    q.reverse()
+    for I in q:
+        if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
+            I2 = I+1
+            break
+    if(I1>=0 and I2>=0):
+        strout = strin[I1:I2]
+    return strout
+
+def sss_to_list(sss):
+    lout = []
+    l1 = sss.split(' ')
+    for l in l1:
+        l2 = strip_whitespace(l)
+        lout.append(l2)
+    return lout
+
+
+def replaceext(fname,ext):
+    fname2 = ""
+    if(len(ext)>0):
+        if(ext[0]!='.'):
+            ext = '.'+ext
+        fname2 = os.path.splitext(fname)[0]+ext
+    else:
+        fname2 = os.path.splitext(fname)[0]
+    return fname2
+
+def replaceexts(fnamelist,ext):
+    fname2list = []
+    for F in fnamelist:
+        F2 = replaceext(F,ext)
+        fname2list.append(F2)
+    return fname2list
+
+# def except_contains_oldv(lst1,exc):
+#     lst2 = []
+#     for item in lst1:
+#         b = 1
+#         for item2 in exc:
+#             if(item.find(item2)>=0):
+#                 b = 0
+#                 break
+#         if(b==1):
+#             lst2.append(item)
+#     return lst2
+
+#filenames must match
+def except_contains(lst1,exc):
+    lst2 = []
+    for item in lst1:
+        b = 1
+        for item2 in exc:
+            fsplit = os.path.split(item)
+            fn = fsplit[len(fsplit)-1]
+            if(fn==item2):
+                b = 0
+                break
+        if(b==1):
+            lst2.append(item)
+    return lst2
+
+##########################
+##System Call Procedures##
+##########################
+
+def callproc(cmd, **kwargs):
+    if(not('logfile' in kwargs)):
+        use_lf = False
+    else:
+        logfile = kwargs['logfile']
+        if(logfile!=""):
+            fp = open(kwargs['logfile'],'a+')
+            use_lf = True
+        else:
+            use_lf = False
+        
+    if(not('echo' in kwargs)):
+        echo = True
+    else:
+        echo = kwargs['echo']
+
+    if(echo):
+        print(cmd)
+
+    #encoding/deconding to/from bytes is necessary to use the subprocess command
+    #in python3.7
+    #However, only do this in linux
+    if(sys.platform!='win32'):
+        cmd2 = cmd.encode(encoding='utf-8')
+    else:
+        cmd2 = cmd
+    proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
+    (out, err) = proc.communicate()
+
+    out = out.decode(encoding='utf-8')
+    
+    if(echo):
+        print(out)
+        #print(err);
+    if(use_lf):
+        fp.writelines(cmd+'\n')
+        fp.writelines(out+'\n')
+
+    if(use_lf):
+        fp.close()
+
+#######################################
+##Compiler, Archive, and Linker Calls##
+#######################################
+
+def smartcompile(srcfile,objext='.o'):
+    mtsrc = os.path.getmtime(srcfile)
+    objfile = replaceext(srcfile,objext)
+    objexists = os.path.exists(objfile)
+    ret = True
+    if(objexists):
+        mtobj = os.path.getmtime(objfile)
+        if(mtobj>=mtsrc):
+            ret = False
+    
+    return ret
+
+#MSVC compiler wrapper
+def msvc_compile(compilername, srcfile, **kwargs):
+
+    if(not('include' in kwargs)):
+        include = ''
+    else:
+        include = kwargs['include']
+        if(isinstance(include,list)):
+            include  = list_to_sss(include)
+    
+    if(not('flags' in kwargs)):
+        flags = ''
+    else:
+        flags = kwargs['flags']
+        if(isinstance(flags,list)):
+            flags = list_to_sss(flags)
+
+    if(not('objext' in kwargs)):
+        objext = '.obj'
+    else:
+        objext = kwargs['objext']
+    
+    if(not('srcfileflag' in kwargs)):
+        srcfileflag = '/c'
+    else:
+        srcfileflag = kwargs['srcfileflag']
+
+    if(not('outfileflag' in kwargs)):
+        outfileflag = '/Fo:'
+    else:
+        outfileflag = kwargs['outfileflag']
+    if(not('logfile' in kwargs)):
+        logfile = ""
+    else:
+        logfile = kwargs['logfile']
+
+    outfile = replaceext(srcfile,objext)
+    ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile
+    ln = ln + " " + include
+
+    callproc(ln,echo=True,logfile=logfile)
+
+    return
+
+#MSVC compiler wrapper
+def msvc_compile_list(compiler,srclist,**kwargs):
+    for S in srclist:
+        msvc_compile(compiler,S,**kwargs)
+    return
+
+#gnu-style compiler compile: Should work with gcc, g++, gfortran
+def gs_compile(compiler,srcfile,**kwargs):
+    if(not('include' in kwargs)):
+        include = ''
+    else:
+        include = kwargs['include']
+        if(isinstance(include,list)):
+            include  = list_to_sss(include)
+    
+    if(not('flags' in kwargs)):
+        flags = ''
+    else:
+        flags = kwargs['flags']
+        if(isinstance(flags,list)):
+            flags = list_to_sss(flags)
+
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+
+    if(not('srcfileflag' in kwargs)):
+        srcfileflag = '-c'
+    else:
+        srcfileflag = kwargs['srcfileflag']
+
+    if(not('outfileflag' in kwargs)):
+        outfileflag = '-o'
+    else:
+        outfileflag = kwargs['outfileflag']
+
+    if(not('logfile' in kwargs)):
+        logfile = ""
+    else:
+        logfile = kwargs['logfile']
+
+    if(not('smartcompile' in kwargs)):
+        _smartcompile = True
+    else:
+        _smartcompile = kwargs['smartcompile']
+    
+    #Do I want to make this thing this general?
+
+    if(not(_smartcompile) or smartcompile(srcfile,objext)):
+        outfile = replaceext(srcfile,objext)
+        ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile
+        ln = ln + " " + include
+
+        callproc(ln,echo=True,logfile=logfile)
+            
+    return
+
+def gs_compile_list(compiler,srclist,**kwargs):
+    for S in srclist:
+        gs_compile(compiler,S,**kwargs)
+    return
+
+def gs_compile_all(compiler,srcdir,srcexts,**kwargs):
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+    srcfils = flist(srcdir,exts=srcexts,recurse=recurse)
+    
+    for S in srcfils:
+        gs_compile(compiler,S,**kwargs)
+
+    return
+
+def gs_link_all(linker,srcpath,target,**kwargs):
+
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+
+    objfils = flist(srcpath,exts=objext,recurse=recurse)
+    oflst = list_to_sss(objfils)
+    
+    gs_link_list(linker,oflst,target,**kwargs)
+    
+    return
+
+def gs_link_list(linker,objlist,target,**kwargs):
+
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+
+    if(not('libdir' in kwargs)):
+        libdir = ''
+    else:
+        libdir = kwargs['libdir']
+
+    if(not('staticlibs' in kwargs)):
+        staticlibs = ''
+    else:
+        staticlibs = kwargs['staticlibs']
+
+    if(not('libflags' in kwargs)):
+        libflags = ''
+    else:
+        libflags = kwargs['libflags']
+
+    if(not('linkerflags' in kwargs)):
+        linkerflags = ''
+    else:
+        linkerflags = kwargs['linkerflags']
+
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+    if(not('logfile' in kwargs)):
+        logfile = ''
+    else:
+        logfile = kwargs['logfile']
+
+    ln = linker+" -o "+target+" "+libdir
+    ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags
+    
+    callproc(ln,logfile=logfile)
+    return
+
+def msvc_link_list(objlist,target,**kwargs):
+
+    linker = 'link'
+
+    if(not('objext' in kwargs)):
+        objext = '.obj'
+    else:
+        objext = kwargs['objext']
+
+    if(not('libdir' in kwargs)):
+        libdir = ''
+    else:
+        libdir = kwargs['libdir']
+
+    if(not('staticlibs' in kwargs)):
+        staticlibs = ''
+    else:
+        staticlibs = kwargs['staticlibs']
+
+    if(not('libflags' in kwargs)):
+        libflags = ''
+    else:
+        libflags = kwargs['libflags']
+
+    if(not('linkerflags' in kwargs)):
+        linkerflags = ''
+    else:
+        linkerflags = kwargs['linkerflags']
+
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+
+    if(not('logfile' in kwargs)):
+        logfile = ''
+    else:
+        logfile = kwargs['logfile']
+
+    ln = linker+" "+libdir
+    ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags
+    ln = ln+" /out:"+target+" "+libflags
+    
+    callproc(ln,logfile=logfile)
+
+    return
+
+def ar_all(srcpath,arname,**kwargs):
+    if(not('recurse' in kwargs)):
+        recurse = True
+    else:
+        recurse = kwargs['recurse']
+    if(not('objext' in kwargs)):
+        objext = '.o'
+    else:
+        objext = kwargs['objext']
+    
+    objlist = flist(srcpath,exts=objext,recurse=recurse)
+    ar_list(objlist,arname,**kwargs)
+    
+    return
+
+def msvc_lib_list(objlist,arname,**kwargs):
+    objlist2 = list_to_sss(objlist)
+    
+    ln = "lib "+objlist2+" /out:"+arname
+    callproc(ln)
+
+    return
+
+def ar_list(objlist,arname,**kwargs):
+    objlist2 = list_to_sss(objlist)
+
+    ln = "ar cr "+ arname+" "+objlist2
+    callproc(ln)
+    
+    return
+
+def ar_add_list(objlist,arname,**kwargs):
+    objlist2 = list_to_sss(objlist)
+
+    ln = "ar t "+arname+" "+objlist2
+    callproc(ln)
+    return
+
+##############################
+##Derived Compiler Functions##
+##############################
+
+def gcc_compile(srcfile,**kwargs):
+    compiler = 'gcc'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c']
+    
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def gcc_compile_all(srcdir,**kwargs):
+    compiler = 'gcc'
+    kwargs['objext'] = '.o'
+    srcexts = ['.c']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+    
+    return
+
+def gcc_compile_list(srclist,**kwargs):
+    compiler = 'gcc'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
+
+def gpp_compile(srcfile,**kwargs):
+    compiler = 'g++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+    
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def gpp_compile_all(srcdir,**kwargs):
+    compiler = 'g++'
+    kwargs['objext'] = '.o'
+    srcexts = ['.c','.cpp']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+
+    return
+
+def gpp_compile_list(srclist,**kwargs):
+    compiler = 'g++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
+
+def gfortran_compile(srcfile,**kwargs):
+    compiler = 'gfortran'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.f','.f90','.f77']
+
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def gfortran_compile_all(srcdir,**kwargs):
+    compiler = 'gfortran'
+    kwargs['objext'] = '.o'
+    srcexts = ['.f','.f90','.f77']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+
+    return
+
+def gfortran_compile_list(srclist,**kwargs):
+    compiler = 'gfortran'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.f','.f90','.f77']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
+
+def clang_compile(srcfile,**kwargs):
+    compiler = 'clang++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+    
+    gs_compile(compiler,srcfile,**kwargs)
+    
+    return
+
+def clang_compile_all(srcdir,**kwargs):
+    compiler = 'clang++'
+    kwargs['objext'] = '.o'
+    srcexts = ['.c','.cpp']
+
+    gs_compile_all(compiler,srcdir,srcexts,**kwargs)
+    
+    return
+
+def clang_compile_list(srclist,**kwargs):
+    compiler = 'clang++'
+    kwargs['objext'] = '.o'
+    #srcexts = ['.c','.cpp']
+
+    gs_compile_list(compiler,srclist,**kwargs)
+
+    return
--- a/compscripts/old/linux64.makelib.py
+++ b/compscripts/old/linux64.makelib.py
@ -0,0 +1,45 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+
+import shutil
+#from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
+from shutil import copytree as copytree
+
+libname = 'amsculib2.linux64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../linux64/include"
+commonlibdir = "../../linux64/lib"
+localbindir = "./bin_linux64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
+kwargs['flags'] = "-dc"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+
+#find all source files, except the main project files
+files = flist('./src',exts = srcexts, recurse=True)
+files = except_contains(files,mainsrc)
+objfiles = replaceexts(files,'.o')
+objfiles_sss = list_to_sss(objfiles)
+
+#compile all the source files in the list
+gs_compile_list(cc,files,**kwargs)
+
+#archive all the source files into a static library
+ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
+
+#Push any libraries to the common lib folder
+shutil.copy('{}/lib{}.a'.format(localbindir,libname),commonlibdir)
+
+#Copy include files to the common include folder
+copytree('./include/',commonincdir+'/',dirs_exist_ok=True)
--- a/compscripts/old/linux64.maketest.py
+++ b/compscripts/old/linux64.maketest.py
@ -0,0 +1,38 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+
+import shutil
+
+libname = 'amsculib2.linux64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../linux64/include"
+commonlibdir = "../../linux64/lib"
+localbindir = "./bin_linux64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
+kwargs['flags'] = "-dc"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+
+#Pull required binary dynamic libraries to the bin folder
+#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
+#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
+#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
+
+#Designate source files for main test program
+fsrc = ['./src/main.cu']
+fobj = replaceexts(fsrc,'.o')
+
+#Compile test programs
+gs_compile_list(cc,fsrc,**kwargs)
+gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)
--- a/compscripts/old/msvc.makelib.py
+++ b/compscripts/old/msvc.makelib.py
@ -0,0 +1,45 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+
+import shutil
+from shutil import copytree as copytree
+
+libname = 'assetcuda.msvc64' #prefix static library name to generate
+targetname = 'main' #create this executable when compiling tests
+commonincdir = "../../winx64/include"
+commonlibdir = "../../winx64/lib"
+localbindir = "./bin_winx64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp']
+mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+kwargs['flags'] = "/O2"
+kwargs['libdir'] = "/LIBPATH:{} /LIBPATH:{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+
+#find all source files, except the main project files
+files = flist('./src',exts = srcexts, recurse=True)
+files = except_contains(files,mainsrc)
+objfiles = replaceexts(files,'.obj')
+objfiles_sss = list_to_sss(objfiles)
+
+#compile all the source files in the list
+msvc_compile_list(cc,files,**kwargs)
+#gs_compile_list(cc,files,**kwargs)
+
+#archive all the source files into a static library
+#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
+msvc_lib_list(objfiles,'{}/lib{}.lib'.format(localbindir,libname))
+
+#Push any libraries to the common lib folder
+shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir)
+
+#Copy include files to the common include folder
+copytree('./include/',commonincdir+'/',dirs_exist_ok=True)
--- a/compscripts/old/msvc.maketest.py
+++ b/compscripts/old/msvc.maketest.py
@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+
+import shutil
+from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
+
+libname = 'assetcuda.msvc64' #prefix static library name to generate
+targetname = 'tests.exe' #create this executable when compiling tests
+commonincdir = "../../winx64/include"
+commonlibdir = "../../winx64/lib"
+localbindir = "./bin_winx64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp']
+mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+kwargs['flags'] = "/O2"
+kwargs['libdir'] = "/LIBPATH:{} /LIBPATH:{}".format(localbindir,commonlibdir)
+#kwargs['libflags'] = "lib{}.lib libamsearthtools.msvc64.lib libamsmeshtools.msvc64.lib libamsmathlib3.msvc64.lib libamsmatrix_cpp.msvc64.lib liblapack.a libblas.a libamsstring3.msvc64.lib libamsmathutil2.msvc64.lib".format(libname)
+kwargs['libflags'] = "lib{}.lib".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+
+#Pull required binary dynamic libraries to the bin folder
+#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
+#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
+#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
+
+#Designate source files for main test program
+fsrc = ['./src/main.cu']
+fobj = replaceexts(fsrc,'.obj')
+
+#Compile test programs
+msvc_compile_list(cc,fsrc,**kwargs)
+msvc_link_list(list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)
--- a/compscripts/old/winnvcc.makelib.py
+++ b/compscripts/old/winnvcc.makelib.py
@ -0,0 +1,44 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+
+import shutil
+from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
+
+libname = 'amsculib2.msvc64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../winx64/include"
+commonlibdir = "../../winx64/lib"
+localbindir = "./bin_winx64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.c','main.cpp'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+kwargs['flags'] = "-dc"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+
+#find all source files, except the main project files
+files = flist('./src',exts = srcexts, recurse=True)
+files = except_contains(files,mainsrc)
+objfiles = replaceexts(files,'.o')
+objfiles_sss = list_to_sss(objfiles)
+
+#compile all the source files in the list
+gs_compile_list(cc,files,**kwargs)
+
+#archive all the source files into a static library
+#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
+msvc_lib_list(objfiles,'{}/lib{}.lib'.format(localbindir,libname))
+
+#Push any libraries to the common lib folder
+shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir)
+
+#Copy include files to the common include folder
+copy_tree('./include/',commonincdir+'/')
--- a/compscripts/old/winnvcc.maketest.py
+++ b/compscripts/old/winnvcc.maketest.py
@ -0,0 +1,38 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+
+import shutil
+from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
+
+libname = 'amsculib2.msvc64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../winx64/include"
+commonlibdir = "../../winx64/lib"
+localbindir = "./bin_winx64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.c','main.cpp'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+kwargs['flags'] = "-dc"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-llib{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+
+#Pull required binary dynamic libraries to the bin folder
+#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
+#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
+#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
+
+#Designate source files for main test program
+fsrc = ['./src/main.cpp']
+fobj = replaceexts(fsrc,'.o')
+
+#Compile test programs
+gs_compile_list(cc,fsrc,**kwargs)
+gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)
--- a/compscripts/winnvcc.makelib.py
+++ b/compscripts/winnvcc.makelib.py
@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+from complib3 import gs_incremental_compile, gs_incremental_compile_list
+
+import shutil
+from shutil import copytree
+
+libname = 'amsculib2.msvc64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../winx64/include"
+commonlibdir = "../../winx64/lib"
+localbindir = "./bin_winx64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+kwargs['flags'] = "-dc"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-l{}".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+kwargs['objstore'] = "./objstore"
+kwargs['searchincdirs'] = ['./include']
+
+#find all source files, except the main project files
+files = flist('./src',exts = srcexts, recurse=True)
+files = except_contains(files,mainsrc)
+objfiles = replaceexts(files,'.o')
+objfiles_sss = list_to_sss(objfiles)
+
+#compile all the source files in the list
+#gs_compile_list(cc,files,**kwargs)
+gs_incremental_compile_list(cc,files,**kwargs)
+
+#archive all the source files into a static library
+#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
+objlist = flist(kwargs['objstore'],exts='.o',recurse=True)
+msvc_lib_list(objlist,'{}/lib{}.lib'.format(localbindir,libname))
+
+# #Push any libraries to the common lib folder
+shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir)
+
+# #Copy include files to the common include folder
+copytree('./include/',commonincdir+'/',dirs_exist_ok=True)
--- a/compscripts/winnvcc.maketest.py
+++ b/compscripts/winnvcc.maketest.py
@ -0,0 +1,43 @@
+#!/usr/bin/python3
+
+import os,sys,subprocess,math
+from complib2 import *
+from complib3 import gs_incremental_compile, gs_incremental_compile_list
+
+import shutil
+from shutil import copytree
+
+libname = 'amsculib2.msvc64' #prefix static library name to generate
+targetname = 'test' #create this executable when compiling tests
+commonincdir = "../../winx64/include"
+commonlibdir = "../../winx64/lib"
+localbindir = "./bin_winx64"
+cc = 'nvcc' #compiler
+srcexts = ['.c','.cpp','.cu']
+mainsrc = ['main.cu'] #ignore these files when compiling the static library
+
+kwargs = dict()
+include = "-I./include -I{}".format(commonincdir)
+kwargs['include'] = include
+kwargs['flags'] = "-dc"
+kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
+kwargs['libflags'] = "-llib{} -llibamsculib2.msvc64".format(libname)
+kwargs['linkerflags'] = ""
+kwargs['recurse'] = True
+kwargs['objstore'] = "./objstore"
+kwargs['searchincdirs'] = ['./include']
+
+#-lamsmathlib3.linux64 -lamsstring3.linux64 -lamsmatrix_cpp.linux64 -llapack -lblas -lgfortran -lamsmathutilthread.linux64 -lamsmathutil2.linux64
+
+#Pull required binary dynamic libraries to the bin folder
+#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
+#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
+#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
+
+#Designate source files for main test program
+fsrc = ['./src/main.cu']
+fobj = replaceexts(fsrc,'.o')
+
+#Compile test programs
+gs_compile_list(cc,fsrc,**kwargs)
+gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)
--- a/include/amsculib2/amscu_comp128.hpp
+++ b/include/amsculib2/amscu_comp128.hpp
@ -0,0 +1,89 @@
+#ifndef __AMSCU_COMP128_HPP__
+#define __AMSCU_COMP128_HPP__
+
+namespace amscuda
+{
+namespace cmp
+{
+
+    class cucomp128
+    {
+        public:
+        double real;
+        double imag;
+
+        __host__ __device__ cucomp128();
+        __host__ __device__ ~cucomp128();
+        __host__ __device__ cucomp128(const cucomp128 &other);
+        __host__ __device__ cucomp128(const double &other);
+
+        __host__ __device__ cucomp128& operator=(cucomp128& other);
+        __host__ __device__ const cucomp128& operator=(const cucomp128& other);
+        __host__ __device__ cucomp128& operator=(double& other);
+        __host__ __device__ const cucomp128& operator=(const double& other);
+        
+        __host__ __device__ double& operator[](int& ind);
+        __host__ __device__ const double& operator[](const int& ind) const;
+
+        __host__ __device__ cucomp128 operator+(const cucomp128& z);
+        __host__ __device__ cucomp128 operator-(const cucomp128& z);
+        __host__ __device__ cucomp128 operator*(const cucomp128& z);
+        __host__ __device__ cucomp128 operator/(const cucomp128& z);
+
+        __host__ __device__ cucomp128 operator+(const double& z);
+        __host__ __device__ cucomp128 operator-(const double& z);
+        __host__ __device__ cucomp128 operator*(const double& z);
+        __host__ __device__ cucomp128 operator/(const double& z);
+
+        __host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign
+
+        //comparison operators
+        __host__ __device__ bool operator==(const cucomp128& z) const;
+        __host__ __device__ bool operator!=(const cucomp128& z) const;
+        __host__ __device__ bool operator>(const cucomp128& z) const;
+        __host__ __device__ bool operator<(const cucomp128& z) const;
+        __host__ __device__ bool operator>=(const cucomp128& z) const;
+        __host__ __device__ bool operator<=(const cucomp128& z) const;
+
+        __host__ __device__ bool isnan() const;
+        __host__ __device__ bool isinf() const;
+
+        __host__ __device__ bool isreal() const;
+        __host__ __device__ bool isimag() const;
+        __host__ __device__ bool iszero() const;
+        __host__ __device__ double arg() const;
+        __host__ __device__ double mag() const;
+        __host__ __device__ cucomp128 conj() const;
+    };
+
+    __host__ __device__ double arg(cucomp128 z);
+
+    __host__ __device__ cucomp128 dtocomp(double _r, double _i);
+    __host__ __device__ double real(cucomp128 z);
+    __host__ __device__ double imag(cucomp128 z);
+    __host__ __device__ cucomp128 sin(cucomp128 z);
+    __host__ __device__ cucomp128 cos(cucomp128 z);
+    __host__ __device__ cucomp128 tan(cucomp128 z);
+    __host__ __device__ cucomp128 exp(cucomp128 z);
+    __host__ __device__ cucomp128 log(cucomp128 z);
+    __host__ __device__ double abs(cucomp128 z);
+    __host__ __device__ cucomp128 conj(cucomp128 z);
+
+    // //need hyperbolic trig Functions
+    __host__ __device__ cucomp128 cosh(cucomp128 z);
+    __host__ __device__ cucomp128 sinh(cucomp128 z);
+    __host__ __device__ cucomp128 tanh(cucomp128 z);
+
+    __host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2);
+
+    // //returns "complex sign" of complex number - 0, or a unit number with same argument
+    __host__ __device__ cucomp128 csgn(cucomp128 z);
+
+void test_cucomp128_1();
+
+
+}; //end namespace cmp
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/amscu_comp64.hpp
+++ b/include/amsculib2/amscu_comp64.hpp
@ -0,0 +1,88 @@
+#ifndef __AMSCU_COMP64_HPP__
+#define __AMSCU_COMP64_HPP__
+
+namespace amscuda
+{
+namespace cmp
+{
+
+    class cucomp64
+    {
+        public:
+        float real;
+        float imag;
+
+        __host__ __device__ cucomp64();
+        __host__ __device__ ~cucomp64();
+        __host__ __device__ cucomp64(const cucomp64 &other);
+        __host__ __device__ cucomp64(const float &other);
+
+        __host__ __device__ cucomp64& operator=(cucomp64& other);
+        __host__ __device__ const cucomp64& operator=(const cucomp64& other);
+        __host__ __device__ cucomp64& operator=(float& other);
+        __host__ __device__ const cucomp64& operator=(const float& other);
+        
+        __host__ __device__ float& operator[](int& ind);
+        __host__ __device__ const float& operator[](const int& ind) const;
+
+        __host__ __device__ cucomp64 operator+(const cucomp64& z);
+        __host__ __device__ cucomp64 operator-(const cucomp64& z);
+        __host__ __device__ cucomp64 operator*(const cucomp64& z);
+        __host__ __device__ cucomp64 operator/(const cucomp64& z);
+
+        __host__ __device__ cucomp64 operator+(const float& z);
+        __host__ __device__ cucomp64 operator-(const float& z);
+        __host__ __device__ cucomp64 operator*(const float& z);
+        __host__ __device__ cucomp64 operator/(const float& z);
+
+        __host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign
+
+        //comparison operators
+        __host__ __device__ bool operator==(const cucomp64& z) const;
+        __host__ __device__ bool operator!=(const cucomp64& z) const;
+        __host__ __device__ bool operator>(const cucomp64& z) const;
+        __host__ __device__ bool operator<(const cucomp64& z) const;
+        __host__ __device__ bool operator>=(const cucomp64& z) const;
+        __host__ __device__ bool operator<=(const cucomp64& z) const;
+
+        __host__ __device__ bool isnan() const;
+        __host__ __device__ bool isinf() const;
+
+        __host__ __device__ bool isreal() const;
+        __host__ __device__ bool isimag() const;
+        __host__ __device__ bool iszero() const;
+        __host__ __device__ float arg() const;
+        __host__ __device__ float mag() const;
+        __host__ __device__ cucomp64 conj() const;
+    };
+
+    __host__ __device__ float arg(cucomp64 z);
+
+    __host__ __device__ cucomp64 dtocomp64(float _r, float _i);
+    __host__ __device__ float real(cucomp64 z);
+    __host__ __device__ float imag(cucomp64 z);
+    __host__ __device__ cucomp64 sin(cucomp64 z);
+    __host__ __device__ cucomp64 cos(cucomp64 z);
+    __host__ __device__ cucomp64 tan(cucomp64 z);
+    __host__ __device__ cucomp64 exp(cucomp64 z);
+    __host__ __device__ cucomp64 log(cucomp64 z);
+    __host__ __device__ float abs(cucomp64 z);
+    __host__ __device__ cucomp64 conj(cucomp64 z);
+
+    // //need hyperbolic trig Functions
+    __host__ __device__ cucomp64 cosh(cucomp64 z);
+    __host__ __device__ cucomp64 sinh(cucomp64 z);
+    __host__ __device__ cucomp64 tanh(cucomp64 z);
+
+    __host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2);
+
+    // //returns "complex sign" of complex number - 0, or a unit number with same argument
+    __host__ __device__ cucomp64 csgn(cucomp64 z);
+
+void test_cucomp64_1();
+
+}; //end namespace cmp
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/amscu_cudafunctions.hpp
+++ b/include/amsculib2/amscu_cudafunctions.hpp
@ -0,0 +1,40 @@
+#ifndef __AMSCU_CUDAFUNCTIONS_HPP__
+#define __AMSCU_CUDAFUNCTIONS_HPP__
+
+
+namespace amscuda
+{
+    // device memory operations
+    // I'm trying to avoid some of the boilerplate mental overhead involved
+    // in calling cuda functions and handling errors
+
+    //frees devbuffer if it is not already NULL, and sets devbuffer to NULL
+    //wrapper to cudaFree
+    template<typename T> int cuda_free(T **devptr);
+
+    //copies hostbuffer to devbuffer
+    //initializes devbuffer from NULL if devbuffer is NULL
+    //if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
+    template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite);
+
+    //copies info from devbuffer to hostbuffer
+    //initialzies hostbuffer from NULL if NULL
+    //if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing)
+    template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite);
+
+    //wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
+    //initializes devptr from NULL if not already initialized
+    template<typename T> int cuda_copytodevice(T *hostptr, T **devptr);
+
+    //wrapper for cudaMemcpy - copies an item or struct (count 1) from device
+    //initializes hostptr from NULL with new if not already initialized
+    template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr);
+    
+    int cuda_errortrap(const char *msgheader);
+
+};
+
+#include <amsculib2/amscu_cudafunctions_impl.hpp>
+
+#endif
+
--- a/include/amsculib2/amscu_cudafunctions_impl.hpp
+++ b/include/amsculib2/amscu_cudafunctions_impl.hpp
@ -0,0 +1,228 @@
+#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
+#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
+
+namespace amscuda
+{
+
+//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
+//wrapper to cudaFree
+template<typename T> int cuda_free(T **devptr)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+
+    if(*devptr==NULL)
+    {
+        return ret; //devbuffer is already NULL/freed
+    }
+
+    err = cudaFree(*devptr);
+    if(err!=cudaSuccess)
+    {
+        ret = -1; //failed to free device pointer
+        *devptr = NULL; // - ? should only happen if I'm trying to double-free something
+    }
+    else
+    {
+        ret = 1;
+        *devptr = NULL;
+    }
+
+    return ret;
+}
+
+//copies hostbuffer to devbuffer
+//initializes devbuffer from NULL if devbuffer is NULL
+//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
+template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+
+    if(N<=0)
+    {
+        ret = 0;
+        return ret;
+    }
+
+    if(hostbuffer==NULL)
+    {
+        ret = -2; //host buffer is NULL
+        return ret;
+    }
+
+    if(overwrite==1)
+    {
+        if(*devbuffer !=NULL)
+        {
+            cuda_free(devbuffer);
+        }
+    }
+
+    if(*devbuffer==NULL)
+    {
+        err = cudaMalloc(devbuffer,sizeof(T)*N);
+        if(err!=cudaSuccess)
+        {
+            ret = -3; //failed to allocate
+            *devbuffer = NULL;
+            return ret;
+        }
+    }
+
+    err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice);
+    if(err!=cudaSuccess)
+    {
+        ret = -4; //failed to copy
+    }
+    else
+    {
+        ret = 1;
+    }
+
+
+    return ret;
+}
+
+//copies info from devbuffer to hostbuffer
+//initialzies hostbuffer from NULL if NULL
+//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing)
+template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+
+    if(N<=0)
+    {
+        ret = 0;
+        return ret;
+    }
+
+    if(devbuffer==NULL)
+    {
+        ret = -5; //null dev buffer
+        return ret;
+    }
+
+    if(overwrite==1 && *hostbuffer!=NULL)
+    {
+        delete[] (*hostbuffer); hostbuffer = NULL;
+    }
+
+    if(*hostbuffer==NULL)
+    {
+        *hostbuffer = new(std::nothrow) T[N];
+        if(*hostbuffer==NULL)
+        {
+            ret = -6; //failed to allocate host buffer
+            return ret;
+        }
+    }
+
+    err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost);
+    if(err!=cudaSuccess)
+    {
+        ret = -7; //failed to copy
+    }
+    else
+    {
+        ret = 1;
+    }
+    
+    return ret;
+}
+
+//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
+//initializes devptr from NULL if not already initialized
+template<typename T> int cuda_copytodevice(T *hostptr, T **devptr)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+    bool overwrite = 1;
+
+    if(hostptr==NULL)
+    {
+        ret = -2; //host buffer is NULL
+        return ret;
+    }
+
+    if(overwrite==1)
+    {
+        if(*devptr !=NULL)
+        {
+            cuda_free(devptr);
+        }
+    }
+
+    if(*devptr==NULL)
+    {
+        err = cudaMalloc(devptr,sizeof(T));
+        if(err!=cudaSuccess)
+        {
+            ret = -3; //failed to allocate
+            *devptr = NULL;
+            return ret;
+        }
+    }
+
+    err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice);
+    if(err!=cudaSuccess)
+    {
+        ret = -4; //failed to copy
+    }
+    else
+    {
+        ret = 1;
+    }
+
+
+    return ret;
+}
+
+//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
+//initializes hostptr from NULL with new if not already initialized
+template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+    bool overwrite = 1;
+
+    if(devptr==NULL)
+    {
+        ret = -5; //null dev buffer
+        return ret;
+    }
+
+    if(overwrite==1 && *hostptr!=NULL)
+    {
+        delete (*hostptr); hostptr = NULL;
+    }
+
+    if(*hostptr==NULL)
+    {
+        *hostptr = new(std::nothrow) T;
+        if(*hostptr==NULL)
+        {
+            ret = -6; //failed to allocate host buffer
+            return ret;
+        }
+    }
+
+    err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost);
+    if(err!=cudaSuccess)
+    {
+        ret = -7; //failed to copy
+    }
+    else
+    {
+        ret = 1;
+    }
+    
+    return ret;
+}
+
+
+};
+
+#endif
+
--- a/include/amsculib2/amscu_random.hpp
+++ b/include/amsculib2/amscu_random.hpp
@ -0,0 +1,55 @@
+#ifndef __AMSCU_RANDOM_HPP__
+#define __AMSCU_RANDOM_HPP__
+
+namespace amscuda
+{
+
+// Random Number Gerneators
+
+
+// faster floating point hash function used in fractal generators
+__device__ __host__ float fhash1d_su(float x);
+
+__device__ __host__ float fhash3d_su(float x, float y, float z);
+
+__device__ __host__ float fhash4d_su(float x, float y, float z, float w);
+
+
+//////////////////////////////////////////////////
+// Deterministic Pseudorandom int32_t Generator //
+//////////////////////////////////////////////////
+
+//Next seed in simple 32 bit integer deterministic psuedo-rand generator
+__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout);
+
+//Simple 32 bit integer deterministic pseudo-random generator
+// *not* for cryptography
+// Frequency of generated floats should be uniform [0,1)
+__host__ __device__ float dpr32_randf(int32_t *rseed_inout);
+
+//box muller standard normal pseudorandom variable
+__host__ __device__ float dpr32_randnf(int32_t *rseed_inout);
+
+//////////////////////////////////////////////////
+// Deterministic Pseudorandom int64_t Generator //
+//////////////////////////////////////////////////
+
+//operates without side-effects on explicit seed for threaded use
+//deterministic pseudorandom number generator - takes seed and returns next seed
+__host__ __device__ void dpr64_nextseed(int64_t *seedinout);
+
+//deterministic pseudorandom number generator - takes seed and returns next seed
+//returns uniformly distributed double
+__host__ __device__ double dpr64_randd(int64_t *seedinout);
+
+__host__ __device__ float dpr64_randf(int64_t *seedinout);
+
+
+void test_dprg64();
+void test_dprg32();
+
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/amscuarray.hpp
+++ b/include/amsculib2/amscuarray.hpp
@ -0,0 +1,47 @@
+#ifndef __CUARRAY_HPP__
+#define __CUARRAY_HPP__
+
+namespace amscuda
+{
+
+template<typename T> class cuarray
+{
+public:
+    int length;
+    T* data;
+
+    __device__ __host__ cuarray();
+    __device__ __host__ ~cuarray();
+
+    //Only call this on the device for thread/block local 
+    // dynamic arrays
+    __device__ __host__ int resize(const int _length);
+
+    __device__ __host__ int size() const;
+    __device__ __host__ T& at(const int I);
+    __device__ __host__ const T& at(const int I) const;
+
+    __device__ __host__ T& operator[](const int I);
+    __device__ __host__ const T& operator[](const int I) const;
+    
+    
+
+    __host__ int device_send(cuarray<T> **dptr);
+    __host__ int _device_send_overwrite(cuarray<T> **dptr);
+    __host__ int _device_send_copy(cuarray<T> *dptr);
+    
+    __host__ int device_pull(cuarray<T> *dptr);
+    __host__ int device_free(cuarray<T> **dptr);
+
+    __host__ int device_length(cuarray<T> *dptr);
+    __host__ T* device_data_ptr(cuarray<T> *dptr);
+    
+};
+
+void test_cuarray();
+
+};
+
+#include <amsculib2/amscuarray_impl.hpp>
+
+#endif
--- a/include/amsculib2/amscuarray_dops.hpp
+++ b/include/amsculib2/amscuarray_dops.hpp
@ -0,0 +1,76 @@
+#ifndef __AMSCUARRAY_DOPS_HPP__
+#define __AMSCUARRAY_DOPS_HPP__
+
+//Device Operations on Arrays
+//
+
+//Device Operations on Device Buffers
+// dodb
+
+namespace amscuda
+{
+
+
+    //sum
+    template<typename T> T devcuarray_sum(cuarray<T> *devptr);
+
+    template<typename T> T dbuff_sum(T *devbuffer, int N);
+
+
+    struct dbuff_statstruct
+    {
+        public:
+        float min;
+        float max;
+        float mean;
+        float stdev;
+        float sum;
+    };
+    
+    //stats (min,max,mean,stdev)
+
+    template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max);
+
+    template<typename T> dbuff_statstruct dbuff_stats(T *devbuffer, int N); //
+
+    //sets all elements to setto
+    template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads);
+
+    //random device buffer functions
+    void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout,  int nblocks, int nthreads); //
+    void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout,  int nblocks, int nthreads); //
+    
+
+    void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout,  int nblocks, int nthreads); //
+
+    //Elementwise device-buffer vector binary operation
+    //takes two input arrays ( , ) --> one output array
+    template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
+
+    //Elementwise device-buffer vector two-parameter operation
+    //takes one input array, and a constant paramter ( ) ---> one output array
+    template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
+
+
+    //vector_add
+    template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+    template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); 
+
+
+    // Tests //
+
+    void test_dbuff_rand_dpr32();
+
+};
+
+#include <amsculib2/amscuarray_dops_impl.hpp>
+
+#endif
+
--- a/include/amsculib2/amscuarray_dops_impl.hpp
+++ b/include/amsculib2/amscuarray_dops_impl.hpp
@ -0,0 +1,404 @@
+#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
+#define __AMSCUARRAY_DOPS_IMPL_HPP__
+
+namespace amscuda
+{
+
+template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    T ret = (T) 0;
+    for(I=I0;I<N;I=I+Is)
+    {
+        ret = ret + devbuffer[I];
+    }
+    rets[I0] = ret;
+}
+
+template<typename T> T devcuarray_sum(cuarray<T> *devptr)
+{
+    T ret = T();
+    cudaError_t err = cudaSuccess;
+
+    cuarray<T> ldptr;
+    
+    cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
+    
+    ret = devbuffer_sum(ldptr.data,ldptr.length);
+
+    ldptr.data = NULL;
+    ldptr.length=0;
+
+    return ret;
+}
+
+template<typename T> T dbuff_sum(T *dbuff, int N)
+{
+    int I;
+    T ret = T();
+    cudaError_t err = cudaSuccess;
+
+    int nblocks;
+    int nthreads;
+
+    if(dbuff==NULL || N<=0)
+    {
+        return ret;
+    }
+
+    if(N>100)
+    {
+        nblocks = 10;
+        nthreads = (int)sqrt((float) (N/nblocks));
+        if(nthreads<=0) nthreads=1;
+        if(nthreads>512) nthreads=512;
+    }
+    else
+    {
+        nblocks = 1;
+        nthreads = 1;
+    }
+
+    T *rets = NULL;
+    T *devrets = NULL;
+
+    rets = new T[nblocks*nthreads];
+    cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
+
+    dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
+
+    ret = (T)0;
+    for(I=0;I<nblocks*nthreads;I++)
+    {
+        ret = ret + rets[I];
+    }
+
+    cudaFree(devrets); devrets = NULL;
+    delete[] rets;
+
+    return ret;
+}
+
+
+template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    for(I=I0;I<N;I=I+Is)
+    {
+        if(I==I0)
+        {
+            maxs[I0] = devbuffer[I];
+            mins[I0] = devbuffer[I];
+        }
+        else
+        {
+            if(devbuffer[I]>maxs[I0])
+            {
+                maxs[I0] = devbuffer[I];
+            }
+            if(devbuffer[I]<mins[I0])
+            {
+                mins[I0] = devbuffer[I];
+            }
+        }
+    }
+
+    return;
+}
+
+template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
+{
+    cudaError_t err = cudaSuccess;
+    int nblocks;
+    int nthreads;
+    int I;
+
+    T *maxs = NULL;
+    T *dev_maxs = NULL;
+    T *mins = NULL;
+    T *dev_mins = NULL;
+
+    T localmax = T(0);
+    T localmin = T(0);
+
+    if(devbuffer==NULL || N<=0)
+    {
+        if(min!=NULL) *min = T(0);
+        if(max!=NULL) *max = T(0);
+        return;
+    }
+
+    if(N>25)
+    {
+        nblocks = 25;
+        nthreads = (int) sqrt((float)(N/nblocks));
+        if(nthreads<1) nthreads = 1;
+        if(nthreads>512) nthreads = 512;
+    }
+    else
+    {
+        nblocks = 1;
+        nthreads = 1;
+    }
+
+    maxs = new T[nblocks*nthreads];
+    mins = new T[nblocks*nthreads];
+    cudaMalloc(&dev_maxs,nblocks*nthreads);
+    cudaMalloc(&dev_mins,nblocks*nthreads);
+    
+    dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
+    cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
+    
+
+    for(I=0;I<nblocks*nthreads;I++)
+    {
+        if(I==0)
+        {
+            localmax = maxs[0];
+            localmin = mins[0];
+        }
+        else
+        {
+            if(maxs[I]>localmax) localmax = maxs[I];
+            if(mins[I]<localmin) localmin = mins[I];
+        }
+    }
+
+    if(max!=NULL) *max = localmax;
+    if(min!=NULL) *min = localmin;
+
+    cudaFree(dev_maxs); dev_maxs = NULL;
+    cudaFree(dev_mins); dev_mins = NULL;
+    delete[] maxs; maxs = NULL;
+    delete[] mins; mins = NULL;
+
+    return;
+}
+
+template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    for(I=I0;I<N;I=I+Is)
+    {
+        devbuffer[I] = setto;
+    }
+    return;
+}
+
+template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
+{
+    cudaError_t err = cudaSuccess;
+
+    if(devbuffer==NULL || N<=0)
+    {
+        return;
+    }
+
+    dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
+    }
+
+    return;
+}
+
+template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    T1 a;
+    T2 b;
+    T3 c;
+
+    for(I=I0;I<N;I=I+Is)
+    {
+        a = dbuf_a[I];
+        b = dbuf_b[I];
+        c = fpnt(a,b);
+        dbuf_out[I] = c;
+    }
+
+    return;
+}
+
+template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    T1 a;
+    T2 b;
+    T3 c;
+
+    for(I=I0;I<N;I=I+Is)
+    {
+        a = dbuf_a[I];
+        b = par_b;
+        c = fpnt(a,b);
+        dbuf_out[I] = c;
+    }
+
+    return;
+}
+
+
+//Elementwise device-buffer vector binary operation
+//takes two input arrays ( , ) --> one output array
+template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
+{
+    cudaError_t err = cudaSuccess;
+
+    if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
+    {
+        return;
+    }
+
+    dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
+    }
+
+    return;
+}
+
+//Elementwise device-buffer vector two-parameter operation
+//takes one input array, and a constant paramter ( ) ---> one output array
+template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
+{
+    cudaError_t err = cudaSuccess;
+
+    if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
+    {
+        return;
+    }
+
+    dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
+    }
+
+    return;
+}
+
+template<typename T> T dbuff_add_fn(T a, T b)
+{
+    return a+b;
+}
+
+template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> T dbuff_sub_fn(T a, T b)
+{
+    return a-b;
+}
+
+template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> T dbuff_mult_fn(T a, T b)
+{
+    return a*b;
+}
+
+template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> T dbuff_div_fn(T a, T b)
+{
+    return a/b;
+}
+
+template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
+    return;
+}
+
+template<typename T> T dbuff_ldiv_fn(T a, T b)
+{
+    return b/a;
+}
+
+
+template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
+{
+    dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
+    return;
+}
+
+
+};
+
+#endif
+
--- a/include/amsculib2/amscuarray_impl.hpp
+++ b/include/amsculib2/amscuarray_impl.hpp
@ -0,0 +1,323 @@
+#ifndef __CUARRAY_IMPL_HPP__
+#define __CUARRAY_IMPL_HPP__
+
+namespace amscuda
+{
+
+// New Version cuarray<T>
+// simpler, less crap going on
+
+template<typename T> __device__ __host__ cuarray<T>::cuarray()
+{
+    length = 0;
+    data = NULL;
+}
+
+template<typename T> __device__ __host__ cuarray<T>::~cuarray()
+{
+    if(data!=NULL)
+    {
+        delete[] data; data = NULL;
+    }
+    length = 0;
+}
+
+template<typename T> __device__ __host__ int cuarray<T>::resize(const int _length)
+{
+    int ret = 0;
+
+    T *newbuffer = NULL;
+
+    if(length==_length)
+    {
+        //do nothing
+        ret = 1;
+        return ret;
+    }
+    if(_length<=0)
+    {
+        if(data!=NULL)
+        {
+            delete[] data;
+            data = NULL;
+        }
+        length = 0;
+        ret = 1;
+    }
+
+    newbuffer = new T[_length];
+    if(newbuffer==NULL)
+    {
+        ret = -1; //failed to allocate memory
+        return ret;
+    }
+
+    int I;
+    T def;
+
+    if(data!=NULL)
+    {
+        for(I=0;I<length&&I<_length;I++)
+        {
+            newbuffer[I] = data[I];
+        }
+        for(I=length;I<_length;I++)
+        {
+            newbuffer[I] = def;
+        }
+        delete[] data; data=NULL;
+    }
+    else
+    {
+        for(I=0;I<_length;I++)
+        {
+            newbuffer[I] = def;
+        }
+    }
+
+    data = newbuffer;
+    length = _length;
+    ret = 1;
+
+    return ret;
+}
+
+
+template<typename T> __host__ int cuarray<T>::device_send(cuarray<T> **dptr)
+{
+    int ret = 0;
+    int dlength;
+
+    if(*dptr==NULL)
+    {
+        ret = _device_send_overwrite(dptr);
+    }
+    else
+    {
+        dlength = device_length(*dptr);
+        if(dlength=length)
+        {
+            ret = _device_send_copy(*dptr);
+        }
+        else
+        {
+            ret = _device_send_overwrite(dptr);
+        }
+    }
+
+    return ret;
+}
+
+template<typename T> __host__ int cuarray<T>::_device_send_overwrite(cuarray<T> **dptr)
+{
+    int ret = 0;
+    cuarray<T> dlocal;
+    cudaError_t err = cudaSuccess;
+    device_free(dptr);
+    
+    if(length>=0 && data!=NULL)
+    {
+        err = cudaMalloc(dptr,sizeof(cuarray<T>));
+        if(err==cudaSuccess)
+        {
+            err = cudaMalloc(&(dlocal.data),sizeof(T)*length);
+            dlocal.length = length;
+
+            if(err==cudaSuccess)
+            {
+                cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
+                if(data!=NULL)
+                    err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice);
+                else
+                    err = cudaSuccess;
+                if(err==cudaSuccess)
+                {
+                    ret = 1;
+                }
+                else
+                {
+                    ret = -3;
+                }
+            }
+            else
+            {
+                ret = -2;
+            }
+        }
+        else
+        {
+            ret = -1;
+        }
+    }
+    else
+    {
+        dlocal.data = NULL;
+        dlocal.length = 0;
+        err = cudaMalloc(dptr,sizeof(cuarray<T>));
+        if(err==cudaSuccess)
+        {
+            cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
+            ret = 1;
+        }
+        else
+        {
+            ret = -4;
+        }
+    }
+
+
+    dlocal.data = NULL;
+    dlocal.length = -1;
+
+    return ret;
+}
+
+template<typename T> __host__ int cuarray<T>::_device_send_copy(cuarray<T> *dptr)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+    T* ddata = NULL;
+    ddata = device_data_ptr(dptr);
+
+    err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice);
+    if(err==cudaSuccess)
+    {
+        ret = 1;
+    }
+    else
+    {
+        ret = -1;
+    }
+
+    return ret;
+}
+
+template<typename T> __host__ int cuarray<T>::device_pull(cuarray<T> *dptr)
+{
+    int ret = 0;
+    int dlength;
+    T* ddata;
+    cudaError_t err;
+
+    if(dptr==NULL) 
+    {
+        ret = -1; // null d pointer
+        return ret;
+    }
+
+    dlength = device_length(dptr);
+    if(dlength!=length)
+    {
+        this->resize(dlength);
+    }
+
+    ddata = device_data_ptr(dptr);
+
+    if(length>0 && data!=NULL && ddata!=NULL)
+    {
+        err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost);
+        if(err==cudaSuccess)
+        {
+            ret = 1;
+        }
+        else
+        {
+            ret = -2;
+        }
+    }
+
+    return ret;
+}
+
+template<typename T> __host__ int cuarray<T>::device_free(cuarray<T> **dptr)
+{
+    int ret = 0;
+    cuarray<T> dlocal;
+
+    if(*dptr!=NULL)
+    {
+        cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
+        if(dlocal.data!=NULL)
+        {
+            cudaFree(dlocal.data);
+            dlocal.data = NULL;
+        }
+
+        cudaFree(*dptr);
+        *dptr = NULL;
+        ret = 1;
+    }
+
+    dlocal.data = NULL;
+    dlocal.length = -1;
+
+    return ret;
+}
+
+template<typename T> __host__ int cuarray<T>::device_length(cuarray<T> *dptr)
+{
+    int ret = -1;
+    cuarray<T> dlocal;
+
+    if(dptr==NULL)
+    {
+        return ret;
+    }
+
+    cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
+    ret = dlocal.length;
+
+    dlocal.data = NULL;
+    dlocal.length = -1;
+
+    return ret;
+}
+
+template<typename T> __host__ T* cuarray<T>::device_data_ptr(cuarray<T> *dptr)
+{
+    T* ret = NULL;
+    cuarray<T> dlocal;
+
+    if(dptr==NULL)
+    {
+        return ret;
+    }
+
+    cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
+    ret = dlocal.data;
+
+    dlocal.data = NULL;
+    dlocal.length = -1;
+
+    return ret;
+}
+
+template<typename T>  __device__ __host__ int cuarray<T>::size() const
+{
+    return this->length;
+}
+
+template<typename T>  __device__ __host__ T& cuarray<T>::at(const int I)
+{
+    return this->data[I];
+}
+
+template<typename T>  __device__ __host__ const T& cuarray<T>::at(const int I) const
+{
+    return this->data[I];
+}
+
+template<typename T> __device__ __host__ T& cuarray<T>::operator[](const int I)
+{
+    return this->data[I];
+}
+
+template<typename T> __device__ __host__ const T& cuarray<T>::operator[](const int I) const
+{
+    return this->data[I];
+}
+
+};
+
+
+#endif
--- a/include/amsculib2/amscuda_binarrrw.hpp
+++ b/include/amsculib2/amscuda_binarrrw.hpp
@ -0,0 +1,19 @@
+#ifndef __AMSCUDA_BINARRRW_HPP__
+#define __AMSCUDA_BINARRRW_HPP__
+
+namespace amscuda
+{
+
+template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer);
+template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer);
+
+template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer);
+template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer);
+
+
+}; //end namespace amscuda
+
+#include <amsculib2/amscuda_binarrrw_impl.hpp>
+
+#endif
+
--- a/include/amsculib2/amscuda_binarrrw_impl.hpp
+++ b/include/amsculib2/amscuda_binarrrw_impl.hpp
@ -0,0 +1,194 @@
+#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__
+#define __AMSCUDA_BINARRRW_IMPL_HPP__
+
+namespace amscuda
+{
+
+template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer)
+{
+    int ret = 1;
+    int I;
+    long piprod;
+    int32_t q;
+    int cnt;
+
+    int32_t Nd;
+
+    if(fp!=NULL)
+    {
+        if(!feof(fp))
+        {
+            cnt = fread(&Nd,sizeof(int32_t),1,fp);
+            if(Nd>0 && cnt>0)
+            {
+                shape->resize(Nd);
+                piprod = 1;
+                for(I=0;I<Nd;I++)
+                {
+                    cnt = fread(&q,sizeof(int32_t),1,fp);
+                    shape->at(I) = q;
+                    if(q>0)
+                    {
+                        piprod = piprod*q;
+                    }
+                    else
+                    {
+                        piprod = 0;
+                    }
+                }
+
+                buffer->resize(piprod);
+                if(piprod>0)
+                {
+                    cnt = fread((buffer->data),sizeof(T),piprod,fp);
+                    if(piprod==cnt)
+                    {
+                        ret = 1;
+                    }
+                    else
+                    {
+                        printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod);
+                        ret = 0;
+                    }
+                }
+            }
+            else
+            {
+                printf("fread_ndarray: Read a number of dimensions<=0.\n");
+                Nd = 0;
+                shape->resize(0);
+                buffer->resize(0);
+            }
+        }
+        else
+        {
+            printf("fread_ndarray: fp=NULL.\n");
+            ret = 0;
+        }
+    }
+    else
+    {
+        ret = 0;
+    }
+
+    return ret;
+}
+
+template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer)
+{
+    int ret = 1;
+    long piprod;
+    int I;
+    int32_t Nd;
+
+    if(fp==NULL)
+    {
+        ret = 0;
+        printf("fwrite_ndarray: fp=NULL\n");
+        return ret;
+    }
+
+    piprod = 1;
+    for(I=0;I<shape->size();I++)
+    {
+        if(shape->at(I)>0)
+        {
+            piprod = piprod*shape->at(I);
+        }
+        else
+        {
+            piprod = 0;
+        }
+    }
+
+    Nd = (int32_t) shape->size();
+
+    if(piprod!=buffer->size())
+    {
+        ret = 0;
+        printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod);
+        return ret;
+    }
+
+    fwrite(&Nd,sizeof(int32_t),1,fp);
+    if(Nd>0)
+    {
+        fwrite(shape->data,sizeof(int32_t),Nd,fp);
+        if(piprod>0)
+        {
+            fwrite(buffer->data,sizeof(T),buffer->size(),fp);
+        }
+    }
+
+    return ret;
+}
+
+template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer)
+{
+    int ret = 0;
+    int Nd = 1;
+    
+    if(fp==NULL)
+    {
+        ret = 0;
+        printf("fwrite_buffer: fp=NULL\n");
+        return ret;
+    }
+
+    fwrite(&Nd,sizeof(int32_t),1,fp);
+    fwrite(&N,sizeof(int32_t),1,fp);
+    fwrite(buffer,sizeof(T),N,fp);
+    
+    return ret;
+}
+
+template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer)
+{
+    int ret = 0;
+    
+    int cnt;
+    int32_t Nd;
+    int32_t *dims = NULL;
+    int piprod;
+    int32_t q;
+    int I;
+
+    int Nr;
+
+
+    if(fp==NULL) {ret = -1; return ret;}
+    if(feof(fp)) {ret = -2; return ret;}
+
+    cnt = fread(&Nd,sizeof(int32_t),1,fp);
+    if(Nd>0 && cnt>0)
+    {
+        piprod = 1;
+        dims = new(std::nothrow) int32_t[Nd];
+        for(I=0;I<Nd;I++)
+        {
+            cnt = fread(&q,sizeof(int32_t),1,fp);
+            dims[I] = q;
+            piprod = piprod*dims[I];
+            if(piprod==cnt)
+            {
+                ret = 1;
+            }
+            else
+            {
+                printf("fwrite_buffer, read %d values, expecting %d\n",cnt,piprod);
+            }
+        }
+
+        Nr = amscuda::min<int32_t>(Nmax,piprod);
+        cnt = fread(buffer,sizeof(T),Nr,fp);
+    }
+
+    if(dims!=NULL) {delete[] dims; dims=NULL;}
+
+    return ret;
+}
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/amscugeom.hpp
+++ b/include/amsculib2/amscugeom.hpp
@ -0,0 +1,11 @@
+#ifndef __AMSCUGEOM_HPP__
+#define __AMSCUGEOM_HPP__
+
+namespace amscuda
+{
+
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/amsculib2.hpp
+++ b/include/amsculib2/amsculib2.hpp
@ -0,0 +1,70 @@
+#ifndef __AMSCULIB2_HPP__
+#define __AMSCULIB2_HPP__
+
+//Std Lib Includes
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <new>
+
+#include <cuda_runtime_api.h> //where all the cuda functions live
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+//Dependencies
+
+//Predeclarations
+class cuvect2;
+class cuvect3;
+class cuvect4;
+class cuvect2f;
+class cuvect3f;
+class cuvect4f;
+
+//Need a way to define the same symbols using both host and device code
+//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory
+#ifdef __CUDA_ARCH__
+#define AMSCU_CONST __constant__
+#else
+#define AMSCU_CONST
+#endif
+
+namespace amscuda
+{
+
+    //default thread and block execution
+    AMSCU_CONST static const int amscu_defnblocks = 256;
+    AMSCU_CONST static const int amscu_defnthreads = 512;
+
+    //default numthreads to execute on cpu
+    AMSCU_CONST static const int amscu_defcputhreads = 8;
+
+}; //end namespace amscuda
+
+//Components
+#include <amsculib2/amscu_cudafunctions.hpp>
+#include <amsculib2/amscumath.hpp>
+#include <amsculib2/amscu_comp64.hpp>
+#include <amsculib2/amscu_comp128.hpp>
+#include <amsculib2/cuvect2.hpp>
+#include <amsculib2/cuvect3.hpp>
+#include <amsculib2/cuvect4.hpp>
+#include <amsculib2/cuvect2f.hpp>
+#include <amsculib2/cuvect3f.hpp>
+#include <amsculib2/cuvect4f.hpp>
+#include <amsculib2/amscugeom.hpp>
+#include <amsculib2/amscuarray.hpp>
+#include <amsculib2/amscuda_binarrrw.hpp>
+#include <amsculib2/amscu_random.hpp>
+
+#include <amsculib2/amscuarray_dops.hpp>
+
+#include <amsculib2/amscurarray.cuh>
+
+
+
+
+#endif
+
--- a/include/amsculib2/amscumath.hpp
+++ b/include/amsculib2/amscumath.hpp
@ -0,0 +1,56 @@
+#ifndef __AMSCUMATH_HPP__
+#define __AMSCUMATH_HPP__
+
+namespace amscuda
+{
+
+
+
+    //Problem: These are not in the namespace
+    //#define nan NAN
+    //#define fnan (float) NAN
+    //#define inf INFINITY
+    //#define finf (float) INFINITY
+    //#define pi 3.1415926535897936
+
+    //These need to be the same symbol for both host and device code
+    AMSCU_CONST static const double nan = NAN;
+    AMSCU_CONST static const float fnan = (float) NAN;
+    AMSCU_CONST static const double inf = INFINITY;
+    AMSCU_CONST static const float finf = (float) INFINITY;
+    AMSCU_CONST static const double pi = 3.1415926535897936;
+    AMSCU_CONST static const float pif = 3.1415926535897936;
+
+    __host__ __device__ double dabs(double x);
+    __host__ __device__ float fabs(float x);
+    
+    template<typename T> __host__ __device__ T abs(const T in)
+    {
+        T ret = in;
+        if(in<0) ret = -in;
+        return ret;
+    }
+
+    __host__ __device__ double mod(double a, double md);
+    __host__ __device__ float mod(float a, float md);
+    __host__ __device__ int mod(int x, int n);
+    __host__ __device__ long mod(long x, long n);
+
+    __host__ __device__ int truediv(int x, int y);
+    __host__ __device__ long truediv(long x, long y);
+
+    template<typename T> __host__ __device__ T min(T a, T b);
+    template<typename T> __host__ __device__ T max(T a, T b);
+
+    __device__ __host__ double arg(double x, double y);
+    __device__ __host__ void get_azel(double x, double y, double z, double *az, double *el);
+
+    void test_amscumath1();
+
+
+}; //end namespace amscuda
+
+#include <amsculib2/amscumath_impl.hpp>
+
+#endif
+
--- a/include/amsculib2/amscumath_impl.hpp
+++ b/include/amsculib2/amscumath_impl.hpp
@ -0,0 +1,42 @@
+#ifndef __AMSCUMATH_IMPL_HPP__
+#define __AMSCUMATH_IMPL_HPP__
+
+namespace amscuda
+{
+
+template<typename T> __host__ __device__ T min(T a, T b)
+{
+    if(a>b)
+    {
+        return b;
+    }
+    else
+    {
+        return a;
+    }
+    return a;
+}
+
+template<typename T> __host__ __device__ T max(T a, T b)
+{
+    if(a>b)
+    {
+        return a;
+    }
+    else
+    {
+        return b;
+    }
+    return a;
+}
+
+template<> __host__ __device__ double min(double a, double b);
+template<> __host__ __device__ float min(float a, float b);
+template<> __host__ __device__ double max(double a, double b);
+template<> __host__ __device__ float max(float a, float b);
+
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/amscurarray.cuh
+++ b/include/amsculib2/amscurarray.cuh
@ -0,0 +1,66 @@
+#ifndef __AMSCURARRAY_HPP__
+#define __AMSCURARRAY_HPP__
+
+namespace amscuda
+{
+
+//Cuda ragged array class
+template<typename T> class curarray
+{
+public:
+    int device;
+    curarray* devptr; //pointer to mirror class on the device
+
+    int Narrays; //number of arrays
+    
+    int *N; //dimension of each array
+    T** hostarrayptrs; //pointers to each array on the host - null on the device
+    T** devarrayptrs; //pointers to each array on the device
+        //the double pointer is a host pointer to device pointers on the host class
+        //for the device class, only the second set of arrays is in use
+    
+    //the constructor and destructor set all pointers to NULL, they
+    // do *not* manage memory. This is done with curarray_new and curarray_delete
+    __device__ __host__ curarray();
+    __device__ __host__ ~curarray();
+    
+    __host__ int push();
+    __host__ int pull();
+    //__device__ int dev_resizearray(int arraynum, int arraysize);
+    __host__ int resizearray(int arraynum, int arraysize);
+    // I may want a way to resize arrays on the device without pushing/pulling all the array contents
+    
+
+};
+
+template<typename T> int curarray_new(curarray<T>** ptr, int Narrays);
+
+template<typename T> int curarray_delete(curarray<T>** ptr);
+
+template<typename T> int curarray_device_new(curarray<T> *hostptr);
+
+template<typename T> int curarray_device_delete(curarray<T> *hostptr);
+
+template<typename T> int curarray_push(curarray<T> *hostptr);
+
+template<typename T> int curarray_pull(curarray<T> *hostptr);
+
+
+//template<typename T> int curarray_host_fillall(curarray<T> *hostptr, const T &val);
+//template<typename T> int curarray_device_fillall(curarray<T> *hostptr, const T &val);
+
+
+//template<typename T> __host__ int curarray_deletearray(curarray<T> *hostptr, int arrayindex);
+//template<typename T> __device__ int curarray_dev_deletearray(curarray<T> *devptr, int arrayindex);
+
+//template<typename T> __host__ int curarray_allocarray(curarray<T> *hostptr, int arrayindex, int size);
+//template<typename T> __device__ int curarray_dev_allocarray(curarray<T> *devptr, int arrayindex, int size);
+
+
+void test_amscurarray1();
+
+};
+
+#include <amsculib2/amscurarray_impl.cuh>
+
+#endif
--- a/include/amsculib2/amscurarray_impl.cuh
+++ b/include/amsculib2/amscurarray_impl.cuh
@ -0,0 +1,529 @@
+#ifndef __AMSCURARRAY_IMPL_HPP__
+#define __AMSCURARRAY_IMPL_HPP__
+
+namespace amscuda
+{
+
+template<typename T> curarray<T>::curarray()
+{
+    device = -1;
+    devptr = NULL;
+    Narrays = 0;
+    N = NULL;
+    hostarrayptrs = NULL;
+    devarrayptrs = NULL;
+
+}
+
+template<typename T> curarray<T>::~curarray()
+{
+    device = -1;
+    devptr = NULL;
+    Narrays = 0;
+    N = NULL;
+    hostarrayptrs = NULL;
+    devarrayptrs = NULL;
+    
+}
+
+template<typename T> int curarray_new(curarray<T>** ptr, int Narrays)
+{
+    int ret = 0;
+    int device;
+    curarray<T> *lhptr = *ptr;
+
+    cudaGetDevice(&device);
+
+    if(lhptr!=NULL)
+    {
+        curarray_delete(ptr);
+    }
+
+    *ptr = new(std::nothrow) curarray<T>();
+    lhptr = *ptr;
+
+    int I;
+
+    if(Narrays<0) Narrays=0;
+
+    lhptr->Narrays = Narrays;
+    lhptr->device = device;
+    lhptr->N = new(std::nothrow) int[Narrays];
+    lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays];
+    lhptr->devarrayptrs = new(std::nothrow) T*[Narrays];
+    
+    for(I=0;I<Narrays;I++)
+    {
+        lhptr->N[I] = 0;
+        lhptr->hostarrayptrs[I] = NULL;
+        lhptr->devarrayptrs[I] = NULL;
+    }
+
+    curarray_device_new(lhptr);
+
+    return ret;
+}
+
+template<typename T> int curarray_delete(curarray<T>** ptr)
+{
+    int ret = 0;
+    curarray<T> *lptr = NULL;
+    int olddev;
+
+    int I;
+
+    if(*ptr==NULL)
+    {
+        return 0;
+    }
+    lptr = *ptr;
+
+    cudaGetDevice(&olddev);
+    cudaSetDevice(lptr->device);
+
+    
+    if(lptr->devptr!=NULL)
+    {
+        curarray_device_delete(lptr);
+    }
+
+    lptr->device = -1;
+
+    for(I=0;I<lptr->Narrays;I++)
+    {
+        if(lptr->hostarrayptrs!=NULL)
+        {
+            if(lptr->hostarrayptrs[I]!=NULL)
+            {
+                delete[] lptr->hostarrayptrs[I];
+                lptr->hostarrayptrs[I] = NULL;
+            }
+        }
+
+        if(lptr->devarrayptrs!=NULL)
+        {
+            if(lptr->devarrayptrs[I]!=NULL)
+            {
+                //erasing device memory should have been handled in curarray_device_delete
+                lptr->devarrayptrs[I] = NULL;
+            }
+        }
+
+        lptr->N[I] = 0;
+    }
+
+    if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;}
+    if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;}
+    if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;}
+
+    if(*ptr!=NULL) {delete *ptr; *ptr = NULL;}
+
+    cudaSetDevice(olddev);
+
+    return ret;
+}
+
+template<typename T> int curarray_device_new(curarray<T> *hostptr)
+{
+    int ret = 0;
+    curarray<T> ldevdata;
+
+    if(hostptr==NULL) return -1;
+    if(hostptr->devptr!=NULL)
+    {
+        curarray_device_delete(hostptr);
+    }
+
+    int I;
+
+    cudaGetDevice(&(hostptr->device));
+
+    ldevdata.device = hostptr->device;
+    ldevdata.Narrays = hostptr->Narrays;
+    int Narrays = hostptr->Narrays;
+
+    cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays);
+    cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
+
+    ldevdata.hostarrayptrs = NULL;
+
+    for(I=0;I<Narrays;I++)
+    {
+        if(hostptr->N[I]>0)
+        {
+            if(hostptr->devarrayptrs[I]!=NULL)
+            {
+                cudaFree(hostptr->devarrayptrs[I]);
+                hostptr->devarrayptrs[I] = NULL;
+            }
+
+            cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
+            cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
+        }
+        else
+        {
+            if(hostptr->devarrayptrs[I]!=NULL)
+            {
+                cudaFree(hostptr->devarrayptrs[I]);
+                hostptr->devarrayptrs[I] = NULL;
+            }
+        }
+    }
+
+    cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays);
+    cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);
+
+    cudaMalloc(&(hostptr->devptr),sizeof(curarray<T>));
+    cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);    
+    
+    ret = 1;
+
+    return ret;
+}
+
+template<typename T> int curarray_device_delete(curarray<T> *hostptr)
+{
+    int ret = 0;
+
+    curarray<T> ldevdata;
+    int olddev;
+
+    if(hostptr->devptr==NULL)
+    {
+        return 0;
+    }
+
+    cudaGetDevice(&olddev);
+    cudaSetDevice(hostptr->device);
+
+    cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
+
+    int I;
+    for(I=0;I<hostptr->Narrays;I++)
+    {
+        if(hostptr->devarrayptrs[I]!=NULL)
+        {
+            cudaFree(hostptr->devarrayptrs[I]);
+            hostptr->devarrayptrs[I] = NULL;
+        }
+    }
+
+    cudaFree(ldevdata.devarrayptrs);
+    cudaFree(ldevdata.N);
+
+    cudaFree(hostptr->devptr);
+    hostptr->devptr = NULL;
+    hostptr->device = -1;
+
+    cudaSetDevice(olddev);
+
+    ret = 1;
+    return ret;
+}
+
+template<typename T> int curarray_push(curarray<T> *hostptr)
+{
+    int ret = 0;
+
+    int olddev;
+
+    curarray<T> ldevdata;
+    T** ldevarrayptrs = NULL;
+    int *devN = NULL;
+
+    if(hostptr==NULL) return -1;
+
+    cudaGetDevice(&olddev);
+    cudaSetDevice(hostptr->device);
+
+    int Narrays = hostptr->Narrays;
+
+    cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
+    ldevarrayptrs = new(std::nothrow) T*[Narrays];
+    devN = new(std::nothrow) int[Narrays];
+
+    cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
+    cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);
+
+
+    int I;
+
+    for(I=0;I<Narrays;I++)
+    {
+        //check to see that host size is the same as device size, and that 
+        //the host device pointer is the same as the device device pointer
+        if( (hostptr->N[I]!=devN[I]) || 
+            (hostptr->devarrayptrs[I] != ldevarrayptrs[I])
+        )
+        {
+            cudaFree(ldevarrayptrs[I]);
+            ldevarrayptrs[I] = NULL;
+            hostptr->devarrayptrs[I] = NULL;
+
+            if(hostptr->N[I]>0)
+            {
+                cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
+                ldevarrayptrs[I] = hostptr->devarrayptrs[I];
+                devN[I] = hostptr->N[I];
+            }
+            else
+            {
+                devN[I] = 0;
+            }
+        }
+        if(hostptr->N[I]>0)
+        {
+            //copy host data to device
+            cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
+        }
+    } //for each array
+
+    //rectify and copy device data structure to device
+    ldevdata.device = hostptr->device;
+    ldevdata.devptr = NULL;
+    ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true
+    ldevdata.hostarrayptrs = NULL;
+
+
+    cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
+    cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);
+
+    cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);
+
+    cuda_errortrap("curarray_push cuda error:");
+
+    cudaSetDevice(olddev);
+
+    delete[] ldevarrayptrs;
+    delete[] devN;
+
+    return ret;
+}
+
+template<typename T> int curarray_pull(curarray<T> *hostptr)
+{
+    int ret = 0;
+
+    int olddev;
+
+    curarray<T> ldevdata;
+    T** ldevarrayptrs = NULL;
+    int *devN = NULL;
+
+    if(hostptr==NULL) return -1;
+
+    cudaGetDevice(&olddev);
+    cudaSetDevice(hostptr->device);
+
+    cuda_errortrap("dbg1");
+
+    int Narrays = hostptr->Narrays;
+
+    cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
+    ldevarrayptrs = new(std::nothrow) T*[Narrays];
+    devN = new(std::nothrow) int[Narrays];
+
+    cuda_errortrap("dbg2");
+
+    cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
+    cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);
+
+    cuda_errortrap("dbg3");
+    char dbgjnk[50];
+
+    int I;
+    for(I=0;I<Narrays;I++)
+    {
+        //check to see that host size is the same as device size, and that 
+        //the host device pointer is the same as the device device pointer
+        if(hostptr->devarrayptrs[I] != ldevarrayptrs[I])
+        {
+            hostptr->devarrayptrs[I] = ldevarrayptrs[I];
+        }
+
+        if(hostptr->N[I]!=devN[I])
+        {
+            if(hostptr->hostarrayptrs[I]!=NULL) 
+            {
+                delete[] hostptr->hostarrayptrs[I];
+                hostptr->hostarrayptrs[I] = NULL;
+            }
+
+            if(devN[I]>0)
+            {
+                hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]];
+                hostptr->N[I] = devN[I];
+            }
+            else
+            {
+                hostptr->N[I] = 0;
+            }
+        }
+
+        if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL)
+        {
+            cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost);
+            sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]);
+            cuda_errortrap(dbgjnk);
+        }
+    } //for each array
+
+    //for the pull operation, I don't think any update of the device data structure is necessary
+
+    cudaSetDevice(olddev);
+
+    delete[] ldevarrayptrs;
+    delete[] devN;
+
+    return ret;
+}
+
+template<typename T> __host__ int curarray<T>::push()
+{
+    return curarray_push(this);
+}
+template<typename T> __host__ int curarray<T>::pull()
+{
+    return curarray_pull(this);
+}
+
+
+
+/*
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D
+
+cudaMalloc() and cudaFree() have distinct semantics between the host and 
+device environments. When invoked from the host, cudaMalloc() allocates a 
+new region from unused device memory. When invoked from the device runtime 
+these functions map to device-side malloc() and free(). This implies that 
+within the device environment the total allocatable memory is limited to the 
+device malloc() heap size, which may be smaller than the available unused 
+device memory. Also, it is an error to invoke cudaFree() from the host 
+program on a pointer which was allocated by cudaMalloc() on the device 
+or vice-versa.
+
+So, basically this entire function is not going to work. I'll be unable to resize within 
+a kernel.
+*/
+
+/*
+
+template<typename T> __device__ int curarray<T>::dev_resizearray(int arraynum, int arraysize)
+{
+    int ret = 0;
+    T* newptr = NULL;
+    int I;
+    T def;
+
+    if(arraynum>=0 && arraynum<Narrays)
+    {
+        if(N[arraynum]!=arraysize)
+        {
+            if(arraysize<=0)
+            {
+                if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
+                devarrayptrs[arraynum] = NULL;
+                N[arraynum] = 0;
+                ret = 1;
+                return ret;
+            }
+
+            cudaMalloc(&newptr,arraysize*sizeof(T));
+            if(newptr!=NULL)
+            {
+                //do I want to assume there is a copy operator? (operator=)
+                //for now, yes - write a more restrictive class later if I don't want it
+                if(devarrayptrs[arraynum]!=NULL)
+                {
+                    for(I=0;I<N[arraynum]&&I<arraysize;I++)
+                    {
+                        newptr[I] = devarrayptrs[arraynum][I];
+                    }
+                }
+                for(I=N[arraynum];I<arraysize;I++)
+                {
+                    newptr[I] = def;
+                }
+
+                if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
+                devarrayptrs[arraynum] = newptr;
+                N[arraynum] = arraysize;
+                ret = 1;
+            }
+            else
+            {
+                ret = -1;
+            }
+        }
+        else
+        {
+            ret = 1;
+        }
+    }
+
+    return ret;
+}
+
+*/
+
+template<typename T> __host__ int curarray<T>::resizearray(int arraynum, int arraysize)
+{
+    int ret = 0;
+    T* newptr = NULL;
+    int I;
+    T def;
+
+    if(arraynum>=0 && arraynum<Narrays)
+    {
+        if(N[arraynum]!=arraysize)
+        {
+            if(arraysize<=0)
+            {
+                delete[] hostarrayptrs[arraynum];
+                hostarrayptrs[arraynum] = NULL;
+                N[arraynum] = 0;
+                ret = 1;
+                return ret;
+            }
+
+            newptr = new(std::nothrow) T[arraysize];
+            if(newptr!=NULL)
+            {
+                //do I want to assume there is a copy operator? (operator=)
+                //for now, yes - write a more restrictive class later if I don't want it
+                
+                if(hostarrayptrs[arraynum]!=NULL)
+                {
+                    for(I=0;I<N[arraynum]&&I<arraysize;I++)
+                    {
+                        newptr[I] = hostarrayptrs[arraynum][I];
+                    }
+                }
+                for(I=N[arraynum];I<arraysize;I++)
+                {
+                    newptr[I] = def;
+                }
+
+                //cudaFree(hostarrayptrs[arraynum]);
+                delete[] hostarrayptrs[arraynum];
+                hostarrayptrs[arraynum] = newptr;
+                N[arraynum] = arraysize;
+                ret = 1;
+            }
+            else
+            {
+                ret = -1;
+            }
+        }
+        else
+        {
+            ret = 1;
+        }
+    }
+
+    return ret;
+}
+
+};
+
+#endif
--- a/include/amsculib2/cuvect2.hpp
+++ b/include/amsculib2/cuvect2.hpp
@ -0,0 +1,84 @@
+#ifndef __CUVECT2_HPP__
+#define __CUVECT2_HPP__
+
+namespace amscuda
+{
+
+    class cuvect2
+    {
+        public:
+        double x;
+        double y;
+
+
+        __host__ __device__ cuvect2();
+        __host__ __device__ ~cuvect2();
+        __host__ __device__ cuvect2(double _x, double _y);
+
+        __host__ __device__ double& operator[](const int I);
+        __host__ __device__ const double& operator[](const int I) const;
+
+        __host__ __device__ cuvect2 operator+(cuvect2 lhs);
+        __host__ __device__ cuvect2 operator-(cuvect2 lhs);
+        __host__ __device__ cuvect2 operator*(double lhs);
+        __host__ __device__ cuvect2 operator/(double lhs);
+    };
+
+    class cumat2
+    {
+        public:
+        double dat[4];
+
+        __host__ __device__ cumat2();
+        __host__ __device__ ~cumat2();
+        __host__ __device__ double& operator[](const int I);
+        __host__ __device__ double& operator()(const int I, const int J);
+        __host__ __device__ double& at(const int I, const int J);
+
+        __host__ __device__ cumat2 operator+(cumat2 lhs);
+        __host__ __device__ cumat2 operator-(cumat2 lhs);
+        __host__ __device__ cumat2 operator*(double lhs);
+        __host__ __device__ cumat2 operator/(double lhs);
+        __host__ __device__ cuvect2 operator*(cuvect2 lhs);
+        __host__ __device__ cumat2 operator*(cumat2 lhs);
+
+        __host__ __device__ double det();
+        __host__ __device__ cumat2 transpose();
+        __host__ __device__ cumat2 inverse();
+    };
+
+    __host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b);
+    __host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b);
+    __host__ __device__ double cuvect2_norm(cuvect2 a);
+    __host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a);
+    __host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b);
+
+    //2x2 matrix operations
+    //matrix order is assumed to be mat[I,J] = mat[I+3*J]
+
+    //transpose a 2x2 matrix in place
+    __host__ __device__ void mat2_transpose(double *mat2inout);
+    
+    //copies src to dest
+    __host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src);
+    
+    //inverts mat?inout[4]
+    __host__ __device__ void mat2_inverse(double *mat2inout);
+    
+    //rotatin matrix from angle
+    __host__ __device__ void mat2_rot_from_angle(double angle, double *mat2);
+
+    //multiplies c = a*b
+    __host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c);
+
+    // ret = a*b
+    __host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b);
+
+
+    void test_cuvect2_1();
+    
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/cuvect2f.hpp
+++ b/include/amsculib2/cuvect2f.hpp
@ -0,0 +1,84 @@
+#ifndef __CUVECT2F_HPP__
+#define __CUVECT2F_HPP__
+
+namespace amscuda
+{
+
+    class cuvect2f
+    {
+        public:
+        float x;
+        float y;
+
+
+        __host__ __device__ cuvect2f();
+        __host__ __device__ ~cuvect2f();
+        __host__ __device__ cuvect2f(float _x, float _y);
+
+        __host__ __device__ float& operator[](const int I);
+        __host__ __device__ const float& operator[](const int I) const;
+
+        __host__ __device__ cuvect2f operator+(cuvect2f lhs);
+        __host__ __device__ cuvect2f operator-(cuvect2f lhs);
+        __host__ __device__ cuvect2f operator*(float lhs);
+        __host__ __device__ cuvect2f operator/(float lhs);
+    };
+
+    class cumat2f
+    {
+        public:
+        float dat[4];
+
+        __host__ __device__ cumat2f();
+        __host__ __device__ ~cumat2f();
+        __host__ __device__ float& operator[](const int I);
+        __host__ __device__ float& operator()(const int I, const int J);
+        __host__ __device__ float& at(const int I, const int J);
+
+        __host__ __device__ cumat2f operator+(cumat2f lhs);
+        __host__ __device__ cumat2f operator-(cumat2f lhs);
+        __host__ __device__ cumat2f operator*(float lhs);
+        __host__ __device__ cumat2f operator/(float lhs);
+        __host__ __device__ cuvect2f operator*(cuvect2f lhs);
+        __host__ __device__ cumat2f operator*(cumat2f lhs);
+
+        __host__ __device__ float det();
+        __host__ __device__ cumat2f transpose();
+        __host__ __device__ cumat2f inverse();
+    };
+
+    __host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b);
+    __host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b);
+    __host__ __device__ float cuvect2f_norm(cuvect2f a);
+    __host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a);
+    __host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b);
+
+    //2x2 matrix operations
+    //matrix order is assumed to be mat[I,J] = mat[I+3*J]
+
+    //transpose a 2x2 matrix in place
+    __host__ __device__ void mat2f_transpose(float *mat2inout);
+    
+    //copies src to dest
+    __host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src);
+    
+    //inverts mat?inout[4]
+    __host__ __device__ void mat2f_inverse(float *mat2inout);
+    
+    //rotatin matrix from angle
+    __host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2);
+
+    //multiplies c = a*b
+    __host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c);
+
+    // ret = a*b
+    __host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b);
+
+
+    void test_cuvect2f_1();
+
+
+};
+
+#endif
+
--- a/include/amsculib2/cuvect3.hpp
+++ b/include/amsculib2/cuvect3.hpp
@ -0,0 +1,86 @@
+#ifndef __CUVECT3_HPP__
+#define __CUVECT3_HPP__
+
+namespace amscuda
+{
+
+    class cuvect3
+    {
+        public:
+        double x;
+        double y;
+        double z;
+
+        __host__ __device__ cuvect3();
+        __host__ __device__ ~cuvect3();
+        __host__ __device__ cuvect3(double _x, double _y, double _z);
+        
+        
+        __host__ __device__ double& operator[](const int I);
+        __host__ __device__ const double& operator[](const int I) const;
+
+        __host__ __device__ cuvect3 operator+(cuvect3 lhs);
+        __host__ __device__ cuvect3 operator-(cuvect3 lhs);
+        __host__ __device__ cuvect3 operator*(double lhs);
+        __host__ __device__ cuvect3 operator/(double lhs);
+    };
+
+    class cumat3
+    {
+        public:
+        double dat[9];
+
+        __host__ __device__ cumat3();
+        __host__ __device__ ~cumat3();
+        __host__ __device__ double& operator[](const int I);
+        __host__ __device__ double& operator()(const int I, const int J);
+        __host__ __device__ double& at(const int I, const int J);
+
+        __host__ __device__ cumat3 operator+(cumat3 lhs);
+        __host__ __device__ cumat3 operator-(cumat3 lhs);
+        __host__ __device__ cumat3 operator*(double lhs);
+        __host__ __device__ cumat3 operator/(double lhs);
+        __host__ __device__ cuvect3 operator*(cuvect3 lhs);
+        __host__ __device__ cumat3 operator*(cumat3 lhs);
+
+        __host__ __device__ double det();
+        __host__ __device__ cumat3 transpose();
+        __host__ __device__ cumat3 inverse();
+    };
+
+    __host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b);
+    __host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b);
+    __host__ __device__ double cuvect3_norm(cuvect3 a);
+    __host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a);
+    __host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b);
+
+    //3x3 matrix operations
+    //matrix order is assumed to be mat[I,J] = mat[I+3*J]
+    
+    //transposes a 3x3 (9 element) matrix
+    __host__ __device__ void mat3_transpose(double *mat3inout);
+
+    //copies src to dest
+    __host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src);
+
+    //returns determinant of 3x3 matrix
+    __host__ __device__ double mat3_det(double *mat3in);
+
+    //inverts a 3x3 (9 element) matrix
+    __host__ __device__ void mat3_inverse(double *mat3inout);
+
+    __host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin);
+    __host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout);
+
+    __host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout);
+    __host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout);
+
+    //returns direction cosine rotation matrix from axis and angle
+    __host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout);
+
+    __host__ void test_cudavect_logic1();
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/cuvect3f.hpp
+++ b/include/amsculib2/cuvect3f.hpp
@ -0,0 +1,86 @@
+#ifndef __CUVECT3F_HPP__
+#define __CUVECT3F_HPP__
+
+namespace amscuda
+{
+
+    class cuvect3f
+    {
+        public:
+        float x;
+        float y;
+        float z;
+
+        __host__ __device__ cuvect3f();
+        __host__ __device__ ~cuvect3f();
+        __host__ __device__ cuvect3f(float _x, float _y, float _z);
+        
+        
+        __host__ __device__ float& operator[](const int I);
+        __host__ __device__ const float& operator[](const int I) const;
+
+        __host__ __device__ cuvect3f operator+(cuvect3f lhs);
+        __host__ __device__ cuvect3f operator-(cuvect3f lhs);
+        __host__ __device__ cuvect3f operator*(float lhs);
+        __host__ __device__ cuvect3f operator/(float lhs);
+    };
+
+    class cumat3f
+    {
+        public:
+        float dat[9];
+
+        __host__ __device__ cumat3f();
+        __host__ __device__ ~cumat3f();
+        __host__ __device__ float& operator[](const int I);
+        __host__ __device__ float& operator()(const int I, const int J);
+        __host__ __device__ float& at(const int I, const int J);
+
+        __host__ __device__ cumat3f operator+(cumat3f lhs);
+        __host__ __device__ cumat3f operator-(cumat3f lhs);
+        __host__ __device__ cumat3f operator*(float lhs);
+        __host__ __device__ cumat3f operator/(float lhs);
+        __host__ __device__ cuvect3f operator*(cuvect3f lhs);
+        __host__ __device__ cumat3f operator*(cumat3f lhs);
+
+        __host__ __device__ float det();
+        __host__ __device__ cumat3f transpose();
+        __host__ __device__ cumat3f inverse();
+    };
+
+    __host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b);
+    __host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b);
+    __host__ __device__ float cuvect3f_norm(cuvect3f a);
+    __host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a);
+    __host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b);
+
+    //3x3 matrix operations
+    //matrix order is assumed to be mat[I,J] = mat[I+3*J]
+    
+    //transposes a 3x3 (9 element) matrix
+    __host__ __device__ void mat3f_transpose(float *mat3inout);
+
+    //copies src to dest
+    __host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src);
+
+    //returns determinant of 3x3 matrix
+    __host__ __device__ float mat3f_det(float *mat3in);
+
+    //inverts a 3x3 (9 element) matrix
+    __host__ __device__ void mat3f_inverse(float *mat3inout);
+
+    __host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin);
+    __host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout);
+
+    __host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout);
+    __host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout);
+
+    //returns direction cosine rotation matrix from axis and angle
+    __host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout);
+
+    __host__ void test_cudavectf_logic1();
+
+};
+
+#endif
+
--- a/include/amsculib2/cuvect4.hpp
+++ b/include/amsculib2/cuvect4.hpp
@ -0,0 +1,59 @@
+#ifndef __CUVECT4_HPP__
+#define __CUVECT4_HPP__
+
+namespace amscuda
+{
+
+    class cuvect4
+    {
+        public:
+        double x;
+        double y;
+        double z;
+        double w;
+
+        __host__ __device__ cuvect4();
+        __host__ __device__ ~cuvect4();
+        __host__ __device__ cuvect4(double _x, double _y, double _z, double _w);
+                
+        __host__ __device__ double& operator[](const int I);
+        __host__ __device__ const double& operator[](const int I) const;
+
+        __host__ __device__ cuvect4 operator+(cuvect4 lhs);
+        __host__ __device__ cuvect4 operator-(cuvect4 lhs);
+        __host__ __device__ cuvect4 operator*(double lhs);
+        __host__ __device__ cuvect4 operator/(double lhs);
+    };
+
+    class cumat4
+    {
+        public:
+        double dat[16];
+
+        __host__ __device__ cumat4();
+        __host__ __device__ ~cumat4();
+        __host__ __device__ double& operator[](const int I);
+        __host__ __device__ double& operator()(const int I, const int J);
+        __host__ __device__ double& at(const int I, const int J);
+
+        __host__ __device__ cumat4 operator+(cumat4 lhs);
+        __host__ __device__ cumat4 operator-(cumat4 lhs);
+        __host__ __device__ cumat4 operator*(double lhs);
+        __host__ __device__ cumat4 operator/(double lhs);
+        __host__ __device__ cuvect4 operator*(cuvect4 lhs);
+        __host__ __device__ cumat4 operator*(cumat4 lhs);
+
+        __host__ __device__ double det();
+        __host__ __device__ cumat4 transpose();
+        __host__ __device__ cumat4 inverse();
+    };
+
+    __host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b);
+    __host__ __device__ double cuvect4_norm(cuvect4 a);
+    __host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a);
+    __host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b);
+
+}; //end namespace amscuda
+
+#endif
+
--- a/include/amsculib2/cuvect4f.hpp
+++ b/include/amsculib2/cuvect4f.hpp
@ -0,0 +1,60 @@
+#ifndef __CUVECT4F_HPP__
+#define __CUVECT4F_HPP__
+
+namespace amscuda
+{
+
+    class cuvect4f
+    {
+        public:
+        float x;
+        float y;
+        float z;
+        float w;
+
+        __host__ __device__ cuvect4f();
+        __host__ __device__ ~cuvect4f();
+        __host__ __device__ cuvect4f(float _x, float _y, float _z, float _w);
+                
+        __host__ __device__ float& operator[](const int I);
+        __host__ __device__ const float& operator[](const int I) const;
+
+        __host__ __device__ cuvect4f operator+(cuvect4f lhs);
+        __host__ __device__ cuvect4f operator-(cuvect4f lhs);
+        __host__ __device__ cuvect4f operator*(float lhs);
+        __host__ __device__ cuvect4f operator/(float lhs);
+    };
+
+    class cumat4f
+    {
+        public:
+        float dat[16];
+
+        __host__ __device__ cumat4f();
+        __host__ __device__ ~cumat4f();
+        __host__ __device__ float& operator[](const int I);
+        __host__ __device__ float& operator()(const int I, const int J);
+        __host__ __device__ float& at(const int I, const int J);
+
+        __host__ __device__ cumat4f operator+(cumat4f lhs);
+        __host__ __device__ cumat4f operator-(cumat4f lhs);
+        __host__ __device__ cumat4f operator*(float lhs);
+        __host__ __device__ cumat4f operator/(float lhs);
+        __host__ __device__ cuvect4f operator*(cuvect4f lhs);
+        __host__ __device__ cumat4f operator*(cumat4f lhs);
+
+        __host__ __device__ float det();
+        __host__ __device__ cumat4f transpose();
+        __host__ __device__ cumat4f inverse();
+    };
+
+    __host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b);
+    __host__ __device__ float cuvect4f_norm(cuvect4f a);
+    __host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a);
+    __host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b);
+
+
+};
+
+#endif
+
--- a/objstore/amscu_comp128.o
+++ b/objstore/amscu_comp128.o
--- a/objstore/amscu_comp64.o
+++ b/objstore/amscu_comp64.o
--- a/objstore/amscu_cudafunctions.o
+++ b/objstore/amscu_cudafunctions.o
--- a/objstore/amscu_random.o
+++ b/objstore/amscu_random.o
--- a/objstore/amscuarray.o
+++ b/objstore/amscuarray.o
--- a/objstore/amscuarray_dops.o
+++ b/objstore/amscuarray_dops.o
--- a/objstore/amscugeom.o
+++ b/objstore/amscugeom.o
--- a/objstore/amsculib2.o
+++ b/objstore/amsculib2.o
--- a/objstore/amscumath.o
+++ b/objstore/amscumath.o
--- a/objstore/amscurarray.o
+++ b/objstore/amscurarray.o
--- a/objstore/cuvect2.o
+++ b/objstore/cuvect2.o
--- a/objstore/cuvect2f.o
+++ b/objstore/cuvect2f.o
--- a/objstore/cuvect3.o
+++ b/objstore/cuvect3.o
--- a/objstore/cuvect3f.o
+++ b/objstore/cuvect3f.o
--- a/objstore/cuvect4.o
+++ b/objstore/cuvect4.o
--- a/objstore/cuvect4f.o
+++ b/objstore/cuvect4f.o
--- a/run.py
+++ b/run.py
@ -0,0 +1,23 @@
+#!/usr/bin/python3
+
+import os,sys,math;
+from compscripts.complib2 import *;
+
+args = sys.argv
+if(len(args)>=2):
+    if(args[1]=="clean"):
+        obj_list = flist('./objstore',recurse=True,exts=['.o'])
+        for o in obj_list:
+            os.remove('{}'.format(o))
+        exit(0)
+
+os.system('python3 ./compscripts/linux64.makelib.py')
+os.system('python3 ./compscripts/linux64.maketest.py')
+
+# obj_list = flist('./src',recurse=True,exts=['.o'])
+# for o in obj_list:
+#     os.remove('{}'.format(o))
+
+#os.chdir('./bin_linux64')
+callproc('./bin_linux64/test')
+#os.chdir('..')
--- a/run3.py
+++ b/run3.py
@ -0,0 +1,17 @@
+#!/usr/bin/python3
+
+import os,sys,math;
+from compscripts.complib2 import *;
+
+
+os.system('python ./compscripts/winnvcc.makelib.py')
+os.system('python ./compscripts/winnvcc.maketest.py')
+
+obj_list = flist('./',recurse=True,exts=['.o'])
+for o in obj_list:
+    os.remove('{}'.format(o))
+
+#os.chdir('./bin_winx64')
+callproc('.\\bin_winx64\\test.exe')
+#os.chdir('..')
+
--- a/src/amsculib2/amscu_comp128.cu
+++ b/src/amsculib2/amscu_comp128.cu
@ -0,0 +1,476 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+namespace cmp
+{
+
+__host__ __device__ cucomp128::cucomp128()
+{
+    real = 0.0;
+    imag = 0.0;
+    return;
+}
+
+__host__ __device__ cucomp128::~cucomp128()
+{
+    real = 0.0;
+    imag = 0.0;
+    return;
+}
+
+__host__ __device__ cucomp128::cucomp128(const cucomp128 &other)
+{
+    real = other.real;
+    imag = other.imag;
+
+    return;
+}
+
+__host__ __device__ cucomp128::cucomp128(const double &other)
+{
+    real = other;
+    imag = 0.0;
+    return;
+}
+
+__host__ __device__ cucomp128& cucomp128::operator=(cucomp128& other)
+{
+    real = other.real;
+    imag = other.imag;
+    return *this;
+}
+
+__host__ __device__ const cucomp128& cucomp128::operator=(const cucomp128& other) 
+{
+    this->real = other.real;
+    this->imag = other.imag;
+    return *this;
+}
+
+__host__ __device__ cucomp128& cucomp128::operator=(double& other)
+{
+    real = other;
+    imag = 0.0;
+    return *this;
+}
+
+__host__ __device__ const cucomp128& cucomp128::operator=(const double& other) 
+{
+    this->real = other;
+    this->imag = 0.0;
+    return *this;
+}
+
+__host__ __device__ double& cucomp128::operator[](int& ind)
+{
+    if(ind==0)
+    {
+        return this->real;
+    }
+    else
+    {
+        return this->imag;
+    }
+}
+
+__host__ __device__ const double& cucomp128::operator[](const int& ind) const
+{
+    if(ind==0)
+    {
+        return this->real;
+    }
+    else
+    {
+        return this->imag;
+    }
+}
+
+__host__ __device__ cucomp128 cucomp128::operator+(const cucomp128& z)
+{
+  cucomp128 ret;
+  ret.real = real + z.real;
+  ret.imag = imag + z.imag;
+  return ret;  
+}
+
+__host__ __device__ cucomp128 cucomp128::operator-(const cucomp128& z)
+{
+  cucomp128 ret;
+  ret.real = real - z.real;
+  ret.imag = imag - z.imag;
+  return ret;  
+}
+
+__host__ __device__ cucomp128 cucomp128::operator*(const cucomp128& z)
+{
+  cucomp128 ret;
+  ret.real = (real*z.real - imag*z.imag);
+  ret.imag = (imag*z.real + real*z.imag);
+  return ret;  
+}
+
+__host__ __device__ cucomp128 cucomp128::operator/(const cucomp128& z)
+{
+  cucomp128 ret;
+  double zm2 = z.real*z.real+z.imag*z.imag;
+  
+  if(zm2>0.0)
+  {
+      ret.real = (this->real*z.real+this->imag*z.imag)/zm2;
+      ret.imag = (this->imag*z.real-this->real*z.imag)/zm2;
+  }
+  else
+  {
+      ret.real = (double) finf;
+      ret.imag = (double) finf;
+  }
+
+  return ret;  
+}
+
+__host__ __device__ cucomp128 cucomp128::operator+(const double& z)
+{
+    cucomp128 ret;
+    ret.real = this->real + z;
+    ret.imag = this->imag;
+    return ret;
+}
+__host__ __device__ cucomp128 cucomp128::operator-(const double& z)
+{
+    cucomp128 ret;
+    ret.real = real-z;
+    ret.imag = imag;
+    return ret;
+}
+__host__ __device__ cucomp128 cucomp128::operator*(const double& z)
+{
+    cucomp128 ret;
+    ret.real = real*z;
+    ret.imag = imag*z;
+    return ret;
+}
+__host__ __device__ cucomp128 cucomp128::operator/(const double& z)
+{
+    cucomp128 ret;
+    if(z!=0.0f)
+    {
+        ret.real = real/z;
+        ret.imag = imag/z;
+    }
+    else
+    {
+        ret.real = finf;
+        ret.imag = finf;
+    }
+
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::operator==(const cucomp128& z) const
+{
+    bool ret = 0;
+    if(z.real == real && z.imag == imag)
+    {
+        ret = 1;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::operator!=(const cucomp128& z) const
+{
+    return !(*this==z);
+}
+
+//sort first by real value, then by imaginary value
+//this is done so that an ordering exists, as long as two values aren't equal
+__host__ __device__ bool cucomp128::operator>(const cucomp128& z) const
+{
+    bool ret = 0;
+    if(this->real>z.real)
+    {
+        ret = 1;
+    }
+    else if(this->real==z.real)
+    {
+        if(this->imag>z.imag)
+        {
+            ret = 1;
+        }
+        else
+        {
+            ret = 0;
+        }
+    }
+    else
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::operator<(const cucomp128& z) const
+{
+    bool ret = 0;
+    if(this->real<z.real)
+    {
+        ret = 1;
+    }
+    else if(this->real==z.real)
+    {
+        if(this->imag<z.imag)
+        {
+            ret = 1;
+        }
+        else
+        {
+            ret = 0;
+        }
+    }
+    else
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ cucomp128 operator-(const cucomp128 &z)
+{
+	cucomp128 ret;
+	ret.real = -z.real;
+	ret.imag = -z.imag;
+	return ret;
+}
+
+__host__ __device__ bool cucomp128::operator>=(const cucomp128& z) const
+{
+    bool ret = (*this==z || *this>z);
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::operator<=(const cucomp128& z) const
+{
+    bool ret = (*this==z || *this<z);
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::isnan() const
+{
+  bool ret = 0;
+  if(::isnan(this->real) || ::isnan(this->imag))
+  {
+    ret = 1;
+  }
+  return ret;
+}
+
+__host__ __device__ bool cucomp128::isinf() const
+{
+  bool ret = 0;
+  //calls math.h isinf()
+  if(::isinf(this->real) || ::isinf(this->imag))
+  {
+    ret = 1;
+  }
+  return ret;
+}
+
+__host__ __device__ bool cucomp128::isreal() const
+{
+    bool ret = 1;
+    if(imag!=0.0f)
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::isimag() const
+{
+    bool ret = 1;
+    if(real!=0.0f)
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp128::iszero() const
+{
+    bool ret = 1;
+    if(real!=0.0f || imag!=0.0f)
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ double cucomp128::arg() const
+{
+    double ret = 0.0;
+    ret = (double) amscuda::arg((double)real,(double)imag);
+    return ret;
+}
+
+__host__ __device__ double cucomp128::mag() const
+{
+    double ret = 0.0;
+    ret = ::sqrt(real*real+imag*imag);
+    return ret;
+}
+
+__host__ __device__ cucomp128 cucomp128::conj() const
+{
+    cucomp128 ret;
+    ret.real = real;
+    ret.imag = -imag;
+    return ret;
+}
+
+__host__ __device__ double arg(cucomp128 z)
+{
+    return z.arg();
+}
+
+__host__ __device__ double abs(cucomp128 z)
+{
+    return z.mag();
+}
+
+__host__ __device__ cucomp128 dtocomp(double _r, double _i)
+{
+    cucomp128 ret;
+    ret.real = _r;
+    ret.imag = _i;
+    return ret;
+}
+
+__host__ __device__ double real(cucomp128 z)
+{
+    return z.real;
+}
+
+__host__ __device__ double imag(cucomp128 z)
+{
+    return z.imag;
+}
+
+__host__ __device__ cucomp128 sin(cucomp128 z)
+{
+    cucomp128 ret;
+    cucomp128 im1 = dtocomp(0.0f,1.0f);
+    cucomp128 div = dtocomp(0.0f,2.0f);
+
+    ret = (exp(im1*z)-exp(-im1*z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp128 cos(cucomp128 z)
+{
+    cucomp128 ret;
+    cucomp128 im1 = dtocomp(0.0f,1.0f);
+    cucomp128 div = dtocomp(2.0f,0.0f);
+
+    ret = (exp(im1*z)+exp(-im1*z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp128 tan(cucomp128 z)
+{
+    return sin(z)/cos(z);
+}
+
+__host__ __device__ cucomp128 exp(cucomp128 z)
+{
+    cucomp128 ret;
+    ret.real = ::exp(z.real)*::cos(z.imag);
+    ret.imag = ::exp(z.real)*::sin(z.imag);
+    return ret;
+}
+
+__host__ __device__ cucomp128 log(cucomp128 z)
+{
+    cucomp128 ret;
+    ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag));
+    ret.imag = amscuda::arg(z.real,z.imag);
+    return ret;
+}
+
+__host__ __device__ cucomp128 conj(cucomp128 z)
+{
+    return z.conj();
+}
+
+__host__ __device__ cucomp128 cosh(cucomp128 z)
+{
+    cucomp128 ret;
+    cucomp128 div = dtocomp(2.0f,0.0f);
+
+    ret = (exp(z)+exp(-z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp128 sinh(cucomp128 z)
+{
+    cucomp128 ret;
+    cucomp128 div = dtocomp(2.0f,0.0f);
+
+    ret = (exp(z)-exp(-z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp128 tanh(cucomp128 z)
+{
+    return sinh(z)/cosh(z);
+}
+
+__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2)
+{
+	cucomp128 ret;
+	if(z1.mag()>0.0)
+		ret = exp(z2*log(z1));
+	else
+		ret = dtocomp(0.0f,0.0f);
+	return ret;
+}
+
+
+void test_cucomp128_1()
+{
+    cucomp128 z1;
+    cucomp128 a,b,c;
+    double d1;
+    double f1;
+
+    printf("sizeof double=%ld\n",(long)(8*sizeof(f1)));
+    printf("sizeof double=%ld\n",(long)(8*sizeof(d1)));
+    printf("sizeof complex=%ld\n",(long)(8*sizeof(z1)));
+    printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a)));
+
+    a = dtocomp(1.0,1.0);
+    b = dtocomp(1.0,-1.0);
+
+    printf("a=%1.4f + %1.4fi\n",a[0],a[1]);
+    printf("b=%1.4f + %1.4fi\n",b[0],b[1]);
+    c = a+b;
+    printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    c = a-b;
+    printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    c = a*b;
+    printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    c = a/b;
+    printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    f1 = abs(a);
+    printf("abs(a)=%1.4f\n",f1);
+    f1 = arg(a);
+    printf("abs(a)=%1.4f pi\n",f1/pi);
+
+
+}
+
+}; //end namespace cmp
+}; //end namespace amscuda
--- a/src/amsculib2/amscu_comp64.cu
+++ b/src/amsculib2/amscu_comp64.cu
@ -0,0 +1,476 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+namespace cmp
+{
+
+__host__ __device__ cucomp64::cucomp64()
+{
+    real = 0.0;
+    imag = 0.0;
+    return;
+}
+
+__host__ __device__ cucomp64::~cucomp64()
+{
+    real = 0.0;
+    imag = 0.0;
+    return;
+}
+
+__host__ __device__ cucomp64::cucomp64(const cucomp64 &other)
+{
+    real = other.real;
+    imag = other.imag;
+
+    return;
+}
+
+__host__ __device__ cucomp64::cucomp64(const float &other)
+{
+    real = other;
+    imag = 0.0;
+    return;
+}
+
+__host__ __device__ cucomp64& cucomp64::operator=(cucomp64& other)
+{
+    real = other.real;
+    imag = other.imag;
+    return *this;
+}
+
+__host__ __device__ const cucomp64& cucomp64::operator=(const cucomp64& other) 
+{
+    this->real = other.real;
+    this->imag = other.imag;
+    return *this;
+}
+
+__host__ __device__ cucomp64& cucomp64::operator=(float& other)
+{
+    real = other;
+    imag = 0.0;
+    return *this;
+}
+
+__host__ __device__ const cucomp64& cucomp64::operator=(const float& other) 
+{
+    this->real = other;
+    this->imag = 0.0;
+    return *this;
+}
+
+__host__ __device__ float& cucomp64::operator[](int& ind)
+{
+    if(ind==0)
+    {
+        return this->real;
+    }
+    else
+    {
+        return this->imag;
+    }
+}
+
+__host__ __device__ const float& cucomp64::operator[](const int& ind) const
+{
+    if(ind==0)
+    {
+        return this->real;
+    }
+    else
+    {
+        return this->imag;
+    }
+}
+
+__host__ __device__ cucomp64 cucomp64::operator+(const cucomp64& z)
+{
+  cucomp64 ret;
+  ret.real = real + z.real;
+  ret.imag = imag + z.imag;
+  return ret;  
+}
+
+__host__ __device__ cucomp64 cucomp64::operator-(const cucomp64& z)
+{
+  cucomp64 ret;
+  ret.real = real - z.real;
+  ret.imag = imag - z.imag;
+  return ret;  
+}
+
+__host__ __device__ cucomp64 cucomp64::operator*(const cucomp64& z)
+{
+  cucomp64 ret;
+  ret.real = (real*z.real - imag*z.imag);
+  ret.imag = (imag*z.real + real*z.imag);
+  return ret;  
+}
+
+__host__ __device__ cucomp64 cucomp64::operator/(const cucomp64& z)
+{
+  cucomp64 ret;
+  float zm2 = z.real*z.real+z.imag*z.imag;
+  
+  if(zm2>0.0)
+  {
+      ret.real = (this->real*z.real+this->imag*z.imag)/zm2;
+      ret.imag = (this->imag*z.real-this->real*z.imag)/zm2;
+  }
+  else
+  {
+      ret.real = (float) finf;
+      ret.imag = (float) finf;
+  }
+
+  return ret;  
+}
+
+__host__ __device__ cucomp64 cucomp64::operator+(const float& z)
+{
+    cucomp64 ret;
+    ret.real = this->real + z;
+    ret.imag = this->imag;
+    return ret;
+}
+__host__ __device__ cucomp64 cucomp64::operator-(const float& z)
+{
+    cucomp64 ret;
+    ret.real = real-z;
+    ret.imag = imag;
+    return ret;
+}
+__host__ __device__ cucomp64 cucomp64::operator*(const float& z)
+{
+    cucomp64 ret;
+    ret.real = real*z;
+    ret.imag = imag*z;
+    return ret;
+}
+__host__ __device__ cucomp64 cucomp64::operator/(const float& z)
+{
+    cucomp64 ret;
+    if(z!=0.0f)
+    {
+        ret.real = real/z;
+        ret.imag = imag/z;
+    }
+    else
+    {
+        ret.real = finf;
+        ret.imag = finf;
+    }
+
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::operator==(const cucomp64& z) const
+{
+    bool ret = 0;
+    if(z.real == real && z.imag == imag)
+    {
+        ret = 1;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::operator!=(const cucomp64& z) const
+{
+    return !(*this==z);
+}
+
+//sort first by real value, then by imaginary value
+//this is done so that an ordering exists, as long as two values aren't equal
+__host__ __device__ bool cucomp64::operator>(const cucomp64& z) const
+{
+    bool ret = 0;
+    if(this->real>z.real)
+    {
+        ret = 1;
+    }
+    else if(this->real==z.real)
+    {
+        if(this->imag>z.imag)
+        {
+            ret = 1;
+        }
+        else
+        {
+            ret = 0;
+        }
+    }
+    else
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::operator<(const cucomp64& z) const
+{
+    bool ret = 0;
+    if(this->real<z.real)
+    {
+        ret = 1;
+    }
+    else if(this->real==z.real)
+    {
+        if(this->imag<z.imag)
+        {
+            ret = 1;
+        }
+        else
+        {
+            ret = 0;
+        }
+    }
+    else
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ cucomp64 operator-(const cucomp64 &z)
+{
+	cucomp64 ret;
+	ret.real = -z.real;
+	ret.imag = -z.imag;
+	return ret;
+}
+
+__host__ __device__ bool cucomp64::operator>=(const cucomp64& z) const
+{
+    bool ret = (*this==z || *this>z);
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::operator<=(const cucomp64& z) const
+{
+    bool ret = (*this==z || *this<z);
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::isnan() const
+{
+  bool ret = 0;
+  if(::isnan(this->real) || ::isnan(this->imag))
+  {
+    ret = 1;
+  }
+  return ret;
+}
+
+__host__ __device__ bool cucomp64::isinf() const
+{
+  bool ret = 0;
+  //calls math.h isinf()
+  if(::isinf(this->real) || ::isinf(this->imag))
+  {
+    ret = 1;
+  }
+  return ret;
+}
+
+__host__ __device__ bool cucomp64::isreal() const
+{
+    bool ret = 1;
+    if(imag!=0.0f)
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::isimag() const
+{
+    bool ret = 1;
+    if(real!=0.0f)
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ bool cucomp64::iszero() const
+{
+    bool ret = 1;
+    if(real!=0.0f || imag!=0.0f)
+    {
+        ret = 0;
+    }
+    return ret;
+}
+
+__host__ __device__ float cucomp64::arg() const
+{
+    float ret = 0.0;
+    ret = (float) amscuda::arg((double)real,(double)imag);
+    return ret;
+}
+
+__host__ __device__ float cucomp64::mag() const
+{
+    float ret = 0.0;
+    ret = ::sqrt(real*real+imag*imag);
+    return ret;
+}
+
+__host__ __device__ cucomp64 cucomp64::conj() const
+{
+    cucomp64 ret;
+    ret.real = real;
+    ret.imag = -imag;
+    return ret;
+}
+
+__host__ __device__ float arg(cucomp64 z)
+{
+    return z.arg();
+}
+
+__host__ __device__ float abs(cucomp64 z)
+{
+    return z.mag();
+}
+
+__host__ __device__ cucomp64 dtocomp64(float _r, float _i)
+{
+    cucomp64 ret;
+    ret.real = _r;
+    ret.imag = _i;
+    return ret;
+}
+
+__host__ __device__ float real(cucomp64 z)
+{
+    return z.real;
+}
+
+__host__ __device__ float imag(cucomp64 z)
+{
+    return z.imag;
+}
+
+__host__ __device__ cucomp64 sin(cucomp64 z)
+{
+    cucomp64 ret;
+    cucomp64 im1 = dtocomp64(0.0f,1.0f);
+    cucomp64 div = dtocomp64(0.0f,2.0f);
+
+    ret = (exp(im1*z)-exp(-im1*z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp64 cos(cucomp64 z)
+{
+    cucomp64 ret;
+    cucomp64 im1 = dtocomp64(0.0f,1.0f);
+    cucomp64 div = dtocomp64(2.0f,0.0f);
+
+    ret = (exp(im1*z)+exp(-im1*z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp64 tan(cucomp64 z)
+{
+    return sin(z)/cos(z);
+}
+
+__host__ __device__ cucomp64 exp(cucomp64 z)
+{
+    cucomp64 ret;
+    ret.real = ::exp(z.real)*::cos(z.imag);
+    ret.imag = ::exp(z.real)*::sin(z.imag);
+    return ret;
+}
+
+__host__ __device__ cucomp64 log(cucomp64 z)
+{
+    cucomp64 ret;
+    ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag));
+    ret.imag = amscuda::arg(z.real,z.imag);
+    return ret;
+}
+
+__host__ __device__ cucomp64 conj(cucomp64 z)
+{
+    return z.conj();
+}
+
+__host__ __device__ cucomp64 cosh(cucomp64 z)
+{
+    cucomp64 ret;
+    cucomp64 div = dtocomp64(2.0f,0.0f);
+
+    ret = (exp(z)+exp(-z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp64 sinh(cucomp64 z)
+{
+    cucomp64 ret;
+    cucomp64 div = dtocomp64(2.0f,0.0f);
+
+    ret = (exp(z)-exp(-z))/div;
+
+    return ret;
+}
+
+__host__ __device__ cucomp64 tanh(cucomp64 z)
+{
+    return sinh(z)/cosh(z);
+}
+
+__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2)
+{
+	cucomp64 ret;
+	if(z1.mag()>0.0)
+		ret = exp(z2*log(z1));
+	else
+		ret = dtocomp64(0.0f,0.0f);
+	return ret;
+}
+
+
+void test_cucomp64_1()
+{
+    cucomp64 z1;
+    cucomp64 a,b,c;
+    double d1;
+    float f1;
+
+    printf("sizeof double=%ld\n",(long)(8*sizeof(f1)));
+    printf("sizeof double=%ld\n",(long)(8*sizeof(d1)));
+    printf("sizeof complex=%ld\n",(long)(8*sizeof(z1)));
+    printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a)));
+
+    a = dtocomp64(1.0,1.0);
+    b = dtocomp64(1.0,-1.0);
+
+    printf("a=%1.4f + %1.4fi\n",a[0],a[1]);
+    printf("b=%1.4f + %1.4fi\n",b[0],b[1]);
+    c = a+b;
+    printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    c = a-b;
+    printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    c = a*b;
+    printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    c = a/b;
+    printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]);
+    f1 = abs(a);
+    printf("abs(a)=%1.4f\n",f1);
+    f1 = arg(a);
+    printf("abs(a)=%1.4f pi\n",f1/pi);
+
+
+}
+
+}; //end namespace cmp
+}; //end namespace amscuda
--- a/src/amsculib2/amscu_cudafunctions.cu
+++ b/src/amsculib2/amscu_cudafunctions.cu
@ -0,0 +1,21 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+int cuda_errortrap(const char *msgheader)
+{
+    int ret = 0;
+    cudaError_t err = cudaSuccess;
+
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("%s :%s\n",msgheader,cudaGetErrorString(err));
+        ret = 1;
+    }
+
+    return ret;
+}
+
+}; //end namespace amscuda
--- a/src/amsculib2/amscu_random.cu
+++ b/src/amsculib2/amscu_random.cu
@ -0,0 +1,222 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+__device__ __host__ float fhash1d_su(float x)
+{
+    float ret;
+    ret = x*(x>0.0f) + -x*(x<0.0f); //sign without conditionals?
+    ret = fmodf(ret,10000.0f); //restrain domain
+    ret = fmodf(ret*(ret+3678.453f)+7890.453f,10000.0f);
+    ret = fmodf(ret*(ret+8927.2134f),10000.0f);
+    ret = fmodf(ret*(ret+3656.234f),10000.0f);
+    //ret = fmodf(ret*(ret+892.2134f),1000.0f);
+    //ret = fmodf(ret,1000.0f);
+    ret = ret/10000.0f;
+    return ret;
+}
+
+
+__device__ __host__ float fhash3d_su(float x, float y=0.0f, float z=0.0f)
+{
+    float ret = 0.0f;
+
+    ret = fhash1d_su(z);
+    ret = fhash1d_su(1000.0f*ret*ret + y);
+    ret = fhash1d_su(1000.0f*ret*ret + x);
+
+    return ret;
+}
+
+__device__ __host__ float fhash4d_su(float x, float y=0.0f, float z=0.0f, float w=0.0f)
+{
+    float ret = 0.0f;
+
+    ret = fhash1d_su(w);
+    ret = fhash1d_su(1000.0f*ret*ret + z);
+    ret = fhash1d_su(1000.0f*ret*ret + y);
+    ret = fhash1d_su(1000.0f*ret*ret + x);
+
+    return ret;
+}
+
+//////////////////////////////////////////////////
+// Deterministic Pseudorandom int32_t Generator //
+//////////////////////////////////////////////////
+
+//Simple 32 bit integer deterministic pseudo-random generator
+// *not* for cryptography
+// Frequency of generated floats should be uniform [0,1)
+
+AMSCU_CONST static const int32_t dpr32_mod = 1<<30-1;
+AMSCU_CONST static const int32_t dpr32_mult = 25137;
+
+//Next seed in simple 32 bit integer deterministic psuedo-rand generator
+__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout)
+{
+    int32_t lseed;
+    if(rseed_inout!=NULL) lseed = *rseed_inout;
+
+    lseed = (lseed*dpr32_mult + 1)%dpr32_mod;
+    lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+dpr32_mod); //ensure mod is positive
+    
+    if(rseed_inout!=NULL) *rseed_inout = lseed;
+
+    return;
+}
+
+//Simple 32 bit integer deterministic pseudo-random generator
+// *not* for cryptography
+// Frequency of generated floats should be uniform [0,1)
+__host__ __device__ float dpr32_randf(int32_t *rseed_inout)
+{
+    int32_t lseed = 1;
+    float ret = 0.0f;
+
+    if(rseed_inout!=NULL) lseed = *rseed_inout;
+
+    dpr32_nextseed(&lseed);
+    ret = ((float)(lseed))/((float)dpr32_mod);
+
+    if(rseed_inout!=NULL) *rseed_inout = lseed;
+
+    return ret;
+}
+
+//box muller standard normal variable
+__host__ __device__ float dpr32_randnf(int32_t *rseed_inout)
+{
+    int32_t lseed = 1;
+    float ret = 0.0f;
+    float u1,u2;
+
+    if(rseed_inout!=NULL) lseed = *rseed_inout;
+
+    u1 = dpr32_randf(&lseed);
+    u2 = dpr32_randf(&lseed);
+
+    ret = ::sqrtf(-2.0f*::logf(u1))*::cosf(2.0f*pif*u2);
+
+    if(rseed_inout!=NULL) *rseed_inout = lseed;
+    
+    return ret;
+}
+
+//////////////////////////////////////////////////
+// Deterministic Pseudorandom int64_t Generator //
+//////////////////////////////////////////////////
+
+//"goodenough" deterministic pseudo-random number generator
+//random enough for procedural applications, deterministic, 
+//operates without side-effects for thread safety
+
+AMSCU_CONST const int64_t random_dpr64_mod = (2LL<<31LL)-1LL;
+AMSCU_CONST const int64_t random_dpr64_mult = 1201633LL;
+
+__host__ __device__ void dpr64_nextseed(int64_t *seedinout)
+{
+    int64_t lseed = 0LL;
+    if(seedinout!=NULL) lseed = *seedinout;
+
+    lseed = (random_dpr64_mult*lseed+1LL)%random_dpr64_mod;
+    lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+random_dpr64_mod);
+
+    if(seedinout!=NULL) *seedinout = lseed;
+
+    return;
+}
+
+__host__ __device__ double dpr64_randd(int64_t *seedinout)
+{
+    double ret = 0.0;
+    int64_t lseed = 0LL;
+
+    if(seedinout!=NULL) lseed = *seedinout;
+
+    dpr64_nextseed(&lseed);
+    ret = ((double)lseed)/((double)(random_dpr64_mod-1LL));
+
+    if(seedinout!=NULL) *seedinout = lseed;
+
+    return ret;
+}
+
+__host__ __device__ float dpr64_randf(int64_t *seedinout)
+{
+    float ret = 0.0f;
+    int64_t lseed = 0LL;
+
+    if(seedinout!=NULL) lseed = *seedinout;
+
+    dpr64_nextseed(&lseed);
+    ret = ((float)lseed)/((float)(random_dpr64_mod-1LL));
+
+    if(seedinout!=NULL) *seedinout = lseed;
+
+    return ret;
+}
+
+///////////
+// Tests //
+///////////
+
+void test_dprg64()
+{
+    printf("Tests for dprg:\n");
+    long I;
+    int64_t seed = 133LL;
+    double d;
+    float f;
+    cuvect3 qv;
+
+    printf("dpr64_randd test\n");
+    seed = 133LL;
+    for(I=0;I<10;I++)
+    {
+        d = dpr64_randd(&seed);
+        printf("seed: %lld rand: %1.4f\n",(long long)seed,d);
+    }
+
+    printf("\n\n");
+    printf("dpr64_randf test\n");
+    seed = 133LL;
+    for(I=0;I<10;I++)
+    {
+        f = dpr64_randf(&seed);
+        printf("seed: %lld rand: %1.4f\n",(long long)seed,f);
+    }
+
+    return;
+}
+
+void test_dprg32()
+{
+    printf("Tests for dprg:\n");
+    long I;
+    int32_t seed = 133;
+    double d;
+    float f;
+    cuvect3 qv;
+
+    printf("dpr32_randf test\n");
+    seed = 133;
+    for(I=0;I<10;I++)
+    {
+        f = dpr32_randf(&seed);
+        printf("seed: %lld rand: %1.4f\n",(long long)seed,f);
+    }
+
+    printf("\n\ndpr32_randnf test\n");
+    seed = 133;
+    for(I=0;I<10;I++)
+    {
+        f = dpr32_randnf(&seed);
+        printf("seed: %lld rand: %1.4f\n",(long long)seed,f);
+    }
+
+    return;
+}
+
+
+}; //namespace amscuda
--- a/src/amsculib2/amscuarray.cu
+++ b/src/amsculib2/amscuarray.cu
@ -0,0 +1,63 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+__global__ void test_cuarray_sum_kf(cuarray<float> *dq1, float *sum)
+{
+    int I;
+    *sum = 0.0f;
+    for(I=0;I<dq1->length;I++)
+    {
+        *sum = *sum + dq1->data[I];
+    }
+    //*sum = (float)dq1->length;
+    return;
+}
+
+float test_cuarray_sum(cuarray<float> *q1)
+{
+    float ret = 0.0f;
+    int res;
+    cuarray<float> *dq1 = NULL;
+    float *dsum;
+    cudaError_t err = cudaSuccess;
+
+    cudaMalloc(&dsum,sizeof(float));
+    res = q1->device_send(&dq1);
+    printf("error: res=%d\n",res);
+    test_cuarray_sum_kf<<<1,1>>>(dq1,dsum);
+    cudaDeviceSynchronize();
+
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("test_cuarray_sum Error: %s\n",cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(&ret,dsum,sizeof(float),cudaMemcpyDeviceToHost);
+
+    q1->device_free(&dq1);
+    cudaFree(dsum); dsum = NULL;
+
+    return ret;
+}
+
+void test_cuarray()
+{
+    cuarray<float> q1;
+    
+    int I;
+
+    q1.resize(100);
+    for(I=0;I<q1.length;I++)
+    {
+        q1.data[I] = I;
+    }
+    printf("q1.length=%d\n",q1.length);
+
+    printf("sum of array: %1.6g\n",test_cuarray_sum(&q1));
+
+}
+
+};
--- a/src/amsculib2/amscuarray_dops.cu
+++ b/src/amsculib2/amscuarray_dops.cu
@ -0,0 +1,213 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+//template instantiations
+template float dbuff_sum(float *devbuffer, int N);
+template void dbuff_minmax(float *devbuffer, int N, float *min, float *max);
+
+
+template void dbuff_setall(float *devbuffer, int N, float setto, int nblocks, int nthreads);
+
+
+
+
+//fill devbuffer with random uniform numbers between 0 and 1 using int32_t based generator
+__global__ void dbuff_rand_dpr32_kf(float *devbuffer, int N, int32_t *seeds)
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    int32_t lseed;
+    float f;
+
+    lseed = seeds[I0];
+    for(I=I0;I<N;I=I+Is)
+    {
+        f = dpr32_randf(&lseed);
+        devbuffer[I] = f;
+    }
+
+    return;
+}
+
+void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout,  int nblocks, int nthreads)
+{
+    cudaError_t err = cudaSuccess;
+    int I;
+    int32_t *seeds = NULL;
+    int32_t *devseeds = NULL;
+    int32_t lseed;
+
+    if(devbuffer==NULL || N<=0)
+    {
+        return;
+    }
+
+    seeds = new(std::nothrow) int32_t[nblocks*nthreads];
+    cudaMalloc(&devseeds,sizeof(int32_t)*nblocks*nthreads);
+
+    if(rseedinout!=NULL) lseed = *rseedinout; else lseed = 1;
+    for(I=0;I<nblocks*nthreads;I++)
+    {
+        lseed = lseed + I + 1;
+        dpr32_nextseed(&lseed);
+        seeds[I] = lseed;
+    }
+
+    cudaMemcpy(devseeds,seeds,sizeof(int32_t)*nblocks*nthreads,cudaMemcpyHostToDevice);
+    if(rseedinout!=NULL) *rseedinout = lseed;
+
+    dbuff_rand_dpr32_kf<<<nblocks,nthreads>>>(devbuffer,N,devseeds);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err));
+    }
+
+    cudaFree(devseeds); devseeds = NULL;
+    delete[] seeds; seeds = NULL;
+
+    return;
+}
+
+__global__ void dbuff_rand_dpr32n_kf(float *devbuffer, int N, int32_t *seeds)
+{
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int I;
+
+    int32_t lseed;
+    float f;
+
+    lseed = seeds[I0];
+    for(I=I0;I<N;I=I+Is)
+    {
+        f = dpr32_randnf(&lseed);
+        devbuffer[I] = f;
+    }
+
+    return;
+}
+
+void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout,  int nblocks, int nthreads)
+{
+    cudaError_t err = cudaSuccess;
+    int I;
+    int32_t *seeds = NULL;
+    int32_t *devseeds = NULL;
+    int32_t lseed;
+
+    if(devbuffer==NULL || N<=0)
+    {
+        return;
+    }
+
+    seeds = new(std::nothrow) int32_t[nblocks*nthreads];
+    cudaMalloc(&devseeds,sizeof(int32_t)*nblocks*nthreads);
+
+    if(rseedinout!=NULL) lseed = *rseedinout; else lseed = 1;
+    for(I=0;I<nblocks*nthreads;I++)
+    {
+        lseed = lseed + I + 1;
+        dpr32_nextseed(&lseed);
+        seeds[I] = lseed;
+    }
+
+    cudaMemcpy(devseeds,seeds,sizeof(int32_t)*nblocks*nthreads,cudaMemcpyHostToDevice);
+    if(rseedinout!=NULL) *rseedinout = lseed;
+
+    dbuff_rand_dpr32n_kf<<<nblocks,nthreads>>>(devbuffer,N,devseeds);
+    cudaDeviceSynchronize();
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err));
+    }
+
+    cudaFree(devseeds); devseeds = NULL;
+    delete[] seeds; seeds = NULL;
+
+    return;
+}
+
+
+
+void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout,  int nblocks, int nthreads)
+{
+    int I;
+
+    return;
+}
+
+
+
+///////////
+// Tests //
+///////////
+
+void test_dbuff_rand_dpr32()
+{
+    cuarray<float> data;
+    float *dev_data = NULL;
+    int Nx = 5000;
+    int Ny = 5000;
+    cuarray<int> dims;
+    int32_t rseed = 15;
+    FILE *fp = NULL;
+    const char *fname = "./test_scripts/test_dbuff_rand_dpr32.bin";
+
+    clock_t t0,t1,t2;
+    double dt0,dt1;
+
+    printf("Tests of dbuff_rand_dpr32...\n");
+
+    fp = fopen(fname,"w+");
+    if(fp==NULL)
+    {
+        printf("Error: Could not open %s for writing.\n",fname);
+        return;
+    }
+
+    data.resize(Nx*Ny);
+    dims.resize(2);
+    dims[0] = Nx; dims[1] = Ny;
+    cudaMalloc(&dev_data,Nx*Ny*sizeof(float));
+
+    t0 = clock();
+    dbuff_rand_dpr32(dev_data,Nx*Ny,&rseed,256,512);
+    t1 = clock();
+    cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost);
+    t2 = clock();
+
+    dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0;
+    dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0;
+    printf("dbuff_rand_dpr32 exec time: %1.3f msec\n",dt0);
+    printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1);
+    
+    fwrite_ndarray(fp,&dims,&data);
+
+    t0 = clock();
+    dbuff_rand_dpr32n(dev_data,Nx*Ny,&rseed,256,512);
+    t1 = clock();
+    cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost);
+    t2 = clock();
+
+    dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0;
+    dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0;
+    printf("dbuff_rand_dpr32n exec time: %1.3f msec\n",dt0);
+    printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1);
+
+    fwrite_ndarray(fp,&dims,&data);
+
+    fclose(fp);
+
+    cudaFree(dev_data); dev_data = NULL;
+
+}
+
+};
+
--- a/src/amsculib2/amscugeom.cu
+++ b/src/amsculib2/amscugeom.cu
@ -0,0 +1,6 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+};
--- a/src/amsculib2/amsculib2.cu
+++ b/src/amsculib2/amsculib2.cu
@ -0,0 +1,6 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+};
--- a/src/amsculib2/amscumath.cu
+++ b/src/amsculib2/amscumath.cu
@ -0,0 +1,269 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+    __host__ __device__ double dabs(double x)
+    {
+        if(x<0.0)
+        {
+            x = -x;
+        }
+        return x;
+    }
+
+    __host__ __device__ float fabs(float x)
+    {
+        if(x<0.0f)
+        {
+            x = -x;
+        }
+        return x;
+    }
+
+    __host__ __device__ double mod(double x, double md)
+    {
+        x = fmod(x,md);
+        if(x<0.0)
+        {
+            x = x + md;
+        }
+        return x;
+    }
+
+    __host__ __device__ float mod(float x, float md)
+    {
+        x = fmodf(x,md);
+        if(x<0.0f)
+        {
+            x = x + md;
+        }
+        return x;
+    }
+
+    __host__ __device__ int mod(int x, int n)
+    {
+        x = x % n;
+        if(x<0)
+        {
+            x = x + n;
+        }
+        return x;
+    }
+
+    __host__ __device__ long mod(long x, long n)
+    {
+        x = x % n;
+        if(x<0)
+        {
+            x = x + n;
+        }
+        return x;
+    }
+
+    __host__ __device__ int truediv(int x, int y)
+    {
+        int z = 0;
+        if(x>=0 && y>0)
+        {
+            z = x/y;
+        }
+        else if(x<0 && y>0)
+        {
+            z = -((-x)/y) - 1;
+        }
+        else if(x>=0 && y<0)
+        {
+            z = -(x/(-y)) - 1;
+        }
+        else if(x<0 && y<0)
+        {
+            z = ((-x)/(-y));
+        }
+
+        return z;
+    }
+
+    __host__ __device__ long truediv(long x, long y)
+    {
+        int z = 0;
+        if(x>=0 && y>0)
+        {
+            z = x/y;
+        }
+        else if(x<0 && y>0)
+        {
+            z = -((-x)/y) - 1;
+        }
+        else if(x>=0 && y<0)
+        {
+            z = -(x/(-y)) - 1;
+        }
+        else if(x<0 && y<0)
+        {
+            z = ((-x)/(-y));
+        }
+
+        return z;
+    }
+
+
+    template<> __host__ __device__ double min(double a, double b)
+    {
+        if(isnan(a))
+        {
+            return b;
+        }
+        else if(isnan(b))
+        {
+            return a;
+        }
+        else if(a>b)
+        {
+            return b;
+        }
+        else
+        {
+            return a;
+        }
+    }
+
+    template<> __host__ __device__ float min(float a, float b)
+    {
+        if(isnan(a))
+        {
+            return b;
+        }
+        else if(isnan(b))
+        {
+            return a;
+        }
+        else if(a>b)
+        {
+            return b;
+        }
+        else
+        {
+            return a;
+        }
+    }
+
+    template<> __host__ __device__ double max(double a, double b)
+    {
+        if(isnan(a))
+        {
+            return b;
+        }
+        else if(isnan(b))
+        {
+            return a;
+        }
+        else if(a>b)
+        {
+            return a;
+        }
+        else
+        {
+            return b;
+        }
+    }
+
+    template<> __host__ __device__ float max(float a, float b)
+    {
+        if(isnan(a))
+        {
+            return b;
+        }
+        else if(isnan(b))
+        {
+            return a;
+        }
+        else if(a>b)
+        {
+            return a;
+        }
+        else
+        {
+            return b;
+        }
+    }
+
+    __device__ __host__ double arg(double x, double y)
+    {
+        double ret = 0.0;
+        double z = ::sqrt(x*x+y*y);
+
+        if(z>0.0)
+        {
+            if(y<=x && y>=-x)
+            {
+                ret = asin(y/z);
+            }
+            else if(y>=x && y>=-x)
+            {
+                ret = acos(x/z);
+            }
+            else if(y>=x && y<=-x)
+            {
+                ret = pi-asin(y/z);
+            }
+            else
+            {
+                ret = 2.0*pi-acos(x/z);
+            }
+        }
+
+        if(ret<0.0)
+        {
+            ret = 2.0*pi+ret;
+        }
+
+        return ret;
+    }
+
+    __device__ __host__ void get_azel(double x, double y, double z, double *az, double *el)
+    {
+        //int ret = -2; //should never see this return
+        double n, rp;
+        n = ::sqrt(x*x+y*y+z*z);
+        if(n>0.0)
+        {
+            rp = ::sqrt(x*x+y*y);
+            if(rp>0.0)
+            {
+                //ret = 1; //nonzero vector - should work
+                *az = arg(x,y);
+                *el = ::atan(z/rp);
+            }
+            else
+            {
+                //ret = 0; //straight up or straight down
+                if(z>0.0)
+                {
+                    *az = 0.0;
+                    *el = pi/2.0;
+                }
+                else
+                {
+                    *az = 0.0;
+                    *el = -pi/2.0;
+                }
+            }
+        }
+        else
+        {
+            *az = 0.0;
+            *el = 0.0;
+            //ret = -1; //zero vector - no real azimuth/elevation
+        }
+
+        return;
+    }
+
+
+    void test_amscumath1()
+    {
+        printf("pi = %1.16f\n",amscuda::pi);
+    }
+
+};
--- a/src/amsculib2/amscurarray.cu
+++ b/src/amsculib2/amscurarray.cu
@ -0,0 +1,85 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+__global__ void test_amscurarray1_kf1(curarray<int> *q)
+{
+    int I,J,Na;
+    int I0 = threadIdx.x + blockIdx.x*blockDim.x;
+    int Is = blockDim.x*gridDim.x;
+    int N = q->Narrays;
+
+    for(I=I0; I<N;I = I + Is)
+    {
+        Na = q->N[I];
+
+        //printf("I:%d Na: %d\n",I,Na);
+        //q->dev_resizearray(I, Na);
+        for(J=0;J<Na;J++)
+        {
+            q->devarrayptrs[I][J] = J;
+        }
+    }
+}
+
+void test_amscurarray1()
+{
+    int I;
+    cudaError_t err = cudaSuccess;
+
+    printf("test_amscurarray1:\n");
+    curarray<int> *qarray = NULL;
+
+    curarray_new(&qarray,100);
+    err = cudaGetLastError();
+    if(err!=cudaSuccess)
+    {
+        printf("debug error trap 1: %s\n",cudaGetErrorString(err));
+    }
+
+    for(I=0;I<100;I++)
+    {
+        qarray->resizearray(I,5);
+    }
+
+    qarray->push();
+    qarray->pull();
+
+    cuda_errortrap("debug: error trap 2");
+
+    for(I=0;I<5;I++)
+    {
+        printf("array[%d], size %d\n",I,qarray->N[I]);
+    }
+
+    //
+    for(I=0;I<100;I++)
+    {
+        qarray->resizearray(I,I%5);
+        cuda_errortrap("debug: error trap resize2");
+    }
+    
+    qarray->push();
+    qarray->pull();
+    test_amscurarray1_kf1<<<128,1>>>(qarray->devptr);
+
+    cuda_errortrap("debug: error trap kf1");
+
+    qarray->pull();
+    cuda_errortrap("debug: error trap pull2");
+
+
+    for(I=0;I<5;I++)
+    {
+        printf("array[%d], size %d\n",I,qarray->N[I]);
+    }
+
+    curarray_delete(&qarray);
+    
+    cuda_errortrap("debug: error trap 3");
+
+    return;
+}
+
+};
--- a/src/amsculib2/cuvect2.cu
+++ b/src/amsculib2/cuvect2.cu
@ -0,0 +1,361 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+    __host__ __device__ cuvect2::cuvect2()
+    {
+        x = 0.0; y = 0.0;
+        return;
+    }
+
+    __host__ __device__ cuvect2::~cuvect2()
+    {
+        x = 0.0; y = 0.0;
+        return;
+    }
+
+    __host__ __device__ cuvect2::cuvect2(double _x, double _y)
+    {
+        x = _x; y = _y;
+        return;
+    }
+
+    __host__ __device__ double& cuvect2::operator[](const int I)
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        return x;
+    }
+
+    __host__ __device__ const double& cuvect2::operator[](const int I) const
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        return x;
+    }
+
+    __host__ __device__ cuvect2 cuvect2::operator+(cuvect2 lhs)
+    {
+        cuvect2 ret;
+        ret.x = x + lhs.x;
+        ret.y = y + lhs.y;
+        return ret;
+    }
+
+    __host__ __device__ cuvect2 cuvect2::operator-(cuvect2 lhs)
+    {
+        cuvect2 ret;
+        ret.x = x - lhs.x;
+        ret.y = y - lhs.y;
+        return ret;
+    }
+    __host__ __device__ cuvect2 cuvect2::operator*(double lhs)
+    {
+        cuvect2 ret;
+        ret.x = x*lhs;
+        ret.y = y*lhs;
+        return ret;
+    }
+
+    __host__ __device__ cuvect2 cuvect2::operator/(double lhs)
+    {
+        cuvect2 ret;
+        ret.x = x/lhs;
+        ret.y = y/lhs;
+        return ret;
+    }
+
+    __host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b)
+    {
+        double ret = a.x*b.x+a.y*b.y;
+        return ret;
+    }
+
+    __host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b)
+    {
+        double ret = a.x*b.y-a.y*b.x;
+        return ret;
+    }
+
+    __host__ __device__ double cuvect2_norm(cuvect2 a)
+    {
+        double ret = ::sqrt(a.x*a.x+a.y*a.y);
+        return ret;
+    }
+
+    __host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a)
+    {
+        cuvect2 ret;
+        double m = cuvect2_norm(a);
+        if(m>0.0)
+        {
+            ret.x = a.x/m; ret.y = a.y/m;
+        }
+        else
+        {
+            ret.x = 0.0; ret.y = 0.0;
+        }
+        return ret;
+    }
+
+    __host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b)
+    {
+        cuvect2 ret;
+        cuvect2 bn = cuvect2_normalize(b);
+        double m = cuvect2_dot(a,bn);
+        ret = bn*m;
+        return ret;
+    }
+
+
+__host__ __device__ cumat2::cumat2()
+{
+    int I;
+    for(I=0;I<4;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ cumat2::~cumat2()
+{
+    int I;
+    for(I=0;I<4;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ double& cumat2::operator[](const int I)
+{
+    return dat[I];
+}
+
+__host__ __device__ double& cumat2::operator()(const int I, const int J)
+{
+    return dat[I+2*J];
+}
+
+__host__ __device__ cumat2 cumat2::operator+(cumat2 lhs)
+{
+    int I;
+    cumat2 ret;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I] + lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2 cumat2::operator-(cumat2 lhs)
+{
+    int I;
+    cumat2 ret;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I] - lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2 cumat2::operator*(double lhs)
+{
+    cumat2 ret;
+    int I;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I]*lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2 cumat2::operator/(double lhs)
+{
+    cumat2 ret;
+    int I;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I]/lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ double& cumat2::at(const int I, const int J)
+{
+    return dat[I+2*J];
+}
+        
+
+__host__ __device__ cuvect2 cumat2::operator*(cuvect2 lhs)
+{
+    cuvect2 ret = cuvect2(0.0,0.0);
+    int I,J;
+    for(I=0;I<2;I++)
+    {
+        for(J=0;J<2;J++)
+        {
+            ret[I] = ret[I] + this->at(I,J)*lhs[J];
+        }
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2 cumat2::operator*(cumat2 lhs)
+{
+    cumat2 ret;
+    int I,J,K;
+
+    for(I=0;I<2;I++)
+    {
+        for(J=0;J<2;J++)
+        {
+            ret(I,J) = 0.0;
+            for(K=0;K<2;K++)
+            {
+                ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
+            }
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ double cumat2::det()
+{
+    double ret = 0.0;
+    ret = ret + this->at(0,0)*this->at(1,1);
+    ret = ret - this->at(1,0)*this->at(0,1);
+    return ret;
+}
+
+__host__ __device__ cumat2 cumat2::transpose()
+{
+    cumat2 q;
+    int I,J;
+    for(I=0;I<2;I++)
+    {
+        for(J=0;J<2;J++)
+        {
+            q.at(I,J) = this->at(J,I);
+        }
+    }
+    return q;
+}
+
+__host__ __device__ cumat2 cumat2::inverse()
+{
+    cumat2 q;
+    double dt = q.det();
+    if(dt!=0.0)
+    {
+        q(0,0) = this->at(1,1)/dt;
+        q(0,1) = -this->at(0,1)/dt;
+        q(1,0) = -this->at(1,0)/dt;
+        q(1,1) = this->at(0,0)/dt;
+    }
+    else
+    {
+        q(0,0) = inf;
+        q(0,1) = inf;
+        q(1,0) = inf;
+        q(1,1) = inf;
+    }
+
+    return q;
+}
+
+//2x2 matrix operations
+//matrix order is assumed to be mat[I,J] = mat[I+3*J]
+
+//transpose a 2x2 matrix in place
+__host__ __device__ void mat2_transpose(double *mat2inout)
+{
+    double mat2_in[4];
+
+    mat2_copy(mat2_in,mat2inout);
+    mat2inout[0+0*2] = mat2_in[0+0*2];
+    mat2inout[1+0*2] = mat2_in[0+1*2];
+    mat2inout[0+1*2] = mat2_in[1+0*2];
+    mat2inout[1+1*2] = mat2_in[1+1*2];
+    
+    return;
+}
+
+//copies src to dest
+__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src)
+{
+    mat2_dest[0+0*2] = mat2_src[0+0*2];
+    mat2_dest[1+0*2] = mat2_src[1+0*2];
+    mat2_dest[0+1*2] = mat2_src[0+1*2];
+    mat2_dest[1+1*2] = mat2_src[1+1*2];
+    
+    return;
+}
+
+//inverts mat?inout[4]
+__host__ __device__ void mat2_inverse(double *mat2inout)
+{
+    double det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2];
+    double mat2in[4];
+
+    mat2_copy(mat2in,mat2inout);
+    mat2inout[0+0*2] = mat2inout[1+1*2]/det;
+    mat2inout[1+0*2] = -mat2inout[1+0*2]/det;
+    mat2inout[0+1*2] = -mat2inout[0+1*2]/det;
+    mat2inout[1+1*2] = mat2inout[0+0*2]/det;
+    
+    return;
+}
+
+//rotatin matrix from angle
+__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2)
+{
+    mat2[0+0*2] = ::cos(angle);
+    mat2[1+0*2] = ::sin(angle);
+    mat2[0+1*2] = -::sin(angle);
+    mat2[1+1*2] = ::cos(angle);
+    return;
+}
+
+//multiplies c = a*b
+__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c)
+{
+    double mat2a_in[4];
+    double mat2b_in[4];
+
+    if(mat2a==NULL || mat2b==NULL || mat2c==NULL)
+    {
+        return;
+    }
+
+    mat2_copy(mat2a_in,mat2a);
+    mat2_copy(mat2b_in,mat2b);
+    
+    mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2];
+    mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2];
+    mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2];
+    mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2];
+
+    return;
+}
+
+// ret = a*b
+__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b)
+{
+    cuvect2 ret;
+    ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2];
+    ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2];
+    return ret;
+}
+
+void test_cuvect2_1()
+{
+
+
+    return;
+}
+
+};
--- a/src/amsculib2/cuvect2f.cu
+++ b/src/amsculib2/cuvect2f.cu
@ -0,0 +1,361 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+    __host__ __device__ cuvect2f::cuvect2f()
+    {
+        x = 0.0; y = 0.0;
+        return;
+    }
+
+    __host__ __device__ cuvect2f::~cuvect2f()
+    {
+        x = 0.0; y = 0.0;
+        return;
+    }
+
+    __host__ __device__ cuvect2f::cuvect2f(float _x, float _y)
+    {
+        x = _x; y = _y;
+        return;
+    }
+
+    __host__ __device__ float& cuvect2f::operator[](const int I)
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        return x;
+    }
+
+    __host__ __device__ const float& cuvect2f::operator[](const int I) const
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        return x;
+    }
+
+    __host__ __device__ cuvect2f cuvect2f::operator+(cuvect2f lhs)
+    {
+        cuvect2f ret;
+        ret.x = x + lhs.x;
+        ret.y = y + lhs.y;
+        return ret;
+    }
+
+    __host__ __device__ cuvect2f cuvect2f::operator-(cuvect2f lhs)
+    {
+        cuvect2f ret;
+        ret.x = x - lhs.x;
+        ret.y = y - lhs.y;
+        return ret;
+    }
+    __host__ __device__ cuvect2f cuvect2f::operator*(float lhs)
+    {
+        cuvect2f ret;
+        ret.x = x*lhs;
+        ret.y = y*lhs;
+        return ret;
+    }
+
+    __host__ __device__ cuvect2f cuvect2f::operator/(float lhs)
+    {
+        cuvect2f ret;
+        ret.x = x/lhs;
+        ret.y = y/lhs;
+        return ret;
+    }
+
+    __host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b)
+    {
+        float ret = a.x*b.x+a.y*b.y;
+        return ret;
+    }
+
+    __host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b)
+    {
+        float ret = a.x*b.y-a.y*b.x;
+        return ret;
+    }
+
+    __host__ __device__ float cuvect2f_norm(cuvect2f a)
+    {
+        float ret = ::sqrtf(a.x*a.x+a.y*a.y);
+        return ret;
+    }
+
+    __host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a)
+    {
+        cuvect2f ret;
+        float m = cuvect2f_norm(a);
+        if(m>0.0)
+        {
+            ret.x = a.x/m; ret.y = a.y/m;
+        }
+        else
+        {
+            ret.x = 0.0; ret.y = 0.0;
+        }
+        return ret;
+    }
+
+    __host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b)
+    {
+        cuvect2f ret;
+        cuvect2f bn = cuvect2f_normalize(b);
+        float m = cuvect2f_dot(a,bn);
+        ret = bn*m;
+        return ret;
+    }
+
+
+__host__ __device__ cumat2f::cumat2f()
+{
+    int I;
+    for(I=0;I<4;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ cumat2f::~cumat2f()
+{
+    int I;
+    for(I=0;I<4;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ float& cumat2f::operator[](const int I)
+{
+    return dat[I];
+}
+
+__host__ __device__ float& cumat2f::operator()(const int I, const int J)
+{
+    return dat[I+2*J];
+}
+
+__host__ __device__ cumat2f cumat2f::operator+(cumat2f lhs)
+{
+    int I;
+    cumat2f ret;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I] + lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2f cumat2f::operator-(cumat2f lhs)
+{
+    int I;
+    cumat2f ret;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I] - lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2f cumat2f::operator*(float lhs)
+{
+    cumat2f ret;
+    int I;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I]*lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2f cumat2f::operator/(float lhs)
+{
+    cumat2f ret;
+    int I;
+    for(I=0;I<4;I++)
+    {
+        ret.dat[I] = this->dat[I]/lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ float& cumat2f::at(const int I, const int J)
+{
+    return dat[I+2*J];
+}
+        
+
+__host__ __device__ cuvect2f cumat2f::operator*(cuvect2f lhs)
+{
+    cuvect2f ret = cuvect2f(0.0,0.0);
+    int I,J;
+    for(I=0;I<2;I++)
+    {
+        for(J=0;J<2;J++)
+        {
+            ret[I] = ret[I] + this->at(I,J)*lhs[J];
+        }
+    }
+    return ret;
+}
+
+__host__ __device__ cumat2f cumat2f::operator*(cumat2f lhs)
+{
+    cumat2f ret;
+    int I,J,K;
+
+    for(I=0;I<2;I++)
+    {
+        for(J=0;J<2;J++)
+        {
+            ret(I,J) = 0.0;
+            for(K=0;K<2;K++)
+            {
+                ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
+            }
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ float cumat2f::det()
+{
+    float ret = 0.0;
+    ret = ret + this->at(0,0)*this->at(1,1);
+    ret = ret - this->at(1,0)*this->at(0,1);
+    return ret;
+}
+
+__host__ __device__ cumat2f cumat2f::transpose()
+{
+    cumat2f q;
+    int I,J;
+    for(I=0;I<2;I++)
+    {
+        for(J=0;J<2;J++)
+        {
+            q.at(I,J) = this->at(J,I);
+        }
+    }
+    return q;
+}
+
+__host__ __device__ cumat2f cumat2f::inverse()
+{
+    cumat2f q;
+    float dt = q.det();
+    if(dt!=0.0)
+    {
+        q(0,0) = this->at(1,1)/dt;
+        q(0,1) = -this->at(0,1)/dt;
+        q(1,0) = -this->at(1,0)/dt;
+        q(1,1) = this->at(0,0)/dt;
+    }
+    else
+    {
+        q(0,0) = inf;
+        q(0,1) = inf;
+        q(1,0) = inf;
+        q(1,1) = inf;
+    }
+
+    return q;
+}
+
+//2x2 matrix operations
+//matrix order is assumed to be mat[I,J] = mat[I+3*J]
+
+//transpose a 2x2 matrix in place
+__host__ __device__ void mat2f_transpose(float *mat2inout)
+{
+    float mat2f_in[4];
+
+    mat2f_copy(mat2f_in,mat2inout);
+    mat2inout[0+0*2] = mat2f_in[0+0*2];
+    mat2inout[1+0*2] = mat2f_in[0+1*2];
+    mat2inout[0+1*2] = mat2f_in[1+0*2];
+    mat2inout[1+1*2] = mat2f_in[1+1*2];
+    
+    return;
+}
+
+//copies src to dest
+__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src)
+{
+    mat2f_dest[0+0*2] = mat2f_src[0+0*2];
+    mat2f_dest[1+0*2] = mat2f_src[1+0*2];
+    mat2f_dest[0+1*2] = mat2f_src[0+1*2];
+    mat2f_dest[1+1*2] = mat2f_src[1+1*2];
+    
+    return;
+}
+
+//inverts mat?inout[4]
+__host__ __device__ void mat2f_inverse(float *mat2inout)
+{
+    float det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2];
+    float mat2in[4];
+
+    mat2f_copy(mat2in,mat2inout);
+    mat2inout[0+0*2] = mat2inout[1+1*2]/det;
+    mat2inout[1+0*2] = -mat2inout[1+0*2]/det;
+    mat2inout[0+1*2] = -mat2inout[0+1*2]/det;
+    mat2inout[1+1*2] = mat2inout[0+0*2]/det;
+    
+    return;
+}
+
+//rotatin matrix from angle
+__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2)
+{
+    mat2[0+0*2] = ::cosf(angle);
+    mat2[1+0*2] = ::sinf(angle);
+    mat2[0+1*2] = -::sinf(angle);
+    mat2[1+1*2] = ::cosf(angle);
+    return;
+}
+
+//multiplies c = a*b
+__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c)
+{
+    float mat2a_in[4];
+    float mat2b_in[4];
+
+    if(mat2a==NULL || mat2b==NULL || mat2c==NULL)
+    {
+        return;
+    }
+
+    mat2f_copy(mat2a_in,mat2a);
+    mat2f_copy(mat2b_in,mat2b);
+    
+    mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2];
+    mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2];
+    mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2];
+    mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2];
+
+    return;
+}
+
+// ret = a*b
+__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b)
+{
+    cuvect2f ret;
+    ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2];
+    ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2];
+    return ret;
+}
+
+void test_cuvect2f_1()
+{
+
+
+    return;
+}
+
+};
--- a/src/amsculib2/cuvect3.cu
+++ b/src/amsculib2/cuvect3.cu
@ -0,0 +1,581 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+    __host__ __device__ cuvect3::cuvect3()
+    {
+        x = 0.0; y = 0.0; z = 0.0;
+        return;
+    }
+
+    __host__ __device__ cuvect3::~cuvect3()
+    {
+        x = 0.0; y = 0.0; z = 0.0;
+        return;
+    }
+
+    __host__ __device__ double& cuvect3::operator[](const int I)
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        if(I==2) return z;
+        return x;
+    }
+
+    __host__ __device__ const double& cuvect3::operator[](const int I) const
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        if(I==2) return z;
+        return x;
+    }
+
+    __host__ __device__ cuvect3 cuvect3::operator+(cuvect3 lhs)
+    {
+        cuvect3 ret;
+        ret.x = x+lhs.x;
+        ret.y = y+lhs.y;
+        ret.z = z+lhs.z;
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3 cuvect3::operator-(cuvect3 lhs)
+    {
+        cuvect3 ret;
+        ret.x = x-lhs.x;
+        ret.y = y-lhs.y;
+        ret.z = z-lhs.z;
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3 cuvect3::operator*(double lhs)
+    {
+        cuvect3 ret;
+        ret.x = x*lhs;
+        ret.y = y*lhs;
+        ret.z = z*lhs;
+        return ret;
+    }
+
+    __host__ __device__ cuvect3 cuvect3::operator/(double lhs)
+    {
+        cuvect3 ret;
+        ret.x = x/lhs;
+        ret.y = y/lhs;
+        ret.z = z/lhs;
+        return ret;
+    }
+
+    __host__ __device__ cuvect3::cuvect3(double _x, double _y, double _z)
+    {
+        x = _x; y = _y; z = _z;
+        return;
+    }
+
+
+    __host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b)
+    {
+        double ret = a.x*b.x+a.y*b.y+a.z*b.z;
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b)
+    {
+        cuvect3 ret;
+        ret[0] = a[1]*b[2]-a[2]*b[1];
+        ret[1] = a[2]*b[0]-a[0]*b[2];
+        ret[2] = a[0]*b[1]-a[1]*b[0];
+
+        return ret;
+    }
+
+    __host__ __device__ double cuvect3_norm(cuvect3 a)
+    {
+        double ret;
+        ret = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z);
+        return ret;
+    }
+
+    __host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a)
+    {
+        cuvect3 ret;
+        double m;
+        m = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z);
+        if(m>0.0)
+        {
+            ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m;
+        }
+        else
+        {
+            ret.x = 0.0; ret.y = 0.0; ret.z = 0.0;
+        }
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b)
+    {
+        cuvect3 ret;
+        cuvect3 bn = cuvect3_normalize(b);
+        double m = cuvect3_dot(a,bn);
+        ret = bn*m;
+        return ret;
+    }
+
+//transposes a 3x3 (9 element) matrix
+__host__ __device__ void mat3_transpose(double *mat3inout)
+{
+    int I,J;
+    double matint[9];
+    for(I=0;I<9;I++)
+    {
+        matint[I] = mat3inout[I];
+    }
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            mat3inout[I+J*3] = matint[J+I*3];
+        }
+    }
+
+    return;
+}
+
+//copies src to dest
+__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src)
+{
+    int I;
+    if(mat3_dest==NULL || mat3_src==NULL)
+        return;
+
+    for(I=0;I<9;I++)
+        mat3_dest[I] = mat3_src[I];
+
+    return;
+}
+
+
+__host__ __device__ double mat3_det(double *mat3in)
+{
+    double ret = 0.0;
+
+    ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3];
+    ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3];
+    ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3];
+    ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3];
+    ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3];
+    ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3];
+
+    return ret;
+}
+
+//inverts a 3x3 (9 element) matrix
+__host__ __device__ void mat3_inverse(double *mat3inout)
+{
+    int I;
+    double matint[9];
+    double det = mat3_det(mat3inout);
+
+    for(I=0;I<9;I++)
+    {
+        matint[I] = mat3inout[I];
+    }
+
+    mat3inout[0+0*3] =  (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det;
+    mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det;
+    mat3inout[0+2*3] =  (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det;
+    mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det;
+    mat3inout[1+1*3] =  (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det;
+    mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det;
+    mat3inout[2+0*3] =  (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det;
+    mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det;
+    mat3inout[2+2*3] =  (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det;
+
+    mat3_transpose(mat3inout);
+
+    return;
+}
+
+__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin)
+{
+    int I,J;
+    cuvect3 ret;
+    for(I=0;I<3;I++)
+    {
+        ret[I] = 0.0;
+        for(J=0;J<3;J++)
+        {
+            ret[I] = ret[I] + mat3in[I+3*J]*cvin[J];
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout)
+{
+    double wrk[9];
+    int I,J,K;
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            wrk[I+3*J] = 0.0;
+        }
+    }
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            for(K=0;K<3;K++)
+            {
+                wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K];
+            }
+        }
+    }
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            matout[I+3*J] = wrk[I+3*J];
+        }
+    }
+
+    return;
+}
+
+__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout)
+{
+    matout[0 + 0*3] = 0.0f;
+    matout[1 + 0*3] = -vecin[2];
+    matout[2 + 0*3] = vecin[1];
+
+    matout[0 + 1*3] = vecin[2];
+    matout[1 + 1*3] = 0.0f;
+    matout[2 + 1*3] = -vecin[0];
+
+    matout[0 + 2*3] = -vecin[1];
+    matout[1 + 2*3] = vecin[0];
+    matout[2 + 2*3] = 0.0f;
+    return;
+}
+
+__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout)
+{
+    vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]);
+    vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]);
+    vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]);
+    
+    return;
+}
+
+//returns direction cosine rotation matrix from axis and angle
+__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout)
+{
+    int I;
+    double H[9];
+    double Hsq[9];
+    double II[9];
+
+    for(I=0;I<9;I++) II[I] = 0.0;
+    II[0+0*3] = 1.0;
+    II[1+1*3] = 1.0;
+    II[2+2*3] = 1.0;
+
+    axis = cuvect3_normalize(axis);
+    
+    mat3_hodgedual(axis,H);
+    mat3_mult(H,H,Hsq);
+
+    for(I=0;I<9;I++)
+    {
+        matout[I] = (II[I] + Hsq[I]) + H[I]*sin(angle) - Hsq[I]*cos(angle);
+    }
+
+    return;
+}
+
+
+
+__host__ void test_cudavect_logic1()
+{
+    //3 dim vector and matrix functional tests on host side
+
+    printf("3 dim vector and matrix functional tests on host side\n");
+    
+    cuvect3 a,b,c;
+    double ma[9],mb[9],mc[9];
+
+    int I,J;
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            ma[I+3*J] = ((double) rand())/((double) RAND_MAX);
+            mb[I+3*J] = ma[I+3*J];
+        }
+    }
+
+    mat3_inverse(mb);
+    mat3_mult(ma,mb,mc);
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]);
+        }
+    }
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]);
+        }
+    }
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]);
+        }
+    }
+
+    a = cuvect3(1,1,1);
+    b = mat3_mult(ma,a);
+    b = mat3_mult(mb,b);
+    
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]);
+    }
+
+    a = cuvect3(1,0,1);
+    b = cuvect3(0,1,-1);
+    c = a+b;
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+
+    c = c/2.0;
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+
+    c = cuvect3_cross(a,b);
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+
+    printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b));
+
+    printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c));
+    a = cuvect3_normalize(a);
+    b = cuvect3_normalize(b);
+    c = cuvect3_normalize(c);
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+    printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b));
+    printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c));
+
+    return;
+}
+
+
+
+__host__ __device__ cumat3::cumat3()
+{
+    int I;
+    for(I=0;I<9;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ cumat3::~cumat3()
+{
+    int I;
+    for(I=0;I<9;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ double& cumat3::operator[](const int I)
+{
+    return dat[I];
+}
+
+__host__ __device__ double& cumat3::operator()(const int I, const int J)
+{
+    return dat[I+3*J];
+}
+
+__host__ __device__ cumat3 cumat3::operator+(cumat3 lhs)
+{
+    int I;
+    cumat3 ret;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I] + lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3 cumat3::operator-(cumat3 lhs)
+{
+    int I;
+    cumat3 ret;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I] - lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3 cumat3::operator*(double lhs)
+{
+    cumat3 ret;
+    int I;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I]*lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3 cumat3::operator/(double lhs)
+{
+    cumat3 ret;
+    int I;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I]/lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ double& cumat3::at(const int I, const int J)
+{
+    return dat[I+3*J];
+}
+        
+
+__host__ __device__ cuvect3 cumat3::operator*(cuvect3 lhs)
+{
+    cuvect3 ret = cuvect3(0.0,0.0,0.0);
+    int I,J;
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            ret[I] = ret[I] + this->at(I,J)*lhs[J];
+        }
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3 cumat3::operator*(cumat3 lhs)
+{
+    cumat3 ret;
+    int I,J,K;
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            ret(I,J) = 0.0;
+            for(K=0;K<3;K++)
+            {
+                ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
+            }
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ double cumat3::det()
+{
+    double ret = 0.0;
+
+    ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2);
+    ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0);
+    ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1);
+    ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1);
+    ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2);
+    ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0);
+
+    return ret;
+}
+
+__host__ __device__ cumat3 cumat3::transpose()
+{
+    cumat3 q;
+    int I,J;
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            q.at(I,J) = this->at(J,I);
+        }
+    }
+    return q;
+}
+
+__host__ __device__ cumat3 cumat3::inverse()
+{
+    cumat3 q;
+    double dt = q.det();
+    if(dt!=0.0)
+    {
+        q(0,0) =  (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt;
+        q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt;
+        q(0,2) =  (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt;
+        q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt;
+        q(1,1) =  (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt;
+        q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt;
+        q(2,0) =  (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt;
+        q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt;
+        q(2,2) =  (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt;
+
+        q = q.transpose();
+    }
+    else
+    {
+        q(0,0) = inf;
+        q(0,1) = inf;
+        q(0,2) = inf;
+        q(1,0) = inf;
+        q(1,1) = inf;
+        q(1,2) = inf;
+        q(2,0) = inf;
+        q(2,1) = inf;
+        q(2,2) = inf;
+    }
+
+    return q;
+}
+
+};
--- a/src/amsculib2/cuvect3f.cu
+++ b/src/amsculib2/cuvect3f.cu
@ -0,0 +1,581 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+    __host__ __device__ cuvect3f::cuvect3f()
+    {
+        x = 0.0; y = 0.0; z = 0.0;
+        return;
+    }
+
+    __host__ __device__ cuvect3f::~cuvect3f()
+    {
+        x = 0.0; y = 0.0; z = 0.0;
+        return;
+    }
+
+    __host__ __device__ float& cuvect3f::operator[](const int I)
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        if(I==2) return z;
+        return x;
+    }
+
+    __host__ __device__ const float& cuvect3f::operator[](const int I) const
+    {
+        if(I==0) return x;
+        if(I==1) return y;
+        if(I==2) return z;
+        return x;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f::operator+(cuvect3f lhs)
+    {
+        cuvect3f ret;
+        ret.x = x+lhs.x;
+        ret.y = y+lhs.y;
+        ret.z = z+lhs.z;
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f::operator-(cuvect3f lhs)
+    {
+        cuvect3f ret;
+        ret.x = x-lhs.x;
+        ret.y = y-lhs.y;
+        ret.z = z-lhs.z;
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f::operator*(float lhs)
+    {
+        cuvect3f ret;
+        ret.x = x*lhs;
+        ret.y = y*lhs;
+        ret.z = z*lhs;
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f::operator/(float lhs)
+    {
+        cuvect3f ret;
+        ret.x = x/lhs;
+        ret.y = y/lhs;
+        ret.z = z/lhs;
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f::cuvect3f(float _x, float _y, float _z)
+    {
+        x = _x; y = _y; z = _z;
+        return;
+    }
+
+
+    __host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b)
+    {
+        float ret = a.x*b.x+a.y*b.y+a.z*b.z;
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b)
+    {
+        cuvect3f ret;
+        ret[0] = a[1]*b[2]-a[2]*b[1];
+        ret[1] = a[2]*b[0]-a[0]*b[2];
+        ret[2] = a[0]*b[1]-a[1]*b[0];
+
+        return ret;
+    }
+
+    __host__ __device__ float cuvect3f_norm(cuvect3f a)
+    {
+        float ret;
+        ret = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a)
+    {
+        cuvect3f ret;
+        float m;
+        m = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
+        if(m>0.0)
+        {
+            ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m;
+        }
+        else
+        {
+            ret.x = 0.0; ret.y = 0.0; ret.z = 0.0;
+        }
+
+        return ret;
+    }
+
+    __host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b)
+    {
+        cuvect3f ret;
+        cuvect3f bn = cuvect3f_normalize(b);
+        float m = cuvect3f_dot(a,bn);
+        ret = bn*m;
+        return ret;
+    }
+
+//transposes a 3x3 (9 element) matrix
+__host__ __device__ void mat3f_transpose(float *mat3inout)
+{
+    int I,J;
+    float matint[9];
+    for(I=0;I<9;I++)
+    {
+        matint[I] = mat3inout[I];
+    }
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            mat3inout[I+J*3] = matint[J+I*3];
+        }
+    }
+
+    return;
+}
+
+//copies src to dest
+__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src)
+{
+    int I;
+    if(mat3f_dest==NULL || mat3f_src==NULL)
+        return;
+
+    for(I=0;I<9;I++)
+        mat3f_dest[I] = mat3f_src[I];
+
+    return;
+}
+
+
+__host__ __device__ float mat3f_det(float *mat3in)
+{
+    float ret = 0.0;
+
+    ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3];
+    ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3];
+    ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3];
+    ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3];
+    ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3];
+    ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3];
+
+    return ret;
+}
+
+//inverts a 3x3 (9 element) matrix
+__host__ __device__ void mat3f_inverse(float *mat3inout)
+{
+    int I;
+    float matint[9];
+    float det = mat3f_det(mat3inout);
+
+    for(I=0;I<9;I++)
+    {
+        matint[I] = mat3inout[I];
+    }
+
+    mat3inout[0+0*3] =  (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det;
+    mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det;
+    mat3inout[0+2*3] =  (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det;
+    mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det;
+    mat3inout[1+1*3] =  (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det;
+    mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det;
+    mat3inout[2+0*3] =  (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det;
+    mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det;
+    mat3inout[2+2*3] =  (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det;
+
+    mat3f_transpose(mat3inout);
+
+    return;
+}
+
+__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin)
+{
+    int I,J;
+    cuvect3f ret;
+    for(I=0;I<3;I++)
+    {
+        ret[I] = 0.0;
+        for(J=0;J<3;J++)
+        {
+            ret[I] = ret[I] + mat3in[I+3*J]*cvin[J];
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout)
+{
+    float wrk[9];
+    int I,J,K;
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            wrk[I+3*J] = 0.0;
+        }
+    }
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            for(K=0;K<3;K++)
+            {
+                wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K];
+            }
+        }
+    }
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            matout[I+3*J] = wrk[I+3*J];
+        }
+    }
+
+    return;
+}
+
+__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout)
+{
+    matout[0 + 0*3] = 0.0f;
+    matout[1 + 0*3] = -vecin[2];
+    matout[2 + 0*3] = vecin[1];
+
+    matout[0 + 1*3] = vecin[2];
+    matout[1 + 1*3] = 0.0f;
+    matout[2 + 1*3] = -vecin[0];
+
+    matout[0 + 2*3] = -vecin[1];
+    matout[1 + 2*3] = vecin[0];
+    matout[2 + 2*3] = 0.0f;
+    return;
+}
+
+__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout)
+{
+    vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]);
+    vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]);
+    vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]);
+    
+    return;
+}
+
+//returns direction cosine rotation matrix from axis and angle
+__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout)
+{
+    int I;
+    float H[9];
+    float Hsq[9];
+    float II[9];
+
+    for(I=0;I<9;I++) II[I] = 0.0;
+    II[0+0*3] = 1.0;
+    II[1+1*3] = 1.0;
+    II[2+2*3] = 1.0;
+
+    axis = cuvect3f_normalize(axis);
+    
+    mat3f_hodgedual(axis,H);
+    mat3f_mult(H,H,Hsq);
+
+    for(I=0;I<9;I++)
+    {
+        matout[I] = (II[I] + Hsq[I]) + H[I]*sinf(angle) - Hsq[I]*cosf(angle);
+    }
+
+    return;
+}
+
+
+
+__host__ void test_cudavectf_logic1()
+{
+    //3 dim vector and matrix functional tests on host side
+
+    printf("3 dim vector and matrix functional tests on host side\n");
+    
+    cuvect3f a,b,c;
+    float ma[9],mb[9],mc[9];
+
+    int I,J;
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            ma[I+3*J] = ((float) rand())/((float) RAND_MAX);
+            mb[I+3*J] = ma[I+3*J];
+        }
+    }
+
+    mat3f_inverse(mb);
+    mat3f_mult(ma,mb,mc);
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]);
+        }
+    }
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]);
+        }
+    }
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]);
+        }
+    }
+
+    a = cuvect3f(1,1,1);
+    b = mat3f_mult(ma,a);
+    b = mat3f_mult(mb,b);
+    
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]);
+    }
+
+    a = cuvect3f(1,0,1);
+    b = cuvect3f(0,1,-1);
+    c = a+b;
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+
+    c = c/2.0;
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+
+    c = cuvect3f_cross(a,b);
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+
+    printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b));
+
+    printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c));
+    a = cuvect3f_normalize(a);
+    b = cuvect3f_normalize(b);
+    c = cuvect3f_normalize(c);
+
+    for(I=0;I<3;I++)
+    {
+        printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
+    }
+    printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b));
+    printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c));
+
+    return;
+}
+
+
+
+__host__ __device__ cumat3f::cumat3f()
+{
+    int I;
+    for(I=0;I<9;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ cumat3f::~cumat3f()
+{
+    int I;
+    for(I=0;I<9;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ float& cumat3f::operator[](const int I)
+{
+    return dat[I];
+}
+
+__host__ __device__ float& cumat3f::operator()(const int I, const int J)
+{
+    return dat[I+3*J];
+}
+
+__host__ __device__ cumat3f cumat3f::operator+(cumat3f lhs)
+{
+    int I;
+    cumat3f ret;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I] + lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator-(cumat3f lhs)
+{
+    int I;
+    cumat3f ret;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I] - lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator*(float lhs)
+{
+    cumat3f ret;
+    int I;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I]*lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator/(float lhs)
+{
+    cumat3f ret;
+    int I;
+    for(I=0;I<9;I++)
+    {
+        ret.dat[I] = this->dat[I]/lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ float& cumat3f::at(const int I, const int J)
+{
+    return dat[I+3*J];
+}
+        
+
+__host__ __device__ cuvect3f cumat3f::operator*(cuvect3f lhs)
+{
+    cuvect3f ret = cuvect3f(0.0,0.0,0.0);
+    int I,J;
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            ret[I] = ret[I] + this->at(I,J)*lhs[J];
+        }
+    }
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::operator*(cumat3f lhs)
+{
+    cumat3f ret;
+    int I,J,K;
+
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            ret(I,J) = 0.0;
+            for(K=0;K<3;K++)
+            {
+                ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
+            }
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ float cumat3f::det()
+{
+    float ret = 0.0;
+
+    ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2);
+    ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0);
+    ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1);
+    ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1);
+    ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2);
+    ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0);
+
+    return ret;
+}
+
+__host__ __device__ cumat3f cumat3f::transpose()
+{
+    cumat3f q;
+    int I,J;
+    for(I=0;I<3;I++)
+    {
+        for(J=0;J<3;J++)
+        {
+            q.at(I,J) = this->at(J,I);
+        }
+    }
+    return q;
+}
+
+__host__ __device__ cumat3f cumat3f::inverse()
+{
+    cumat3f q;
+    float dt = q.det();
+    if(dt!=0.0)
+    {
+        q(0,0) =  (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt;
+        q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt;
+        q(0,2) =  (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt;
+        q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt;
+        q(1,1) =  (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt;
+        q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt;
+        q(2,0) =  (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt;
+        q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt;
+        q(2,2) =  (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt;
+
+        q = q.transpose();
+    }
+    else
+    {
+        q(0,0) = inf;
+        q(0,1) = inf;
+        q(0,2) = inf;
+        q(1,0) = inf;
+        q(1,1) = inf;
+        q(1,2) = inf;
+        q(2,0) = inf;
+        q(2,1) = inf;
+        q(2,2) = inf;
+    }
+
+    return q;
+}
+
+};
--- a/src/amsculib2/cuvect4.cu
+++ b/src/amsculib2/cuvect4.cu
@ -0,0 +1,414 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+__host__ __device__ cuvect4::cuvect4()
+{
+    x = 0.0; y = 0.0; z = 0.0; w = 0.0;
+    return;
+}
+
+__host__ __device__ cuvect4::~cuvect4()
+{
+    x = 0.0; y = 0.0; z = 0.0; w = 0.0;
+    return;
+}
+
+__host__ __device__ cuvect4::cuvect4(double _x, double _y, double _z, double _w)
+{
+    x = _x; y = _y; z = _z; w = _w;
+    return;
+}
+        
+__host__ __device__ double& cuvect4::operator[](const int I)
+{
+    if(I==0) return x;
+    else if(I==1) return y;
+    else if(I==2) return z;
+    else if(I==3) return w;
+    return x;
+}
+
+__host__ __device__ const double& cuvect4::operator[](const int I) const
+{
+    if(I==0) return x;
+    else if(I==1) return y;
+    else if(I==2) return z;
+    else if(I==3) return w;
+    return x;
+}
+
+__host__ __device__ cuvect4 cuvect4::operator+(cuvect4 lhs)
+{
+    cuvect4 ret;
+    ret.x = this->x + lhs.x;
+    ret.y = this->y + lhs.y;
+    ret.z = this->z + lhs.z;
+    ret.w = this->w + lhs.w;
+    return ret;
+}
+
+__host__ __device__ cuvect4 cuvect4::operator-(cuvect4 lhs)
+{
+    cuvect4 ret;
+    ret.x = this->x - lhs.x;
+    ret.y = this->y - lhs.y;
+    ret.z = this->z - lhs.z;
+    ret.w = this->w - lhs.w;
+    return ret;
+}
+
+__host__ __device__ cuvect4 cuvect4::operator*(double lhs)
+{
+    cuvect4 ret;
+    ret.x = this->x*lhs;
+    ret.y = this->y*lhs;
+    ret.z = this->z*lhs;
+    ret.w = this->w*lhs;
+    return ret;
+}
+
+__host__ __device__ cuvect4 cuvect4::operator/(double lhs)
+{
+    cuvect4 ret;
+    ret.x = this->x/lhs;
+    ret.y = this->y/lhs;
+    ret.z = this->z/lhs;
+    ret.w = this->w/lhs;
+    return ret;
+}
+
+__host__ __device__ cumat4::cumat4()
+{
+    int I;
+    for(I=0;I<16;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ cumat4::~cumat4()
+{
+    int I;
+    for(I=0;I<16;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ double& cumat4::operator[](const int I)
+{
+    return dat[I];
+}
+
+__host__ __device__ double& cumat4::operator()(const int I, const int J)
+{
+    return dat[I+4*J];
+}
+
+__host__ __device__ double& cumat4::at(const int I, const int J)
+{
+    return dat[I+4*J];
+}
+
+__host__ __device__ cumat4 cumat4::operator+(cumat4 lhs)
+{
+    cumat4 ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I] + lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4 cumat4::operator-(cumat4 lhs)
+{
+    cumat4 ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I] - lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4 cumat4::operator*(double lhs)
+{
+    cumat4 ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I]*lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4 cumat4::operator/(double lhs)
+{
+    cumat4 ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I]/lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cuvect4 cumat4::operator*(cuvect4 lhs)
+{
+    cuvect4 ret = cuvect4(0.0,0.0,0.0,0.0);
+    int I,J;
+
+    for(I=0;I<4;I++)
+    {
+        for(J=0;J<4;J++)
+        {
+            ret[I] = ret[I] + this->at(I,J)*lhs[J];
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ cumat4 cumat4::operator*(cumat4 lhs)
+{
+    cumat4 ret;
+    int I,J,K;
+    for(I=0;I<4;I++)
+    {
+        for(J=0;J<4;J++)
+        {
+            ret(I,J) = 0;
+            for(K=0;K<4;K++)
+            {
+                ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J);
+            }
+        }
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4 cumat4::transpose()
+{
+    cumat4 q;
+    int I,J;
+    for(I=0;I<4;I++)
+    {
+        for(J=0;J<4;J++)
+        {
+            q(I,J) = this->at(J,I);
+        }
+    }
+    return q;
+}
+
+__host__ __device__ double cumat4::det()
+{
+    double a00,a01,a02,a03;
+    double a10,a11,a12,a13;
+    double a20,a21,a22,a23;
+    double a30,a31,a32,a33;
+    double det;
+
+    a00 = this->at(0,0);
+    a01 = this->at(0,1);
+    a02 = this->at(0,2);
+    a03 = this->at(0,3);
+    a10 = this->at(1,0);
+    a11 = this->at(1,1);
+    a12 = this->at(1,2);
+    a13 = this->at(1,3);
+    a20 = this->at(2,0);
+    a21 = this->at(2,1);
+    a22 = this->at(2,2);
+    a23 = this->at(2,3);
+    a30 = this->at(3,0);
+    a31 = this->at(3,1);
+    a32 = this->at(3,2);
+    a33 = this->at(3,3);
+
+    det = a03*a12*a21*a30 -
+        a02*a13*a21*a30 -
+        a03*a11*a22*a30 +
+        a01*a13*a22*a30 +
+        a02*a11*a23*a30 -
+        a01*a12*a23*a30 -
+        a03*a12*a20*a31 +
+        a02*a13*a20*a31 +
+        a03*a10*a22*a31 -
+        a00*a13*a22*a31 -
+        a02*a10*a23*a31 +
+        a00*a12*a23*a31 +
+        a03*a11*a20*a32 -
+        a01*a13*a20*a32 -
+        a03*a10*a21*a32 +
+        a00*a13*a21*a32 +
+        a01*a10*a23*a32 -
+        a00*a11*a23*a32 -
+        a02*a11*a20*a33 +
+        a01*a12*a20*a33 +
+        a02*a10*a21*a33 -
+        a00*a12*a21*a33 -
+        a01*a10*a22*a33 +
+        a00*a11*a22*a33;
+
+    return det;
+}
+
+__host__ __device__ cumat4 minverse(cumat4 ma)
+{
+  cumat4 mb;
+
+  double a00,a01,a02,a03;
+  double a10,a11,a12,a13;
+  double a20,a21,a22,a23;
+  double a30,a31,a32,a33;
+
+  double b00,b01,b02,b03;
+  double b10,b11,b12,b13;
+  double b20,b21,b22,b23;
+  double b30,b31,b32,b33;
+
+  double det = 0.0;
+
+  a00 = ma.at(0,0);
+  a01 = ma.at(0,1);
+  a02 = ma.at(0,2);
+  a03 = ma.at(0,3);
+  a10 = ma.at(1,0);
+  a11 = ma.at(1,1);
+  a12 = ma.at(1,2);
+  a13 = ma.at(1,3);
+  a20 = ma.at(2,0);
+  a21 = ma.at(2,1);
+  a22 = ma.at(2,2);
+  a23 = ma.at(2,3);
+  a30 = ma.at(3,0);
+  a31 = ma.at(3,1);
+  a32 = ma.at(3,2);
+  a33 = ma.at(3,3);
+
+  det = a03*a12*a21*a30 -
+        a02*a13*a21*a30 -
+        a03*a11*a22*a30 +
+        a01*a13*a22*a30 +
+        a02*a11*a23*a30 -
+        a01*a12*a23*a30 -
+        a03*a12*a20*a31 +
+        a02*a13*a20*a31 +
+        a03*a10*a22*a31 -
+        a00*a13*a22*a31 -
+        a02*a10*a23*a31 +
+        a00*a12*a23*a31 +
+        a03*a11*a20*a32 -
+        a01*a13*a20*a32 -
+        a03*a10*a21*a32 +
+        a00*a13*a21*a32 +
+        a01*a10*a23*a32 -
+        a00*a11*a23*a32 -
+        a02*a11*a20*a33 +
+        a01*a12*a20*a33 +
+        a02*a10*a21*a33 -
+        a00*a12*a21*a33 -
+        a01*a10*a22*a33 +
+        a00*a11*a22*a33;
+
+  if(det*det>1.0E-30)
+  {
+    b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33;
+    b01 =  a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33;
+    b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33;
+    b03 =  a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23;
+    b10 =  a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33;
+    b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33;
+    b12 =  a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33;
+    b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23;
+    b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33;
+    b21 =  a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33;
+    b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33;
+    b23 =  a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23;
+    b30 =  a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32;
+    b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32;
+    b32 =  a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32;
+    b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22;
+    b00 = b00/det;
+    b01 = b01/det;
+    b02 = b02/det;
+    b03 = b03/det;
+    b10 = b10/det;
+    b11 = b11/det;
+    b12 = b12/det;
+    b13 = b13/det;
+    b20 = b20/det;
+    b21 = b21/det;
+    b22 = b22/det;
+    b23 = b23/det;
+    b30 = b30/det;
+    b31 = b31/det;
+    b32 = b32/det;
+    b33 = b33/det;
+    mb.at(0,0) = b00;
+    mb.at(0,1) = b01;
+    mb.at(0,2) = b02;
+    mb.at(0,3) = b03;
+    mb.at(1,0) = b10;
+    mb.at(1,1) = b11;
+    mb.at(1,2) = b12;
+    mb.at(1,3) = b13;
+    mb.at(2,0) = b20;
+    mb.at(2,1) = b21;
+    mb.at(2,2) = b22;
+    mb.at(2,3) = b23;
+    mb.at(3,0) = b30;
+    mb.at(3,1) = b31;
+    mb.at(3,2) = b32;
+    mb.at(3,3) = b33;
+  }
+  //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again!
+  return mb;
+}
+
+__host__ __device__ cumat4 cumat4::inverse()
+{
+    return minverse(*this);
+}
+
+
+__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b)
+{
+    double ret = 0.0;
+    ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+    return ret;
+}
+
+__host__ __device__ double cuvect4_norm(cuvect4 a)
+{
+    double ret = 0.0;
+    ret = ::sqrt(cuvect4_dot(a,a));
+    return ret;
+}
+
+__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a)
+{
+    cuvect4 ret = cuvect4(0.0f,0.0f,0.0f,0.0f);
+    double nrm = cuvect4_norm(a);
+    if(nrm>0.0)
+        ret = a/nrm;
+    return ret;
+}
+
+__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b)
+{
+    cuvect4 ret;
+    cuvect4 bn = cuvect4_normalize(b);
+    double d = cuvect4_dot(a,bn);
+    ret = bn*d;
+    return ret;
+}
+
+
+};
--- a/src/amsculib2/cuvect4f.cu
+++ b/src/amsculib2/cuvect4f.cu
@ -0,0 +1,417 @@
+#include <amsculib2/amsculib2.hpp>
+
+namespace amscuda
+{
+
+////////////
+//cuvect4ff//
+////////////
+
+__host__ __device__ cuvect4f::cuvect4f()
+{
+    x = 0.0; y = 0.0; z = 0.0; w = 0.0;
+    return;
+}
+
+__host__ __device__ cuvect4f::~cuvect4f()
+{
+    x = 0.0; y = 0.0; z = 0.0; w = 0.0;
+    return;
+}
+
+__host__ __device__ cuvect4f::cuvect4f(float _x, float _y, float _z, float _w)
+{
+    x = _x; y = _y; z = _z; w = _w;
+    return;
+}
+        
+__host__ __device__ float& cuvect4f::operator[](const int I)
+{
+    if(I==0) return x;
+    else if(I==1) return y;
+    else if(I==2) return z;
+    else if(I==3) return w;
+    return x;
+}
+
+__host__ __device__ const float& cuvect4f::operator[](const int I) const
+{
+    if(I==0) return x;
+    else if(I==1) return y;
+    else if(I==2) return z;
+    else if(I==3) return w;
+    return x;
+}
+
+__host__ __device__ cuvect4f cuvect4f::operator+(cuvect4f lhs)
+{
+    cuvect4f ret;
+    ret.x = this->x + lhs.x;
+    ret.y = this->y + lhs.y;
+    ret.z = this->z + lhs.z;
+    ret.w = this->w + lhs.w;
+    return ret;
+}
+
+__host__ __device__ cuvect4f cuvect4f::operator-(cuvect4f lhs)
+{
+    cuvect4f ret;
+    ret.x = this->x - lhs.x;
+    ret.y = this->y - lhs.y;
+    ret.z = this->z - lhs.z;
+    ret.w = this->w - lhs.w;
+    return ret;
+}
+
+__host__ __device__ cuvect4f cuvect4f::operator*(float lhs)
+{
+    cuvect4f ret;
+    ret.x = this->x*lhs;
+    ret.y = this->y*lhs;
+    ret.z = this->z*lhs;
+    ret.w = this->w*lhs;
+    return ret;
+}
+
+__host__ __device__ cuvect4f cuvect4f::operator/(float lhs)
+{
+    cuvect4f ret;
+    ret.x = this->x/lhs;
+    ret.y = this->y/lhs;
+    ret.z = this->z/lhs;
+    ret.w = this->w/lhs;
+    return ret;
+}
+
+__host__ __device__ cumat4f::cumat4f()
+{
+    int I;
+    for(I=0;I<16;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ cumat4f::~cumat4f()
+{
+    int I;
+    for(I=0;I<16;I++)
+    {
+        dat[I] = 0.0;
+    }
+    return;
+}
+
+__host__ __device__ float& cumat4f::operator[](const int I)
+{
+    return dat[I];
+}
+
+__host__ __device__ float& cumat4f::operator()(const int I, const int J)
+{
+    return dat[I+4*J];
+}
+
+__host__ __device__ float& cumat4f::at(const int I, const int J)
+{
+    return dat[I+4*J];
+}
+
+__host__ __device__ cumat4f cumat4f::operator+(cumat4f lhs)
+{
+    cumat4f ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I] + lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4f cumat4f::operator-(cumat4f lhs)
+{
+    cumat4f ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I] - lhs.dat[I];
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4f cumat4f::operator*(float lhs)
+{
+    cumat4f ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I]*lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4f cumat4f::operator/(float lhs)
+{
+    cumat4f ret;
+    int I;
+    for(I=0;I<16;I++)
+    {
+        ret.dat[I] = this->dat[I]/lhs;
+    }
+    return ret;
+}
+
+__host__ __device__ cuvect4f cumat4f::operator*(cuvect4f lhs)
+{
+    cuvect4f ret = cuvect4f(0.0,0.0,0.0,0.0);
+    int I,J;
+
+    for(I=0;I<4;I++)
+    {
+        for(J=0;J<4;J++)
+        {
+            ret[I] = ret[I] + this->at(I,J)*lhs[J];
+        }
+    }
+
+    return ret;
+}
+
+__host__ __device__ cumat4f cumat4f::operator*(cumat4f lhs)
+{
+    cumat4f ret;
+    int I,J,K;
+    for(I=0;I<4;I++)
+    {
+        for(J=0;J<4;J++)
+        {
+            ret(I,J) = 0;
+            for(K=0;K<4;K++)
+            {
+                ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J);
+            }
+        }
+    }
+    return ret;
+}
+
+__host__ __device__ cumat4f cumat4f::transpose()
+{
+    cumat4f q;
+    int I,J;
+    for(I=0;I<4;I++)
+    {
+        for(J=0;J<4;J++)
+        {
+            q(I,J) = this->at(J,I);
+        }
+    }
+    return q;
+}
+
+__host__ __device__ float cumat4f::det()
+{
+    float a00,a01,a02,a03;
+    float a10,a11,a12,a13;
+    float a20,a21,a22,a23;
+    float a30,a31,a32,a33;
+    float det;
+
+    a00 = this->at(0,0);
+    a01 = this->at(0,1);
+    a02 = this->at(0,2);
+    a03 = this->at(0,3);
+    a10 = this->at(1,0);
+    a11 = this->at(1,1);
+    a12 = this->at(1,2);
+    a13 = this->at(1,3);
+    a20 = this->at(2,0);
+    a21 = this->at(2,1);
+    a22 = this->at(2,2);
+    a23 = this->at(2,3);
+    a30 = this->at(3,0);
+    a31 = this->at(3,1);
+    a32 = this->at(3,2);
+    a33 = this->at(3,3);
+
+    det = a03*a12*a21*a30 -
+        a02*a13*a21*a30 -
+        a03*a11*a22*a30 +
+        a01*a13*a22*a30 +
+        a02*a11*a23*a30 -
+        a01*a12*a23*a30 -
+        a03*a12*a20*a31 +
+        a02*a13*a20*a31 +
+        a03*a10*a22*a31 -
+        a00*a13*a22*a31 -
+        a02*a10*a23*a31 +
+        a00*a12*a23*a31 +
+        a03*a11*a20*a32 -
+        a01*a13*a20*a32 -
+        a03*a10*a21*a32 +
+        a00*a13*a21*a32 +
+        a01*a10*a23*a32 -
+        a00*a11*a23*a32 -
+        a02*a11*a20*a33 +
+        a01*a12*a20*a33 +
+        a02*a10*a21*a33 -
+        a00*a12*a21*a33 -
+        a01*a10*a22*a33 +
+        a00*a11*a22*a33;
+
+    return det;
+}
+
+__host__ __device__ cumat4f minverse(cumat4f ma)
+{
+  cumat4f mb;
+
+  float a00,a01,a02,a03;
+  float a10,a11,a12,a13;
+  float a20,a21,a22,a23;
+  float a30,a31,a32,a33;
+
+  float b00,b01,b02,b03;
+  float b10,b11,b12,b13;
+  float b20,b21,b22,b23;
+  float b30,b31,b32,b33;
+
+  float det = 0.0;
+
+  a00 = ma.at(0,0);
+  a01 = ma.at(0,1);
+  a02 = ma.at(0,2);
+  a03 = ma.at(0,3);
+  a10 = ma.at(1,0);
+  a11 = ma.at(1,1);
+  a12 = ma.at(1,2);
+  a13 = ma.at(1,3);
+  a20 = ma.at(2,0);
+  a21 = ma.at(2,1);
+  a22 = ma.at(2,2);
+  a23 = ma.at(2,3);
+  a30 = ma.at(3,0);
+  a31 = ma.at(3,1);
+  a32 = ma.at(3,2);
+  a33 = ma.at(3,3);
+
+  det = a03*a12*a21*a30 -
+        a02*a13*a21*a30 -
+        a03*a11*a22*a30 +
+        a01*a13*a22*a30 +
+        a02*a11*a23*a30 -
+        a01*a12*a23*a30 -
+        a03*a12*a20*a31 +
+        a02*a13*a20*a31 +
+        a03*a10*a22*a31 -
+        a00*a13*a22*a31 -
+        a02*a10*a23*a31 +
+        a00*a12*a23*a31 +
+        a03*a11*a20*a32 -
+        a01*a13*a20*a32 -
+        a03*a10*a21*a32 +
+        a00*a13*a21*a32 +
+        a01*a10*a23*a32 -
+        a00*a11*a23*a32 -
+        a02*a11*a20*a33 +
+        a01*a12*a20*a33 +
+        a02*a10*a21*a33 -
+        a00*a12*a21*a33 -
+        a01*a10*a22*a33 +
+        a00*a11*a22*a33;
+
+  if(det*det>1.0E-30)
+  {
+    b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33;
+    b01 =  a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33;
+    b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33;
+    b03 =  a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23;
+    b10 =  a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33;
+    b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33;
+    b12 =  a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33;
+    b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23;
+    b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33;
+    b21 =  a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33;
+    b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33;
+    b23 =  a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23;
+    b30 =  a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32;
+    b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32;
+    b32 =  a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32;
+    b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22;
+    b00 = b00/det;
+    b01 = b01/det;
+    b02 = b02/det;
+    b03 = b03/det;
+    b10 = b10/det;
+    b11 = b11/det;
+    b12 = b12/det;
+    b13 = b13/det;
+    b20 = b20/det;
+    b21 = b21/det;
+    b22 = b22/det;
+    b23 = b23/det;
+    b30 = b30/det;
+    b31 = b31/det;
+    b32 = b32/det;
+    b33 = b33/det;
+    mb.at(0,0) = b00;
+    mb.at(0,1) = b01;
+    mb.at(0,2) = b02;
+    mb.at(0,3) = b03;
+    mb.at(1,0) = b10;
+    mb.at(1,1) = b11;
+    mb.at(1,2) = b12;
+    mb.at(1,3) = b13;
+    mb.at(2,0) = b20;
+    mb.at(2,1) = b21;
+    mb.at(2,2) = b22;
+    mb.at(2,3) = b23;
+    mb.at(3,0) = b30;
+    mb.at(3,1) = b31;
+    mb.at(3,2) = b32;
+    mb.at(3,3) = b33;
+  }
+  //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again!
+  return mb;
+}
+
+__host__ __device__ cumat4f cumat4f::inverse()
+{
+    return minverse(*this);
+}
+
+
+__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b)
+{
+    float ret = 0.0;
+    ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+    return ret;
+}
+
+__host__ __device__ float cuvect4f_norm(cuvect4f a)
+{
+    float ret = 0.0;
+    ret = ::sqrtf(cuvect4f_dot(a,a));
+    return ret;
+}
+
+__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a)
+{
+    cuvect4f ret = cuvect4f(0.0f,0.0f,0.0f,0.0f);
+    float nrm = cuvect4f_norm(a);
+    if(nrm>0.0)
+        ret = a/nrm;
+    return ret;
+}
+
+__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b)
+{
+    cuvect4f ret;
+    cuvect4f bn = cuvect4f_normalize(b);
+    float d = cuvect4f_dot(a,bn);
+    ret = bn*d;
+    return ret;
+}
+
+}; //namespace amscuda
--- a/src/main.cu
+++ b/src/main.cu
@ -0,0 +1,27 @@
+#include <amsculib2/amsculib2.hpp>
+
+//using namespace ams;
+using namespace amscuda;
+
+int main(int argc, char* argv[])
+{
+    printf("AMSCULIB2: Cuda Library Tests.\n");
+
+    //test_amscuarray_1();
+    //test_amscumath1();
+
+    //cmp::test_cucomp64_1();
+    //cmp::test_cucomp128_1();
+
+    //test_amscuarray_2();
+
+    //test_dprg64();
+    //printf("\n");
+    //test_dprg32();
+
+    //test_dbuff_rand_dpr32();
+
+    test_amscurarray1();
+
+    return 0;
+}
--- a/src/main.o
+++ b/src/main.o
--- a/test_scripts/test_dbuff_dpr32.py
+++ b/test_scripts/test_dbuff_dpr32.py
@ -0,0 +1,76 @@
+#!/usr/bin/python3
+
+import os,sys,math
+import numpy as np
+import matplotlib.pyplot as plt
+
+#################
+## Subroutines ##
+#################
+
+def binload_float_ndarray(fp):
+    arr = np.zeros((0),dtype=np.float32,order='F')
+
+    qb = fp.read(4)
+    Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+    shp = np.zeros((Nd),dtype=np.int32)
+
+    piprod = 1
+    for I in range(0,Nd):
+        qb = fp.read(4)
+        shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0]
+        piprod = piprod*shp[I]
+    
+    qb = fp.read(4*piprod)
+    arr = np.frombuffer(qb,dtype=np.float32,count=piprod)
+
+    arr = arr.reshape(shp)
+    
+    return arr;
+
+def binsave_float_ndarray(fp,arr):
+
+
+
+
+    return
+
+
+#################
+## Main Script ##
+#################
+
+def test_1():
+
+    fname = "./test_scripts/test_dbuff_rand_dpr32.bin"
+    try:
+        fp = open(fname,"rb")
+    except:
+        print("Could not open {} for reading".format(fname))
+        return
+    
+    arr1 = binload_float_ndarray(fp)
+    arr2 = binload_float_ndarray(fp)
+
+    fp.close()
+
+    plt.subplot(2,2,1)
+    plt.imshow(arr1)
+    plt.subplot(2,2,2)
+    plt.imshow(arr2)
+    plt.subplot(2,2,3)
+    plt.hist(arr1.flatten(),bins=100)
+    plt.subplot(2,2,4)
+    plt.hist(arr2.flatten(),bins=100)
+    plt.show()
+
+    print("{} {}".format(np.mean(arr2),np.std(arr2)))
+    
+
+    return
+
+if(__name__=="__main__"):
+    test_1()
+
+    exit(0)
+