Making some of my library code public.

master
Aaron 3 weeks ago
commit 64a7bfc851

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -0,0 +1,639 @@
#!/usr/bin/python3
#Python3 compilation library
#Aaron M. Schinder
#29 Dec 2020
#
#Cleanup and refactor from 2017 python2 version compilation libraries
import os,sys,math,subprocess
#####################
#Directory Functions#
#####################
##flist - list all files in a given directory pth
##optional arguments:
# recurse - (T/F): Whether to recursively search for files in directory tree
# exts - (list): A list of file extensions to filter on
# normpath (T/F): whether to normalize path variables after
#filelist = flist(pth,**kwargs):
def flist(pth,**kwargs):
flst = []
if(not('recurse' in kwargs)):
recurse_ = False
else:
recurse_ = kwargs['recurse']
if(not('exts' in kwargs)):
filterexts_ = False
else:
filterexts_ = True
exts = kwargs['exts']
if(not('normpath' in kwargs)):
normpath_ = True
else:
normpath_ = kwargs['normpath']
if(not('linuxpath' in kwargs)):
linuxpath_ = False
else:
linuxpath_ = kwargs['linuxpath']
if(not('followlinks' in kwargs)):
followlinks_ = False
else:
followlinks_ = kwargs['followlinks']
dirlist = []
rawlist = os.listdir(pth)
for F in rawlist:
F2 = os.path.join(pth,F)
if(os.path.isdir(F2)):
b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2)))
if(b):
if((F2!=".")&(F2!="..")):
dirlist.append(F2)
elif(os.path.isfile(F2)):
flst.append(F2)
#Recurse through directories
if(recurse_):
for D in dirlist:
lst = flist(D,**kwargs)
for L in lst:
flst.append(L)
#Postprocess:
#Filter out all extensions except the selected ext list
if(filterexts_):
flst = filterexts(flst,exts)
#Normalize filename path according to os
if(normpath_):
flst2 = list(flst)
for I in range(0,len(flst2)):
flst[I] = os.path.normpath(flst2[I])
#If linuxpath, convert all \\ to /
#if(linuxpath_):
# flst2 = list(flst)
# for I in range(0,len(flst2)):
# flst[I] = linuxpath(flst2[I])
return flst
#Filters by extensions in a list of files
#flst = def filterexts(flst,exts):
def filterexts(flst,exts):
flst2 = []
if(isinstance(exts,str)):
exts = list([exts])
for F in flst:
b = False
for ext in exts:
if(ext[0]!='.'):
ext = '.'+ext
F2 = os.path.splitext(F)
if(len(F2)>=2):
ex = F2[1]
if(len(ex)>0):
if(ex[0]!='.'):
ex = '.'+ex
if(ex==ext):
b = True
if(b):
flst2.append(F)
return flst2
#Find a file fname, starting in pth and recursing
#Used for finding library files to link
def findfile(fname,pth,**kwargs):
fullfname = ""
flst = flist(pth,recurse=True)
for F in flst:
F2 = os.path.split(F)[1]
if(F2 == fname):
fullfname = F
return fullfname
#List to space-seperated-string
def list_to_sss(lst):
lout = ""
for I in range(0,len(lst)-1):
lout = lout + lst[I] + " "
if(len(lst)>0):
lout = lout + lst[len(lst)-1]
return lout
def strip_whitespace(strin):
strout = ""
I1 = -1
I2 = -1
for I in range(0,len(strin)):
if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
I1 = I
break
q = list(range(0,len(strin)))
q.reverse()
for I in q:
if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
I2 = I+1
break
if(I1>=0 and I2>=0):
strout = strin[I1:I2]
return strout
def sss_to_list(sss):
lout = []
l1 = sss.split(' ')
for l in l1:
l2 = strip_whitespace(l)
lout.append(l2)
return lout
def replaceext(fname,ext):
fname2 = ""
if(len(ext)>0):
if(ext[0]!='.'):
ext = '.'+ext
fname2 = os.path.splitext(fname)[0]+ext
else:
fname2 = os.path.splitext(fname)[0]
return fname2
def replaceexts(fnamelist,ext):
fname2list = []
for F in fnamelist:
F2 = replaceext(F,ext)
fname2list.append(F2)
return fname2list
# def except_contains_oldv(lst1,exc):
# lst2 = []
# for item in lst1:
# b = 1
# for item2 in exc:
# if(item.find(item2)>=0):
# b = 0
# break
# if(b==1):
# lst2.append(item)
# return lst2
#filenames must match
def except_contains(lst1,exc):
lst2 = []
for item in lst1:
b = 1
for item2 in exc:
fsplit = os.path.split(item)
fn = fsplit[len(fsplit)-1]
if(fn==item2):
b = 0
break
if(b==1):
lst2.append(item)
return lst2
##########################
##System Call Procedures##
##########################
def callproc(cmd, **kwargs):
if(not('logfile' in kwargs)):
use_lf = False
else:
logfile = kwargs['logfile']
if(logfile!=""):
fp = open(kwargs['logfile'],'a+')
use_lf = True
else:
use_lf = False
if(not('echo' in kwargs)):
echo = True
else:
echo = kwargs['echo']
if(echo):
print(cmd)
#encoding/deconding to/from bytes is necessary to use the subprocess command
#in python3.7
#However, only do this in linux
if(sys.platform!='win32'):
cmd2 = cmd.encode(encoding='utf-8')
else:
cmd2 = cmd
proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
out = out.decode(encoding='utf-8')
if(echo):
print(out)
#print(err);
if(use_lf):
fp.writelines(cmd+'\n')
fp.writelines(out+'\n')
if(use_lf):
fp.close()
#######################################
##Compiler, Archive, and Linker Calls##
#######################################
def smartcompile(srcfile,objext='.o'):
mtsrc = os.path.getmtime(srcfile)
objfile = replaceext(srcfile,objext)
objexists = os.path.exists(objfile)
ret = True
if(objexists):
mtobj = os.path.getmtime(objfile)
if(mtobj>=mtsrc):
ret = False
return ret
#MSVC compiler wrapper
def msvc_compile(compilername, srcfile, **kwargs):
if(not('include' in kwargs)):
include = ''
else:
include = kwargs['include']
if(isinstance(include,list)):
include = list_to_sss(include)
if(not('flags' in kwargs)):
flags = ''
else:
flags = kwargs['flags']
if(isinstance(flags,list)):
flags = list_to_sss(flags)
if(not('objext' in kwargs)):
objext = '.obj'
else:
objext = kwargs['objext']
if(not('srcfileflag' in kwargs)):
srcfileflag = '/c'
else:
srcfileflag = kwargs['srcfileflag']
if(not('outfileflag' in kwargs)):
outfileflag = '/Fo:'
else:
outfileflag = kwargs['outfileflag']
if(not('logfile' in kwargs)):
logfile = ""
else:
logfile = kwargs['logfile']
outfile = replaceext(srcfile,objext)
ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile
ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
return
#MSVC compiler wrapper
def msvc_compile_list(compiler,srclist,**kwargs):
for S in srclist:
msvc_compile(compiler,S,**kwargs)
return
#gnu-style compiler compile: Should work with gcc, g++, gfortran
def gs_compile(compiler,srcfile,**kwargs):
if(not('include' in kwargs)):
include = ''
else:
include = kwargs['include']
if(isinstance(include,list)):
include = list_to_sss(include)
if(not('flags' in kwargs)):
flags = ''
else:
flags = kwargs['flags']
if(isinstance(flags,list)):
flags = list_to_sss(flags)
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('srcfileflag' in kwargs)):
srcfileflag = '-c'
else:
srcfileflag = kwargs['srcfileflag']
if(not('outfileflag' in kwargs)):
outfileflag = '-o'
else:
outfileflag = kwargs['outfileflag']
if(not('logfile' in kwargs)):
logfile = ""
else:
logfile = kwargs['logfile']
if(not('smartcompile' in kwargs)):
_smartcompile = True
else:
_smartcompile = kwargs['smartcompile']
#Do I want to make this thing this general?
if(not(_smartcompile) or smartcompile(srcfile,objext)):
outfile = replaceext(srcfile,objext)
ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile
ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
return
def gs_compile_list(compiler,srclist,**kwargs):
for S in srclist:
gs_compile(compiler,S,**kwargs)
return
def gs_compile_all(compiler,srcdir,srcexts,**kwargs):
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
srcfils = flist(srcdir,exts=srcexts,recurse=recurse)
for S in srcfils:
gs_compile(compiler,S,**kwargs)
return
def gs_link_all(linker,srcpath,target,**kwargs):
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
objfils = flist(srcpath,exts=objext,recurse=recurse)
oflst = list_to_sss(objfils)
gs_link_list(linker,oflst,target,**kwargs)
return
def gs_link_list(linker,objlist,target,**kwargs):
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('libdir' in kwargs)):
libdir = ''
else:
libdir = kwargs['libdir']
if(not('staticlibs' in kwargs)):
staticlibs = ''
else:
staticlibs = kwargs['staticlibs']
if(not('libflags' in kwargs)):
libflags = ''
else:
libflags = kwargs['libflags']
if(not('linkerflags' in kwargs)):
linkerflags = ''
else:
linkerflags = kwargs['linkerflags']
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
if(not('logfile' in kwargs)):
logfile = ''
else:
logfile = kwargs['logfile']
ln = linker+" -o "+target+" "+libdir
ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags
callproc(ln,logfile=logfile)
return
def msvc_link_list(objlist,target,**kwargs):
linker = 'link'
if(not('objext' in kwargs)):
objext = '.obj'
else:
objext = kwargs['objext']
if(not('libdir' in kwargs)):
libdir = ''
else:
libdir = kwargs['libdir']
if(not('staticlibs' in kwargs)):
staticlibs = ''
else:
staticlibs = kwargs['staticlibs']
if(not('libflags' in kwargs)):
libflags = ''
else:
libflags = kwargs['libflags']
if(not('linkerflags' in kwargs)):
linkerflags = ''
else:
linkerflags = kwargs['linkerflags']
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
if(not('logfile' in kwargs)):
logfile = ''
else:
logfile = kwargs['logfile']
ln = linker+" "+libdir
ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags
ln = ln+" /out:"+target+" "+libflags
callproc(ln,logfile=logfile)
return
def ar_all(srcpath,arname,**kwargs):
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
objlist = flist(srcpath,exts=objext,recurse=recurse)
ar_list(objlist,arname,**kwargs)
return
def msvc_lib_list(objlist,arname,**kwargs):
objlist2 = list_to_sss(objlist)
ln = "lib "+objlist2+" /out:"+arname
callproc(ln)
return
def ar_list(objlist,arname,**kwargs):
objlist2 = list_to_sss(objlist)
ln = "ar cr "+ arname+" "+objlist2
callproc(ln)
return
def ar_add_list(objlist,arname,**kwargs):
objlist2 = list_to_sss(objlist)
ln = "ar t "+arname+" "+objlist2
callproc(ln)
return
##############################
##Derived Compiler Functions##
##############################
def gcc_compile(srcfile,**kwargs):
compiler = 'gcc'
kwargs['objext'] = '.o'
#srcexts = ['.c']
gs_compile(compiler,srcfile,**kwargs)
return
def gcc_compile_all(srcdir,**kwargs):
compiler = 'gcc'
kwargs['objext'] = '.o'
srcexts = ['.c']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def gcc_compile_list(srclist,**kwargs):
compiler = 'gcc'
kwargs['objext'] = '.o'
#srcexts = ['.c']
gs_compile_list(compiler,srclist,**kwargs)
return
def gpp_compile(srcfile,**kwargs):
compiler = 'g++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile(compiler,srcfile,**kwargs)
return
def gpp_compile_all(srcdir,**kwargs):
compiler = 'g++'
kwargs['objext'] = '.o'
srcexts = ['.c','.cpp']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def gpp_compile_list(srclist,**kwargs):
compiler = 'g++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile_list(compiler,srclist,**kwargs)
return
def gfortran_compile(srcfile,**kwargs):
compiler = 'gfortran'
kwargs['objext'] = '.o'
#srcexts = ['.f','.f90','.f77']
gs_compile(compiler,srcfile,**kwargs)
return
def gfortran_compile_all(srcdir,**kwargs):
compiler = 'gfortran'
kwargs['objext'] = '.o'
srcexts = ['.f','.f90','.f77']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def gfortran_compile_list(srclist,**kwargs):
compiler = 'gfortran'
kwargs['objext'] = '.o'
#srcexts = ['.f','.f90','.f77']
gs_compile_list(compiler,srclist,**kwargs)
return
def clang_compile(srcfile,**kwargs):
compiler = 'clang++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile(compiler,srcfile,**kwargs)
return
def clang_compile_all(srcdir,**kwargs):
compiler = 'clang++'
kwargs['objext'] = '.o'
srcexts = ['.c','.cpp']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def clang_compile_list(srclist,**kwargs):
compiler = 'clang++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile_list(compiler,srclist,**kwargs)
return

@ -0,0 +1,524 @@
#!/usr/bin/python3
import os,sys,math
import subprocess
##flist - list all files in a given directory pth
##optional arguments:
# recurse - (T/F): Whether to recursively search for files in directory tree
# exts - (list): A list of file extensions to filter on
# normpath (T/F): whether to normalize path variables after
#filelist = flist(pth,**kwargs):
def flist(pth,**kwargs):
flst = []
if(not('recurse' in kwargs)):
recurse_ = False
else:
recurse_ = kwargs['recurse']
if(not('exts' in kwargs)):
filterexts_ = False
else:
filterexts_ = True
exts = kwargs['exts']
if(not('normpath' in kwargs)):
normpath_ = True
else:
normpath_ = kwargs['normpath']
if(not('linuxpath' in kwargs)):
linuxpath_ = False
else:
linuxpath_ = kwargs['linuxpath']
if(not('followlinks' in kwargs)):
followlinks_ = False
else:
followlinks_ = kwargs['followlinks']
dirlist = []
rawlist = os.listdir(pth)
for F in rawlist:
F2 = os.path.join(pth,F)
if(os.path.isdir(F2)):
b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2)))
if(b):
if((F2!=".")&(F2!="..")):
dirlist.append(F2)
elif(os.path.isfile(F2)):
flst.append(F2)
#Recurse through directories
if(recurse_):
for D in dirlist:
lst = flist(D,**kwargs)
for L in lst:
flst.append(L)
#Postprocess:
#Filter out all extensions except the selected ext list
if(filterexts_):
flst = filterexts(flst,exts)
#Normalize filename path according to os
if(normpath_):
flst2 = list(flst)
for I in range(0,len(flst2)):
flst[I] = os.path.normpath(flst2[I])
#If linuxpath, convert all \\ to /
#if(linuxpath_):
# flst2 = list(flst)
# for I in range(0,len(flst2)):
# flst[I] = linuxpath(flst2[I])
return flst
#Filters by extensions in a list of files
#flst = def filterexts(flst,exts):
def filterexts(flst,exts):
flst2 = []
if(isinstance(exts,str)):
exts = list([exts])
for F in flst:
b = False
for ext in exts:
if(ext[0]!='.'):
ext = '.'+ext
F2 = os.path.splitext(F)
if(len(F2)>=2):
ex = F2[1]
if(len(ex)>0):
if(ex[0]!='.'):
ex = '.'+ex
if(ex==ext):
b = True
if(b):
flst2.append(F)
return flst2
#Find a file fname, starting in pth and recursing
#Used for finding library files to link
def findfile(fname,pth,**kwargs):
fullfname = ""
flst = flist(pth,recurse=True)
for F in flst:
F2 = os.path.split(F)[1]
if(F2 == fname):
fullfname = F
return fullfname
def replaceext(fname,ext):
fname2 = ""
if(len(ext)>0):
if(ext[0]!='.'):
ext = '.'+ext
fname2 = os.path.splitext(fname)[0]+ext
else:
fname2 = os.path.splitext(fname)[0]
return fname2
def replaceexts(fnamelist,ext):
fname2list = []
for F in fnamelist:
F2 = replaceext(F,ext)
fname2list.append(F2)
return fname2list
#filenames must match
def except_contains(lst1,exc):
lst2 = []
for item in lst1:
b = 1
for item2 in exc:
fsplit = os.path.split(item)
fn = fsplit[len(fsplit)-1]
if(fn==item2):
b = 0
break
if(b==1):
lst2.append(item)
return lst2
##########################
##System Call Procedures##
##########################
def callproc(cmd, **kwargs):
if(not('logfile' in kwargs)):
use_lf = False
else:
logfile = kwargs['logfile']
if(logfile!=""):
fp = open(kwargs['logfile'],'a+')
use_lf = True
else:
use_lf = False
if(not('echo' in kwargs)):
echo = True
else:
echo = kwargs['echo']
if(echo):
print(cmd)
#encoding/deconding to/from bytes is necessary to use the subprocess command
#in python3.7
#However, only do this in linux
if(sys.platform!='win32'):
cmd2 = cmd.encode(encoding='utf-8')
else:
cmd2 = cmd
proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
out = out.decode(encoding='utf-8')
if(echo):
print(out)
#print(err);
if(use_lf):
fp.writelines(cmd+'\n')
fp.writelines(out+'\n')
if(use_lf):
fp.close()
#List to space-seperated-string
def list_to_sss(lst):
lout = ""
for I in range(0,len(lst)-1):
lout = lout + lst[I] + " "
if(len(lst)>0):
lout = lout + lst[len(lst)-1]
return lout
#####################################
## Incremental Compilation Library ##
#####################################
#silently read lines from a text file if exists
def readtextlines(fname):
txtlns = []
if(not os.path.isfile(fname)):
return txtlns
try:
fp = open(fname,"r")
except:
return txtlns
ln = " "
while(ln!=""):
ln = fp.readline()
txtlns.append(ln)
fp.close()
return txtlns
def getincludefnfrage(includeline):
fnfrag = ""
I1 = -1
I2 = -1
for I in range(0,len(includeline)):
if(I1<0 and (includeline[I]=='<' or includeline[I]=='"')):
I1 = I
if(I1>=0 and (includeline[I]=='>' or includeline[I]=='"')):
I2 = I
break
if(I1>=0 and I2>=0):
fnfrag = includeline[I1+1:I2]
return fnfrag
#Returns the name of the source file fname (if it exists)
#and all included filenames
def getsrcandincludes(fname, incdirs):
flist = []
if(os.path.isfile(fname)):
flist.append(fname)
Ilist = 0
while(Ilist<len(flist)):
#recurse through files
f1 = flist[Ilist]
lns = readtextlines(f1)
for J in range(0,len(lns)):
if(lns[J].find("#include")>=0):
fnfrag = getincludefnfrage(lns[J])
for K in range(0,len(incdirs)):
tfn = os.path.join(incdirs[K],fnfrag)
if(os.path.isfile(tfn)):
flist.append(tfn)
break
Ilist = Ilist + 1
return flist
#Returns the name of the object file associated with the source file
#within the object store folder (if it exists)
def getobjfile(fname,objstore,objext = ".o"):
fret = ""
f1 = os.path.split(fname)[1]
f2 = f1
while(os.path.splitext(f2)[1]!=""):
f2 = os.path.splitext(f2)[0]
objext = objext.strip('.')
f3 = os.path.join(objstore,"{}.{}".format(f2,objext))
if(os.path.exists(f3)):
fret = f3
return fret
def getsrctimes(fname, incdirs):
ftimes = []
flst = getsrcandincludes(fname, incdirs)
for I in range(0,len(flst)):
f = flst[I]
mt = os.path.getmtime(f)
ftimes.append(mt)
return ftimes
def getobjtime(fname,objstore,objext=".o"):
ret = -1
fret = getobjfile(fname,objstore,objext)
if(fret!=""):
ret = os.path.getmtime(fret)
return ret
#Decide whether or not to compile source file
def decidecompile(fname,**kwargs):
ret = True
if(not os.path.isfile(fname)):
ret = False
return ret
##unpack kwargs
if("searchincdirs" in kwargs):
incdirs = kwargs["searchincdirs"]
else:
incdirs = ["./include"]
if("objext" in kwargs):
objext = kwargs["objext"]
else:
objext = ".o"
if("objstore" in kwargs):
objstore = kwargs["objstore"]
else:
objstore = "./objstore"
srclist = getsrcandincludes(fname,incdirs)
srctlist = getsrctimes(fname,incdirs)
obj = getobjfile(fname,objstore,objext)
objt = getobjtime(fname,objstore,objext)
if(obj!=""):
ret = False
for I in range(0,len(srctlist)):
if(srctlist[I]>objt):
ret = True
break
return ret
def gs_incremental_compile(compiler,srcfile,**kwargs):
if(not('include' in kwargs)):
include = ''
else:
include = kwargs['include']
if(isinstance(include,list)):
include = list_to_sss(include)
if(not('flags' in kwargs)):
flags = ''
else:
flags = kwargs['flags']
if(isinstance(flags,list)):
flags = list_to_sss(flags)
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('srcfileflag' in kwargs)):
srcfileflag = '-c'
else:
srcfileflag = kwargs['srcfileflag']
if(not('outfileflag' in kwargs)):
outfileflag = '-o'
else:
outfileflag = kwargs['outfileflag']
if(not('logfile' in kwargs)):
logfile = ""
else:
logfile = kwargs['logfile']
if(not('smartcompile' in kwargs)):
_smartcompile = True
else:
_smartcompile = kwargs['smartcompile']
#incrementalcompile
if("searchincdirs" in kwargs):
incdirs = kwargs["searchincdirs"]
else:
incdirs = ["./include"]
if("objext" in kwargs):
objext = kwargs["objext"]
else:
objext = ".o"
if("objstore" in kwargs):
objstore = kwargs["objstore"]
else:
objstore = "./objstore"
#Do I want to make this thing this general?
docompile = decidecompile(srcfile,**kwargs)
if(docompile):
f1 = os.path.split(srcfile)[1]
f2 = f1
while(os.path.splitext(f2)[1]!=""):
f2 = os.path.splitext(f2)[0]
outfile = os.path.join(objstore,"{}{}".format(f2,objext))
ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile
ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
return
def gs_incremental_compile_list(compiler,srclist,**kwargs):
for s in srclist:
gs_incremental_compile(compiler,s,**kwargs)
return
#MSVC compiler wrapper
def msvc_incremental_compile(compilername, srcfile, **kwargs):
if(not('include' in kwargs)):
include = ''
else:
include = kwargs['include']
if(isinstance(include,list)):
include = list_to_sss(include)
if(not('flags' in kwargs)):
flags = ''
else:
flags = kwargs['flags']
if(isinstance(flags,list)):
flags = list_to_sss(flags)
if(not('objext' in kwargs)):
objext = '.obj'
else:
objext = kwargs['objext']
if(not('srcfileflag' in kwargs)):
srcfileflag = '/c'
else:
srcfileflag = kwargs['srcfileflag']
if(not('outfileflag' in kwargs)):
outfileflag = '/Fo:'
else:
outfileflag = kwargs['outfileflag']
if(not('logfile' in kwargs)):
logfile = ""
else:
logfile = kwargs['logfile']
#incrementalcompile
if("searchincdirs" in kwargs):
incdirs = kwargs["searchincdirs"]
else:
incdirs = ["./include"]
# if("objext" in kwargs):
# objext = kwargs["objext"]
# else:
# objext = ".o"
if("objstore" in kwargs):
objstore = kwargs["objstore"]
else:
objstore = "./objstore"
docompile = decidecompile(srcfile,**kwargs)
if(docompile):
f1 = os.path.split(srcfile)[1]
f2 = f1
while(os.path.splitext(f2)[1]!=""):
f2 = os.path.splitext(f2)[0]
outfile = os.path.join(objstore,"{}{}".format(f2,objext))
ln = compilername+" "+flags+" "+srcfileflag+" "+srcfile+" "+outfileflag+" "+outfile
ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
# outfile = replaceext(srcfile,objext)
# ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile
# ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
return
def msvc_incremental_compile_list(compiler,srclist,**kwargs):
for S in srclist:
msvc_incremental_compile(compiler,S,**kwargs)
return
#######################
## Main Script Tests ##
#######################
def testtimes(args):
if(len(args)>=2):
flist = getsrcandincludes(args[1],["./include"])
ftlist = getsrctimes(args[1],["./include"])
for I in range(0,len(flist)):
print("{}\t\t{}".format(flist[I],ftlist[I]))
print("associated obj file:")
fobj = getobjfile(args[1],"./objstore")
ftobj = getobjtime(args[1],"./objstore")
if(fobj!=""):
print("{}\t\t{}".format(fobj,ftobj))
else:
print("none found")
cflag = decidecompile(args[1])
print("compile? : {}".format(cflag))
return
# if(__name__ == "__main__"):
# args = sys.argv
# testtimes(args)

@ -0,0 +1,52 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
from complib3 import gs_incremental_compile, gs_incremental_compile_list
import shutil
#from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
from shutil import copytree
libname = 'amsculib2.linux64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../linux64/include"
commonlibdir = "../../linux64/lib"
localbindir = "./bin_linux64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
#--ptxas-options=-v
kwargs['flags'] = "-dc --compiler-options '-fPIC -O3'"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
kwargs['objstore'] = "./objstore"
kwargs['searchincdirs'] = ['./include']
#find all source files, except the main project files
files = flist('./src',exts = srcexts, recurse=True)
files = except_contains(files,mainsrc)
objfiles = replaceexts(files,'.o')
objfiles_sss = list_to_sss(objfiles)
#compile all the source files in the list
#gs_compile_list(cc,files,**kwargs)
gs_incremental_compile_list(cc,files,**kwargs)
#archive all the source files into a static library
#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
objlist = flist(kwargs['objstore'],exts='.o',recurse=True)
ar_list(objlist,'{}/lib{}.a'.format(localbindir,libname))
# #Push any libraries to the common lib folder
shutil.copy('{}/lib{}.a'.format(localbindir,libname),commonlibdir)
# #Copy include files to the common include folder
copytree('./include/',commonincdir+'/',dirs_exist_ok=True)

@ -0,0 +1,43 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
from complib3 import gs_incremental_compile, gs_incremental_compile_list
import shutil
libname = 'amsculib2.linux64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../linux64/include"
commonlibdir = "../../linux64/lib"
localbindir = "./bin_linux64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
kwargs['flags'] = "-dc --compiler-options '-fPIC'"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{} -lamsculib2.linux64".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
kwargs['objstore'] = "./objstore"
kwargs['searchincdirs'] = ['./include']
#-lamsmathlib3.linux64 -lamsstring3.linux64 -lamsmatrix_cpp.linux64 -llapack -lblas -lgfortran -lamsmathutilthread.linux64 -lamsmathutil2.linux64
#Pull required binary dynamic libraries to the bin folder
#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
#Designate source files for main test program
fsrc = ['./src/main.cu']
fobj = replaceexts(fsrc,'.o')
#Compile test programs
gs_compile_list(cc,fsrc,**kwargs)
gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)

@ -0,0 +1,639 @@
#!/usr/bin/python3
#Python3 compilation library
#Aaron M. Schinder
#29 Dec 2020
#
#Cleanup and refactor from 2017 python2 version compilation libraries
import os,sys,math,subprocess
#####################
#Directory Functions#
#####################
##flist - list all files in a given directory pth
##optional arguments:
# recurse - (T/F): Whether to recursively search for files in directory tree
# exts - (list): A list of file extensions to filter on
# normpath (T/F): whether to normalize path variables after
#filelist = flist(pth,**kwargs):
def flist(pth,**kwargs):
flst = []
if(not('recurse' in kwargs)):
recurse_ = False
else:
recurse_ = kwargs['recurse']
if(not('exts' in kwargs)):
filterexts_ = False
else:
filterexts_ = True
exts = kwargs['exts']
if(not('normpath' in kwargs)):
normpath_ = True
else:
normpath_ = kwargs['normpath']
if(not('linuxpath' in kwargs)):
linuxpath_ = False
else:
linuxpath_ = kwargs['linuxpath']
if(not('followlinks' in kwargs)):
followlinks_ = False
else:
followlinks_ = kwargs['followlinks']
dirlist = []
rawlist = os.listdir(pth)
for F in rawlist:
F2 = os.path.join(pth,F)
if(os.path.isdir(F2)):
b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2)))
if(b):
if((F2!=".")&(F2!="..")):
dirlist.append(F2)
elif(os.path.isfile(F2)):
flst.append(F2)
#Recurse through directories
if(recurse_):
for D in dirlist:
lst = flist(D,**kwargs)
for L in lst:
flst.append(L)
#Postprocess:
#Filter out all extensions except the selected ext list
if(filterexts_):
flst = filterexts(flst,exts)
#Normalize filename path according to os
if(normpath_):
flst2 = list(flst)
for I in range(0,len(flst2)):
flst[I] = os.path.normpath(flst2[I])
#If linuxpath, convert all \\ to /
#if(linuxpath_):
# flst2 = list(flst)
# for I in range(0,len(flst2)):
# flst[I] = linuxpath(flst2[I])
return flst
#Filters by extensions in a list of files
#flst = def filterexts(flst,exts):
def filterexts(flst,exts):
flst2 = []
if(isinstance(exts,str)):
exts = list([exts])
for F in flst:
b = False
for ext in exts:
if(ext[0]!='.'):
ext = '.'+ext
F2 = os.path.splitext(F)
if(len(F2)>=2):
ex = F2[1]
if(len(ex)>0):
if(ex[0]!='.'):
ex = '.'+ex
if(ex==ext):
b = True
if(b):
flst2.append(F)
return flst2
#Find a file fname, starting in pth and recursing
#Used for finding library files to link
def findfile(fname,pth,**kwargs):
fullfname = ""
flst = flist(pth,recurse=True)
for F in flst:
F2 = os.path.split(F)[1]
if(F2 == fname):
fullfname = F
return fullfname
#List to space-seperated-string
def list_to_sss(lst):
lout = ""
for I in range(0,len(lst)-1):
lout = lout + lst[I] + " "
if(len(lst)>0):
lout = lout + lst[len(lst)-1]
return lout
def strip_whitespace(strin):
strout = ""
I1 = -1
I2 = -1
for I in range(0,len(strin)):
if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
I1 = I
break
q = list(range(0,len(strin)))
q.reverse()
for I in q:
if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'):
I2 = I+1
break
if(I1>=0 and I2>=0):
strout = strin[I1:I2]
return strout
def sss_to_list(sss):
lout = []
l1 = sss.split(' ')
for l in l1:
l2 = strip_whitespace(l)
lout.append(l2)
return lout
def replaceext(fname,ext):
fname2 = ""
if(len(ext)>0):
if(ext[0]!='.'):
ext = '.'+ext
fname2 = os.path.splitext(fname)[0]+ext
else:
fname2 = os.path.splitext(fname)[0]
return fname2
def replaceexts(fnamelist,ext):
fname2list = []
for F in fnamelist:
F2 = replaceext(F,ext)
fname2list.append(F2)
return fname2list
# def except_contains_oldv(lst1,exc):
# lst2 = []
# for item in lst1:
# b = 1
# for item2 in exc:
# if(item.find(item2)>=0):
# b = 0
# break
# if(b==1):
# lst2.append(item)
# return lst2
#filenames must match
def except_contains(lst1,exc):
lst2 = []
for item in lst1:
b = 1
for item2 in exc:
fsplit = os.path.split(item)
fn = fsplit[len(fsplit)-1]
if(fn==item2):
b = 0
break
if(b==1):
lst2.append(item)
return lst2
##########################
##System Call Procedures##
##########################
def callproc(cmd, **kwargs):
if(not('logfile' in kwargs)):
use_lf = False
else:
logfile = kwargs['logfile']
if(logfile!=""):
fp = open(kwargs['logfile'],'a+')
use_lf = True
else:
use_lf = False
if(not('echo' in kwargs)):
echo = True
else:
echo = kwargs['echo']
if(echo):
print(cmd)
#encoding/deconding to/from bytes is necessary to use the subprocess command
#in python3.7
#However, only do this in linux
if(sys.platform!='win32'):
cmd2 = cmd.encode(encoding='utf-8')
else:
cmd2 = cmd
proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
out = out.decode(encoding='utf-8')
if(echo):
print(out)
#print(err);
if(use_lf):
fp.writelines(cmd+'\n')
fp.writelines(out+'\n')
if(use_lf):
fp.close()
#######################################
##Compiler, Archive, and Linker Calls##
#######################################
def smartcompile(srcfile,objext='.o'):
mtsrc = os.path.getmtime(srcfile)
objfile = replaceext(srcfile,objext)
objexists = os.path.exists(objfile)
ret = True
if(objexists):
mtobj = os.path.getmtime(objfile)
if(mtobj>=mtsrc):
ret = False
return ret
#MSVC compiler wrapper
def msvc_compile(compilername, srcfile, **kwargs):
if(not('include' in kwargs)):
include = ''
else:
include = kwargs['include']
if(isinstance(include,list)):
include = list_to_sss(include)
if(not('flags' in kwargs)):
flags = ''
else:
flags = kwargs['flags']
if(isinstance(flags,list)):
flags = list_to_sss(flags)
if(not('objext' in kwargs)):
objext = '.obj'
else:
objext = kwargs['objext']
if(not('srcfileflag' in kwargs)):
srcfileflag = '/c'
else:
srcfileflag = kwargs['srcfileflag']
if(not('outfileflag' in kwargs)):
outfileflag = '/Fo:'
else:
outfileflag = kwargs['outfileflag']
if(not('logfile' in kwargs)):
logfile = ""
else:
logfile = kwargs['logfile']
outfile = replaceext(srcfile,objext)
ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile
ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
return
#MSVC compiler wrapper
def msvc_compile_list(compiler,srclist,**kwargs):
for S in srclist:
msvc_compile(compiler,S,**kwargs)
return
#gnu-style compiler compile: Should work with gcc, g++, gfortran
def gs_compile(compiler,srcfile,**kwargs):
if(not('include' in kwargs)):
include = ''
else:
include = kwargs['include']
if(isinstance(include,list)):
include = list_to_sss(include)
if(not('flags' in kwargs)):
flags = ''
else:
flags = kwargs['flags']
if(isinstance(flags,list)):
flags = list_to_sss(flags)
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('srcfileflag' in kwargs)):
srcfileflag = '-c'
else:
srcfileflag = kwargs['srcfileflag']
if(not('outfileflag' in kwargs)):
outfileflag = '-o'
else:
outfileflag = kwargs['outfileflag']
if(not('logfile' in kwargs)):
logfile = ""
else:
logfile = kwargs['logfile']
if(not('smartcompile' in kwargs)):
_smartcompile = True
else:
_smartcompile = kwargs['smartcompile']
#Do I want to make this thing this general?
if(not(_smartcompile) or smartcompile(srcfile,objext)):
outfile = replaceext(srcfile,objext)
ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile
ln = ln + " " + include
callproc(ln,echo=True,logfile=logfile)
return
def gs_compile_list(compiler,srclist,**kwargs):
for S in srclist:
gs_compile(compiler,S,**kwargs)
return
def gs_compile_all(compiler,srcdir,srcexts,**kwargs):
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
srcfils = flist(srcdir,exts=srcexts,recurse=recurse)
for S in srcfils:
gs_compile(compiler,S,**kwargs)
return
def gs_link_all(linker,srcpath,target,**kwargs):
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
objfils = flist(srcpath,exts=objext,recurse=recurse)
oflst = list_to_sss(objfils)
gs_link_list(linker,oflst,target,**kwargs)
return
def gs_link_list(linker,objlist,target,**kwargs):
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
if(not('libdir' in kwargs)):
libdir = ''
else:
libdir = kwargs['libdir']
if(not('staticlibs' in kwargs)):
staticlibs = ''
else:
staticlibs = kwargs['staticlibs']
if(not('libflags' in kwargs)):
libflags = ''
else:
libflags = kwargs['libflags']
if(not('linkerflags' in kwargs)):
linkerflags = ''
else:
linkerflags = kwargs['linkerflags']
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
if(not('logfile' in kwargs)):
logfile = ''
else:
logfile = kwargs['logfile']
ln = linker+" -o "+target+" "+libdir
ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags
callproc(ln,logfile=logfile)
return
def msvc_link_list(objlist,target,**kwargs):
linker = 'link'
if(not('objext' in kwargs)):
objext = '.obj'
else:
objext = kwargs['objext']
if(not('libdir' in kwargs)):
libdir = ''
else:
libdir = kwargs['libdir']
if(not('staticlibs' in kwargs)):
staticlibs = ''
else:
staticlibs = kwargs['staticlibs']
if(not('libflags' in kwargs)):
libflags = ''
else:
libflags = kwargs['libflags']
if(not('linkerflags' in kwargs)):
linkerflags = ''
else:
linkerflags = kwargs['linkerflags']
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
if(not('logfile' in kwargs)):
logfile = ''
else:
logfile = kwargs['logfile']
ln = linker+" "+libdir
ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags
ln = ln+" /out:"+target+" "+libflags
callproc(ln,logfile=logfile)
return
def ar_all(srcpath,arname,**kwargs):
if(not('recurse' in kwargs)):
recurse = True
else:
recurse = kwargs['recurse']
if(not('objext' in kwargs)):
objext = '.o'
else:
objext = kwargs['objext']
objlist = flist(srcpath,exts=objext,recurse=recurse)
ar_list(objlist,arname,**kwargs)
return
def msvc_lib_list(objlist,arname,**kwargs):
objlist2 = list_to_sss(objlist)
ln = "lib "+objlist2+" /out:"+arname
callproc(ln)
return
def ar_list(objlist,arname,**kwargs):
objlist2 = list_to_sss(objlist)
ln = "ar cr "+ arname+" "+objlist2
callproc(ln)
return
def ar_add_list(objlist,arname,**kwargs):
objlist2 = list_to_sss(objlist)
ln = "ar t "+arname+" "+objlist2
callproc(ln)
return
##############################
##Derived Compiler Functions##
##############################
def gcc_compile(srcfile,**kwargs):
compiler = 'gcc'
kwargs['objext'] = '.o'
#srcexts = ['.c']
gs_compile(compiler,srcfile,**kwargs)
return
def gcc_compile_all(srcdir,**kwargs):
compiler = 'gcc'
kwargs['objext'] = '.o'
srcexts = ['.c']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def gcc_compile_list(srclist,**kwargs):
compiler = 'gcc'
kwargs['objext'] = '.o'
#srcexts = ['.c']
gs_compile_list(compiler,srclist,**kwargs)
return
def gpp_compile(srcfile,**kwargs):
compiler = 'g++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile(compiler,srcfile,**kwargs)
return
def gpp_compile_all(srcdir,**kwargs):
compiler = 'g++'
kwargs['objext'] = '.o'
srcexts = ['.c','.cpp']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def gpp_compile_list(srclist,**kwargs):
compiler = 'g++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile_list(compiler,srclist,**kwargs)
return
def gfortran_compile(srcfile,**kwargs):
compiler = 'gfortran'
kwargs['objext'] = '.o'
#srcexts = ['.f','.f90','.f77']
gs_compile(compiler,srcfile,**kwargs)
return
def gfortran_compile_all(srcdir,**kwargs):
compiler = 'gfortran'
kwargs['objext'] = '.o'
srcexts = ['.f','.f90','.f77']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def gfortran_compile_list(srclist,**kwargs):
compiler = 'gfortran'
kwargs['objext'] = '.o'
#srcexts = ['.f','.f90','.f77']
gs_compile_list(compiler,srclist,**kwargs)
return
def clang_compile(srcfile,**kwargs):
compiler = 'clang++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile(compiler,srcfile,**kwargs)
return
def clang_compile_all(srcdir,**kwargs):
compiler = 'clang++'
kwargs['objext'] = '.o'
srcexts = ['.c','.cpp']
gs_compile_all(compiler,srcdir,srcexts,**kwargs)
return
def clang_compile_list(srclist,**kwargs):
compiler = 'clang++'
kwargs['objext'] = '.o'
#srcexts = ['.c','.cpp']
gs_compile_list(compiler,srclist,**kwargs)
return

@ -0,0 +1,45 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
import shutil
#from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
from shutil import copytree as copytree
libname = 'amsculib2.linux64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../linux64/include"
commonlibdir = "../../linux64/lib"
localbindir = "./bin_linux64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
kwargs['flags'] = "-dc"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
#find all source files, except the main project files
files = flist('./src',exts = srcexts, recurse=True)
files = except_contains(files,mainsrc)
objfiles = replaceexts(files,'.o')
objfiles_sss = list_to_sss(objfiles)
#compile all the source files in the list
gs_compile_list(cc,files,**kwargs)
#archive all the source files into a static library
ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
#Push any libraries to the common lib folder
shutil.copy('{}/lib{}.a'.format(localbindir,libname),commonlibdir)
#Copy include files to the common include folder
copytree('./include/',commonincdir+'/',dirs_exist_ok=True)

@ -0,0 +1,38 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
import shutil
libname = 'amsculib2.linux64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../linux64/include"
commonlibdir = "../../linux64/lib"
localbindir = "./bin_linux64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
#-dc flag: relocatable device code - needed for device functions to link in different "execution units"
kwargs['flags'] = "-dc"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
#Pull required binary dynamic libraries to the bin folder
#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
#Designate source files for main test program
fsrc = ['./src/main.cu']
fobj = replaceexts(fsrc,'.o')
#Compile test programs
gs_compile_list(cc,fsrc,**kwargs)
gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)

@ -0,0 +1,45 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
import shutil
from shutil import copytree as copytree
libname = 'assetcuda.msvc64' #prefix static library name to generate
targetname = 'main' #create this executable when compiling tests
commonincdir = "../../winx64/include"
commonlibdir = "../../winx64/lib"
localbindir = "./bin_winx64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp']
mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
kwargs['flags'] = "/O2"
kwargs['libdir'] = "/LIBPATH:{} /LIBPATH:{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
#find all source files, except the main project files
files = flist('./src',exts = srcexts, recurse=True)
files = except_contains(files,mainsrc)
objfiles = replaceexts(files,'.obj')
objfiles_sss = list_to_sss(objfiles)
#compile all the source files in the list
msvc_compile_list(cc,files,**kwargs)
#gs_compile_list(cc,files,**kwargs)
#archive all the source files into a static library
#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
msvc_lib_list(objfiles,'{}/lib{}.lib'.format(localbindir,libname))
#Push any libraries to the common lib folder
shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir)
#Copy include files to the common include folder
copytree('./include/',commonincdir+'/',dirs_exist_ok=True)

@ -0,0 +1,39 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
import shutil
from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
libname = 'assetcuda.msvc64' #prefix static library name to generate
targetname = 'tests.exe' #create this executable when compiling tests
commonincdir = "../../winx64/include"
commonlibdir = "../../winx64/lib"
localbindir = "./bin_winx64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp']
mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
kwargs['flags'] = "/O2"
kwargs['libdir'] = "/LIBPATH:{} /LIBPATH:{}".format(localbindir,commonlibdir)
#kwargs['libflags'] = "lib{}.lib libamsearthtools.msvc64.lib libamsmeshtools.msvc64.lib libamsmathlib3.msvc64.lib libamsmatrix_cpp.msvc64.lib liblapack.a libblas.a libamsstring3.msvc64.lib libamsmathutil2.msvc64.lib".format(libname)
kwargs['libflags'] = "lib{}.lib".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
#Pull required binary dynamic libraries to the bin folder
#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
#Designate source files for main test program
fsrc = ['./src/main.cu']
fobj = replaceexts(fsrc,'.obj')
#Compile test programs
msvc_compile_list(cc,fsrc,**kwargs)
msvc_link_list(list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)

@ -0,0 +1,44 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
import shutil
from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
libname = 'amsculib2.msvc64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../winx64/include"
commonlibdir = "../../winx64/lib"
localbindir = "./bin_winx64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.c','main.cpp'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
kwargs['flags'] = "-dc"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
#find all source files, except the main project files
files = flist('./src',exts = srcexts, recurse=True)
files = except_contains(files,mainsrc)
objfiles = replaceexts(files,'.o')
objfiles_sss = list_to_sss(objfiles)
#compile all the source files in the list
gs_compile_list(cc,files,**kwargs)
#archive all the source files into a static library
#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
msvc_lib_list(objfiles,'{}/lib{}.lib'.format(localbindir,libname))
#Push any libraries to the common lib folder
shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir)
#Copy include files to the common include folder
copy_tree('./include/',commonincdir+'/')

@ -0,0 +1,38 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
import shutil
from distutils.dir_util import copy_tree as copy_tree #this version does overwrites
libname = 'amsculib2.msvc64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../winx64/include"
commonlibdir = "../../winx64/lib"
localbindir = "./bin_winx64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.c','main.cpp'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
kwargs['flags'] = "-dc"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-llib{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
#Pull required binary dynamic libraries to the bin folder
#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
#Designate source files for main test program
fsrc = ['./src/main.cpp']
fobj = replaceexts(fsrc,'.o')
#Compile test programs
gs_compile_list(cc,fsrc,**kwargs)
gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)

@ -0,0 +1,49 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
from complib3 import gs_incremental_compile, gs_incremental_compile_list
import shutil
from shutil import copytree
libname = 'amsculib2.msvc64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../winx64/include"
commonlibdir = "../../winx64/lib"
localbindir = "./bin_winx64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
kwargs['flags'] = "-dc"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-l{}".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
kwargs['objstore'] = "./objstore"
kwargs['searchincdirs'] = ['./include']
#find all source files, except the main project files
files = flist('./src',exts = srcexts, recurse=True)
files = except_contains(files,mainsrc)
objfiles = replaceexts(files,'.o')
objfiles_sss = list_to_sss(objfiles)
#compile all the source files in the list
#gs_compile_list(cc,files,**kwargs)
gs_incremental_compile_list(cc,files,**kwargs)
#archive all the source files into a static library
#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname))
objlist = flist(kwargs['objstore'],exts='.o',recurse=True)
msvc_lib_list(objlist,'{}/lib{}.lib'.format(localbindir,libname))
# #Push any libraries to the common lib folder
shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir)
# #Copy include files to the common include folder
copytree('./include/',commonincdir+'/',dirs_exist_ok=True)

@ -0,0 +1,43 @@
#!/usr/bin/python3
import os,sys,subprocess,math
from complib2 import *
from complib3 import gs_incremental_compile, gs_incremental_compile_list
import shutil
from shutil import copytree
libname = 'amsculib2.msvc64' #prefix static library name to generate
targetname = 'test' #create this executable when compiling tests
commonincdir = "../../winx64/include"
commonlibdir = "../../winx64/lib"
localbindir = "./bin_winx64"
cc = 'nvcc' #compiler
srcexts = ['.c','.cpp','.cu']
mainsrc = ['main.cu'] #ignore these files when compiling the static library
kwargs = dict()
include = "-I./include -I{}".format(commonincdir)
kwargs['include'] = include
kwargs['flags'] = "-dc"
kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir)
kwargs['libflags'] = "-llib{} -llibamsculib2.msvc64".format(libname)
kwargs['linkerflags'] = ""
kwargs['recurse'] = True
kwargs['objstore'] = "./objstore"
kwargs['searchincdirs'] = ['./include']
#-lamsmathlib3.linux64 -lamsstring3.linux64 -lamsmatrix_cpp.linux64 -llapack -lblas -lgfortran -lamsmathutilthread.linux64 -lamsmathutil2.linux64
#Pull required binary dynamic libraries to the bin folder
#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir);
#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir);
#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64');
#Designate source files for main test program
fsrc = ['./src/main.cu']
fobj = replaceexts(fsrc,'.o')
#Compile test programs
gs_compile_list(cc,fsrc,**kwargs)
gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs)

@ -0,0 +1,89 @@
#ifndef __AMSCU_COMP128_HPP__
#define __AMSCU_COMP128_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp128
{
public:
double real;
double imag;
__host__ __device__ cucomp128();
__host__ __device__ ~cucomp128();
__host__ __device__ cucomp128(const cucomp128 &other);
__host__ __device__ cucomp128(const double &other);
__host__ __device__ cucomp128& operator=(cucomp128& other);
__host__ __device__ const cucomp128& operator=(const cucomp128& other);
__host__ __device__ cucomp128& operator=(double& other);
__host__ __device__ const cucomp128& operator=(const double& other);
__host__ __device__ double& operator[](int& ind);
__host__ __device__ const double& operator[](const int& ind) const;
__host__ __device__ cucomp128 operator+(const cucomp128& z);
__host__ __device__ cucomp128 operator-(const cucomp128& z);
__host__ __device__ cucomp128 operator*(const cucomp128& z);
__host__ __device__ cucomp128 operator/(const cucomp128& z);
__host__ __device__ cucomp128 operator+(const double& z);
__host__ __device__ cucomp128 operator-(const double& z);
__host__ __device__ cucomp128 operator*(const double& z);
__host__ __device__ cucomp128 operator/(const double& z);
__host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp128& z) const;
__host__ __device__ bool operator!=(const cucomp128& z) const;
__host__ __device__ bool operator>(const cucomp128& z) const;
__host__ __device__ bool operator<(const cucomp128& z) const;
__host__ __device__ bool operator>=(const cucomp128& z) const;
__host__ __device__ bool operator<=(const cucomp128& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ double arg() const;
__host__ __device__ double mag() const;
__host__ __device__ cucomp128 conj() const;
};
__host__ __device__ double arg(cucomp128 z);
__host__ __device__ cucomp128 dtocomp(double _r, double _i);
__host__ __device__ double real(cucomp128 z);
__host__ __device__ double imag(cucomp128 z);
__host__ __device__ cucomp128 sin(cucomp128 z);
__host__ __device__ cucomp128 cos(cucomp128 z);
__host__ __device__ cucomp128 tan(cucomp128 z);
__host__ __device__ cucomp128 exp(cucomp128 z);
__host__ __device__ cucomp128 log(cucomp128 z);
__host__ __device__ double abs(cucomp128 z);
__host__ __device__ cucomp128 conj(cucomp128 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp128 cosh(cucomp128 z);
__host__ __device__ cucomp128 sinh(cucomp128 z);
__host__ __device__ cucomp128 tanh(cucomp128 z);
__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp128 csgn(cucomp128 z);
void test_cucomp128_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif

@ -0,0 +1,88 @@
#ifndef __AMSCU_COMP64_HPP__
#define __AMSCU_COMP64_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp64
{
public:
float real;
float imag;
__host__ __device__ cucomp64();
__host__ __device__ ~cucomp64();
__host__ __device__ cucomp64(const cucomp64 &other);
__host__ __device__ cucomp64(const float &other);
__host__ __device__ cucomp64& operator=(cucomp64& other);
__host__ __device__ const cucomp64& operator=(const cucomp64& other);
__host__ __device__ cucomp64& operator=(float& other);
__host__ __device__ const cucomp64& operator=(const float& other);
__host__ __device__ float& operator[](int& ind);
__host__ __device__ const float& operator[](const int& ind) const;
__host__ __device__ cucomp64 operator+(const cucomp64& z);
__host__ __device__ cucomp64 operator-(const cucomp64& z);
__host__ __device__ cucomp64 operator*(const cucomp64& z);
__host__ __device__ cucomp64 operator/(const cucomp64& z);
__host__ __device__ cucomp64 operator+(const float& z);
__host__ __device__ cucomp64 operator-(const float& z);
__host__ __device__ cucomp64 operator*(const float& z);
__host__ __device__ cucomp64 operator/(const float& z);
__host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp64& z) const;
__host__ __device__ bool operator!=(const cucomp64& z) const;
__host__ __device__ bool operator>(const cucomp64& z) const;
__host__ __device__ bool operator<(const cucomp64& z) const;
__host__ __device__ bool operator>=(const cucomp64& z) const;
__host__ __device__ bool operator<=(const cucomp64& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ float arg() const;
__host__ __device__ float mag() const;
__host__ __device__ cucomp64 conj() const;
};
__host__ __device__ float arg(cucomp64 z);
__host__ __device__ cucomp64 dtocomp64(float _r, float _i);
__host__ __device__ float real(cucomp64 z);
__host__ __device__ float imag(cucomp64 z);
__host__ __device__ cucomp64 sin(cucomp64 z);
__host__ __device__ cucomp64 cos(cucomp64 z);
__host__ __device__ cucomp64 tan(cucomp64 z);
__host__ __device__ cucomp64 exp(cucomp64 z);
__host__ __device__ cucomp64 log(cucomp64 z);
__host__ __device__ float abs(cucomp64 z);
__host__ __device__ cucomp64 conj(cucomp64 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp64 cosh(cucomp64 z);
__host__ __device__ cucomp64 sinh(cucomp64 z);
__host__ __device__ cucomp64 tanh(cucomp64 z);
__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp64 csgn(cucomp64 z);
void test_cucomp64_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif

@ -0,0 +1,40 @@
#ifndef __AMSCU_CUDAFUNCTIONS_HPP__
#define __AMSCU_CUDAFUNCTIONS_HPP__
namespace amscuda
{
// device memory operations
// I'm trying to avoid some of the boilerplate mental overhead involved
// in calling cuda functions and handling errors
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr);
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite);
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite);
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr);
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr);
int cuda_errortrap(const char *msgheader);
};
#include <amsculib2/amscu_cudafunctions_impl.hpp>
#endif

@ -0,0 +1,228 @@
#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
namespace amscuda
{
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(*devptr==NULL)
{
return ret; //devbuffer is already NULL/freed
}
err = cudaFree(*devptr);
if(err!=cudaSuccess)
{
ret = -1; //failed to free device pointer
*devptr = NULL; // - ? should only happen if I'm trying to double-free something
}
else
{
ret = 1;
*devptr = NULL;
}
return ret;
}
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(hostbuffer==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devbuffer !=NULL)
{
cuda_free(devbuffer);
}
}
if(*devbuffer==NULL)
{
err = cudaMalloc(devbuffer,sizeof(T)*N);
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devbuffer = NULL;
return ret;
}
}
err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(devbuffer==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostbuffer!=NULL)
{
delete[] (*hostbuffer); hostbuffer = NULL;
}
if(*hostbuffer==NULL)
{
*hostbuffer = new(std::nothrow) T[N];
if(*hostbuffer==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(hostptr==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devptr !=NULL)
{
cuda_free(devptr);
}
}
if(*devptr==NULL)
{
err = cudaMalloc(devptr,sizeof(T));
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devptr = NULL;
return ret;
}
}
err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(devptr==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostptr!=NULL)
{
delete (*hostptr); hostptr = NULL;
}
if(*hostptr==NULL)
{
*hostptr = new(std::nothrow) T;
if(*hostptr==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
};
#endif

@ -0,0 +1,55 @@
#ifndef __AMSCU_RANDOM_HPP__
#define __AMSCU_RANDOM_HPP__
namespace amscuda
{
// Random Number Gerneators
// faster floating point hash function used in fractal generators
__device__ __host__ float fhash1d_su(float x);
__device__ __host__ float fhash3d_su(float x, float y, float z);
__device__ __host__ float fhash4d_su(float x, float y, float z, float w);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int32_t Generator //
//////////////////////////////////////////////////
//Next seed in simple 32 bit integer deterministic psuedo-rand generator
__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout);
//Simple 32 bit integer deterministic pseudo-random generator
// *not* for cryptography
// Frequency of generated floats should be uniform [0,1)
__host__ __device__ float dpr32_randf(int32_t *rseed_inout);
//box muller standard normal pseudorandom variable
__host__ __device__ float dpr32_randnf(int32_t *rseed_inout);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int64_t Generator //
//////////////////////////////////////////////////
//operates without side-effects on explicit seed for threaded use
//deterministic pseudorandom number generator - takes seed and returns next seed
__host__ __device__ void dpr64_nextseed(int64_t *seedinout);
//deterministic pseudorandom number generator - takes seed and returns next seed
//returns uniformly distributed double
__host__ __device__ double dpr64_randd(int64_t *seedinout);
__host__ __device__ float dpr64_randf(int64_t *seedinout);
void test_dprg64();
void test_dprg32();
}; //end namespace amscuda
#endif

@ -0,0 +1,47 @@
#ifndef __CUARRAY_HPP__
#define __CUARRAY_HPP__
namespace amscuda
{
template<typename T> class cuarray
{
public:
int length;
T* data;
__device__ __host__ cuarray();
__device__ __host__ ~cuarray();
//Only call this on the device for thread/block local
// dynamic arrays
__device__ __host__ int resize(const int _length);
__device__ __host__ int size() const;
__device__ __host__ T& at(const int I);
__device__ __host__ const T& at(const int I) const;
__device__ __host__ T& operator[](const int I);
__device__ __host__ const T& operator[](const int I) const;
__host__ int device_send(cuarray<T> **dptr);
__host__ int _device_send_overwrite(cuarray<T> **dptr);
__host__ int _device_send_copy(cuarray<T> *dptr);
__host__ int device_pull(cuarray<T> *dptr);
__host__ int device_free(cuarray<T> **dptr);
__host__ int device_length(cuarray<T> *dptr);
__host__ T* device_data_ptr(cuarray<T> *dptr);
};
void test_cuarray();
};
#include <amsculib2/amscuarray_impl.hpp>
#endif

@ -0,0 +1,76 @@
#ifndef __AMSCUARRAY_DOPS_HPP__
#define __AMSCUARRAY_DOPS_HPP__
//Device Operations on Arrays
//
//Device Operations on Device Buffers
// dodb
namespace amscuda
{
//sum
template<typename T> T devcuarray_sum(cuarray<T> *devptr);
template<typename T> T dbuff_sum(T *devbuffer, int N);
struct dbuff_statstruct
{
public:
float min;
float max;
float mean;
float stdev;
float sum;
};
//stats (min,max,mean,stdev)
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max);
template<typename T> dbuff_statstruct dbuff_stats(T *devbuffer, int N); //
//sets all elements to setto
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads);
//random device buffer functions
void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads); //
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//vector_add
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
// Tests //
void test_dbuff_rand_dpr32();
};
#include <amsculib2/amscuarray_dops_impl.hpp>
#endif

@ -0,0 +1,404 @@
#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
#define __AMSCUARRAY_DOPS_IMPL_HPP__
namespace amscuda
{
template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T ret = (T) 0;
for(I=I0;I<N;I=I+Is)
{
ret = ret + devbuffer[I];
}
rets[I0] = ret;
}
template<typename T> T devcuarray_sum(cuarray<T> *devptr)
{
T ret = T();
cudaError_t err = cudaSuccess;
cuarray<T> ldptr;
cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = devbuffer_sum(ldptr.data,ldptr.length);
ldptr.data = NULL;
ldptr.length=0;
return ret;
}
template<typename T> T dbuff_sum(T *dbuff, int N)
{
int I;
T ret = T();
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
if(dbuff==NULL || N<=0)
{
return ret;
}
if(N>100)
{
nblocks = 10;
nthreads = (int)sqrt((float) (N/nblocks));
if(nthreads<=0) nthreads=1;
if(nthreads>512) nthreads=512;
}
else
{
nblocks = 1;
nthreads = 1;
}
T *rets = NULL;
T *devrets = NULL;
rets = new T[nblocks*nthreads];
cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
ret = (T)0;
for(I=0;I<nblocks*nthreads;I++)
{
ret = ret + rets[I];
}
cudaFree(devrets); devrets = NULL;
delete[] rets;
return ret;
}
template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
if(I==I0)
{
maxs[I0] = devbuffer[I];
mins[I0] = devbuffer[I];
}
else
{
if(devbuffer[I]>maxs[I0])
{
maxs[I0] = devbuffer[I];
}
if(devbuffer[I]<mins[I0])
{
mins[I0] = devbuffer[I];
}
}
}
return;
}
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
{
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
int I;
T *maxs = NULL;
T *dev_maxs = NULL;
T *mins = NULL;
T *dev_mins = NULL;
T localmax = T(0);
T localmin = T(0);
if(devbuffer==NULL || N<=0)
{
if(min!=NULL) *min = T(0);
if(max!=NULL) *max = T(0);
return;
}
if(N>25)
{
nblocks = 25;
nthreads = (int) sqrt((float)(N/nblocks));
if(nthreads<1) nthreads = 1;
if(nthreads>512) nthreads = 512;
}
else
{
nblocks = 1;
nthreads = 1;
}
maxs = new T[nblocks*nthreads];
mins = new T[nblocks*nthreads];
cudaMalloc(&dev_maxs,nblocks*nthreads);
cudaMalloc(&dev_mins,nblocks*nthreads);
dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
for(I=0;I<nblocks*nthreads;I++)
{
if(I==0)
{
localmax = maxs[0];
localmin = mins[0];
}
else
{
if(maxs[I]>localmax) localmax = maxs[I];
if(mins[I]<localmin) localmin = mins[I];
}
}
if(max!=NULL) *max = localmax;
if(min!=NULL) *min = localmin;
cudaFree(dev_maxs); dev_maxs = NULL;
cudaFree(dev_mins); dev_mins = NULL;
delete[] maxs; maxs = NULL;
delete[] mins; mins = NULL;
return;
}
template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
devbuffer[I] = setto;
}
return;
}
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(devbuffer==NULL || N<=0)
{
return;
}
dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = dbuf_b[I];
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = par_b;
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T> T dbuff_add_fn(T a, T b)
{
return a+b;
}
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_sub_fn(T a, T b)
{
return a-b;
}
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_mult_fn(T a, T b)
{
return a*b;
}
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_div_fn(T a, T b)
{
return a/b;
}
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_ldiv_fn(T a, T b)
{
return b/a;
}
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
return;
}
};
#endif

@ -0,0 +1,323 @@
#ifndef __CUARRAY_IMPL_HPP__
#define __CUARRAY_IMPL_HPP__
namespace amscuda
{
// New Version cuarray<T>
// simpler, less crap going on
template<typename T> __device__ __host__ cuarray<T>::cuarray()
{
length = 0;
data = NULL;
}
template<typename T> __device__ __host__ cuarray<T>::~cuarray()
{
if(data!=NULL)
{
delete[] data; data = NULL;
}
length = 0;
}
template<typename T> __device__ __host__ int cuarray<T>::resize(const int _length)
{
int ret = 0;
T *newbuffer = NULL;
if(length==_length)
{
//do nothing
ret = 1;
return ret;
}
if(_length<=0)
{
if(data!=NULL)
{
delete[] data;
data = NULL;
}
length = 0;
ret = 1;
}
newbuffer = new T[_length];
if(newbuffer==NULL)
{
ret = -1; //failed to allocate memory
return ret;
}
int I;
T def;
if(data!=NULL)
{
for(I=0;I<length&&I<_length;I++)
{
newbuffer[I] = data[I];
}
for(I=length;I<_length;I++)
{
newbuffer[I] = def;
}
delete[] data; data=NULL;
}
else
{
for(I=0;I<_length;I++)
{
newbuffer[I] = def;
}
}
data = newbuffer;
length = _length;
ret = 1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_send(cuarray<T> **dptr)
{
int ret = 0;
int dlength;
if(*dptr==NULL)
{
ret = _device_send_overwrite(dptr);
}
else
{
dlength = device_length(*dptr);
if(dlength=length)
{
ret = _device_send_copy(*dptr);
}
else
{
ret = _device_send_overwrite(dptr);
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_overwrite(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
cudaError_t err = cudaSuccess;
device_free(dptr);
if(length>=0 && data!=NULL)
{
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
err = cudaMalloc(&(dlocal.data),sizeof(T)*length);
dlocal.length = length;
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
if(data!=NULL)
err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice);
else
err = cudaSuccess;
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -3;
}
}
else
{
ret = -2;
}
}
else
{
ret = -1;
}
}
else
{
dlocal.data = NULL;
dlocal.length = 0;
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
ret = 1;
}
else
{
ret = -4;
}
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_copy(cuarray<T> *dptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
T* ddata = NULL;
ddata = device_data_ptr(dptr);
err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -1;
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_pull(cuarray<T> *dptr)
{
int ret = 0;
int dlength;
T* ddata;
cudaError_t err;
if(dptr==NULL)
{
ret = -1; // null d pointer
return ret;
}
dlength = device_length(dptr);
if(dlength!=length)
{
this->resize(dlength);
}
ddata = device_data_ptr(dptr);
if(length>0 && data!=NULL && ddata!=NULL)
{
err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -2;
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_free(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
if(*dptr!=NULL)
{
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
if(dlocal.data!=NULL)
{
cudaFree(dlocal.data);
dlocal.data = NULL;
}
cudaFree(*dptr);
*dptr = NULL;
ret = 1;
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_length(cuarray<T> *dptr)
{
int ret = -1;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.length;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ T* cuarray<T>::device_data_ptr(cuarray<T> *dptr)
{
T* ret = NULL;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.data;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __device__ __host__ int cuarray<T>::size() const
{
return this->length;
}
template<typename T> __device__ __host__ T& cuarray<T>::at(const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::at(const int I) const
{
return this->data[I];
}
template<typename T> __device__ __host__ T& cuarray<T>::operator[](const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::operator[](const int I) const
{
return this->data[I];
}
};
#endif

@ -0,0 +1,19 @@
#ifndef __AMSCUDA_BINARRRW_HPP__
#define __AMSCUDA_BINARRRW_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer);
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer);
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer);
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer);
}; //end namespace amscuda
#include <amsculib2/amscuda_binarrrw_impl.hpp>
#endif

@ -0,0 +1,194 @@
#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__
#define __AMSCUDA_BINARRRW_IMPL_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer)
{
int ret = 1;
int I;
long piprod;
int32_t q;
int cnt;
int32_t Nd;
if(fp!=NULL)
{
if(!feof(fp))
{
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
shape->resize(Nd);
piprod = 1;
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
shape->at(I) = q;
if(q>0)
{
piprod = piprod*q;
}
else
{
piprod = 0;
}
}
buffer->resize(piprod);
if(piprod>0)
{
cnt = fread((buffer->data),sizeof(T),piprod,fp);
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod);
ret = 0;
}
}
}
else
{
printf("fread_ndarray: Read a number of dimensions<=0.\n");
Nd = 0;
shape->resize(0);
buffer->resize(0);
}
}
else
{
printf("fread_ndarray: fp=NULL.\n");
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer)
{
int ret = 1;
long piprod;
int I;
int32_t Nd;
if(fp==NULL)
{
ret = 0;
printf("fwrite_ndarray: fp=NULL\n");
return ret;
}
piprod = 1;
for(I=0;I<shape->size();I++)
{
if(shape->at(I)>0)
{
piprod = piprod*shape->at(I);
}
else
{
piprod = 0;
}
}
Nd = (int32_t) shape->size();
if(piprod!=buffer->size())
{
ret = 0;
printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod);
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
if(Nd>0)
{
fwrite(shape->data,sizeof(int32_t),Nd,fp);
if(piprod>0)
{
fwrite(buffer->data,sizeof(T),buffer->size(),fp);
}
}
return ret;
}
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer)
{
int ret = 0;
int Nd = 1;
if(fp==NULL)
{
ret = 0;
printf("fwrite_buffer: fp=NULL\n");
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
fwrite(&N,sizeof(int32_t),1,fp);
fwrite(buffer,sizeof(T),N,fp);
return ret;
}
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer)
{
int ret = 0;
int cnt;
int32_t Nd;
int32_t *dims = NULL;
int piprod;
int32_t q;
int I;
int Nr;
if(fp==NULL) {ret = -1; return ret;}
if(feof(fp)) {ret = -2; return ret;}
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
piprod = 1;
dims = new(std::nothrow) int32_t[Nd];
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
dims[I] = q;
piprod = piprod*dims[I];
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fwrite_buffer, read %d values, expecting %d\n",cnt,piprod);
}
}
Nr = amscuda::min<int32_t>(Nmax,piprod);
cnt = fread(buffer,sizeof(T),Nr,fp);
}
if(dims!=NULL) {delete[] dims; dims=NULL;}
return ret;
}
}; //end namespace amscuda
#endif

@ -0,0 +1,11 @@
#ifndef __AMSCUGEOM_HPP__
#define __AMSCUGEOM_HPP__
namespace amscuda
{
}; //end namespace amscuda
#endif

@ -0,0 +1,70 @@
#ifndef __AMSCULIB2_HPP__
#define __AMSCULIB2_HPP__
//Std Lib Includes
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <time.h>
#include <new>
#include <cuda_runtime_api.h> //where all the cuda functions live
#include <cuda_runtime.h>
#include <cuda.h>
//Dependencies
//Predeclarations
class cuvect2;
class cuvect3;
class cuvect4;
class cuvect2f;
class cuvect3f;
class cuvect4f;
//Need a way to define the same symbols using both host and device code
//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory
#ifdef __CUDA_ARCH__
#define AMSCU_CONST __constant__
#else
#define AMSCU_CONST
#endif
namespace amscuda
{
//default thread and block execution
AMSCU_CONST static const int amscu_defnblocks = 256;
AMSCU_CONST static const int amscu_defnthreads = 512;
//default numthreads to execute on cpu
AMSCU_CONST static const int amscu_defcputhreads = 8;
}; //end namespace amscuda
//Components
#include <amsculib2/amscu_cudafunctions.hpp>
#include <amsculib2/amscumath.hpp>
#include <amsculib2/amscu_comp64.hpp>
#include <amsculib2/amscu_comp128.hpp>
#include <amsculib2/cuvect2.hpp>
#include <amsculib2/cuvect3.hpp>
#include <amsculib2/cuvect4.hpp>
#include <amsculib2/cuvect2f.hpp>
#include <amsculib2/cuvect3f.hpp>
#include <amsculib2/cuvect4f.hpp>
#include <amsculib2/amscugeom.hpp>
#include <amsculib2/amscuarray.hpp>
#include <amsculib2/amscuda_binarrrw.hpp>
#include <amsculib2/amscu_random.hpp>
#include <amsculib2/amscuarray_dops.hpp>
#include <amsculib2/amscurarray.cuh>
#endif

@ -0,0 +1,56 @@
#ifndef __AMSCUMATH_HPP__
#define __AMSCUMATH_HPP__
namespace amscuda
{
//Problem: These are not in the namespace
//#define nan NAN
//#define fnan (float) NAN
//#define inf INFINITY
//#define finf (float) INFINITY
//#define pi 3.1415926535897936
//These need to be the same symbol for both host and device code
AMSCU_CONST static const double nan = NAN;
AMSCU_CONST static const float fnan = (float) NAN;
AMSCU_CONST static const double inf = INFINITY;
AMSCU_CONST static const float finf = (float) INFINITY;
AMSCU_CONST static const double pi = 3.1415926535897936;
AMSCU_CONST static const float pif = 3.1415926535897936;
__host__ __device__ double dabs(double x);
__host__ __device__ float fabs(float x);
template<typename T> __host__ __device__ T abs(const T in)
{
T ret = in;
if(in<0) ret = -in;
return ret;
}
__host__ __device__ double mod(double a, double md);
__host__ __device__ float mod(float a, float md);
__host__ __device__ int mod(int x, int n);
__host__ __device__ long mod(long x, long n);
__host__ __device__ int truediv(int x, int y);
__host__ __device__ long truediv(long x, long y);
template<typename T> __host__ __device__ T min(T a, T b);
template<typename T> __host__ __device__ T max(T a, T b);
__device__ __host__ double arg(double x, double y);
__device__ __host__ void get_azel(double x, double y, double z, double *az, double *el);
void test_amscumath1();
}; //end namespace amscuda
#include <amsculib2/amscumath_impl.hpp>
#endif

@ -0,0 +1,42 @@
#ifndef __AMSCUMATH_IMPL_HPP__
#define __AMSCUMATH_IMPL_HPP__
namespace amscuda
{
template<typename T> __host__ __device__ T min(T a, T b)
{
if(a>b)
{
return b;
}
else
{
return a;
}
return a;
}
template<typename T> __host__ __device__ T max(T a, T b)
{
if(a>b)
{
return a;
}
else
{
return b;
}
return a;
}
template<> __host__ __device__ double min(double a, double b);
template<> __host__ __device__ float min(float a, float b);
template<> __host__ __device__ double max(double a, double b);
template<> __host__ __device__ float max(float a, float b);
}; //end namespace amscuda
#endif

@ -0,0 +1,66 @@
#ifndef __AMSCURARRAY_HPP__
#define __AMSCURARRAY_HPP__
namespace amscuda
{
//Cuda ragged array class
template<typename T> class curarray
{
public:
int device;
curarray* devptr; //pointer to mirror class on the device
int Narrays; //number of arrays
int *N; //dimension of each array
T** hostarrayptrs; //pointers to each array on the host - null on the device
T** devarrayptrs; //pointers to each array on the device
//the double pointer is a host pointer to device pointers on the host class
//for the device class, only the second set of arrays is in use
//the constructor and destructor set all pointers to NULL, they
// do *not* manage memory. This is done with curarray_new and curarray_delete
__device__ __host__ curarray();
__device__ __host__ ~curarray();
__host__ int push();
__host__ int pull();
//__device__ int dev_resizearray(int arraynum, int arraysize);
__host__ int resizearray(int arraynum, int arraysize);
// I may want a way to resize arrays on the device without pushing/pulling all the array contents
};
template<typename T> int curarray_new(curarray<T>** ptr, int Narrays);
template<typename T> int curarray_delete(curarray<T>** ptr);
template<typename T> int curarray_device_new(curarray<T> *hostptr);
template<typename T> int curarray_device_delete(curarray<T> *hostptr);
template<typename T> int curarray_push(curarray<T> *hostptr);
template<typename T> int curarray_pull(curarray<T> *hostptr);
//template<typename T> int curarray_host_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> int curarray_device_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> __host__ int curarray_deletearray(curarray<T> *hostptr, int arrayindex);
//template<typename T> __device__ int curarray_dev_deletearray(curarray<T> *devptr, int arrayindex);
//template<typename T> __host__ int curarray_allocarray(curarray<T> *hostptr, int arrayindex, int size);
//template<typename T> __device__ int curarray_dev_allocarray(curarray<T> *devptr, int arrayindex, int size);
void test_amscurarray1();
};
#include <amsculib2/amscurarray_impl.cuh>
#endif

@ -0,0 +1,529 @@
#ifndef __AMSCURARRAY_IMPL_HPP__
#define __AMSCURARRAY_IMPL_HPP__
namespace amscuda
{
template<typename T> curarray<T>::curarray()
{
device = -1;
devptr = NULL;
Narrays = 0;
N = NULL;
hostarrayptrs = NULL;
devarrayptrs = NULL;
}
template<typename T> curarray<T>::~curarray()
{
device = -1;
devptr = NULL;
Narrays = 0;
N = NULL;
hostarrayptrs = NULL;
devarrayptrs = NULL;
}
template<typename T> int curarray_new(curarray<T>** ptr, int Narrays)
{
int ret = 0;
int device;
curarray<T> *lhptr = *ptr;
cudaGetDevice(&device);
if(lhptr!=NULL)
{
curarray_delete(ptr);
}
*ptr = new(std::nothrow) curarray<T>();
lhptr = *ptr;
int I;
if(Narrays<0) Narrays=0;
lhptr->Narrays = Narrays;
lhptr->device = device;
lhptr->N = new(std::nothrow) int[Narrays];
lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays];
lhptr->devarrayptrs = new(std::nothrow) T*[Narrays];
for(I=0;I<Narrays;I++)
{
lhptr->N[I] = 0;
lhptr->hostarrayptrs[I] = NULL;
lhptr->devarrayptrs[I] = NULL;
}
curarray_device_new(lhptr);
return ret;
}
template<typename T> int curarray_delete(curarray<T>** ptr)
{
int ret = 0;
curarray<T> *lptr = NULL;
int olddev;
int I;
if(*ptr==NULL)
{
return 0;
}
lptr = *ptr;
cudaGetDevice(&olddev);
cudaSetDevice(lptr->device);
if(lptr->devptr!=NULL)
{
curarray_device_delete(lptr);
}
lptr->device = -1;
for(I=0;I<lptr->Narrays;I++)
{
if(lptr->hostarrayptrs!=NULL)
{
if(lptr->hostarrayptrs[I]!=NULL)
{
delete[] lptr->hostarrayptrs[I];
lptr->hostarrayptrs[I] = NULL;
}
}
if(lptr->devarrayptrs!=NULL)
{
if(lptr->devarrayptrs[I]!=NULL)
{
//erasing device memory should have been handled in curarray_device_delete
lptr->devarrayptrs[I] = NULL;
}
}
lptr->N[I] = 0;
}
if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;}
if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;}
if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;}
if(*ptr!=NULL) {delete *ptr; *ptr = NULL;}
cudaSetDevice(olddev);
return ret;
}
template<typename T> int curarray_device_new(curarray<T> *hostptr)
{
int ret = 0;
curarray<T> ldevdata;
if(hostptr==NULL) return -1;
if(hostptr->devptr!=NULL)
{
curarray_device_delete(hostptr);
}
int I;
cudaGetDevice(&(hostptr->device));
ldevdata.device = hostptr->device;
ldevdata.Narrays = hostptr->Narrays;
int Narrays = hostptr->Narrays;
cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays);
cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
ldevdata.hostarrayptrs = NULL;
for(I=0;I<Narrays;I++)
{
if(hostptr->N[I]>0)
{
if(hostptr->devarrayptrs[I]!=NULL)
{
cudaFree(hostptr->devarrayptrs[I]);
hostptr->devarrayptrs[I] = NULL;
}
cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
}
else
{
if(hostptr->devarrayptrs[I]!=NULL)
{
cudaFree(hostptr->devarrayptrs[I]);
hostptr->devarrayptrs[I] = NULL;
}
}
}
cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays);
cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);
cudaMalloc(&(hostptr->devptr),sizeof(curarray<T>));
cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);
ret = 1;
return ret;
}
template<typename T> int curarray_device_delete(curarray<T> *hostptr)
{
int ret = 0;
curarray<T> ldevdata;
int olddev;
if(hostptr->devptr==NULL)
{
return 0;
}
cudaGetDevice(&olddev);
cudaSetDevice(hostptr->device);
cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
int I;
for(I=0;I<hostptr->Narrays;I++)
{
if(hostptr->devarrayptrs[I]!=NULL)
{
cudaFree(hostptr->devarrayptrs[I]);
hostptr->devarrayptrs[I] = NULL;
}
}
cudaFree(ldevdata.devarrayptrs);
cudaFree(ldevdata.N);
cudaFree(hostptr->devptr);
hostptr->devptr = NULL;
hostptr->device = -1;
cudaSetDevice(olddev);
ret = 1;
return ret;
}
template<typename T> int curarray_push(curarray<T> *hostptr)
{
int ret = 0;
int olddev;
curarray<T> ldevdata;
T** ldevarrayptrs = NULL;
int *devN = NULL;
if(hostptr==NULL) return -1;
cudaGetDevice(&olddev);
cudaSetDevice(hostptr->device);
int Narrays = hostptr->Narrays;
cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
ldevarrayptrs = new(std::nothrow) T*[Narrays];
devN = new(std::nothrow) int[Narrays];
cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);
int I;
for(I=0;I<Narrays;I++)
{
//check to see that host size is the same as device size, and that
//the host device pointer is the same as the device device pointer
if( (hostptr->N[I]!=devN[I]) ||
(hostptr->devarrayptrs[I] != ldevarrayptrs[I])
)
{
cudaFree(ldevarrayptrs[I]);
ldevarrayptrs[I] = NULL;
hostptr->devarrayptrs[I] = NULL;
if(hostptr->N[I]>0)
{
cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
ldevarrayptrs[I] = hostptr->devarrayptrs[I];
devN[I] = hostptr->N[I];
}
else
{
devN[I] = 0;
}
}
if(hostptr->N[I]>0)
{
//copy host data to device
cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
}
} //for each array
//rectify and copy device data structure to device
ldevdata.device = hostptr->device;
ldevdata.devptr = NULL;
ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true
ldevdata.hostarrayptrs = NULL;
cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);
cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);
cuda_errortrap("curarray_push cuda error:");
cudaSetDevice(olddev);
delete[] ldevarrayptrs;
delete[] devN;
return ret;
}
template<typename T> int curarray_pull(curarray<T> *hostptr)
{
int ret = 0;
int olddev;
curarray<T> ldevdata;
T** ldevarrayptrs = NULL;
int *devN = NULL;
if(hostptr==NULL) return -1;
cudaGetDevice(&olddev);
cudaSetDevice(hostptr->device);
cuda_errortrap("dbg1");
int Narrays = hostptr->Narrays;
cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
ldevarrayptrs = new(std::nothrow) T*[Narrays];
devN = new(std::nothrow) int[Narrays];
cuda_errortrap("dbg2");
cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);
cuda_errortrap("dbg3");
char dbgjnk[50];
int I;
for(I=0;I<Narrays;I++)
{
//check to see that host size is the same as device size, and that
//the host device pointer is the same as the device device pointer
if(hostptr->devarrayptrs[I] != ldevarrayptrs[I])
{
hostptr->devarrayptrs[I] = ldevarrayptrs[I];
}
if(hostptr->N[I]!=devN[I])
{
if(hostptr->hostarrayptrs[I]!=NULL)
{
delete[] hostptr->hostarrayptrs[I];
hostptr->hostarrayptrs[I] = NULL;
}
if(devN[I]>0)
{
hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]];
hostptr->N[I] = devN[I];
}
else
{
hostptr->N[I] = 0;
}
}
if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL)
{
cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost);
sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]);
cuda_errortrap(dbgjnk);
}
} //for each array
//for the pull operation, I don't think any update of the device data structure is necessary
cudaSetDevice(olddev);
delete[] ldevarrayptrs;
delete[] devN;
return ret;
}
template<typename T> __host__ int curarray<T>::push()
{
return curarray_push(this);
}
template<typename T> __host__ int curarray<T>::pull()
{
return curarray_pull(this);
}
/*
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D
cudaMalloc() and cudaFree() have distinct semantics between the host and
device environments. When invoked from the host, cudaMalloc() allocates a
new region from unused device memory. When invoked from the device runtime
these functions map to device-side malloc() and free(). This implies that
within the device environment the total allocatable memory is limited to the
device malloc() heap size, which may be smaller than the available unused
device memory. Also, it is an error to invoke cudaFree() from the host
program on a pointer which was allocated by cudaMalloc() on the device
or vice-versa.
So, basically this entire function is not going to work. I'll be unable to resize within
a kernel.
*/
/*
template<typename T> __device__ int curarray<T>::dev_resizearray(int arraynum, int arraysize)
{
int ret = 0;
T* newptr = NULL;
int I;
T def;
if(arraynum>=0 && arraynum<Narrays)
{
if(N[arraynum]!=arraysize)
{
if(arraysize<=0)
{
if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
devarrayptrs[arraynum] = NULL;
N[arraynum] = 0;
ret = 1;
return ret;
}
cudaMalloc(&newptr,arraysize*sizeof(T));
if(newptr!=NULL)
{
//do I want to assume there is a copy operator? (operator=)
//for now, yes - write a more restrictive class later if I don't want it
if(devarrayptrs[arraynum]!=NULL)
{
for(I=0;I<N[arraynum]&&I<arraysize;I++)
{
newptr[I] = devarrayptrs[arraynum][I];
}
}
for(I=N[arraynum];I<arraysize;I++)
{
newptr[I] = def;
}
if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
devarrayptrs[arraynum] = newptr;
N[arraynum] = arraysize;
ret = 1;
}
else
{
ret = -1;
}
}
else
{
ret = 1;
}
}
return ret;
}
*/
template<typename T> __host__ int curarray<T>::resizearray(int arraynum, int arraysize)
{
int ret = 0;
T* newptr = NULL;
int I;
T def;
if(arraynum>=0 && arraynum<Narrays)
{
if(N[arraynum]!=arraysize)
{
if(arraysize<=0)
{
delete[] hostarrayptrs[arraynum];
hostarrayptrs[arraynum] = NULL;
N[arraynum] = 0;
ret = 1;
return ret;
}
newptr = new(std::nothrow) T[arraysize];
if(newptr!=NULL)
{
//do I want to assume there is a copy operator? (operator=)
//for now, yes - write a more restrictive class later if I don't want it
if(hostarrayptrs[arraynum]!=NULL)
{
for(I=0;I<N[arraynum]&&I<arraysize;I++)
{
newptr[I] = hostarrayptrs[arraynum][I];
}
}
for(I=N[arraynum];I<arraysize;I++)
{
newptr[I] = def;
}
//cudaFree(hostarrayptrs[arraynum]);
delete[] hostarrayptrs[arraynum];
hostarrayptrs[arraynum] = newptr;
N[arraynum] = arraysize;
ret = 1;
}
else
{
ret = -1;
}
}
else
{
ret = 1;
}
}
return ret;
}
};
#endif

@ -0,0 +1,84 @@
#ifndef __CUVECT2_HPP__
#define __CUVECT2_HPP__
namespace amscuda
{
class cuvect2
{
public:
double x;
double y;
__host__ __device__ cuvect2();
__host__ __device__ ~cuvect2();
__host__ __device__ cuvect2(double _x, double _y);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect2 operator+(cuvect2 lhs);
__host__ __device__ cuvect2 operator-(cuvect2 lhs);
__host__ __device__ cuvect2 operator*(double lhs);
__host__ __device__ cuvect2 operator/(double lhs);
};
class cumat2
{
public:
double dat[4];
__host__ __device__ cumat2();
__host__ __device__ ~cumat2();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat2 operator+(cumat2 lhs);
__host__ __device__ cumat2 operator-(cumat2 lhs);
__host__ __device__ cumat2 operator*(double lhs);
__host__ __device__ cumat2 operator/(double lhs);
__host__ __device__ cuvect2 operator*(cuvect2 lhs);
__host__ __device__ cumat2 operator*(cumat2 lhs);
__host__ __device__ double det();
__host__ __device__ cumat2 transpose();
__host__ __device__ cumat2 inverse();
};
__host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_norm(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2_transpose(double *mat2inout);
//copies src to dest
__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src);
//inverts mat?inout[4]
__host__ __device__ void mat2_inverse(double *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2);
//multiplies c = a*b
__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c);
// ret = a*b
__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b);
void test_cuvect2_1();
}; //end namespace amscuda
#endif

@ -0,0 +1,84 @@
#ifndef __CUVECT2F_HPP__
#define __CUVECT2F_HPP__
namespace amscuda
{
class cuvect2f
{
public:
float x;
float y;
__host__ __device__ cuvect2f();
__host__ __device__ ~cuvect2f();
__host__ __device__ cuvect2f(float _x, float _y);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect2f operator+(cuvect2f lhs);
__host__ __device__ cuvect2f operator-(cuvect2f lhs);
__host__ __device__ cuvect2f operator*(float lhs);
__host__ __device__ cuvect2f operator/(float lhs);
};
class cumat2f
{
public:
float dat[4];
__host__ __device__ cumat2f();
__host__ __device__ ~cumat2f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat2f operator+(cumat2f lhs);
__host__ __device__ cumat2f operator-(cumat2f lhs);
__host__ __device__ cumat2f operator*(float lhs);
__host__ __device__ cumat2f operator/(float lhs);
__host__ __device__ cuvect2f operator*(cuvect2f lhs);
__host__ __device__ cumat2f operator*(cumat2f lhs);
__host__ __device__ float det();
__host__ __device__ cumat2f transpose();
__host__ __device__ cumat2f inverse();
};
__host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_norm(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2f_transpose(float *mat2inout);
//copies src to dest
__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src);
//inverts mat?inout[4]
__host__ __device__ void mat2f_inverse(float *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2);
//multiplies c = a*b
__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c);
// ret = a*b
__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b);
void test_cuvect2f_1();
};
#endif

@ -0,0 +1,86 @@
#ifndef __CUVECT3_HPP__
#define __CUVECT3_HPP__
namespace amscuda
{
class cuvect3
{
public:
double x;
double y;
double z;
__host__ __device__ cuvect3();
__host__ __device__ ~cuvect3();
__host__ __device__ cuvect3(double _x, double _y, double _z);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect3 operator+(cuvect3 lhs);
__host__ __device__ cuvect3 operator-(cuvect3 lhs);
__host__ __device__ cuvect3 operator*(double lhs);
__host__ __device__ cuvect3 operator/(double lhs);
};
class cumat3
{
public:
double dat[9];
__host__ __device__ cumat3();
__host__ __device__ ~cumat3();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat3 operator+(cumat3 lhs);
__host__ __device__ cumat3 operator-(cumat3 lhs);
__host__ __device__ cumat3 operator*(double lhs);
__host__ __device__ cumat3 operator/(double lhs);
__host__ __device__ cuvect3 operator*(cuvect3 lhs);
__host__ __device__ cumat3 operator*(cumat3 lhs);
__host__ __device__ double det();
__host__ __device__ cumat3 transpose();
__host__ __device__ cumat3 inverse();
};
__host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b);
__host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b);
__host__ __device__ double cuvect3_norm(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3_transpose(double *mat3inout);
//copies src to dest
__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src);
//returns determinant of 3x3 matrix
__host__ __device__ double mat3_det(double *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3_inverse(double *mat3inout);
__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin);
__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout);
__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout);
__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout);
__host__ void test_cudavect_logic1();
}; //end namespace amscuda
#endif

@ -0,0 +1,86 @@
#ifndef __CUVECT3F_HPP__
#define __CUVECT3F_HPP__
namespace amscuda
{
class cuvect3f
{
public:
float x;
float y;
float z;
__host__ __device__ cuvect3f();
__host__ __device__ ~cuvect3f();
__host__ __device__ cuvect3f(float _x, float _y, float _z);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect3f operator+(cuvect3f lhs);
__host__ __device__ cuvect3f operator-(cuvect3f lhs);
__host__ __device__ cuvect3f operator*(float lhs);
__host__ __device__ cuvect3f operator/(float lhs);
};
class cumat3f
{
public:
float dat[9];
__host__ __device__ cumat3f();
__host__ __device__ ~cumat3f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat3f operator+(cumat3f lhs);
__host__ __device__ cumat3f operator-(cumat3f lhs);
__host__ __device__ cumat3f operator*(float lhs);
__host__ __device__ cumat3f operator/(float lhs);
__host__ __device__ cuvect3f operator*(cuvect3f lhs);
__host__ __device__ cumat3f operator*(cumat3f lhs);
__host__ __device__ float det();
__host__ __device__ cumat3f transpose();
__host__ __device__ cumat3f inverse();
};
__host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b);
__host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b);
__host__ __device__ float cuvect3f_norm(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3f_transpose(float *mat3inout);
//copies src to dest
__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src);
//returns determinant of 3x3 matrix
__host__ __device__ float mat3f_det(float *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3f_inverse(float *mat3inout);
__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin);
__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout);
__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout);
__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout);
__host__ void test_cudavectf_logic1();
};
#endif

@ -0,0 +1,59 @@
#ifndef __CUVECT4_HPP__
#define __CUVECT4_HPP__
namespace amscuda
{
class cuvect4
{
public:
double x;
double y;
double z;
double w;
__host__ __device__ cuvect4();
__host__ __device__ ~cuvect4();
__host__ __device__ cuvect4(double _x, double _y, double _z, double _w);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect4 operator+(cuvect4 lhs);
__host__ __device__ cuvect4 operator-(cuvect4 lhs);
__host__ __device__ cuvect4 operator*(double lhs);
__host__ __device__ cuvect4 operator/(double lhs);
};
class cumat4
{
public:
double dat[16];
__host__ __device__ cumat4();
__host__ __device__ ~cumat4();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat4 operator+(cumat4 lhs);
__host__ __device__ cumat4 operator-(cumat4 lhs);
__host__ __device__ cumat4 operator*(double lhs);
__host__ __device__ cumat4 operator/(double lhs);
__host__ __device__ cuvect4 operator*(cuvect4 lhs);
__host__ __device__ cumat4 operator*(cumat4 lhs);
__host__ __device__ double det();
__host__ __device__ cumat4 transpose();
__host__ __device__ cumat4 inverse();
};
__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b);
__host__ __device__ double cuvect4_norm(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b);
}; //end namespace amscuda
#endif

@ -0,0 +1,60 @@
#ifndef __CUVECT4F_HPP__
#define __CUVECT4F_HPP__
namespace amscuda
{
class cuvect4f
{
public:
float x;
float y;
float z;
float w;
__host__ __device__ cuvect4f();
__host__ __device__ ~cuvect4f();
__host__ __device__ cuvect4f(float _x, float _y, float _z, float _w);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect4f operator+(cuvect4f lhs);
__host__ __device__ cuvect4f operator-(cuvect4f lhs);
__host__ __device__ cuvect4f operator*(float lhs);
__host__ __device__ cuvect4f operator/(float lhs);
};
class cumat4f
{
public:
float dat[16];
__host__ __device__ cumat4f();
__host__ __device__ ~cumat4f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat4f operator+(cumat4f lhs);
__host__ __device__ cumat4f operator-(cumat4f lhs);
__host__ __device__ cumat4f operator*(float lhs);
__host__ __device__ cumat4f operator/(float lhs);
__host__ __device__ cuvect4f operator*(cuvect4f lhs);
__host__ __device__ cumat4f operator*(cumat4f lhs);
__host__ __device__ float det();
__host__ __device__ cumat4f transpose();
__host__ __device__ cumat4f inverse();
};
__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b);
__host__ __device__ float cuvect4f_norm(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b);
};
#endif

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -0,0 +1,23 @@
#!/usr/bin/python3
import os,sys,math;
from compscripts.complib2 import *;
args = sys.argv
if(len(args)>=2):
if(args[1]=="clean"):
obj_list = flist('./objstore',recurse=True,exts=['.o'])
for o in obj_list:
os.remove('{}'.format(o))
exit(0)
os.system('python3 ./compscripts/linux64.makelib.py')
os.system('python3 ./compscripts/linux64.maketest.py')
# obj_list = flist('./src',recurse=True,exts=['.o'])
# for o in obj_list:
# os.remove('{}'.format(o))
#os.chdir('./bin_linux64')
callproc('./bin_linux64/test')
#os.chdir('..')

@ -0,0 +1,17 @@
#!/usr/bin/python3
import os,sys,math;
from compscripts.complib2 import *;
os.system('python ./compscripts/winnvcc.makelib.py')
os.system('python ./compscripts/winnvcc.maketest.py')
obj_list = flist('./',recurse=True,exts=['.o'])
for o in obj_list:
os.remove('{}'.format(o))
#os.chdir('./bin_winx64')
callproc('.\\bin_winx64\\test.exe')
#os.chdir('..')

@ -0,0 +1,476 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
namespace cmp
{
__host__ __device__ cucomp128::cucomp128()
{
real = 0.0;
imag = 0.0;
return;
}
__host__ __device__ cucomp128::~cucomp128()
{
real = 0.0;
imag = 0.0;
return;
}
__host__ __device__ cucomp128::cucomp128(const cucomp128 &other)
{
real = other.real;
imag = other.imag;
return;
}
__host__ __device__ cucomp128::cucomp128(const double &other)
{
real = other;
imag = 0.0;
return;
}
__host__ __device__ cucomp128& cucomp128::operator=(cucomp128& other)
{
real = other.real;
imag = other.imag;
return *this;
}
__host__ __device__ const cucomp128& cucomp128::operator=(const cucomp128& other)
{
this->real = other.real;
this->imag = other.imag;
return *this;
}
__host__ __device__ cucomp128& cucomp128::operator=(double& other)
{
real = other;
imag = 0.0;
return *this;
}
__host__ __device__ const cucomp128& cucomp128::operator=(const double& other)
{
this->real = other;
this->imag = 0.0;
return *this;
}
__host__ __device__ double& cucomp128::operator[](int& ind)
{
if(ind==0)
{
return this->real;
}
else
{
return this->imag;
}
}
__host__ __device__ const double& cucomp128::operator[](const int& ind) const
{
if(ind==0)
{
return this->real;
}
else
{
return this->imag;
}
}
__host__ __device__ cucomp128 cucomp128::operator+(const cucomp128& z)
{
cucomp128 ret;
ret.real = real + z.real;
ret.imag = imag + z.imag;
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator-(const cucomp128& z)
{
cucomp128 ret;
ret.real = real - z.real;
ret.imag = imag - z.imag;
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator*(const cucomp128& z)
{
cucomp128 ret;
ret.real = (real*z.real - imag*z.imag);
ret.imag = (imag*z.real + real*z.imag);
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator/(const cucomp128& z)
{
cucomp128 ret;
double zm2 = z.real*z.real+z.imag*z.imag;
if(zm2>0.0)
{
ret.real = (this->real*z.real+this->imag*z.imag)/zm2;
ret.imag = (this->imag*z.real-this->real*z.imag)/zm2;
}
else
{
ret.real = (double) finf;
ret.imag = (double) finf;
}
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator+(const double& z)
{
cucomp128 ret;
ret.real = this->real + z;
ret.imag = this->imag;
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator-(const double& z)
{
cucomp128 ret;
ret.real = real-z;
ret.imag = imag;
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator*(const double& z)
{
cucomp128 ret;
ret.real = real*z;
ret.imag = imag*z;
return ret;
}
__host__ __device__ cucomp128 cucomp128::operator/(const double& z)
{
cucomp128 ret;
if(z!=0.0f)
{
ret.real = real/z;
ret.imag = imag/z;
}
else
{
ret.real = finf;
ret.imag = finf;
}
return ret;
}
__host__ __device__ bool cucomp128::operator==(const cucomp128& z) const
{
bool ret = 0;
if(z.real == real && z.imag == imag)
{
ret = 1;
}
return ret;
}
__host__ __device__ bool cucomp128::operator!=(const cucomp128& z) const
{
return !(*this==z);
}
//sort first by real value, then by imaginary value
//this is done so that an ordering exists, as long as two values aren't equal
__host__ __device__ bool cucomp128::operator>(const cucomp128& z) const
{
bool ret = 0;
if(this->real>z.real)
{
ret = 1;
}
else if(this->real==z.real)
{
if(this->imag>z.imag)
{
ret = 1;
}
else
{
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
__host__ __device__ bool cucomp128::operator<(const cucomp128& z) const
{
bool ret = 0;
if(this->real<z.real)
{
ret = 1;
}
else if(this->real==z.real)
{
if(this->imag<z.imag)
{
ret = 1;
}
else
{
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
__host__ __device__ cucomp128 operator-(const cucomp128 &z)
{
cucomp128 ret;
ret.real = -z.real;
ret.imag = -z.imag;
return ret;
}
__host__ __device__ bool cucomp128::operator>=(const cucomp128& z) const
{
bool ret = (*this==z || *this>z);
return ret;
}
__host__ __device__ bool cucomp128::operator<=(const cucomp128& z) const
{
bool ret = (*this==z || *this<z);
return ret;
}
__host__ __device__ bool cucomp128::isnan() const
{
bool ret = 0;
if(::isnan(this->real) || ::isnan(this->imag))
{
ret = 1;
}
return ret;
}
__host__ __device__ bool cucomp128::isinf() const
{
bool ret = 0;
//calls math.h isinf()
if(::isinf(this->real) || ::isinf(this->imag))
{
ret = 1;
}
return ret;
}
__host__ __device__ bool cucomp128::isreal() const
{
bool ret = 1;
if(imag!=0.0f)
{
ret = 0;
}
return ret;
}
__host__ __device__ bool cucomp128::isimag() const
{
bool ret = 1;
if(real!=0.0f)
{
ret = 0;
}
return ret;
}
__host__ __device__ bool cucomp128::iszero() const
{
bool ret = 1;
if(real!=0.0f || imag!=0.0f)
{
ret = 0;
}
return ret;
}
__host__ __device__ double cucomp128::arg() const
{
double ret = 0.0;
ret = (double) amscuda::arg((double)real,(double)imag);
return ret;
}
__host__ __device__ double cucomp128::mag() const
{
double ret = 0.0;
ret = ::sqrt(real*real+imag*imag);
return ret;
}
__host__ __device__ cucomp128 cucomp128::conj() const
{
cucomp128 ret;
ret.real = real;
ret.imag = -imag;
return ret;
}
__host__ __device__ double arg(cucomp128 z)
{
return z.arg();
}
__host__ __device__ double abs(cucomp128 z)
{
return z.mag();
}
__host__ __device__ cucomp128 dtocomp(double _r, double _i)
{
cucomp128 ret;
ret.real = _r;
ret.imag = _i;
return ret;
}
__host__ __device__ double real(cucomp128 z)
{
return z.real;
}
__host__ __device__ double imag(cucomp128 z)
{
return z.imag;
}
__host__ __device__ cucomp128 sin(cucomp128 z)
{
cucomp128 ret;
cucomp128 im1 = dtocomp(0.0f,1.0f);
cucomp128 div = dtocomp(0.0f,2.0f);
ret = (exp(im1*z)-exp(-im1*z))/div;
return ret;
}
__host__ __device__ cucomp128 cos(cucomp128 z)
{
cucomp128 ret;
cucomp128 im1 = dtocomp(0.0f,1.0f);
cucomp128 div = dtocomp(2.0f,0.0f);
ret = (exp(im1*z)+exp(-im1*z))/div;
return ret;
}
__host__ __device__ cucomp128 tan(cucomp128 z)
{
return sin(z)/cos(z);
}
__host__ __device__ cucomp128 exp(cucomp128 z)
{
cucomp128 ret;
ret.real = ::exp(z.real)*::cos(z.imag);
ret.imag = ::exp(z.real)*::sin(z.imag);
return ret;
}
__host__ __device__ cucomp128 log(cucomp128 z)
{
cucomp128 ret;
ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag));
ret.imag = amscuda::arg(z.real,z.imag);
return ret;
}
__host__ __device__ cucomp128 conj(cucomp128 z)
{
return z.conj();
}
__host__ __device__ cucomp128 cosh(cucomp128 z)
{
cucomp128 ret;
cucomp128 div = dtocomp(2.0f,0.0f);
ret = (exp(z)+exp(-z))/div;
return ret;
}
__host__ __device__ cucomp128 sinh(cucomp128 z)
{
cucomp128 ret;
cucomp128 div = dtocomp(2.0f,0.0f);
ret = (exp(z)-exp(-z))/div;
return ret;
}
__host__ __device__ cucomp128 tanh(cucomp128 z)
{
return sinh(z)/cosh(z);
}
__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2)
{
cucomp128 ret;
if(z1.mag()>0.0)
ret = exp(z2*log(z1));
else
ret = dtocomp(0.0f,0.0f);
return ret;
}
void test_cucomp128_1()
{
cucomp128 z1;
cucomp128 a,b,c;
double d1;
double f1;
printf("sizeof double=%ld\n",(long)(8*sizeof(f1)));
printf("sizeof double=%ld\n",(long)(8*sizeof(d1)));
printf("sizeof complex=%ld\n",(long)(8*sizeof(z1)));
printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a)));
a = dtocomp(1.0,1.0);
b = dtocomp(1.0,-1.0);
printf("a=%1.4f + %1.4fi\n",a[0],a[1]);
printf("b=%1.4f + %1.4fi\n",b[0],b[1]);
c = a+b;
printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]);
c = a-b;
printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]);
c = a*b;
printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]);
c = a/b;
printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]);
f1 = abs(a);
printf("abs(a)=%1.4f\n",f1);
f1 = arg(a);
printf("abs(a)=%1.4f pi\n",f1/pi);
}
}; //end namespace cmp
}; //end namespace amscuda

@ -0,0 +1,476 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
namespace cmp
{
__host__ __device__ cucomp64::cucomp64()
{
real = 0.0;
imag = 0.0;
return;
}
__host__ __device__ cucomp64::~cucomp64()
{
real = 0.0;
imag = 0.0;
return;
}
__host__ __device__ cucomp64::cucomp64(const cucomp64 &other)
{
real = other.real;
imag = other.imag;
return;
}
__host__ __device__ cucomp64::cucomp64(const float &other)
{
real = other;
imag = 0.0;
return;
}
__host__ __device__ cucomp64& cucomp64::operator=(cucomp64& other)
{
real = other.real;
imag = other.imag;
return *this;
}
__host__ __device__ const cucomp64& cucomp64::operator=(const cucomp64& other)
{
this->real = other.real;
this->imag = other.imag;
return *this;
}
__host__ __device__ cucomp64& cucomp64::operator=(float& other)
{
real = other;
imag = 0.0;
return *this;
}
__host__ __device__ const cucomp64& cucomp64::operator=(const float& other)
{
this->real = other;
this->imag = 0.0;
return *this;
}
__host__ __device__ float& cucomp64::operator[](int& ind)
{
if(ind==0)
{
return this->real;
}
else
{
return this->imag;
}
}
__host__ __device__ const float& cucomp64::operator[](const int& ind) const
{
if(ind==0)
{
return this->real;
}
else
{
return this->imag;
}
}
__host__ __device__ cucomp64 cucomp64::operator+(const cucomp64& z)
{
cucomp64 ret;
ret.real = real + z.real;
ret.imag = imag + z.imag;
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator-(const cucomp64& z)
{
cucomp64 ret;
ret.real = real - z.real;
ret.imag = imag - z.imag;
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator*(const cucomp64& z)
{
cucomp64 ret;
ret.real = (real*z.real - imag*z.imag);
ret.imag = (imag*z.real + real*z.imag);
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator/(const cucomp64& z)
{
cucomp64 ret;
float zm2 = z.real*z.real+z.imag*z.imag;
if(zm2>0.0)
{
ret.real = (this->real*z.real+this->imag*z.imag)/zm2;
ret.imag = (this->imag*z.real-this->real*z.imag)/zm2;
}
else
{
ret.real = (float) finf;
ret.imag = (float) finf;
}
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator+(const float& z)
{
cucomp64 ret;
ret.real = this->real + z;
ret.imag = this->imag;
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator-(const float& z)
{
cucomp64 ret;
ret.real = real-z;
ret.imag = imag;
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator*(const float& z)
{
cucomp64 ret;
ret.real = real*z;
ret.imag = imag*z;
return ret;
}
__host__ __device__ cucomp64 cucomp64::operator/(const float& z)
{
cucomp64 ret;
if(z!=0.0f)
{
ret.real = real/z;
ret.imag = imag/z;
}
else
{
ret.real = finf;
ret.imag = finf;
}
return ret;
}
__host__ __device__ bool cucomp64::operator==(const cucomp64& z) const
{
bool ret = 0;
if(z.real == real && z.imag == imag)
{
ret = 1;
}
return ret;
}
__host__ __device__ bool cucomp64::operator!=(const cucomp64& z) const
{
return !(*this==z);
}
//sort first by real value, then by imaginary value
//this is done so that an ordering exists, as long as two values aren't equal
__host__ __device__ bool cucomp64::operator>(const cucomp64& z) const
{
bool ret = 0;
if(this->real>z.real)
{
ret = 1;
}
else if(this->real==z.real)
{
if(this->imag>z.imag)
{
ret = 1;
}
else
{
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
__host__ __device__ bool cucomp64::operator<(const cucomp64& z) const
{
bool ret = 0;
if(this->real<z.real)
{
ret = 1;
}
else if(this->real==z.real)
{
if(this->imag<z.imag)
{
ret = 1;
}
else
{
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
__host__ __device__ cucomp64 operator-(const cucomp64 &z)
{
cucomp64 ret;
ret.real = -z.real;
ret.imag = -z.imag;
return ret;
}
__host__ __device__ bool cucomp64::operator>=(const cucomp64& z) const
{
bool ret = (*this==z || *this>z);
return ret;
}
__host__ __device__ bool cucomp64::operator<=(const cucomp64& z) const
{
bool ret = (*this==z || *this<z);
return ret;
}
__host__ __device__ bool cucomp64::isnan() const
{
bool ret = 0;
if(::isnan(this->real) || ::isnan(this->imag))
{
ret = 1;
}
return ret;
}
__host__ __device__ bool cucomp64::isinf() const
{
bool ret = 0;
//calls math.h isinf()
if(::isinf(this->real) || ::isinf(this->imag))
{
ret = 1;
}
return ret;
}
__host__ __device__ bool cucomp64::isreal() const
{
bool ret = 1;
if(imag!=0.0f)
{
ret = 0;
}
return ret;
}
__host__ __device__ bool cucomp64::isimag() const
{
bool ret = 1;
if(real!=0.0f)
{
ret = 0;
}
return ret;
}
__host__ __device__ bool cucomp64::iszero() const
{
bool ret = 1;
if(real!=0.0f || imag!=0.0f)
{
ret = 0;
}
return ret;
}
__host__ __device__ float cucomp64::arg() const
{
float ret = 0.0;
ret = (float) amscuda::arg((double)real,(double)imag);
return ret;
}
__host__ __device__ float cucomp64::mag() const
{
float ret = 0.0;
ret = ::sqrt(real*real+imag*imag);
return ret;
}
__host__ __device__ cucomp64 cucomp64::conj() const
{
cucomp64 ret;
ret.real = real;
ret.imag = -imag;
return ret;
}
__host__ __device__ float arg(cucomp64 z)
{
return z.arg();
}
__host__ __device__ float abs(cucomp64 z)
{
return z.mag();
}
__host__ __device__ cucomp64 dtocomp64(float _r, float _i)
{
cucomp64 ret;
ret.real = _r;
ret.imag = _i;
return ret;
}
__host__ __device__ float real(cucomp64 z)
{
return z.real;
}
__host__ __device__ float imag(cucomp64 z)
{
return z.imag;
}
__host__ __device__ cucomp64 sin(cucomp64 z)
{
cucomp64 ret;
cucomp64 im1 = dtocomp64(0.0f,1.0f);
cucomp64 div = dtocomp64(0.0f,2.0f);
ret = (exp(im1*z)-exp(-im1*z))/div;
return ret;
}
__host__ __device__ cucomp64 cos(cucomp64 z)
{
cucomp64 ret;
cucomp64 im1 = dtocomp64(0.0f,1.0f);
cucomp64 div = dtocomp64(2.0f,0.0f);
ret = (exp(im1*z)+exp(-im1*z))/div;
return ret;
}
__host__ __device__ cucomp64 tan(cucomp64 z)
{
return sin(z)/cos(z);
}
__host__ __device__ cucomp64 exp(cucomp64 z)
{
cucomp64 ret;
ret.real = ::exp(z.real)*::cos(z.imag);
ret.imag = ::exp(z.real)*::sin(z.imag);
return ret;
}
__host__ __device__ cucomp64 log(cucomp64 z)
{
cucomp64 ret;
ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag));
ret.imag = amscuda::arg(z.real,z.imag);
return ret;
}
__host__ __device__ cucomp64 conj(cucomp64 z)
{
return z.conj();
}
__host__ __device__ cucomp64 cosh(cucomp64 z)
{
cucomp64 ret;
cucomp64 div = dtocomp64(2.0f,0.0f);
ret = (exp(z)+exp(-z))/div;
return ret;
}
__host__ __device__ cucomp64 sinh(cucomp64 z)
{
cucomp64 ret;
cucomp64 div = dtocomp64(2.0f,0.0f);
ret = (exp(z)-exp(-z))/div;
return ret;
}
__host__ __device__ cucomp64 tanh(cucomp64 z)
{
return sinh(z)/cosh(z);
}
__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2)
{
cucomp64 ret;
if(z1.mag()>0.0)
ret = exp(z2*log(z1));
else
ret = dtocomp64(0.0f,0.0f);
return ret;
}
void test_cucomp64_1()
{
cucomp64 z1;
cucomp64 a,b,c;
double d1;
float f1;
printf("sizeof double=%ld\n",(long)(8*sizeof(f1)));
printf("sizeof double=%ld\n",(long)(8*sizeof(d1)));
printf("sizeof complex=%ld\n",(long)(8*sizeof(z1)));
printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a)));
a = dtocomp64(1.0,1.0);
b = dtocomp64(1.0,-1.0);
printf("a=%1.4f + %1.4fi\n",a[0],a[1]);
printf("b=%1.4f + %1.4fi\n",b[0],b[1]);
c = a+b;
printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]);
c = a-b;
printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]);
c = a*b;
printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]);
c = a/b;
printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]);
f1 = abs(a);
printf("abs(a)=%1.4f\n",f1);
f1 = arg(a);
printf("abs(a)=%1.4f pi\n",f1/pi);
}
}; //end namespace cmp
}; //end namespace amscuda

@ -0,0 +1,21 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
int cuda_errortrap(const char *msgheader)
{
int ret = 0;
cudaError_t err = cudaSuccess;
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("%s :%s\n",msgheader,cudaGetErrorString(err));
ret = 1;
}
return ret;
}
}; //end namespace amscuda

@ -0,0 +1,222 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__device__ __host__ float fhash1d_su(float x)
{
float ret;
ret = x*(x>0.0f) + -x*(x<0.0f); //sign without conditionals?
ret = fmodf(ret,10000.0f); //restrain domain
ret = fmodf(ret*(ret+3678.453f)+7890.453f,10000.0f);
ret = fmodf(ret*(ret+8927.2134f),10000.0f);
ret = fmodf(ret*(ret+3656.234f),10000.0f);
//ret = fmodf(ret*(ret+892.2134f),1000.0f);
//ret = fmodf(ret,1000.0f);
ret = ret/10000.0f;
return ret;
}
__device__ __host__ float fhash3d_su(float x, float y=0.0f, float z=0.0f)
{
float ret = 0.0f;
ret = fhash1d_su(z);
ret = fhash1d_su(1000.0f*ret*ret + y);
ret = fhash1d_su(1000.0f*ret*ret + x);
return ret;
}
__device__ __host__ float fhash4d_su(float x, float y=0.0f, float z=0.0f, float w=0.0f)
{
float ret = 0.0f;
ret = fhash1d_su(w);
ret = fhash1d_su(1000.0f*ret*ret + z);
ret = fhash1d_su(1000.0f*ret*ret + y);
ret = fhash1d_su(1000.0f*ret*ret + x);
return ret;
}
//////////////////////////////////////////////////
// Deterministic Pseudorandom int32_t Generator //
//////////////////////////////////////////////////
//Simple 32 bit integer deterministic pseudo-random generator
// *not* for cryptography
// Frequency of generated floats should be uniform [0,1)
AMSCU_CONST static const int32_t dpr32_mod = 1<<30-1;
AMSCU_CONST static const int32_t dpr32_mult = 25137;
//Next seed in simple 32 bit integer deterministic psuedo-rand generator
__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout)
{
int32_t lseed;
if(rseed_inout!=NULL) lseed = *rseed_inout;
lseed = (lseed*dpr32_mult + 1)%dpr32_mod;
lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+dpr32_mod); //ensure mod is positive
if(rseed_inout!=NULL) *rseed_inout = lseed;
return;
}
//Simple 32 bit integer deterministic pseudo-random generator
// *not* for cryptography
// Frequency of generated floats should be uniform [0,1)
__host__ __device__ float dpr32_randf(int32_t *rseed_inout)
{
int32_t lseed = 1;
float ret = 0.0f;
if(rseed_inout!=NULL) lseed = *rseed_inout;
dpr32_nextseed(&lseed);
ret = ((float)(lseed))/((float)dpr32_mod);
if(rseed_inout!=NULL) *rseed_inout = lseed;
return ret;
}
//box muller standard normal variable
__host__ __device__ float dpr32_randnf(int32_t *rseed_inout)
{
int32_t lseed = 1;
float ret = 0.0f;
float u1,u2;
if(rseed_inout!=NULL) lseed = *rseed_inout;
u1 = dpr32_randf(&lseed);
u2 = dpr32_randf(&lseed);
ret = ::sqrtf(-2.0f*::logf(u1))*::cosf(2.0f*pif*u2);
if(rseed_inout!=NULL) *rseed_inout = lseed;
return ret;
}
//////////////////////////////////////////////////
// Deterministic Pseudorandom int64_t Generator //
//////////////////////////////////////////////////
//"goodenough" deterministic pseudo-random number generator
//random enough for procedural applications, deterministic,
//operates without side-effects for thread safety
AMSCU_CONST const int64_t random_dpr64_mod = (2LL<<31LL)-1LL;
AMSCU_CONST const int64_t random_dpr64_mult = 1201633LL;
__host__ __device__ void dpr64_nextseed(int64_t *seedinout)
{
int64_t lseed = 0LL;
if(seedinout!=NULL) lseed = *seedinout;
lseed = (random_dpr64_mult*lseed+1LL)%random_dpr64_mod;
lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+random_dpr64_mod);
if(seedinout!=NULL) *seedinout = lseed;
return;
}
__host__ __device__ double dpr64_randd(int64_t *seedinout)
{
double ret = 0.0;
int64_t lseed = 0LL;
if(seedinout!=NULL) lseed = *seedinout;
dpr64_nextseed(&lseed);
ret = ((double)lseed)/((double)(random_dpr64_mod-1LL));
if(seedinout!=NULL) *seedinout = lseed;
return ret;
}
__host__ __device__ float dpr64_randf(int64_t *seedinout)
{
float ret = 0.0f;
int64_t lseed = 0LL;
if(seedinout!=NULL) lseed = *seedinout;
dpr64_nextseed(&lseed);
ret = ((float)lseed)/((float)(random_dpr64_mod-1LL));
if(seedinout!=NULL) *seedinout = lseed;
return ret;
}
///////////
// Tests //
///////////
void test_dprg64()
{
printf("Tests for dprg:\n");
long I;
int64_t seed = 133LL;
double d;
float f;
cuvect3 qv;
printf("dpr64_randd test\n");
seed = 133LL;
for(I=0;I<10;I++)
{
d = dpr64_randd(&seed);
printf("seed: %lld rand: %1.4f\n",(long long)seed,d);
}
printf("\n\n");
printf("dpr64_randf test\n");
seed = 133LL;
for(I=0;I<10;I++)
{
f = dpr64_randf(&seed);
printf("seed: %lld rand: %1.4f\n",(long long)seed,f);
}
return;
}
void test_dprg32()
{
printf("Tests for dprg:\n");
long I;
int32_t seed = 133;
double d;
float f;
cuvect3 qv;
printf("dpr32_randf test\n");
seed = 133;
for(I=0;I<10;I++)
{
f = dpr32_randf(&seed);
printf("seed: %lld rand: %1.4f\n",(long long)seed,f);
}
printf("\n\ndpr32_randnf test\n");
seed = 133;
for(I=0;I<10;I++)
{
f = dpr32_randnf(&seed);
printf("seed: %lld rand: %1.4f\n",(long long)seed,f);
}
return;
}
}; //namespace amscuda

@ -0,0 +1,63 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__global__ void test_cuarray_sum_kf(cuarray<float> *dq1, float *sum)
{
int I;
*sum = 0.0f;
for(I=0;I<dq1->length;I++)
{
*sum = *sum + dq1->data[I];
}
//*sum = (float)dq1->length;
return;
}
float test_cuarray_sum(cuarray<float> *q1)
{
float ret = 0.0f;
int res;
cuarray<float> *dq1 = NULL;
float *dsum;
cudaError_t err = cudaSuccess;
cudaMalloc(&dsum,sizeof(float));
res = q1->device_send(&dq1);
printf("error: res=%d\n",res);
test_cuarray_sum_kf<<<1,1>>>(dq1,dsum);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("test_cuarray_sum Error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(&ret,dsum,sizeof(float),cudaMemcpyDeviceToHost);
q1->device_free(&dq1);
cudaFree(dsum); dsum = NULL;
return ret;
}
void test_cuarray()
{
cuarray<float> q1;
int I;
q1.resize(100);
for(I=0;I<q1.length;I++)
{
q1.data[I] = I;
}
printf("q1.length=%d\n",q1.length);
printf("sum of array: %1.6g\n",test_cuarray_sum(&q1));
}
};

@ -0,0 +1,213 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
//template instantiations
template float dbuff_sum(float *devbuffer, int N);
template void dbuff_minmax(float *devbuffer, int N, float *min, float *max);
template void dbuff_setall(float *devbuffer, int N, float setto, int nblocks, int nthreads);
//fill devbuffer with random uniform numbers between 0 and 1 using int32_t based generator
__global__ void dbuff_rand_dpr32_kf(float *devbuffer, int N, int32_t *seeds)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
int32_t lseed;
float f;
lseed = seeds[I0];
for(I=I0;I<N;I=I+Is)
{
f = dpr32_randf(&lseed);
devbuffer[I] = f;
}
return;
}
void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
int I;
int32_t *seeds = NULL;
int32_t *devseeds = NULL;
int32_t lseed;
if(devbuffer==NULL || N<=0)
{
return;
}
seeds = new(std::nothrow) int32_t[nblocks*nthreads];
cudaMalloc(&devseeds,sizeof(int32_t)*nblocks*nthreads);
if(rseedinout!=NULL) lseed = *rseedinout; else lseed = 1;
for(I=0;I<nblocks*nthreads;I++)
{
lseed = lseed + I + 1;
dpr32_nextseed(&lseed);
seeds[I] = lseed;
}
cudaMemcpy(devseeds,seeds,sizeof(int32_t)*nblocks*nthreads,cudaMemcpyHostToDevice);
if(rseedinout!=NULL) *rseedinout = lseed;
dbuff_rand_dpr32_kf<<<nblocks,nthreads>>>(devbuffer,N,devseeds);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err));
}
cudaFree(devseeds); devseeds = NULL;
delete[] seeds; seeds = NULL;
return;
}
__global__ void dbuff_rand_dpr32n_kf(float *devbuffer, int N, int32_t *seeds)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
int32_t lseed;
float f;
lseed = seeds[I0];
for(I=I0;I<N;I=I+Is)
{
f = dpr32_randnf(&lseed);
devbuffer[I] = f;
}
return;
}
void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
int I;
int32_t *seeds = NULL;
int32_t *devseeds = NULL;
int32_t lseed;
if(devbuffer==NULL || N<=0)
{
return;
}
seeds = new(std::nothrow) int32_t[nblocks*nthreads];
cudaMalloc(&devseeds,sizeof(int32_t)*nblocks*nthreads);
if(rseedinout!=NULL) lseed = *rseedinout; else lseed = 1;
for(I=0;I<nblocks*nthreads;I++)
{
lseed = lseed + I + 1;
dpr32_nextseed(&lseed);
seeds[I] = lseed;
}
cudaMemcpy(devseeds,seeds,sizeof(int32_t)*nblocks*nthreads,cudaMemcpyHostToDevice);
if(rseedinout!=NULL) *rseedinout = lseed;
dbuff_rand_dpr32n_kf<<<nblocks,nthreads>>>(devbuffer,N,devseeds);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err));
}
cudaFree(devseeds); devseeds = NULL;
delete[] seeds; seeds = NULL;
return;
}
void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads)
{
int I;
return;
}
///////////
// Tests //
///////////
void test_dbuff_rand_dpr32()
{
cuarray<float> data;
float *dev_data = NULL;
int Nx = 5000;
int Ny = 5000;
cuarray<int> dims;
int32_t rseed = 15;
FILE *fp = NULL;
const char *fname = "./test_scripts/test_dbuff_rand_dpr32.bin";
clock_t t0,t1,t2;
double dt0,dt1;
printf("Tests of dbuff_rand_dpr32...\n");
fp = fopen(fname,"w+");
if(fp==NULL)
{
printf("Error: Could not open %s for writing.\n",fname);
return;
}
data.resize(Nx*Ny);
dims.resize(2);
dims[0] = Nx; dims[1] = Ny;
cudaMalloc(&dev_data,Nx*Ny*sizeof(float));
t0 = clock();
dbuff_rand_dpr32(dev_data,Nx*Ny,&rseed,256,512);
t1 = clock();
cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost);
t2 = clock();
dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0;
dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0;
printf("dbuff_rand_dpr32 exec time: %1.3f msec\n",dt0);
printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1);
fwrite_ndarray(fp,&dims,&data);
t0 = clock();
dbuff_rand_dpr32n(dev_data,Nx*Ny,&rseed,256,512);
t1 = clock();
cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost);
t2 = clock();
dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0;
dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0;
printf("dbuff_rand_dpr32n exec time: %1.3f msec\n",dt0);
printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1);
fwrite_ndarray(fp,&dims,&data);
fclose(fp);
cudaFree(dev_data); dev_data = NULL;
}
};

@ -0,0 +1,6 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
};

@ -0,0 +1,6 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
};

@ -0,0 +1,269 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__host__ __device__ double dabs(double x)
{
if(x<0.0)
{
x = -x;
}
return x;
}
__host__ __device__ float fabs(float x)
{
if(x<0.0f)
{
x = -x;
}
return x;
}
__host__ __device__ double mod(double x, double md)
{
x = fmod(x,md);
if(x<0.0)
{
x = x + md;
}
return x;
}
__host__ __device__ float mod(float x, float md)
{
x = fmodf(x,md);
if(x<0.0f)
{
x = x + md;
}
return x;
}
__host__ __device__ int mod(int x, int n)
{
x = x % n;
if(x<0)
{
x = x + n;
}
return x;
}
__host__ __device__ long mod(long x, long n)
{
x = x % n;
if(x<0)
{
x = x + n;
}
return x;
}
__host__ __device__ int truediv(int x, int y)
{
int z = 0;
if(x>=0 && y>0)
{
z = x/y;
}
else if(x<0 && y>0)
{
z = -((-x)/y) - 1;
}
else if(x>=0 && y<0)
{
z = -(x/(-y)) - 1;
}
else if(x<0 && y<0)
{
z = ((-x)/(-y));
}
return z;
}
__host__ __device__ long truediv(long x, long y)
{
int z = 0;
if(x>=0 && y>0)
{
z = x/y;
}
else if(x<0 && y>0)
{
z = -((-x)/y) - 1;
}
else if(x>=0 && y<0)
{
z = -(x/(-y)) - 1;
}
else if(x<0 && y<0)
{
z = ((-x)/(-y));
}
return z;
}
template<> __host__ __device__ double min(double a, double b)
{
if(isnan(a))
{
return b;
}
else if(isnan(b))
{
return a;
}
else if(a>b)
{
return b;
}
else
{
return a;
}
}
template<> __host__ __device__ float min(float a, float b)
{
if(isnan(a))
{
return b;
}
else if(isnan(b))
{
return a;
}
else if(a>b)
{
return b;
}
else
{
return a;
}
}
template<> __host__ __device__ double max(double a, double b)
{
if(isnan(a))
{
return b;
}
else if(isnan(b))
{
return a;
}
else if(a>b)
{
return a;
}
else
{
return b;
}
}
template<> __host__ __device__ float max(float a, float b)
{
if(isnan(a))
{
return b;
}
else if(isnan(b))
{
return a;
}
else if(a>b)
{
return a;
}
else
{
return b;
}
}
__device__ __host__ double arg(double x, double y)
{
double ret = 0.0;
double z = ::sqrt(x*x+y*y);
if(z>0.0)
{
if(y<=x && y>=-x)
{
ret = asin(y/z);
}
else if(y>=x && y>=-x)
{
ret = acos(x/z);
}
else if(y>=x && y<=-x)
{
ret = pi-asin(y/z);
}
else
{
ret = 2.0*pi-acos(x/z);
}
}
if(ret<0.0)
{
ret = 2.0*pi+ret;
}
return ret;
}
__device__ __host__ void get_azel(double x, double y, double z, double *az, double *el)
{
//int ret = -2; //should never see this return
double n, rp;
n = ::sqrt(x*x+y*y+z*z);
if(n>0.0)
{
rp = ::sqrt(x*x+y*y);
if(rp>0.0)
{
//ret = 1; //nonzero vector - should work
*az = arg(x,y);
*el = ::atan(z/rp);
}
else
{
//ret = 0; //straight up or straight down
if(z>0.0)
{
*az = 0.0;
*el = pi/2.0;
}
else
{
*az = 0.0;
*el = -pi/2.0;
}
}
}
else
{
*az = 0.0;
*el = 0.0;
//ret = -1; //zero vector - no real azimuth/elevation
}
return;
}
void test_amscumath1()
{
printf("pi = %1.16f\n",amscuda::pi);
}
};

@ -0,0 +1,85 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__global__ void test_amscurarray1_kf1(curarray<int> *q)
{
int I,J,Na;
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int N = q->Narrays;
for(I=I0; I<N;I = I + Is)
{
Na = q->N[I];
//printf("I:%d Na: %d\n",I,Na);
//q->dev_resizearray(I, Na);
for(J=0;J<Na;J++)
{
q->devarrayptrs[I][J] = J;
}
}
}
void test_amscurarray1()
{
int I;
cudaError_t err = cudaSuccess;
printf("test_amscurarray1:\n");
curarray<int> *qarray = NULL;
curarray_new(&qarray,100);
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("debug error trap 1: %s\n",cudaGetErrorString(err));
}
for(I=0;I<100;I++)
{
qarray->resizearray(I,5);
}
qarray->push();
qarray->pull();
cuda_errortrap("debug: error trap 2");
for(I=0;I<5;I++)
{
printf("array[%d], size %d\n",I,qarray->N[I]);
}
//
for(I=0;I<100;I++)
{
qarray->resizearray(I,I%5);
cuda_errortrap("debug: error trap resize2");
}
qarray->push();
qarray->pull();
test_amscurarray1_kf1<<<128,1>>>(qarray->devptr);
cuda_errortrap("debug: error trap kf1");
qarray->pull();
cuda_errortrap("debug: error trap pull2");
for(I=0;I<5;I++)
{
printf("array[%d], size %d\n",I,qarray->N[I]);
}
curarray_delete(&qarray);
cuda_errortrap("debug: error trap 3");
return;
}
};

@ -0,0 +1,361 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__host__ __device__ cuvect2::cuvect2()
{
x = 0.0; y = 0.0;
return;
}
__host__ __device__ cuvect2::~cuvect2()
{
x = 0.0; y = 0.0;
return;
}
__host__ __device__ cuvect2::cuvect2(double _x, double _y)
{
x = _x; y = _y;
return;
}
__host__ __device__ double& cuvect2::operator[](const int I)
{
if(I==0) return x;
if(I==1) return y;
return x;
}
__host__ __device__ const double& cuvect2::operator[](const int I) const
{
if(I==0) return x;
if(I==1) return y;
return x;
}
__host__ __device__ cuvect2 cuvect2::operator+(cuvect2 lhs)
{
cuvect2 ret;
ret.x = x + lhs.x;
ret.y = y + lhs.y;
return ret;
}
__host__ __device__ cuvect2 cuvect2::operator-(cuvect2 lhs)
{
cuvect2 ret;
ret.x = x - lhs.x;
ret.y = y - lhs.y;
return ret;
}
__host__ __device__ cuvect2 cuvect2::operator*(double lhs)
{
cuvect2 ret;
ret.x = x*lhs;
ret.y = y*lhs;
return ret;
}
__host__ __device__ cuvect2 cuvect2::operator/(double lhs)
{
cuvect2 ret;
ret.x = x/lhs;
ret.y = y/lhs;
return ret;
}
__host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b)
{
double ret = a.x*b.x+a.y*b.y;
return ret;
}
__host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b)
{
double ret = a.x*b.y-a.y*b.x;
return ret;
}
__host__ __device__ double cuvect2_norm(cuvect2 a)
{
double ret = ::sqrt(a.x*a.x+a.y*a.y);
return ret;
}
__host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a)
{
cuvect2 ret;
double m = cuvect2_norm(a);
if(m>0.0)
{
ret.x = a.x/m; ret.y = a.y/m;
}
else
{
ret.x = 0.0; ret.y = 0.0;
}
return ret;
}
__host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b)
{
cuvect2 ret;
cuvect2 bn = cuvect2_normalize(b);
double m = cuvect2_dot(a,bn);
ret = bn*m;
return ret;
}
__host__ __device__ cumat2::cumat2()
{
int I;
for(I=0;I<4;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ cumat2::~cumat2()
{
int I;
for(I=0;I<4;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ double& cumat2::operator[](const int I)
{
return dat[I];
}
__host__ __device__ double& cumat2::operator()(const int I, const int J)
{
return dat[I+2*J];
}
__host__ __device__ cumat2 cumat2::operator+(cumat2 lhs)
{
int I;
cumat2 ret;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I] + lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat2 cumat2::operator-(cumat2 lhs)
{
int I;
cumat2 ret;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I] - lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat2 cumat2::operator*(double lhs)
{
cumat2 ret;
int I;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I]*lhs;
}
return ret;
}
__host__ __device__ cumat2 cumat2::operator/(double lhs)
{
cumat2 ret;
int I;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I]/lhs;
}
return ret;
}
__host__ __device__ double& cumat2::at(const int I, const int J)
{
return dat[I+2*J];
}
__host__ __device__ cuvect2 cumat2::operator*(cuvect2 lhs)
{
cuvect2 ret = cuvect2(0.0,0.0);
int I,J;
for(I=0;I<2;I++)
{
for(J=0;J<2;J++)
{
ret[I] = ret[I] + this->at(I,J)*lhs[J];
}
}
return ret;
}
__host__ __device__ cumat2 cumat2::operator*(cumat2 lhs)
{
cumat2 ret;
int I,J,K;
for(I=0;I<2;I++)
{
for(J=0;J<2;J++)
{
ret(I,J) = 0.0;
for(K=0;K<2;K++)
{
ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
}
}
}
return ret;
}
__host__ __device__ double cumat2::det()
{
double ret = 0.0;
ret = ret + this->at(0,0)*this->at(1,1);
ret = ret - this->at(1,0)*this->at(0,1);
return ret;
}
__host__ __device__ cumat2 cumat2::transpose()
{
cumat2 q;
int I,J;
for(I=0;I<2;I++)
{
for(J=0;J<2;J++)
{
q.at(I,J) = this->at(J,I);
}
}
return q;
}
__host__ __device__ cumat2 cumat2::inverse()
{
cumat2 q;
double dt = q.det();
if(dt!=0.0)
{
q(0,0) = this->at(1,1)/dt;
q(0,1) = -this->at(0,1)/dt;
q(1,0) = -this->at(1,0)/dt;
q(1,1) = this->at(0,0)/dt;
}
else
{
q(0,0) = inf;
q(0,1) = inf;
q(1,0) = inf;
q(1,1) = inf;
}
return q;
}
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2_transpose(double *mat2inout)
{
double mat2_in[4];
mat2_copy(mat2_in,mat2inout);
mat2inout[0+0*2] = mat2_in[0+0*2];
mat2inout[1+0*2] = mat2_in[0+1*2];
mat2inout[0+1*2] = mat2_in[1+0*2];
mat2inout[1+1*2] = mat2_in[1+1*2];
return;
}
//copies src to dest
__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src)
{
mat2_dest[0+0*2] = mat2_src[0+0*2];
mat2_dest[1+0*2] = mat2_src[1+0*2];
mat2_dest[0+1*2] = mat2_src[0+1*2];
mat2_dest[1+1*2] = mat2_src[1+1*2];
return;
}
//inverts mat?inout[4]
__host__ __device__ void mat2_inverse(double *mat2inout)
{
double det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2];
double mat2in[4];
mat2_copy(mat2in,mat2inout);
mat2inout[0+0*2] = mat2inout[1+1*2]/det;
mat2inout[1+0*2] = -mat2inout[1+0*2]/det;
mat2inout[0+1*2] = -mat2inout[0+1*2]/det;
mat2inout[1+1*2] = mat2inout[0+0*2]/det;
return;
}
//rotatin matrix from angle
__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2)
{
mat2[0+0*2] = ::cos(angle);
mat2[1+0*2] = ::sin(angle);
mat2[0+1*2] = -::sin(angle);
mat2[1+1*2] = ::cos(angle);
return;
}
//multiplies c = a*b
__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c)
{
double mat2a_in[4];
double mat2b_in[4];
if(mat2a==NULL || mat2b==NULL || mat2c==NULL)
{
return;
}
mat2_copy(mat2a_in,mat2a);
mat2_copy(mat2b_in,mat2b);
mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2];
mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2];
mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2];
mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2];
return;
}
// ret = a*b
__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b)
{
cuvect2 ret;
ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2];
ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2];
return ret;
}
void test_cuvect2_1()
{
return;
}
};

@ -0,0 +1,361 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__host__ __device__ cuvect2f::cuvect2f()
{
x = 0.0; y = 0.0;
return;
}
__host__ __device__ cuvect2f::~cuvect2f()
{
x = 0.0; y = 0.0;
return;
}
__host__ __device__ cuvect2f::cuvect2f(float _x, float _y)
{
x = _x; y = _y;
return;
}
__host__ __device__ float& cuvect2f::operator[](const int I)
{
if(I==0) return x;
if(I==1) return y;
return x;
}
__host__ __device__ const float& cuvect2f::operator[](const int I) const
{
if(I==0) return x;
if(I==1) return y;
return x;
}
__host__ __device__ cuvect2f cuvect2f::operator+(cuvect2f lhs)
{
cuvect2f ret;
ret.x = x + lhs.x;
ret.y = y + lhs.y;
return ret;
}
__host__ __device__ cuvect2f cuvect2f::operator-(cuvect2f lhs)
{
cuvect2f ret;
ret.x = x - lhs.x;
ret.y = y - lhs.y;
return ret;
}
__host__ __device__ cuvect2f cuvect2f::operator*(float lhs)
{
cuvect2f ret;
ret.x = x*lhs;
ret.y = y*lhs;
return ret;
}
__host__ __device__ cuvect2f cuvect2f::operator/(float lhs)
{
cuvect2f ret;
ret.x = x/lhs;
ret.y = y/lhs;
return ret;
}
__host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b)
{
float ret = a.x*b.x+a.y*b.y;
return ret;
}
__host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b)
{
float ret = a.x*b.y-a.y*b.x;
return ret;
}
__host__ __device__ float cuvect2f_norm(cuvect2f a)
{
float ret = ::sqrtf(a.x*a.x+a.y*a.y);
return ret;
}
__host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a)
{
cuvect2f ret;
float m = cuvect2f_norm(a);
if(m>0.0)
{
ret.x = a.x/m; ret.y = a.y/m;
}
else
{
ret.x = 0.0; ret.y = 0.0;
}
return ret;
}
__host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b)
{
cuvect2f ret;
cuvect2f bn = cuvect2f_normalize(b);
float m = cuvect2f_dot(a,bn);
ret = bn*m;
return ret;
}
__host__ __device__ cumat2f::cumat2f()
{
int I;
for(I=0;I<4;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ cumat2f::~cumat2f()
{
int I;
for(I=0;I<4;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ float& cumat2f::operator[](const int I)
{
return dat[I];
}
__host__ __device__ float& cumat2f::operator()(const int I, const int J)
{
return dat[I+2*J];
}
__host__ __device__ cumat2f cumat2f::operator+(cumat2f lhs)
{
int I;
cumat2f ret;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I] + lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat2f cumat2f::operator-(cumat2f lhs)
{
int I;
cumat2f ret;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I] - lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat2f cumat2f::operator*(float lhs)
{
cumat2f ret;
int I;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I]*lhs;
}
return ret;
}
__host__ __device__ cumat2f cumat2f::operator/(float lhs)
{
cumat2f ret;
int I;
for(I=0;I<4;I++)
{
ret.dat[I] = this->dat[I]/lhs;
}
return ret;
}
__host__ __device__ float& cumat2f::at(const int I, const int J)
{
return dat[I+2*J];
}
__host__ __device__ cuvect2f cumat2f::operator*(cuvect2f lhs)
{
cuvect2f ret = cuvect2f(0.0,0.0);
int I,J;
for(I=0;I<2;I++)
{
for(J=0;J<2;J++)
{
ret[I] = ret[I] + this->at(I,J)*lhs[J];
}
}
return ret;
}
__host__ __device__ cumat2f cumat2f::operator*(cumat2f lhs)
{
cumat2f ret;
int I,J,K;
for(I=0;I<2;I++)
{
for(J=0;J<2;J++)
{
ret(I,J) = 0.0;
for(K=0;K<2;K++)
{
ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
}
}
}
return ret;
}
__host__ __device__ float cumat2f::det()
{
float ret = 0.0;
ret = ret + this->at(0,0)*this->at(1,1);
ret = ret - this->at(1,0)*this->at(0,1);
return ret;
}
__host__ __device__ cumat2f cumat2f::transpose()
{
cumat2f q;
int I,J;
for(I=0;I<2;I++)
{
for(J=0;J<2;J++)
{
q.at(I,J) = this->at(J,I);
}
}
return q;
}
__host__ __device__ cumat2f cumat2f::inverse()
{
cumat2f q;
float dt = q.det();
if(dt!=0.0)
{
q(0,0) = this->at(1,1)/dt;
q(0,1) = -this->at(0,1)/dt;
q(1,0) = -this->at(1,0)/dt;
q(1,1) = this->at(0,0)/dt;
}
else
{
q(0,0) = inf;
q(0,1) = inf;
q(1,0) = inf;
q(1,1) = inf;
}
return q;
}
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2f_transpose(float *mat2inout)
{
float mat2f_in[4];
mat2f_copy(mat2f_in,mat2inout);
mat2inout[0+0*2] = mat2f_in[0+0*2];
mat2inout[1+0*2] = mat2f_in[0+1*2];
mat2inout[0+1*2] = mat2f_in[1+0*2];
mat2inout[1+1*2] = mat2f_in[1+1*2];
return;
}
//copies src to dest
__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src)
{
mat2f_dest[0+0*2] = mat2f_src[0+0*2];
mat2f_dest[1+0*2] = mat2f_src[1+0*2];
mat2f_dest[0+1*2] = mat2f_src[0+1*2];
mat2f_dest[1+1*2] = mat2f_src[1+1*2];
return;
}
//inverts mat?inout[4]
__host__ __device__ void mat2f_inverse(float *mat2inout)
{
float det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2];
float mat2in[4];
mat2f_copy(mat2in,mat2inout);
mat2inout[0+0*2] = mat2inout[1+1*2]/det;
mat2inout[1+0*2] = -mat2inout[1+0*2]/det;
mat2inout[0+1*2] = -mat2inout[0+1*2]/det;
mat2inout[1+1*2] = mat2inout[0+0*2]/det;
return;
}
//rotatin matrix from angle
__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2)
{
mat2[0+0*2] = ::cosf(angle);
mat2[1+0*2] = ::sinf(angle);
mat2[0+1*2] = -::sinf(angle);
mat2[1+1*2] = ::cosf(angle);
return;
}
//multiplies c = a*b
__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c)
{
float mat2a_in[4];
float mat2b_in[4];
if(mat2a==NULL || mat2b==NULL || mat2c==NULL)
{
return;
}
mat2f_copy(mat2a_in,mat2a);
mat2f_copy(mat2b_in,mat2b);
mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2];
mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2];
mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2];
mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2];
return;
}
// ret = a*b
__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b)
{
cuvect2f ret;
ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2];
ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2];
return ret;
}
void test_cuvect2f_1()
{
return;
}
};

@ -0,0 +1,581 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__host__ __device__ cuvect3::cuvect3()
{
x = 0.0; y = 0.0; z = 0.0;
return;
}
__host__ __device__ cuvect3::~cuvect3()
{
x = 0.0; y = 0.0; z = 0.0;
return;
}
__host__ __device__ double& cuvect3::operator[](const int I)
{
if(I==0) return x;
if(I==1) return y;
if(I==2) return z;
return x;
}
__host__ __device__ const double& cuvect3::operator[](const int I) const
{
if(I==0) return x;
if(I==1) return y;
if(I==2) return z;
return x;
}
__host__ __device__ cuvect3 cuvect3::operator+(cuvect3 lhs)
{
cuvect3 ret;
ret.x = x+lhs.x;
ret.y = y+lhs.y;
ret.z = z+lhs.z;
return ret;
}
__host__ __device__ cuvect3 cuvect3::operator-(cuvect3 lhs)
{
cuvect3 ret;
ret.x = x-lhs.x;
ret.y = y-lhs.y;
ret.z = z-lhs.z;
return ret;
}
__host__ __device__ cuvect3 cuvect3::operator*(double lhs)
{
cuvect3 ret;
ret.x = x*lhs;
ret.y = y*lhs;
ret.z = z*lhs;
return ret;
}
__host__ __device__ cuvect3 cuvect3::operator/(double lhs)
{
cuvect3 ret;
ret.x = x/lhs;
ret.y = y/lhs;
ret.z = z/lhs;
return ret;
}
__host__ __device__ cuvect3::cuvect3(double _x, double _y, double _z)
{
x = _x; y = _y; z = _z;
return;
}
__host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b)
{
double ret = a.x*b.x+a.y*b.y+a.z*b.z;
return ret;
}
__host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b)
{
cuvect3 ret;
ret[0] = a[1]*b[2]-a[2]*b[1];
ret[1] = a[2]*b[0]-a[0]*b[2];
ret[2] = a[0]*b[1]-a[1]*b[0];
return ret;
}
__host__ __device__ double cuvect3_norm(cuvect3 a)
{
double ret;
ret = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z);
return ret;
}
__host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a)
{
cuvect3 ret;
double m;
m = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z);
if(m>0.0)
{
ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m;
}
else
{
ret.x = 0.0; ret.y = 0.0; ret.z = 0.0;
}
return ret;
}
__host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b)
{
cuvect3 ret;
cuvect3 bn = cuvect3_normalize(b);
double m = cuvect3_dot(a,bn);
ret = bn*m;
return ret;
}
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3_transpose(double *mat3inout)
{
int I,J;
double matint[9];
for(I=0;I<9;I++)
{
matint[I] = mat3inout[I];
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
mat3inout[I+J*3] = matint[J+I*3];
}
}
return;
}
//copies src to dest
__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src)
{
int I;
if(mat3_dest==NULL || mat3_src==NULL)
return;
for(I=0;I<9;I++)
mat3_dest[I] = mat3_src[I];
return;
}
__host__ __device__ double mat3_det(double *mat3in)
{
double ret = 0.0;
ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3];
ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3];
ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3];
ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3];
ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3];
ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3];
return ret;
}
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3_inverse(double *mat3inout)
{
int I;
double matint[9];
double det = mat3_det(mat3inout);
for(I=0;I<9;I++)
{
matint[I] = mat3inout[I];
}
mat3inout[0+0*3] = (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det;
mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det;
mat3inout[0+2*3] = (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det;
mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det;
mat3inout[1+1*3] = (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det;
mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det;
mat3inout[2+0*3] = (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det;
mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det;
mat3inout[2+2*3] = (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det;
mat3_transpose(mat3inout);
return;
}
__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin)
{
int I,J;
cuvect3 ret;
for(I=0;I<3;I++)
{
ret[I] = 0.0;
for(J=0;J<3;J++)
{
ret[I] = ret[I] + mat3in[I+3*J]*cvin[J];
}
}
return ret;
}
__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout)
{
double wrk[9];
int I,J,K;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
wrk[I+3*J] = 0.0;
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
for(K=0;K<3;K++)
{
wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K];
}
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
matout[I+3*J] = wrk[I+3*J];
}
}
return;
}
__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout)
{
matout[0 + 0*3] = 0.0f;
matout[1 + 0*3] = -vecin[2];
matout[2 + 0*3] = vecin[1];
matout[0 + 1*3] = vecin[2];
matout[1 + 1*3] = 0.0f;
matout[2 + 1*3] = -vecin[0];
matout[0 + 2*3] = -vecin[1];
matout[1 + 2*3] = vecin[0];
matout[2 + 2*3] = 0.0f;
return;
}
__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout)
{
vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]);
vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]);
vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]);
return;
}
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout)
{
int I;
double H[9];
double Hsq[9];
double II[9];
for(I=0;I<9;I++) II[I] = 0.0;
II[0+0*3] = 1.0;
II[1+1*3] = 1.0;
II[2+2*3] = 1.0;
axis = cuvect3_normalize(axis);
mat3_hodgedual(axis,H);
mat3_mult(H,H,Hsq);
for(I=0;I<9;I++)
{
matout[I] = (II[I] + Hsq[I]) + H[I]*sin(angle) - Hsq[I]*cos(angle);
}
return;
}
__host__ void test_cudavect_logic1()
{
//3 dim vector and matrix functional tests on host side
printf("3 dim vector and matrix functional tests on host side\n");
cuvect3 a,b,c;
double ma[9],mb[9],mc[9];
int I,J;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
ma[I+3*J] = ((double) rand())/((double) RAND_MAX);
mb[I+3*J] = ma[I+3*J];
}
}
mat3_inverse(mb);
mat3_mult(ma,mb,mc);
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]);
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]);
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]);
}
}
a = cuvect3(1,1,1);
b = mat3_mult(ma,a);
b = mat3_mult(mb,b);
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]);
}
a = cuvect3(1,0,1);
b = cuvect3(0,1,-1);
c = a+b;
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
c = c/2.0;
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
c = cuvect3_cross(a,b);
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b));
printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c));
a = cuvect3_normalize(a);
b = cuvect3_normalize(b);
c = cuvect3_normalize(c);
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b));
printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c));
return;
}
__host__ __device__ cumat3::cumat3()
{
int I;
for(I=0;I<9;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ cumat3::~cumat3()
{
int I;
for(I=0;I<9;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ double& cumat3::operator[](const int I)
{
return dat[I];
}
__host__ __device__ double& cumat3::operator()(const int I, const int J)
{
return dat[I+3*J];
}
__host__ __device__ cumat3 cumat3::operator+(cumat3 lhs)
{
int I;
cumat3 ret;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I] + lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat3 cumat3::operator-(cumat3 lhs)
{
int I;
cumat3 ret;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I] - lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat3 cumat3::operator*(double lhs)
{
cumat3 ret;
int I;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I]*lhs;
}
return ret;
}
__host__ __device__ cumat3 cumat3::operator/(double lhs)
{
cumat3 ret;
int I;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I]/lhs;
}
return ret;
}
__host__ __device__ double& cumat3::at(const int I, const int J)
{
return dat[I+3*J];
}
__host__ __device__ cuvect3 cumat3::operator*(cuvect3 lhs)
{
cuvect3 ret = cuvect3(0.0,0.0,0.0);
int I,J;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
ret[I] = ret[I] + this->at(I,J)*lhs[J];
}
}
return ret;
}
__host__ __device__ cumat3 cumat3::operator*(cumat3 lhs)
{
cumat3 ret;
int I,J,K;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
ret(I,J) = 0.0;
for(K=0;K<3;K++)
{
ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
}
}
}
return ret;
}
__host__ __device__ double cumat3::det()
{
double ret = 0.0;
ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2);
ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0);
ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1);
ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1);
ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2);
ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0);
return ret;
}
__host__ __device__ cumat3 cumat3::transpose()
{
cumat3 q;
int I,J;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
q.at(I,J) = this->at(J,I);
}
}
return q;
}
__host__ __device__ cumat3 cumat3::inverse()
{
cumat3 q;
double dt = q.det();
if(dt!=0.0)
{
q(0,0) = (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt;
q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt;
q(0,2) = (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt;
q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt;
q(1,1) = (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt;
q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt;
q(2,0) = (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt;
q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt;
q(2,2) = (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt;
q = q.transpose();
}
else
{
q(0,0) = inf;
q(0,1) = inf;
q(0,2) = inf;
q(1,0) = inf;
q(1,1) = inf;
q(1,2) = inf;
q(2,0) = inf;
q(2,1) = inf;
q(2,2) = inf;
}
return q;
}
};

@ -0,0 +1,581 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__host__ __device__ cuvect3f::cuvect3f()
{
x = 0.0; y = 0.0; z = 0.0;
return;
}
__host__ __device__ cuvect3f::~cuvect3f()
{
x = 0.0; y = 0.0; z = 0.0;
return;
}
__host__ __device__ float& cuvect3f::operator[](const int I)
{
if(I==0) return x;
if(I==1) return y;
if(I==2) return z;
return x;
}
__host__ __device__ const float& cuvect3f::operator[](const int I) const
{
if(I==0) return x;
if(I==1) return y;
if(I==2) return z;
return x;
}
__host__ __device__ cuvect3f cuvect3f::operator+(cuvect3f lhs)
{
cuvect3f ret;
ret.x = x+lhs.x;
ret.y = y+lhs.y;
ret.z = z+lhs.z;
return ret;
}
__host__ __device__ cuvect3f cuvect3f::operator-(cuvect3f lhs)
{
cuvect3f ret;
ret.x = x-lhs.x;
ret.y = y-lhs.y;
ret.z = z-lhs.z;
return ret;
}
__host__ __device__ cuvect3f cuvect3f::operator*(float lhs)
{
cuvect3f ret;
ret.x = x*lhs;
ret.y = y*lhs;
ret.z = z*lhs;
return ret;
}
__host__ __device__ cuvect3f cuvect3f::operator/(float lhs)
{
cuvect3f ret;
ret.x = x/lhs;
ret.y = y/lhs;
ret.z = z/lhs;
return ret;
}
__host__ __device__ cuvect3f::cuvect3f(float _x, float _y, float _z)
{
x = _x; y = _y; z = _z;
return;
}
__host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b)
{
float ret = a.x*b.x+a.y*b.y+a.z*b.z;
return ret;
}
__host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b)
{
cuvect3f ret;
ret[0] = a[1]*b[2]-a[2]*b[1];
ret[1] = a[2]*b[0]-a[0]*b[2];
ret[2] = a[0]*b[1]-a[1]*b[0];
return ret;
}
__host__ __device__ float cuvect3f_norm(cuvect3f a)
{
float ret;
ret = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
return ret;
}
__host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a)
{
cuvect3f ret;
float m;
m = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
if(m>0.0)
{
ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m;
}
else
{
ret.x = 0.0; ret.y = 0.0; ret.z = 0.0;
}
return ret;
}
__host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b)
{
cuvect3f ret;
cuvect3f bn = cuvect3f_normalize(b);
float m = cuvect3f_dot(a,bn);
ret = bn*m;
return ret;
}
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3f_transpose(float *mat3inout)
{
int I,J;
float matint[9];
for(I=0;I<9;I++)
{
matint[I] = mat3inout[I];
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
mat3inout[I+J*3] = matint[J+I*3];
}
}
return;
}
//copies src to dest
__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src)
{
int I;
if(mat3f_dest==NULL || mat3f_src==NULL)
return;
for(I=0;I<9;I++)
mat3f_dest[I] = mat3f_src[I];
return;
}
__host__ __device__ float mat3f_det(float *mat3in)
{
float ret = 0.0;
ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3];
ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3];
ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3];
ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3];
ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3];
ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3];
return ret;
}
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3f_inverse(float *mat3inout)
{
int I;
float matint[9];
float det = mat3f_det(mat3inout);
for(I=0;I<9;I++)
{
matint[I] = mat3inout[I];
}
mat3inout[0+0*3] = (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det;
mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det;
mat3inout[0+2*3] = (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det;
mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det;
mat3inout[1+1*3] = (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det;
mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det;
mat3inout[2+0*3] = (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det;
mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det;
mat3inout[2+2*3] = (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det;
mat3f_transpose(mat3inout);
return;
}
__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin)
{
int I,J;
cuvect3f ret;
for(I=0;I<3;I++)
{
ret[I] = 0.0;
for(J=0;J<3;J++)
{
ret[I] = ret[I] + mat3in[I+3*J]*cvin[J];
}
}
return ret;
}
__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout)
{
float wrk[9];
int I,J,K;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
wrk[I+3*J] = 0.0;
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
for(K=0;K<3;K++)
{
wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K];
}
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
matout[I+3*J] = wrk[I+3*J];
}
}
return;
}
__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout)
{
matout[0 + 0*3] = 0.0f;
matout[1 + 0*3] = -vecin[2];
matout[2 + 0*3] = vecin[1];
matout[0 + 1*3] = vecin[2];
matout[1 + 1*3] = 0.0f;
matout[2 + 1*3] = -vecin[0];
matout[0 + 2*3] = -vecin[1];
matout[1 + 2*3] = vecin[0];
matout[2 + 2*3] = 0.0f;
return;
}
__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout)
{
vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]);
vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]);
vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]);
return;
}
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout)
{
int I;
float H[9];
float Hsq[9];
float II[9];
for(I=0;I<9;I++) II[I] = 0.0;
II[0+0*3] = 1.0;
II[1+1*3] = 1.0;
II[2+2*3] = 1.0;
axis = cuvect3f_normalize(axis);
mat3f_hodgedual(axis,H);
mat3f_mult(H,H,Hsq);
for(I=0;I<9;I++)
{
matout[I] = (II[I] + Hsq[I]) + H[I]*sinf(angle) - Hsq[I]*cosf(angle);
}
return;
}
__host__ void test_cudavectf_logic1()
{
//3 dim vector and matrix functional tests on host side
printf("3 dim vector and matrix functional tests on host side\n");
cuvect3f a,b,c;
float ma[9],mb[9],mc[9];
int I,J;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
ma[I+3*J] = ((float) rand())/((float) RAND_MAX);
mb[I+3*J] = ma[I+3*J];
}
}
mat3f_inverse(mb);
mat3f_mult(ma,mb,mc);
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]);
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]);
}
}
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]);
}
}
a = cuvect3f(1,1,1);
b = mat3f_mult(ma,a);
b = mat3f_mult(mb,b);
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]);
}
a = cuvect3f(1,0,1);
b = cuvect3f(0,1,-1);
c = a+b;
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
c = c/2.0;
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
c = cuvect3f_cross(a,b);
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b));
printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c));
a = cuvect3f_normalize(a);
b = cuvect3f_normalize(b);
c = cuvect3f_normalize(c);
for(I=0;I<3;I++)
{
printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]);
}
printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b));
printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c));
return;
}
__host__ __device__ cumat3f::cumat3f()
{
int I;
for(I=0;I<9;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ cumat3f::~cumat3f()
{
int I;
for(I=0;I<9;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ float& cumat3f::operator[](const int I)
{
return dat[I];
}
__host__ __device__ float& cumat3f::operator()(const int I, const int J)
{
return dat[I+3*J];
}
__host__ __device__ cumat3f cumat3f::operator+(cumat3f lhs)
{
int I;
cumat3f ret;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I] + lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat3f cumat3f::operator-(cumat3f lhs)
{
int I;
cumat3f ret;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I] - lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat3f cumat3f::operator*(float lhs)
{
cumat3f ret;
int I;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I]*lhs;
}
return ret;
}
__host__ __device__ cumat3f cumat3f::operator/(float lhs)
{
cumat3f ret;
int I;
for(I=0;I<9;I++)
{
ret.dat[I] = this->dat[I]/lhs;
}
return ret;
}
__host__ __device__ float& cumat3f::at(const int I, const int J)
{
return dat[I+3*J];
}
__host__ __device__ cuvect3f cumat3f::operator*(cuvect3f lhs)
{
cuvect3f ret = cuvect3f(0.0,0.0,0.0);
int I,J;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
ret[I] = ret[I] + this->at(I,J)*lhs[J];
}
}
return ret;
}
__host__ __device__ cumat3f cumat3f::operator*(cumat3f lhs)
{
cumat3f ret;
int I,J,K;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
ret(I,J) = 0.0;
for(K=0;K<3;K++)
{
ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J);
}
}
}
return ret;
}
__host__ __device__ float cumat3f::det()
{
float ret = 0.0;
ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2);
ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0);
ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1);
ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1);
ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2);
ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0);
return ret;
}
__host__ __device__ cumat3f cumat3f::transpose()
{
cumat3f q;
int I,J;
for(I=0;I<3;I++)
{
for(J=0;J<3;J++)
{
q.at(I,J) = this->at(J,I);
}
}
return q;
}
__host__ __device__ cumat3f cumat3f::inverse()
{
cumat3f q;
float dt = q.det();
if(dt!=0.0)
{
q(0,0) = (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt;
q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt;
q(0,2) = (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt;
q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt;
q(1,1) = (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt;
q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt;
q(2,0) = (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt;
q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt;
q(2,2) = (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt;
q = q.transpose();
}
else
{
q(0,0) = inf;
q(0,1) = inf;
q(0,2) = inf;
q(1,0) = inf;
q(1,1) = inf;
q(1,2) = inf;
q(2,0) = inf;
q(2,1) = inf;
q(2,2) = inf;
}
return q;
}
};

@ -0,0 +1,414 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
__host__ __device__ cuvect4::cuvect4()
{
x = 0.0; y = 0.0; z = 0.0; w = 0.0;
return;
}
__host__ __device__ cuvect4::~cuvect4()
{
x = 0.0; y = 0.0; z = 0.0; w = 0.0;
return;
}
__host__ __device__ cuvect4::cuvect4(double _x, double _y, double _z, double _w)
{
x = _x; y = _y; z = _z; w = _w;
return;
}
__host__ __device__ double& cuvect4::operator[](const int I)
{
if(I==0) return x;
else if(I==1) return y;
else if(I==2) return z;
else if(I==3) return w;
return x;
}
__host__ __device__ const double& cuvect4::operator[](const int I) const
{
if(I==0) return x;
else if(I==1) return y;
else if(I==2) return z;
else if(I==3) return w;
return x;
}
__host__ __device__ cuvect4 cuvect4::operator+(cuvect4 lhs)
{
cuvect4 ret;
ret.x = this->x + lhs.x;
ret.y = this->y + lhs.y;
ret.z = this->z + lhs.z;
ret.w = this->w + lhs.w;
return ret;
}
__host__ __device__ cuvect4 cuvect4::operator-(cuvect4 lhs)
{
cuvect4 ret;
ret.x = this->x - lhs.x;
ret.y = this->y - lhs.y;
ret.z = this->z - lhs.z;
ret.w = this->w - lhs.w;
return ret;
}
__host__ __device__ cuvect4 cuvect4::operator*(double lhs)
{
cuvect4 ret;
ret.x = this->x*lhs;
ret.y = this->y*lhs;
ret.z = this->z*lhs;
ret.w = this->w*lhs;
return ret;
}
__host__ __device__ cuvect4 cuvect4::operator/(double lhs)
{
cuvect4 ret;
ret.x = this->x/lhs;
ret.y = this->y/lhs;
ret.z = this->z/lhs;
ret.w = this->w/lhs;
return ret;
}
__host__ __device__ cumat4::cumat4()
{
int I;
for(I=0;I<16;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ cumat4::~cumat4()
{
int I;
for(I=0;I<16;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ double& cumat4::operator[](const int I)
{
return dat[I];
}
__host__ __device__ double& cumat4::operator()(const int I, const int J)
{
return dat[I+4*J];
}
__host__ __device__ double& cumat4::at(const int I, const int J)
{
return dat[I+4*J];
}
__host__ __device__ cumat4 cumat4::operator+(cumat4 lhs)
{
cumat4 ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I] + lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat4 cumat4::operator-(cumat4 lhs)
{
cumat4 ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I] - lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat4 cumat4::operator*(double lhs)
{
cumat4 ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I]*lhs;
}
return ret;
}
__host__ __device__ cumat4 cumat4::operator/(double lhs)
{
cumat4 ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I]/lhs;
}
return ret;
}
__host__ __device__ cuvect4 cumat4::operator*(cuvect4 lhs)
{
cuvect4 ret = cuvect4(0.0,0.0,0.0,0.0);
int I,J;
for(I=0;I<4;I++)
{
for(J=0;J<4;J++)
{
ret[I] = ret[I] + this->at(I,J)*lhs[J];
}
}
return ret;
}
__host__ __device__ cumat4 cumat4::operator*(cumat4 lhs)
{
cumat4 ret;
int I,J,K;
for(I=0;I<4;I++)
{
for(J=0;J<4;J++)
{
ret(I,J) = 0;
for(K=0;K<4;K++)
{
ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J);
}
}
}
return ret;
}
__host__ __device__ cumat4 cumat4::transpose()
{
cumat4 q;
int I,J;
for(I=0;I<4;I++)
{
for(J=0;J<4;J++)
{
q(I,J) = this->at(J,I);
}
}
return q;
}
__host__ __device__ double cumat4::det()
{
double a00,a01,a02,a03;
double a10,a11,a12,a13;
double a20,a21,a22,a23;
double a30,a31,a32,a33;
double det;
a00 = this->at(0,0);
a01 = this->at(0,1);
a02 = this->at(0,2);
a03 = this->at(0,3);
a10 = this->at(1,0);
a11 = this->at(1,1);
a12 = this->at(1,2);
a13 = this->at(1,3);
a20 = this->at(2,0);
a21 = this->at(2,1);
a22 = this->at(2,2);
a23 = this->at(2,3);
a30 = this->at(3,0);
a31 = this->at(3,1);
a32 = this->at(3,2);
a33 = this->at(3,3);
det = a03*a12*a21*a30 -
a02*a13*a21*a30 -
a03*a11*a22*a30 +
a01*a13*a22*a30 +
a02*a11*a23*a30 -
a01*a12*a23*a30 -
a03*a12*a20*a31 +
a02*a13*a20*a31 +
a03*a10*a22*a31 -
a00*a13*a22*a31 -
a02*a10*a23*a31 +
a00*a12*a23*a31 +
a03*a11*a20*a32 -
a01*a13*a20*a32 -
a03*a10*a21*a32 +
a00*a13*a21*a32 +
a01*a10*a23*a32 -
a00*a11*a23*a32 -
a02*a11*a20*a33 +
a01*a12*a20*a33 +
a02*a10*a21*a33 -
a00*a12*a21*a33 -
a01*a10*a22*a33 +
a00*a11*a22*a33;
return det;
}
__host__ __device__ cumat4 minverse(cumat4 ma)
{
cumat4 mb;
double a00,a01,a02,a03;
double a10,a11,a12,a13;
double a20,a21,a22,a23;
double a30,a31,a32,a33;
double b00,b01,b02,b03;
double b10,b11,b12,b13;
double b20,b21,b22,b23;
double b30,b31,b32,b33;
double det = 0.0;
a00 = ma.at(0,0);
a01 = ma.at(0,1);
a02 = ma.at(0,2);
a03 = ma.at(0,3);
a10 = ma.at(1,0);
a11 = ma.at(1,1);
a12 = ma.at(1,2);
a13 = ma.at(1,3);
a20 = ma.at(2,0);
a21 = ma.at(2,1);
a22 = ma.at(2,2);
a23 = ma.at(2,3);
a30 = ma.at(3,0);
a31 = ma.at(3,1);
a32 = ma.at(3,2);
a33 = ma.at(3,3);
det = a03*a12*a21*a30 -
a02*a13*a21*a30 -
a03*a11*a22*a30 +
a01*a13*a22*a30 +
a02*a11*a23*a30 -
a01*a12*a23*a30 -
a03*a12*a20*a31 +
a02*a13*a20*a31 +
a03*a10*a22*a31 -
a00*a13*a22*a31 -
a02*a10*a23*a31 +
a00*a12*a23*a31 +
a03*a11*a20*a32 -
a01*a13*a20*a32 -
a03*a10*a21*a32 +
a00*a13*a21*a32 +
a01*a10*a23*a32 -
a00*a11*a23*a32 -
a02*a11*a20*a33 +
a01*a12*a20*a33 +
a02*a10*a21*a33 -
a00*a12*a21*a33 -
a01*a10*a22*a33 +
a00*a11*a22*a33;
if(det*det>1.0E-30)
{
b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33;
b01 = a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33;
b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33;
b03 = a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23;
b10 = a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33;
b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33;
b12 = a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33;
b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23;
b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33;
b21 = a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33;
b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33;
b23 = a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23;
b30 = a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32;
b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32;
b32 = a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32;
b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22;
b00 = b00/det;
b01 = b01/det;
b02 = b02/det;
b03 = b03/det;
b10 = b10/det;
b11 = b11/det;
b12 = b12/det;
b13 = b13/det;
b20 = b20/det;
b21 = b21/det;
b22 = b22/det;
b23 = b23/det;
b30 = b30/det;
b31 = b31/det;
b32 = b32/det;
b33 = b33/det;
mb.at(0,0) = b00;
mb.at(0,1) = b01;
mb.at(0,2) = b02;
mb.at(0,3) = b03;
mb.at(1,0) = b10;
mb.at(1,1) = b11;
mb.at(1,2) = b12;
mb.at(1,3) = b13;
mb.at(2,0) = b20;
mb.at(2,1) = b21;
mb.at(2,2) = b22;
mb.at(2,3) = b23;
mb.at(3,0) = b30;
mb.at(3,1) = b31;
mb.at(3,2) = b32;
mb.at(3,3) = b33;
}
//this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again!
return mb;
}
__host__ __device__ cumat4 cumat4::inverse()
{
return minverse(*this);
}
__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b)
{
double ret = 0.0;
ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
return ret;
}
__host__ __device__ double cuvect4_norm(cuvect4 a)
{
double ret = 0.0;
ret = ::sqrt(cuvect4_dot(a,a));
return ret;
}
__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a)
{
cuvect4 ret = cuvect4(0.0f,0.0f,0.0f,0.0f);
double nrm = cuvect4_norm(a);
if(nrm>0.0)
ret = a/nrm;
return ret;
}
__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b)
{
cuvect4 ret;
cuvect4 bn = cuvect4_normalize(b);
double d = cuvect4_dot(a,bn);
ret = bn*d;
return ret;
}
};

@ -0,0 +1,417 @@
#include <amsculib2/amsculib2.hpp>
namespace amscuda
{
////////////
//cuvect4ff//
////////////
__host__ __device__ cuvect4f::cuvect4f()
{
x = 0.0; y = 0.0; z = 0.0; w = 0.0;
return;
}
__host__ __device__ cuvect4f::~cuvect4f()
{
x = 0.0; y = 0.0; z = 0.0; w = 0.0;
return;
}
__host__ __device__ cuvect4f::cuvect4f(float _x, float _y, float _z, float _w)
{
x = _x; y = _y; z = _z; w = _w;
return;
}
__host__ __device__ float& cuvect4f::operator[](const int I)
{
if(I==0) return x;
else if(I==1) return y;
else if(I==2) return z;
else if(I==3) return w;
return x;
}
__host__ __device__ const float& cuvect4f::operator[](const int I) const
{
if(I==0) return x;
else if(I==1) return y;
else if(I==2) return z;
else if(I==3) return w;
return x;
}
__host__ __device__ cuvect4f cuvect4f::operator+(cuvect4f lhs)
{
cuvect4f ret;
ret.x = this->x + lhs.x;
ret.y = this->y + lhs.y;
ret.z = this->z + lhs.z;
ret.w = this->w + lhs.w;
return ret;
}
__host__ __device__ cuvect4f cuvect4f::operator-(cuvect4f lhs)
{
cuvect4f ret;
ret.x = this->x - lhs.x;
ret.y = this->y - lhs.y;
ret.z = this->z - lhs.z;
ret.w = this->w - lhs.w;
return ret;
}
__host__ __device__ cuvect4f cuvect4f::operator*(float lhs)
{
cuvect4f ret;
ret.x = this->x*lhs;
ret.y = this->y*lhs;
ret.z = this->z*lhs;
ret.w = this->w*lhs;
return ret;
}
__host__ __device__ cuvect4f cuvect4f::operator/(float lhs)
{
cuvect4f ret;
ret.x = this->x/lhs;
ret.y = this->y/lhs;
ret.z = this->z/lhs;
ret.w = this->w/lhs;
return ret;
}
__host__ __device__ cumat4f::cumat4f()
{
int I;
for(I=0;I<16;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ cumat4f::~cumat4f()
{
int I;
for(I=0;I<16;I++)
{
dat[I] = 0.0;
}
return;
}
__host__ __device__ float& cumat4f::operator[](const int I)
{
return dat[I];
}
__host__ __device__ float& cumat4f::operator()(const int I, const int J)
{
return dat[I+4*J];
}
__host__ __device__ float& cumat4f::at(const int I, const int J)
{
return dat[I+4*J];
}
__host__ __device__ cumat4f cumat4f::operator+(cumat4f lhs)
{
cumat4f ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I] + lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat4f cumat4f::operator-(cumat4f lhs)
{
cumat4f ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I] - lhs.dat[I];
}
return ret;
}
__host__ __device__ cumat4f cumat4f::operator*(float lhs)
{
cumat4f ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I]*lhs;
}
return ret;
}
__host__ __device__ cumat4f cumat4f::operator/(float lhs)
{
cumat4f ret;
int I;
for(I=0;I<16;I++)
{
ret.dat[I] = this->dat[I]/lhs;
}
return ret;
}
__host__ __device__ cuvect4f cumat4f::operator*(cuvect4f lhs)
{
cuvect4f ret = cuvect4f(0.0,0.0,0.0,0.0);
int I,J;
for(I=0;I<4;I++)
{
for(J=0;J<4;J++)
{
ret[I] = ret[I] + this->at(I,J)*lhs[J];
}
}
return ret;
}
__host__ __device__ cumat4f cumat4f::operator*(cumat4f lhs)
{
cumat4f ret;
int I,J,K;
for(I=0;I<4;I++)
{
for(J=0;J<4;J++)
{
ret(I,J) = 0;
for(K=0;K<4;K++)
{
ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J);
}
}
}
return ret;
}
__host__ __device__ cumat4f cumat4f::transpose()
{
cumat4f q;
int I,J;
for(I=0;I<4;I++)
{
for(J=0;J<4;J++)
{
q(I,J) = this->at(J,I);
}
}
return q;
}
__host__ __device__ float cumat4f::det()
{
float a00,a01,a02,a03;
float a10,a11,a12,a13;
float a20,a21,a22,a23;
float a30,a31,a32,a33;
float det;
a00 = this->at(0,0);
a01 = this->at(0,1);
a02 = this->at(0,2);
a03 = this->at(0,3);
a10 = this->at(1,0);
a11 = this->at(1,1);
a12 = this->at(1,2);
a13 = this->at(1,3);
a20 = this->at(2,0);
a21 = this->at(2,1);
a22 = this->at(2,2);
a23 = this->at(2,3);
a30 = this->at(3,0);
a31 = this->at(3,1);
a32 = this->at(3,2);
a33 = this->at(3,3);
det = a03*a12*a21*a30 -
a02*a13*a21*a30 -
a03*a11*a22*a30 +
a01*a13*a22*a30 +
a02*a11*a23*a30 -
a01*a12*a23*a30 -
a03*a12*a20*a31 +
a02*a13*a20*a31 +
a03*a10*a22*a31 -
a00*a13*a22*a31 -
a02*a10*a23*a31 +
a00*a12*a23*a31 +
a03*a11*a20*a32 -
a01*a13*a20*a32 -
a03*a10*a21*a32 +
a00*a13*a21*a32 +
a01*a10*a23*a32 -
a00*a11*a23*a32 -
a02*a11*a20*a33 +
a01*a12*a20*a33 +
a02*a10*a21*a33 -
a00*a12*a21*a33 -
a01*a10*a22*a33 +
a00*a11*a22*a33;
return det;
}
__host__ __device__ cumat4f minverse(cumat4f ma)
{
cumat4f mb;
float a00,a01,a02,a03;
float a10,a11,a12,a13;
float a20,a21,a22,a23;
float a30,a31,a32,a33;
float b00,b01,b02,b03;
float b10,b11,b12,b13;
float b20,b21,b22,b23;
float b30,b31,b32,b33;
float det = 0.0;
a00 = ma.at(0,0);
a01 = ma.at(0,1);
a02 = ma.at(0,2);
a03 = ma.at(0,3);
a10 = ma.at(1,0);
a11 = ma.at(1,1);
a12 = ma.at(1,2);
a13 = ma.at(1,3);
a20 = ma.at(2,0);
a21 = ma.at(2,1);
a22 = ma.at(2,2);
a23 = ma.at(2,3);
a30 = ma.at(3,0);
a31 = ma.at(3,1);
a32 = ma.at(3,2);
a33 = ma.at(3,3);
det = a03*a12*a21*a30 -
a02*a13*a21*a30 -
a03*a11*a22*a30 +
a01*a13*a22*a30 +
a02*a11*a23*a30 -
a01*a12*a23*a30 -
a03*a12*a20*a31 +
a02*a13*a20*a31 +
a03*a10*a22*a31 -
a00*a13*a22*a31 -
a02*a10*a23*a31 +
a00*a12*a23*a31 +
a03*a11*a20*a32 -
a01*a13*a20*a32 -
a03*a10*a21*a32 +
a00*a13*a21*a32 +
a01*a10*a23*a32 -
a00*a11*a23*a32 -
a02*a11*a20*a33 +
a01*a12*a20*a33 +
a02*a10*a21*a33 -
a00*a12*a21*a33 -
a01*a10*a22*a33 +
a00*a11*a22*a33;
if(det*det>1.0E-30)
{
b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33;
b01 = a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33;
b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33;
b03 = a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23;
b10 = a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33;
b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33;
b12 = a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33;
b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23;
b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33;
b21 = a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33;
b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33;
b23 = a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23;
b30 = a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32;
b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32;
b32 = a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32;
b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22;
b00 = b00/det;
b01 = b01/det;
b02 = b02/det;
b03 = b03/det;
b10 = b10/det;
b11 = b11/det;
b12 = b12/det;
b13 = b13/det;
b20 = b20/det;
b21 = b21/det;
b22 = b22/det;
b23 = b23/det;
b30 = b30/det;
b31 = b31/det;
b32 = b32/det;
b33 = b33/det;
mb.at(0,0) = b00;
mb.at(0,1) = b01;
mb.at(0,2) = b02;
mb.at(0,3) = b03;
mb.at(1,0) = b10;
mb.at(1,1) = b11;
mb.at(1,2) = b12;
mb.at(1,3) = b13;
mb.at(2,0) = b20;
mb.at(2,1) = b21;
mb.at(2,2) = b22;
mb.at(2,3) = b23;
mb.at(3,0) = b30;
mb.at(3,1) = b31;
mb.at(3,2) = b32;
mb.at(3,3) = b33;
}
//this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again!
return mb;
}
__host__ __device__ cumat4f cumat4f::inverse()
{
return minverse(*this);
}
__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b)
{
float ret = 0.0;
ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
return ret;
}
__host__ __device__ float cuvect4f_norm(cuvect4f a)
{
float ret = 0.0;
ret = ::sqrtf(cuvect4f_dot(a,a));
return ret;
}
__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a)
{
cuvect4f ret = cuvect4f(0.0f,0.0f,0.0f,0.0f);
float nrm = cuvect4f_norm(a);
if(nrm>0.0)
ret = a/nrm;
return ret;
}
__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b)
{
cuvect4f ret;
cuvect4f bn = cuvect4f_normalize(b);
float d = cuvect4f_dot(a,bn);
ret = bn*d;
return ret;
}
}; //namespace amscuda

@ -0,0 +1,27 @@
#include <amsculib2/amsculib2.hpp>
//using namespace ams;
using namespace amscuda;
int main(int argc, char* argv[])
{
printf("AMSCULIB2: Cuda Library Tests.\n");
//test_amscuarray_1();
//test_amscumath1();
//cmp::test_cucomp64_1();
//cmp::test_cucomp128_1();
//test_amscuarray_2();
//test_dprg64();
//printf("\n");
//test_dprg32();
//test_dbuff_rand_dpr32();
test_amscurarray1();
return 0;
}

Binary file not shown.

@ -0,0 +1,76 @@
#!/usr/bin/python3
import os,sys,math
import numpy as np
import matplotlib.pyplot as plt
#################
## Subroutines ##
#################
def binload_float_ndarray(fp):
arr = np.zeros((0),dtype=np.float32,order='F')
qb = fp.read(4)
Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0]
shp = np.zeros((Nd),dtype=np.int32)
piprod = 1
for I in range(0,Nd):
qb = fp.read(4)
shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0]
piprod = piprod*shp[I]
qb = fp.read(4*piprod)
arr = np.frombuffer(qb,dtype=np.float32,count=piprod)
arr = arr.reshape(shp)
return arr;
def binsave_float_ndarray(fp,arr):
return
#################
## Main Script ##
#################
def test_1():
fname = "./test_scripts/test_dbuff_rand_dpr32.bin"
try:
fp = open(fname,"rb")
except:
print("Could not open {} for reading".format(fname))
return
arr1 = binload_float_ndarray(fp)
arr2 = binload_float_ndarray(fp)
fp.close()
plt.subplot(2,2,1)
plt.imshow(arr1)
plt.subplot(2,2,2)
plt.imshow(arr2)
plt.subplot(2,2,3)
plt.hist(arr1.flatten(),bins=100)
plt.subplot(2,2,4)
plt.hist(arr2.flatten(),bins=100)
plt.show()
print("{} {}".format(np.mean(arr2),np.std(arr2)))
return
if(__name__=="__main__"):
test_1()
exit(0)
Loading…
Cancel
Save