diff --git a/LICENSE b/LICENSE index f6cb221..aa6091e 100644 --- a/LICENSE +++ b/LICENSE @@ -1 +1 @@ -Copyright Aaron M. Schinder, 2023 +Copyright Aaron M. Schinder, 2023 diff --git a/amsculib2.code-workspace b/amsculib2.code-workspace new file mode 100644 index 0000000..362d7c2 --- /dev/null +++ b/amsculib2.code-workspace @@ -0,0 +1,7 @@ +{ + "folders": [ + { + "path": "." + } + ] +} \ No newline at end of file diff --git a/backup.sh b/backup.sh new file mode 100644 index 0000000..4293c09 --- /dev/null +++ b/backup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + + +rm ./test_scripts/*.bin + +tar --exclude='./data' -czvf ../amsculib2.tar.gz ./* + +scp ../amsculib2.tar.gz aschinder@amssolarempire.com:~/workspace/projects diff --git a/build/__pycache__/amsbuildlib4.cpython-310.pyc b/build/__pycache__/amsbuildlib4.cpython-310.pyc new file mode 100644 index 0000000..521d819 Binary files /dev/null and b/build/__pycache__/amsbuildlib4.cpython-310.pyc differ diff --git a/build/__pycache__/amsbuildlib4.cpython-311.pyc b/build/__pycache__/amsbuildlib4.cpython-311.pyc new file mode 100644 index 0000000..a3642e0 Binary files /dev/null and b/build/__pycache__/amsbuildlib4.cpython-311.pyc differ diff --git a/build/__pycache__/amsbuildlib4.cpython-312.pyc b/build/__pycache__/amsbuildlib4.cpython-312.pyc new file mode 100644 index 0000000..c8af139 Binary files /dev/null and b/build/__pycache__/amsbuildlib4.cpython-312.pyc differ diff --git a/build/__pycache__/amsbuildlib4.cpython-39.pyc b/build/__pycache__/amsbuildlib4.cpython-39.pyc new file mode 100644 index 0000000..4620db4 Binary files /dev/null and b/build/__pycache__/amsbuildlib4.cpython-39.pyc differ diff --git a/build/amsbuildlib4.py b/build/amsbuildlib4.py new file mode 100644 index 0000000..6275726 --- /dev/null +++ b/build/amsbuildlib4.py @@ -0,0 +1,813 @@ +#!/usr/bin/python3 + +import os,sys,math +import subprocess + +""" +Copyright Aaron M. Schinder, 2011 - MIT/BSD License + +This script contains a bunch of helper functions for generating simple, imperative, hopefully +transparent build scripts using the python language (and nothing else). + +I just want the script to do the compiling and linking operations I want it to do in the order +I want it to do it in, finding every relevant source file. + +That's it. That's what I want in a build system. +""" + +def flist(pth,**kwargs): + """ + flist - list all files in a given directory pth + optional arguments: + recurse - (T/F): Whether to recursively search for files in directory tree + exts - (list): A list of file extensions to search for, otherwise all files + normpath (T/F): whether to normalize path variables after + filelist = flist(pth,**kwargs): + """ + flst = [] + if(not('recurse' in kwargs)): + recurse_ = False + else: + recurse_ = kwargs['recurse'] + if(not('exts' in kwargs)): + filterexts_ = False + else: + filterexts_ = True + exts = kwargs['exts'] + if(not('normpath' in kwargs)): + normpath_ = True + else: + normpath_ = kwargs['normpath'] + if(not('linuxpath' in kwargs)): + linuxpath_ = False + else: + linuxpath_ = kwargs['linuxpath'] + if(not('followlinks' in kwargs)): + followlinks_ = False + else: + followlinks_ = kwargs['followlinks'] + + dirlist = [] + rawlist = os.listdir(pth) + + for F in rawlist: + F2 = os.path.join(pth,F) + if(os.path.isdir(F2)): + b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2))) + if(b): + if((F2!=".")&(F2!="..")): + dirlist.append(F2) + elif(os.path.isfile(F2)): + flst.append(F2) + + #Recurse through directories + if(recurse_): + for D in dirlist: + lst = flist(D,**kwargs) + for L in lst: + flst.append(L) + + #Postprocess: + #Filter out all extensions except the selected ext list + if(filterexts_): + flst = filterexts(flst,exts) + + #Normalize filename path according to os + if(normpath_): + flst2 = list(flst) + for I in range(0,len(flst2)): + flst[I] = os.path.normpath(flst2[I]) + + #If linuxpath, convert all \\ to / + #if(linuxpath_): + # flst2 = list(flst) + # for I in range(0,len(flst2)): + # flst[I] = linuxpath(flst2[I]) + + return flst + + +def filterexts(flst,exts): + """ + Filters by extensions in a list of files + flst = def filterexts(flst,exts): + """ + flst2 = [] + if(isinstance(exts,str)): + exts = list([exts]) + for F in flst: + b = False + for ext in exts: + if(ext[0]!='.'): + ext = '.'+ext + F2 = os.path.splitext(F) + if(len(F2)>=2): + ex = F2[1] + if(len(ex)>0): + if(ex[0]!='.'): + ex = '.'+ex + if(ex==ext): + b = True + if(b): + flst2.append(F) + + return flst2 + +#Find a file fname, starting in pth and recursing +#Used for finding library files to link +def findfile(fname,pth,**kwargs): + fullfname = "" + flst = flist(pth,recurse=True) + for F in flst: + F2 = os.path.split(F)[1] + if(F2 == fname): + fullfname = F + + return fullfname + +def replaceext(fname,ext): + fname2 = "" + if(len(ext)>0): + if(ext[0]!='.'): + ext = '.'+ext + fname2 = os.path.splitext(fname)[0]+ext + else: + fname2 = os.path.splitext(fname)[0] + return fname2 + +def replaceexts(fnamelist,ext): + """Takes a list of filenames and returns a list with the extensions replaced by ext """ + fname2list = [] + for F in fnamelist: + F2 = replaceext(F,ext) + fname2list.append(F2) + return fname2list + +def except_contains(lst1,exc): + """ + Takes a list of file names lst1, and removes filenams that match the + list of exceptions exc. Returns a list without the exceptions. + """ + lst2 = [] + for item in lst1: + b = 1 + for item2 in exc: + fsplit = os.path.split(item) + fn = fsplit[len(fsplit)-1] + if(fn==item2): + b = 0 + break + if(b==1): + lst2.append(item) + return lst2 + + +def list_to_sss(lst): + """List of strings to space-seperated-string""" + lout = "" + for I in range(0,len(lst)-1): + lout = lout + lst[I] + " " + if(len(lst)>0): + lout = lout + lst[len(lst)-1] + return lout + +########################## +##System Call Procedures## +########################## + +def callproc(cmd, **kwargs): + if(not('logfile' in kwargs)): + use_lf = False + else: + logfile = kwargs['logfile'] + if(logfile!=""): + fp = open(kwargs['logfile'],'a+') + use_lf = True + else: + use_lf = False + + if(not('echo' in kwargs)): + echo = True + else: + echo = kwargs['echo'] + + if(echo): + print(cmd) + + #encoding/deconding to/from bytes is necessary to use the subprocess command + #in python3.7 + #However, only do this in linux + if(sys.platform!='win32'): + cmd2 = cmd.encode(encoding='utf-8') + else: + cmd2 = cmd + proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + + out = out.decode(encoding='utf-8') + + if(echo): + print(out) + #print(err); + if(use_lf): + fp.writelines(cmd+'\n') + fp.writelines(out+'\n') + + if(use_lf): + fp.close() + + +############################################# +## Compiler, Archive, and Linker Functions ## +############################################# + +#MSVC compiler wrapper +def msvc_compile(compilername, srcfile, **kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '/c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '/Fo:' + else: + outfileflag = kwargs['outfileflag'] + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + outfile = replaceext(srcfile,objext) + ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+'"'+outfile+'"' + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +#MSVC compiler wrapper +def msvc_compile_list(compiler,srclist,**kwargs): + for S in srclist: + msvc_compile(compiler,S,**kwargs) + return + +#gnu-style compiler compile: Should work with gcc, g++, gfortran +def gs_compile(compiler,srcfile,**kwargs): + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '-c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '-o' + else: + outfileflag = kwargs['outfileflag'] + + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + if(not('smartcompile' in kwargs)): + _smartcompile = True + else: + _smartcompile = kwargs['smartcompile'] + + #Do I want to make this thing this general? + + # if(not(_smartcompile) or smartcompile(srcfile,objext)): + # outfile = replaceext(srcfile,objext) + # ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile + # ln = ln + " " + include + + # callproc(ln,echo=True,logfile=logfile) + + outfile = replaceext(srcfile,objext) + ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +def gs_compile_list(compiler,srclist,**kwargs): + for S in srclist: + gs_compile(compiler,S,**kwargs) + return + +def gs_compile_all(compiler,srcdir,srcexts,**kwargs): + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + srcfils = flist(srcdir,exts=srcexts,recurse=recurse) + + for S in srcfils: + gs_compile(compiler,S,**kwargs) + + return + +def gs_link_all(linker,srcpath,target,**kwargs): + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + + objfils = flist(srcpath,exts=objext,recurse=recurse) + oflst = list_to_sss(objfils) + + gs_link_list(linker,oflst,target,**kwargs) + + return + +def gs_link_list(linker,objlist,target,**kwargs): + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('libdir' in kwargs)): + libdir = '' + else: + libdir = kwargs['libdir'] + + if(not('staticlibs' in kwargs)): + staticlibs = '' + else: + staticlibs = kwargs['staticlibs'] + + if(not('libflags' in kwargs)): + libflags = '' + else: + libflags = kwargs['libflags'] + + if(not('linkerflags' in kwargs)): + linkerflags = '' + else: + linkerflags = kwargs['linkerflags'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + if(not('logfile' in kwargs)): + logfile = '' + else: + logfile = kwargs['logfile'] + + ln = linker+" -o "+target+" "+libdir + ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags + + callproc(ln,logfile=logfile) + return + +def msvc_link_list(objlist,target,**kwargs): + + linker = 'link' + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('libdir' in kwargs)): + libdir = '' + else: + libdir = kwargs['libdir'] + + if(not('staticlibs' in kwargs)): + staticlibs = '' + else: + staticlibs = kwargs['staticlibs'] + + if(not('libflags' in kwargs)): + libflags = '' + else: + libflags = kwargs['libflags'] + + if(not('linkerflags' in kwargs)): + linkerflags = '' + else: + linkerflags = kwargs['linkerflags'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + if(not('logfile' in kwargs)): + logfile = '' + else: + logfile = kwargs['logfile'] + + ln = linker+" "+libdir + ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags + ln = ln+" /out:"+target+" "+libflags + + callproc(ln,logfile=logfile) + + return + +def ar_all(srcpath,arname,**kwargs): + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + objlist = flist(srcpath,exts=objext,recurse=recurse) + ar_list(objlist,arname,**kwargs) + + return + +def msvc_lib_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "lib "+objlist2+" /out:"+arname + callproc(ln) + + return + +def ar_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "ar cr "+ arname+" "+objlist2 + callproc(ln) + + return + +def ar_add_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "ar t "+arname+" "+objlist2 + callproc(ln) + return + +##################################### +## Incremental Compilation Library ## +##################################### + +#silently read lines from a text file if exists +def readtextlines(fname): + txtlns = [] + + if(not os.path.isfile(fname)): + return txtlns + + try: + fp = open(fname,"r") + except: + return txtlns + + ln = " " + while(ln!=""): + ln = fp.readline() + txtlns.append(ln) + + fp.close() + + return txtlns + +def getincludefnfrage(includeline): + + fnfrag = "" + I1 = -1 + I2 = -1 + + for I in range(0,len(includeline)): + if(I1<0 and (includeline[I]=='<' or includeline[I]=='"')): + I1 = I + if(I1>=0 and (includeline[I]=='>' or includeline[I]=='"')): + I2 = I + break + if(I1>=0 and I2>=0): + fnfrag = includeline[I1+1:I2] + + return fnfrag + +#Returns the name of the source file fname (if it exists) +#and all included filenames +def getsrcandincludes(fname, incdirs): + + flist = [] + if(os.path.isfile(fname)): + flist.append(fname) + + Ilist = 0 + while(Ilist=0): + fnfrag = getincludefnfrage(lns[J]) + for K in range(0,len(incdirs)): + tfn = os.path.join(incdirs[K],fnfrag) + if(os.path.isfile(tfn)): + flist.append(tfn) + break + + Ilist = Ilist + 1 + + return flist + +#Returns the name of the object file associated with the source file +#within the object store folder (if it exists) +def getobjfile(fname,objstore,objext = ".o"): + + fret = "" + f1 = os.path.split(fname)[1] + f2 = f1 + while(os.path.splitext(f2)[1]!=""): + f2 = os.path.splitext(f2)[0] + objext = objext.strip('.') + f3 = os.path.join(objstore,"{}.{}".format(f2,objext)) + if(os.path.exists(f3)): + fret = f3 + + return fret + +def getsrctimes(fname, incdirs): + + ftimes = [] + flst = getsrcandincludes(fname, incdirs) + for I in range(0,len(flst)): + f = flst[I] + mt = os.path.getmtime(f) + ftimes.append(mt) + + return ftimes + +def getobjtime(fname,objstore,objext=".o"): + ret = -1 + fret = getobjfile(fname,objstore,objext) + if(fret!=""): + ret = os.path.getmtime(fret) + + return ret + +#Decide whether or not to compile source file +def decidecompile(fname,**kwargs): + ret = True + + if(not os.path.isfile(fname)): + ret = False + return ret + + ##unpack kwargs + if("searchincdirs" in kwargs): + incdirs = kwargs["searchincdirs"] + else: + incdirs = ["./include"] + + if("objext" in kwargs): + objext = kwargs["objext"] + else: + objext = ".o" + if("objstore" in kwargs): + objstore = kwargs["objstore"] + else: + objstore = "./objstore" + + + srclist = getsrcandincludes(fname,incdirs) + srctlist = getsrctimes(fname,incdirs) + obj = getobjfile(fname,objstore,objext) + objt = getobjtime(fname,objstore,objext) + + if(obj!=""): + ret = False + for I in range(0,len(srctlist)): + if(srctlist[I]>objt): + ret = True + break + + return ret + +def gs_incremental_compile(compiler,srcfile,**kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + if(not('srcfileflag' in kwargs)): + srcfileflag = '-c' + else: + srcfileflag = kwargs['srcfileflag'] + if(not('outfileflag' in kwargs)): + outfileflag = '-o' + else: + outfileflag = kwargs['outfileflag'] + + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + if(not('smartcompile' in kwargs)): + _smartcompile = True + else: + _smartcompile = kwargs['smartcompile'] + + #incrementalcompile + if("searchincdirs" in kwargs): + incdirs = kwargs["searchincdirs"] + else: + incdirs = ["./include"] + + if("objstore" in kwargs): + objstore = kwargs["objstore"] + else: + objstore = "./objstore" + + #Do I want to make this thing this general? + + docompile = decidecompile(srcfile,**kwargs) + + if(docompile): + f1 = os.path.split(srcfile)[1] + f2 = f1 + while(os.path.splitext(f2)[1]!=""): + f2 = os.path.splitext(f2)[0] + outfile = os.path.join(objstore,"{}{}".format(f2,objext)) + + ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +def msvc_incremental_compile(compiler,srcfile,**kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + if(not('srcfileflag' in kwargs)): + srcfileflag = '/c' + else: + srcfileflag = kwargs['srcfileflag'] + if(not('outfileflag' in kwargs)): + outfileflag = '/Fo' + else: + outfileflag = kwargs['outfileflag'] + + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + #incrementalcompile + if("searchincdirs" in kwargs): + incdirs = kwargs["searchincdirs"] + else: + incdirs = ["./include"] + + if("objstore" in kwargs): + objstore = kwargs["objstore"] + else: + objstore = "./objstore" + + #Do I want to make this thing this general? + + docompile = decidecompile(srcfile,**kwargs) + + if(docompile): + f1 = os.path.split(srcfile)[1] + f2 = f1 + while(os.path.splitext(f2)[1]!=""): + f2 = os.path.splitext(f2)[0] + outfile = os.path.join(objstore,"{}{}".format(f2,objext)) + outfile = os.path.normpath(outfile) + + ln = compiler+" "+flags+" "+srcfileflag+" "+srcfile+" "+ outfileflag+'"'+outfile+'"' + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + + +def gs_incremental_compile_list(compiler,srclist,**kwargs): + + for s in srclist: + gs_incremental_compile(compiler,s,**kwargs) + + return + +def msvc_incremental_compile_list(compiler,srclist,**kwargs): + + for s in srclist: + msvc_incremental_compile(compiler,s,**kwargs) + + return + +####################### +## Main Script Tests ## +####################### + +def testtimes(args): + if(len(args)>=2): + flist = getsrcandincludes(args[1],["./include"]) + ftlist = getsrctimes(args[1],["./include"]) + for I in range(0,len(flist)): + print("{}\t\t{}".format(flist[I],ftlist[I])) + + print("associated obj file:") + fobj = getobjfile(args[1],"./objstore") + ftobj = getobjtime(args[1],"./objstore") + if(fobj!=""): + print("{}\t\t{}".format(fobj,ftobj)) + else: + print("none found") + + cflag = decidecompile(args[1]) + print("compile? : {}".format(cflag)) + + + return + +# if(__name__ == "__main__"): + +# args = sys.argv +# testtimes(args) + + + + \ No newline at end of file diff --git a/build/make.linux64.lib.py b/build/make.linux64.lib.py new file mode 100644 index 0000000..07bb2fa --- /dev/null +++ b/build/make.linux64.lib.py @@ -0,0 +1,58 @@ +#!/usr/bin/python3 + +import os,sys,math +import subprocess +import shutil +from shutil import copytree + +from amsbuildlib4 import * + +libname = "amsculib2.linux64" #static library name to generate +binname = "test" #create this executable when compiling main.c or main.cpp +commondir = "../../linux64" #common directory to pul libraries and includes from +depdir = "./dependencies/linux64" #local pre-compiled dependency libraries and their includes +installdir = "../../linux64" #directory to install to when finished +builddir = "./build_linux64" + +doinstall = True #copies the build_output to the install dir when finished +cc = "nvcc" #compiler +cflags = "-dc --compiler-options '-fPIC -O3'" +libraries = "-l{}".format(libname) +libdirs = "-L{} -L{}/lib -L{}/lib".format(builddir,commondir,depdir) +linkerflags = " -Xlinker=-rpath,." +srcexts = [".c",".cpp",".cu"] +binsrc = ["main.c","main.cpp", "main.cu"] #ignore these files when compiling the static library + +#keyword list to control the compilers/linkers +kwargs = dict() +include = "-I./include -I{}/include -I{}/include".format(commondir, depdir) +kwargs["include"] = include +kwargs["flags"] = cflags +kwargs["libdir"] = libdirs +kwargs["libflags"] = libraries +kwargs["linkerflags"] = linkerflags +kwargs["recurse"] = True +kwargs["objstore"] = "{}/objstore".format(builddir) +kwargs["searchincdirs"] = "./include" + +#Find all source files, except the main project files +srcfiles = flist('./src',exts = srcexts, recurse=True) +srcfiles = except_contains(srcfiles,binsrc) + +#compile all the source files in the list +#gs_compile_list(cc,files,**kwargs) +gs_incremental_compile_list(cc,srcfiles,**kwargs) + +#archive all the source files into a static library +objlist = flist(kwargs['objstore'],exts='.o',recurse=True) +ar_list(objlist,'{}/lib{}.a'.format(builddir,libname)) + +if(doinstall): + #Push any libraries to the common lib folder + shutil.copy( + '{}/lib{}.a'.format(builddir,libname), + "{}/lib".format(installdir) + ) + + #Copy include files to the common include folder + copytree('./include/',installdir+'/include/',dirs_exist_ok=True) diff --git a/build/make.linux64.test.py b/build/make.linux64.test.py new file mode 100644 index 0000000..207b41b --- /dev/null +++ b/build/make.linux64.test.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 + +import os,sys,math +import subprocess +import shutil +from shutil import copytree + +from amsbuildlib4 import * + +libname = "amsculib2.linux64" #static library name to generate +binname = "test" #create this executable when compiling main.c or main.cpp +commondir = "../../linux64" #common directory to pul libraries and includes from +depdir = "./dependencies/linux64" #local pre-compiled dependency libraries and their includes +installdir = "../../linux64" #directory to install to when finished +builddir = "./build_linux64" + +doinstall = True #copies the build_output to the install dir when finished +cc = "nvcc" #compiler +cflags = "-dc --compiler-options '-fPIC -O3'" +libraries = "-l{}".format(libname) +libdirs = "-L{} -L{}/lib -L{}/lib".format(builddir,commondir,depdir) +linkerflags = " -Xlinker=-rpath,." +srcexts = [".c",".cpp",".cu"] +binsrc = ["main.c","main.cpp", "main.cu"] #ignore these files when compiling the static library + +#keyword list to control the compilers/linkers +kwargs = dict() +include = "-I./include -I{}/include -I{}/include".format(commondir, depdir) +kwargs["include"] = include +kwargs["flags"] = cflags +kwargs["libdir"] = libdirs +kwargs["libflags"] = libraries +kwargs["linkerflags"] = linkerflags +kwargs["recurse"] = True +kwargs["objstore"] = "{}/objstore".format(builddir) +kwargs["searchincdirs"] = "./include" + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/lib/libcamsimg3.linux64.so'.format(commondir),builddir); +#shutil.copy('{}/lib/libamsimg.dll'.format(commondir),builddir); +#shutil.copy('{}/lib/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cu'] +fobj = replaceexts(fsrc,'.o') + +#Compile test programs +gs_compile_list(cc,fsrc,**kwargs) +gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(builddir,binname),**kwargs) diff --git a/build/make.msvc64.lib.py b/build/make.msvc64.lib.py new file mode 100644 index 0000000..19095fa --- /dev/null +++ b/build/make.msvc64.lib.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 + +import os,sys,math +import subprocess +import shutil +from shutil import copytree + +from amsbuildlib4 import * + +libname = "amsculib2.msvc64" #static library name to generate +binname = "test" #create this executable when compiling main.c or main.cpp +commondir = "../../winx64" #common directory to pul libraries and includes from +depdir = "./dependencies/winx64" #local pre-compiled dependency libraries and their includes +installdir = "../../winx64" #directory to install to when finished +builddir = "./build_msvc64" + +doinstall = True #copies the build_output to the install dir when finished +cc = "nvcc" #compiler +cflags = "-dc --compiler-options '-fPIC -O3'" +libraries = "-l{}".format(libname) +libdirs = "-L{} -L{}/lib -L{}/lib".format(builddir,commondir,depdir) +linkerflags = " -Xlinker=-rpath,." +srcexts = [".c",".cpp",".cu"] +binsrc = ["main.c","main.cpp","main.cu"] #ignore these files when compiling the static library + + +#keyword list to control the compilers/linkers +kwargs = dict() +include = "-I./include -I{}/include -I{}/include".format(commondir, depdir) +kwargs["include"] = include +kwargs["flags"] = cflags +kwargs["libdir"] = libdirs +kwargs["libflags"] = libraries +kwargs["linkerflags"] = linkerflags +kwargs["recurse"] = True +kwargs["objstore"] = "{}/objstore".format(builddir) +kwargs["searchincdirs"] = "./include" +kwargs["objext"] = ".obj" + + +#Find all source files, except the main project files +srcfiles = flist('./src',exts = srcexts, recurse=True) +srcfiles = except_contains(srcfiles,binsrc) + +#compile all the source files in the list +#gs_compile_list(cc,files,**kwargs) +msvc_incremental_compile_list(cc,srcfiles,**kwargs) + +#archive all the source files into a static library +objlist = flist(kwargs['objstore'],exts='.obj',recurse=True) +msvc_lib_list(objlist,'{}/lib{}.lib'.format(builddir,libname)) + +if(doinstall): + #Push any libraries to the common lib folder + shutil.copy( + '{}/lib{}.lib'.format(builddir,libname), + "{}/lib".format(installdir) + ) + + #Copy include files to the common include folder + copytree('./include/',installdir+'/include/',dirs_exist_ok=True) diff --git a/build/make.msvc64.test.py b/build/make.msvc64.test.py new file mode 100644 index 0000000..20dc30f --- /dev/null +++ b/build/make.msvc64.test.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 + +import os,sys,math +import subprocess +import shutil +from shutil import copytree + +from amsbuildlib4 import * + +libname = "amsculib2.msvc64" #static library name to generate +binname = "test.exe" #create this executable when compiling main.c or main.cpp +commondir = "../../winx64" #common directory to pul libraries and includes from +depdir = "./dependencies/winx64" #local pre-compiled dependency libraries and their includes +installdir = "../../winx64" #directory to install to when finished +builddir = "./build_msvc64" + +doinstall = False #copies the build_output to the install dir when finished +cc = "nvcc" #compiler +cflags = "-dc --compiler-options '-fPIC -O3'" +libraries = "-l{}".format(libname) +libdirs = "-L{} -L{}/lib -L{}/lib".format(builddir,commondir,depdir) +linkerflags = " -Xlinker=-rpath,." +srcexts = [".c",".cpp",".cu"] +binsrc = ["main.c","main.cpp","main.cu"] #ignore these files when compiling the static library + +#keyword list to control the compilers/linkers +kwargs = dict() +include = "-I./include -I{}/include -I{}/include".format(commondir, depdir) +kwargs["include"] = include +kwargs["flags"] = cflags +kwargs["libdir"] = libdirs +kwargs["libflags"] = libraries +kwargs["linkerflags"] = linkerflags +kwargs["recurse"] = True +kwargs["objstore"] = "{}/objstore".format(builddir) +kwargs["searchincdirs"] = "./include" + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/lib/libcamsimg3.linux64.so'.format(commondir),builddir); +#shutil.copy('{}/lib/libamsimg.dll'.format(commondir),builddir); +#shutil.copy('{}/lib/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cpp'] +fobj = replaceexts(fsrc,'.obj') + +#Compile test programs +msvc_compile_list(cc,fsrc,**kwargs) +msvc_link_list(list_to_sss(fobj),'{}/{}'.format(builddir,binname),**kwargs) diff --git a/build_linux64/libamsculib2.linux64.a b/build_linux64/libamsculib2.linux64.a new file mode 100644 index 0000000..5f75b23 Binary files /dev/null and b/build_linux64/libamsculib2.linux64.a differ diff --git a/build_linux64/objstore/.placeholder b/build_linux64/objstore/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/build_linux64/objstore/amscu_comp128.o b/build_linux64/objstore/amscu_comp128.o new file mode 100644 index 0000000..39a52a5 Binary files /dev/null and b/build_linux64/objstore/amscu_comp128.o differ diff --git a/build_linux64/objstore/amscu_comp64.o b/build_linux64/objstore/amscu_comp64.o new file mode 100644 index 0000000..48de632 Binary files /dev/null and b/build_linux64/objstore/amscu_comp64.o differ diff --git a/build_linux64/objstore/amscu_cudafunctions.o b/build_linux64/objstore/amscu_cudafunctions.o new file mode 100644 index 0000000..9d06fed Binary files /dev/null and b/build_linux64/objstore/amscu_cudafunctions.o differ diff --git a/build_linux64/objstore/amscu_random.o b/build_linux64/objstore/amscu_random.o new file mode 100644 index 0000000..a6c8741 Binary files /dev/null and b/build_linux64/objstore/amscu_random.o differ diff --git a/build_linux64/objstore/amscuarray.o b/build_linux64/objstore/amscuarray.o new file mode 100644 index 0000000..38ca069 Binary files /dev/null and b/build_linux64/objstore/amscuarray.o differ diff --git a/build_linux64/objstore/amscuarray_dops.o b/build_linux64/objstore/amscuarray_dops.o new file mode 100644 index 0000000..5a7cacd Binary files /dev/null and b/build_linux64/objstore/amscuarray_dops.o differ diff --git a/build_linux64/objstore/amscugeom.o b/build_linux64/objstore/amscugeom.o new file mode 100644 index 0000000..9efd47c Binary files /dev/null and b/build_linux64/objstore/amscugeom.o differ diff --git a/build_linux64/objstore/amsculib2.o b/build_linux64/objstore/amsculib2.o new file mode 100644 index 0000000..1d1854e Binary files /dev/null and b/build_linux64/objstore/amsculib2.o differ diff --git a/build_linux64/objstore/amscumath.o b/build_linux64/objstore/amscumath.o new file mode 100644 index 0000000..fbaee69 Binary files /dev/null and b/build_linux64/objstore/amscumath.o differ diff --git a/build_linux64/objstore/amscurarray.o b/build_linux64/objstore/amscurarray.o new file mode 100644 index 0000000..152c4cc Binary files /dev/null and b/build_linux64/objstore/amscurarray.o differ diff --git a/build_linux64/objstore/cuvect2.o b/build_linux64/objstore/cuvect2.o new file mode 100644 index 0000000..3257a2a Binary files /dev/null and b/build_linux64/objstore/cuvect2.o differ diff --git a/build_linux64/objstore/cuvect2f.o b/build_linux64/objstore/cuvect2f.o new file mode 100644 index 0000000..61b3942 Binary files /dev/null and b/build_linux64/objstore/cuvect2f.o differ diff --git a/build_linux64/objstore/cuvect3.o b/build_linux64/objstore/cuvect3.o new file mode 100644 index 0000000..522f52a Binary files /dev/null and b/build_linux64/objstore/cuvect3.o differ diff --git a/build_linux64/objstore/cuvect3f.o b/build_linux64/objstore/cuvect3f.o new file mode 100644 index 0000000..0e76717 Binary files /dev/null and b/build_linux64/objstore/cuvect3f.o differ diff --git a/build_linux64/objstore/cuvect4.o b/build_linux64/objstore/cuvect4.o new file mode 100644 index 0000000..f41e97c Binary files /dev/null and b/build_linux64/objstore/cuvect4.o differ diff --git a/build_linux64/objstore/cuvect4f.o b/build_linux64/objstore/cuvect4f.o new file mode 100644 index 0000000..688a2b0 Binary files /dev/null and b/build_linux64/objstore/cuvect4f.o differ diff --git a/build_linux64/test b/build_linux64/test new file mode 100644 index 0000000..143ca5f Binary files /dev/null and b/build_linux64/test differ diff --git a/build_msvc64/objstore/.placeholder b/build_msvc64/objstore/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/include/amsculib2/amscu_comp128.hpp b/include/amsculib2/amscu_comp128.hpp index 076c0e4..05bb2c6 100644 --- a/include/amsculib2/amscu_comp128.hpp +++ b/include/amsculib2/amscu_comp128.hpp @@ -1,89 +1,89 @@ -#ifndef __AMSCU_COMP128_HPP__ -#define __AMSCU_COMP128_HPP__ - -namespace amscuda -{ -namespace cmp -{ - - class cucomp128 - { - public: - double real; - double imag; - - __host__ __device__ cucomp128(); - __host__ __device__ ~cucomp128(); - __host__ __device__ cucomp128(const cucomp128 &other); - __host__ __device__ cucomp128(const double &other); - - __host__ __device__ cucomp128& operator=(cucomp128& other); - __host__ __device__ const cucomp128& operator=(const cucomp128& other); - __host__ __device__ cucomp128& operator=(double& other); - __host__ __device__ const cucomp128& operator=(const double& other); - - __host__ __device__ double& operator[](int& ind); - __host__ __device__ const double& operator[](const int& ind) const; - - __host__ __device__ cucomp128 operator+(const cucomp128& z); - __host__ __device__ cucomp128 operator-(const cucomp128& z); - __host__ __device__ cucomp128 operator*(const cucomp128& z); - __host__ __device__ cucomp128 operator/(const cucomp128& z); - - __host__ __device__ cucomp128 operator+(const double& z); - __host__ __device__ cucomp128 operator-(const double& z); - __host__ __device__ cucomp128 operator*(const double& z); - __host__ __device__ cucomp128 operator/(const double& z); - - __host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign - - //comparison operators - __host__ __device__ bool operator==(const cucomp128& z) const; - __host__ __device__ bool operator!=(const cucomp128& z) const; - __host__ __device__ bool operator>(const cucomp128& z) const; - __host__ __device__ bool operator<(const cucomp128& z) const; - __host__ __device__ bool operator>=(const cucomp128& z) const; - __host__ __device__ bool operator<=(const cucomp128& z) const; - - __host__ __device__ bool isnan() const; - __host__ __device__ bool isinf() const; - - __host__ __device__ bool isreal() const; - __host__ __device__ bool isimag() const; - __host__ __device__ bool iszero() const; - __host__ __device__ double arg() const; - __host__ __device__ double mag() const; - __host__ __device__ cucomp128 conj() const; - }; - - __host__ __device__ double arg(cucomp128 z); - - __host__ __device__ cucomp128 dtocomp(double _r, double _i); - __host__ __device__ double real(cucomp128 z); - __host__ __device__ double imag(cucomp128 z); - __host__ __device__ cucomp128 sin(cucomp128 z); - __host__ __device__ cucomp128 cos(cucomp128 z); - __host__ __device__ cucomp128 tan(cucomp128 z); - __host__ __device__ cucomp128 exp(cucomp128 z); - __host__ __device__ cucomp128 log(cucomp128 z); - __host__ __device__ double abs(cucomp128 z); - __host__ __device__ cucomp128 conj(cucomp128 z); - - // //need hyperbolic trig Functions - __host__ __device__ cucomp128 cosh(cucomp128 z); - __host__ __device__ cucomp128 sinh(cucomp128 z); - __host__ __device__ cucomp128 tanh(cucomp128 z); - - __host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2); - - // //returns "complex sign" of complex number - 0, or a unit number with same argument - __host__ __device__ cucomp128 csgn(cucomp128 z); - -void test_cucomp128_1(); - - -}; //end namespace cmp -}; //end namespace amscuda - -#endif - +#ifndef __AMSCU_COMP128_HPP__ +#define __AMSCU_COMP128_HPP__ + +namespace amscuda +{ +namespace cmp +{ + + class cucomp128 + { + public: + double real; + double imag; + + __host__ __device__ cucomp128(); + __host__ __device__ ~cucomp128(); + __host__ __device__ cucomp128(const cucomp128 &other); + __host__ __device__ cucomp128(const double &other); + + __host__ __device__ cucomp128& operator=(cucomp128& other); + __host__ __device__ const cucomp128& operator=(const cucomp128& other); + __host__ __device__ cucomp128& operator=(double& other); + __host__ __device__ const cucomp128& operator=(const double& other); + + __host__ __device__ double& operator[](int& ind); + __host__ __device__ const double& operator[](const int& ind) const; + + __host__ __device__ cucomp128 operator+(const cucomp128& z); + __host__ __device__ cucomp128 operator-(const cucomp128& z); + __host__ __device__ cucomp128 operator*(const cucomp128& z); + __host__ __device__ cucomp128 operator/(const cucomp128& z); + + __host__ __device__ cucomp128 operator+(const double& z); + __host__ __device__ cucomp128 operator-(const double& z); + __host__ __device__ cucomp128 operator*(const double& z); + __host__ __device__ cucomp128 operator/(const double& z); + + __host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign + + //comparison operators + __host__ __device__ bool operator==(const cucomp128& z) const; + __host__ __device__ bool operator!=(const cucomp128& z) const; + __host__ __device__ bool operator>(const cucomp128& z) const; + __host__ __device__ bool operator<(const cucomp128& z) const; + __host__ __device__ bool operator>=(const cucomp128& z) const; + __host__ __device__ bool operator<=(const cucomp128& z) const; + + __host__ __device__ bool isnan() const; + __host__ __device__ bool isinf() const; + + __host__ __device__ bool isreal() const; + __host__ __device__ bool isimag() const; + __host__ __device__ bool iszero() const; + __host__ __device__ double arg() const; + __host__ __device__ double mag() const; + __host__ __device__ cucomp128 conj() const; + }; + + __host__ __device__ double arg(cucomp128 z); + + __host__ __device__ cucomp128 dtocomp(double _r, double _i); + __host__ __device__ double real(cucomp128 z); + __host__ __device__ double imag(cucomp128 z); + __host__ __device__ cucomp128 sin(cucomp128 z); + __host__ __device__ cucomp128 cos(cucomp128 z); + __host__ __device__ cucomp128 tan(cucomp128 z); + __host__ __device__ cucomp128 exp(cucomp128 z); + __host__ __device__ cucomp128 log(cucomp128 z); + __host__ __device__ double abs(cucomp128 z); + __host__ __device__ cucomp128 conj(cucomp128 z); + + // //need hyperbolic trig Functions + __host__ __device__ cucomp128 cosh(cucomp128 z); + __host__ __device__ cucomp128 sinh(cucomp128 z); + __host__ __device__ cucomp128 tanh(cucomp128 z); + + __host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2); + + // //returns "complex sign" of complex number - 0, or a unit number with same argument + __host__ __device__ cucomp128 csgn(cucomp128 z); + +void test_cucomp128_1(); + + +}; //end namespace cmp +}; //end namespace amscuda + +#endif + diff --git a/include/amsculib2/amscu_comp64.hpp b/include/amsculib2/amscu_comp64.hpp index bc4430b..53c86c0 100644 --- a/include/amsculib2/amscu_comp64.hpp +++ b/include/amsculib2/amscu_comp64.hpp @@ -1,88 +1,88 @@ -#ifndef __AMSCU_COMP64_HPP__ -#define __AMSCU_COMP64_HPP__ - -namespace amscuda -{ -namespace cmp -{ - - class cucomp64 - { - public: - float real; - float imag; - - __host__ __device__ cucomp64(); - __host__ __device__ ~cucomp64(); - __host__ __device__ cucomp64(const cucomp64 &other); - __host__ __device__ cucomp64(const float &other); - - __host__ __device__ cucomp64& operator=(cucomp64& other); - __host__ __device__ const cucomp64& operator=(const cucomp64& other); - __host__ __device__ cucomp64& operator=(float& other); - __host__ __device__ const cucomp64& operator=(const float& other); - - __host__ __device__ float& operator[](int& ind); - __host__ __device__ const float& operator[](const int& ind) const; - - __host__ __device__ cucomp64 operator+(const cucomp64& z); - __host__ __device__ cucomp64 operator-(const cucomp64& z); - __host__ __device__ cucomp64 operator*(const cucomp64& z); - __host__ __device__ cucomp64 operator/(const cucomp64& z); - - __host__ __device__ cucomp64 operator+(const float& z); - __host__ __device__ cucomp64 operator-(const float& z); - __host__ __device__ cucomp64 operator*(const float& z); - __host__ __device__ cucomp64 operator/(const float& z); - - __host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign - - //comparison operators - __host__ __device__ bool operator==(const cucomp64& z) const; - __host__ __device__ bool operator!=(const cucomp64& z) const; - __host__ __device__ bool operator>(const cucomp64& z) const; - __host__ __device__ bool operator<(const cucomp64& z) const; - __host__ __device__ bool operator>=(const cucomp64& z) const; - __host__ __device__ bool operator<=(const cucomp64& z) const; - - __host__ __device__ bool isnan() const; - __host__ __device__ bool isinf() const; - - __host__ __device__ bool isreal() const; - __host__ __device__ bool isimag() const; - __host__ __device__ bool iszero() const; - __host__ __device__ float arg() const; - __host__ __device__ float mag() const; - __host__ __device__ cucomp64 conj() const; - }; - - __host__ __device__ float arg(cucomp64 z); - - __host__ __device__ cucomp64 dtocomp64(float _r, float _i); - __host__ __device__ float real(cucomp64 z); - __host__ __device__ float imag(cucomp64 z); - __host__ __device__ cucomp64 sin(cucomp64 z); - __host__ __device__ cucomp64 cos(cucomp64 z); - __host__ __device__ cucomp64 tan(cucomp64 z); - __host__ __device__ cucomp64 exp(cucomp64 z); - __host__ __device__ cucomp64 log(cucomp64 z); - __host__ __device__ float abs(cucomp64 z); - __host__ __device__ cucomp64 conj(cucomp64 z); - - // //need hyperbolic trig Functions - __host__ __device__ cucomp64 cosh(cucomp64 z); - __host__ __device__ cucomp64 sinh(cucomp64 z); - __host__ __device__ cucomp64 tanh(cucomp64 z); - - __host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2); - - // //returns "complex sign" of complex number - 0, or a unit number with same argument - __host__ __device__ cucomp64 csgn(cucomp64 z); - -void test_cucomp64_1(); - -}; //end namespace cmp -}; //end namespace amscuda - -#endif - +#ifndef __AMSCU_COMP64_HPP__ +#define __AMSCU_COMP64_HPP__ + +namespace amscuda +{ +namespace cmp +{ + + class cucomp64 + { + public: + float real; + float imag; + + __host__ __device__ cucomp64(); + __host__ __device__ ~cucomp64(); + __host__ __device__ cucomp64(const cucomp64 &other); + __host__ __device__ cucomp64(const float &other); + + __host__ __device__ cucomp64& operator=(cucomp64& other); + __host__ __device__ const cucomp64& operator=(const cucomp64& other); + __host__ __device__ cucomp64& operator=(float& other); + __host__ __device__ const cucomp64& operator=(const float& other); + + __host__ __device__ float& operator[](int& ind); + __host__ __device__ const float& operator[](const int& ind) const; + + __host__ __device__ cucomp64 operator+(const cucomp64& z); + __host__ __device__ cucomp64 operator-(const cucomp64& z); + __host__ __device__ cucomp64 operator*(const cucomp64& z); + __host__ __device__ cucomp64 operator/(const cucomp64& z); + + __host__ __device__ cucomp64 operator+(const float& z); + __host__ __device__ cucomp64 operator-(const float& z); + __host__ __device__ cucomp64 operator*(const float& z); + __host__ __device__ cucomp64 operator/(const float& z); + + __host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign + + //comparison operators + __host__ __device__ bool operator==(const cucomp64& z) const; + __host__ __device__ bool operator!=(const cucomp64& z) const; + __host__ __device__ bool operator>(const cucomp64& z) const; + __host__ __device__ bool operator<(const cucomp64& z) const; + __host__ __device__ bool operator>=(const cucomp64& z) const; + __host__ __device__ bool operator<=(const cucomp64& z) const; + + __host__ __device__ bool isnan() const; + __host__ __device__ bool isinf() const; + + __host__ __device__ bool isreal() const; + __host__ __device__ bool isimag() const; + __host__ __device__ bool iszero() const; + __host__ __device__ float arg() const; + __host__ __device__ float mag() const; + __host__ __device__ cucomp64 conj() const; + }; + + __host__ __device__ float arg(cucomp64 z); + + __host__ __device__ cucomp64 dtocomp64(float _r, float _i); + __host__ __device__ float real(cucomp64 z); + __host__ __device__ float imag(cucomp64 z); + __host__ __device__ cucomp64 sin(cucomp64 z); + __host__ __device__ cucomp64 cos(cucomp64 z); + __host__ __device__ cucomp64 tan(cucomp64 z); + __host__ __device__ cucomp64 exp(cucomp64 z); + __host__ __device__ cucomp64 log(cucomp64 z); + __host__ __device__ float abs(cucomp64 z); + __host__ __device__ cucomp64 conj(cucomp64 z); + + // //need hyperbolic trig Functions + __host__ __device__ cucomp64 cosh(cucomp64 z); + __host__ __device__ cucomp64 sinh(cucomp64 z); + __host__ __device__ cucomp64 tanh(cucomp64 z); + + __host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2); + + // //returns "complex sign" of complex number - 0, or a unit number with same argument + __host__ __device__ cucomp64 csgn(cucomp64 z); + +void test_cucomp64_1(); + +}; //end namespace cmp +}; //end namespace amscuda + +#endif + diff --git a/include/amsculib2/amscu_cudafunctions.hpp b/include/amsculib2/amscu_cudafunctions.hpp index f3aa0da..4f18731 100644 --- a/include/amsculib2/amscu_cudafunctions.hpp +++ b/include/amsculib2/amscu_cudafunctions.hpp @@ -1,40 +1,40 @@ -#ifndef __AMSCU_CUDAFUNCTIONS_HPP__ -#define __AMSCU_CUDAFUNCTIONS_HPP__ - - -namespace amscuda -{ - // device memory operations - // I'm trying to avoid some of the boilerplate mental overhead involved - // in calling cuda functions and handling errors - - //frees devbuffer if it is not already NULL, and sets devbuffer to NULL - //wrapper to cudaFree - template int cuda_free(T **devptr); - - //copies hostbuffer to devbuffer - //initializes devbuffer from NULL if devbuffer is NULL - //if overwrite is true, deletes and reallocates devbuffer on device (for resizing) - template int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite); - - //copies info from devbuffer to hostbuffer - //initialzies hostbuffer from NULL if NULL - //if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing) - template int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite); - - //wrapper for cudaMemcpy - copies an item or struct (count 1) to the device - //initializes devptr from NULL if not already initialized - template int cuda_copytodevice(T *hostptr, T **devptr); - - //wrapper for cudaMemcpy - copies an item or struct (count 1) from device - //initializes hostptr from NULL with new if not already initialized - template int cuda_copyfromdevice(T *devptr, T **hostptr); - - int cuda_errortrap(const char *msgheader); - -}; - -#include - -#endif - +#ifndef __AMSCU_CUDAFUNCTIONS_HPP__ +#define __AMSCU_CUDAFUNCTIONS_HPP__ + + +namespace amscuda +{ + // device memory operations + // I'm trying to avoid some of the boilerplate mental overhead involved + // in calling cuda functions and handling errors + + //frees devbuffer if it is not already NULL, and sets devbuffer to NULL + //wrapper to cudaFree + template int cuda_free(T **devptr); + + //copies hostbuffer to devbuffer + //initializes devbuffer from NULL if devbuffer is NULL + //if overwrite is true, deletes and reallocates devbuffer on device (for resizing) + template int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite); + + //copies info from devbuffer to hostbuffer + //initialzies hostbuffer from NULL if NULL + //if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing) + template int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite); + + //wrapper for cudaMemcpy - copies an item or struct (count 1) to the device + //initializes devptr from NULL if not already initialized + template int cuda_copytodevice(T *hostptr, T **devptr); + + //wrapper for cudaMemcpy - copies an item or struct (count 1) from device + //initializes hostptr from NULL with new if not already initialized + template int cuda_copyfromdevice(T *devptr, T **hostptr); + + int cuda_errortrap(const char *msgheader); + +}; + +#include + +#endif + diff --git a/include/amsculib2/amscu_cudafunctions_impl.hpp b/include/amsculib2/amscu_cudafunctions_impl.hpp index 34b6382..7c3a408 100644 --- a/include/amsculib2/amscu_cudafunctions_impl.hpp +++ b/include/amsculib2/amscu_cudafunctions_impl.hpp @@ -1,228 +1,228 @@ -#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__ -#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__ - -namespace amscuda -{ - -//frees devbuffer if it is not already NULL, and sets devbuffer to NULL -//wrapper to cudaFree -template int cuda_free(T **devptr) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - - if(*devptr==NULL) - { - return ret; //devbuffer is already NULL/freed - } - - err = cudaFree(*devptr); - if(err!=cudaSuccess) - { - ret = -1; //failed to free device pointer - *devptr = NULL; // - ? should only happen if I'm trying to double-free something - } - else - { - ret = 1; - *devptr = NULL; - } - - return ret; -} - -//copies hostbuffer to devbuffer -//initializes devbuffer from NULL if devbuffer is NULL -//if overwrite is true, deletes and reallocates devbuffer on device (for resizing) -template int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - - if(N<=0) - { - ret = 0; - return ret; - } - - if(hostbuffer==NULL) - { - ret = -2; //host buffer is NULL - return ret; - } - - if(overwrite==1) - { - if(*devbuffer !=NULL) - { - cuda_free(devbuffer); - } - } - - if(*devbuffer==NULL) - { - err = cudaMalloc(devbuffer,sizeof(T)*N); - if(err!=cudaSuccess) - { - ret = -3; //failed to allocate - *devbuffer = NULL; - return ret; - } - } - - err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice); - if(err!=cudaSuccess) - { - ret = -4; //failed to copy - } - else - { - ret = 1; - } - - - return ret; -} - -//copies info from devbuffer to hostbuffer -//initialzies hostbuffer from NULL if NULL -//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing) -template int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - - if(N<=0) - { - ret = 0; - return ret; - } - - if(devbuffer==NULL) - { - ret = -5; //null dev buffer - return ret; - } - - if(overwrite==1 && *hostbuffer!=NULL) - { - delete[] (*hostbuffer); hostbuffer = NULL; - } - - if(*hostbuffer==NULL) - { - *hostbuffer = new(std::nothrow) T[N]; - if(*hostbuffer==NULL) - { - ret = -6; //failed to allocate host buffer - return ret; - } - } - - err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost); - if(err!=cudaSuccess) - { - ret = -7; //failed to copy - } - else - { - ret = 1; - } - - return ret; -} - -//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device -//initializes devptr from NULL if not already initialized -template int cuda_copytodevice(T *hostptr, T **devptr) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - bool overwrite = 1; - - if(hostptr==NULL) - { - ret = -2; //host buffer is NULL - return ret; - } - - if(overwrite==1) - { - if(*devptr !=NULL) - { - cuda_free(devptr); - } - } - - if(*devptr==NULL) - { - err = cudaMalloc(devptr,sizeof(T)); - if(err!=cudaSuccess) - { - ret = -3; //failed to allocate - *devptr = NULL; - return ret; - } - } - - err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice); - if(err!=cudaSuccess) - { - ret = -4; //failed to copy - } - else - { - ret = 1; - } - - - return ret; -} - -//wrapper for cudaMemcpy - copies an item or struct (count 1) from device -//initializes hostptr from NULL with new if not already initialized -template int cuda_copyfromdevice(T *devptr, T **hostptr) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - bool overwrite = 1; - - if(devptr==NULL) - { - ret = -5; //null dev buffer - return ret; - } - - if(overwrite==1 && *hostptr!=NULL) - { - delete (*hostptr); hostptr = NULL; - } - - if(*hostptr==NULL) - { - *hostptr = new(std::nothrow) T; - if(*hostptr==NULL) - { - ret = -6; //failed to allocate host buffer - return ret; - } - } - - err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost); - if(err!=cudaSuccess) - { - ret = -7; //failed to copy - } - else - { - ret = 1; - } - - return ret; -} - - -}; - -#endif - +#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__ +#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__ + +namespace amscuda +{ + +//frees devbuffer if it is not already NULL, and sets devbuffer to NULL +//wrapper to cudaFree +template int cuda_free(T **devptr) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + + if(*devptr==NULL) + { + return ret; //devbuffer is already NULL/freed + } + + err = cudaFree(*devptr); + if(err!=cudaSuccess) + { + ret = -1; //failed to free device pointer + *devptr = NULL; // - ? should only happen if I'm trying to double-free something + } + else + { + ret = 1; + *devptr = NULL; + } + + return ret; +} + +//copies hostbuffer to devbuffer +//initializes devbuffer from NULL if devbuffer is NULL +//if overwrite is true, deletes and reallocates devbuffer on device (for resizing) +template int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + + if(N<=0) + { + ret = 0; + return ret; + } + + if(hostbuffer==NULL) + { + ret = -2; //host buffer is NULL + return ret; + } + + if(overwrite==1) + { + if(*devbuffer !=NULL) + { + cuda_free(devbuffer); + } + } + + if(*devbuffer==NULL) + { + err = cudaMalloc(devbuffer,sizeof(T)*N); + if(err!=cudaSuccess) + { + ret = -3; //failed to allocate + *devbuffer = NULL; + return ret; + } + } + + err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice); + if(err!=cudaSuccess) + { + ret = -4; //failed to copy + } + else + { + ret = 1; + } + + + return ret; +} + +//copies info from devbuffer to hostbuffer +//initialzies hostbuffer from NULL if NULL +//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing) +template int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + + if(N<=0) + { + ret = 0; + return ret; + } + + if(devbuffer==NULL) + { + ret = -5; //null dev buffer + return ret; + } + + if(overwrite==1 && *hostbuffer!=NULL) + { + delete[] (*hostbuffer); hostbuffer = NULL; + } + + if(*hostbuffer==NULL) + { + *hostbuffer = new(std::nothrow) T[N]; + if(*hostbuffer==NULL) + { + ret = -6; //failed to allocate host buffer + return ret; + } + } + + err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost); + if(err!=cudaSuccess) + { + ret = -7; //failed to copy + } + else + { + ret = 1; + } + + return ret; +} + +//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device +//initializes devptr from NULL if not already initialized +template int cuda_copytodevice(T *hostptr, T **devptr) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + bool overwrite = 1; + + if(hostptr==NULL) + { + ret = -2; //host buffer is NULL + return ret; + } + + if(overwrite==1) + { + if(*devptr !=NULL) + { + cuda_free(devptr); + } + } + + if(*devptr==NULL) + { + err = cudaMalloc(devptr,sizeof(T)); + if(err!=cudaSuccess) + { + ret = -3; //failed to allocate + *devptr = NULL; + return ret; + } + } + + err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice); + if(err!=cudaSuccess) + { + ret = -4; //failed to copy + } + else + { + ret = 1; + } + + + return ret; +} + +//wrapper for cudaMemcpy - copies an item or struct (count 1) from device +//initializes hostptr from NULL with new if not already initialized +template int cuda_copyfromdevice(T *devptr, T **hostptr) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + bool overwrite = 1; + + if(devptr==NULL) + { + ret = -5; //null dev buffer + return ret; + } + + if(overwrite==1 && *hostptr!=NULL) + { + delete (*hostptr); hostptr = NULL; + } + + if(*hostptr==NULL) + { + *hostptr = new(std::nothrow) T; + if(*hostptr==NULL) + { + ret = -6; //failed to allocate host buffer + return ret; + } + } + + err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost); + if(err!=cudaSuccess) + { + ret = -7; //failed to copy + } + else + { + ret = 1; + } + + return ret; +} + + +}; + +#endif + diff --git a/include/amsculib2/amscu_random.hpp b/include/amsculib2/amscu_random.hpp index 5ec2fba..e974e47 100644 --- a/include/amsculib2/amscu_random.hpp +++ b/include/amsculib2/amscu_random.hpp @@ -1,55 +1,55 @@ -#ifndef __AMSCU_RANDOM_HPP__ -#define __AMSCU_RANDOM_HPP__ - -namespace amscuda -{ - -// Random Number Gerneators - - -// faster floating point hash function used in fractal generators -__device__ __host__ float fhash1d_su(float x); - -__device__ __host__ float fhash3d_su(float x, float y, float z); - -__device__ __host__ float fhash4d_su(float x, float y, float z, float w); - - -////////////////////////////////////////////////// -// Deterministic Pseudorandom int32_t Generator // -////////////////////////////////////////////////// - -//Next seed in simple 32 bit integer deterministic psuedo-rand generator -__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout); - -//Simple 32 bit integer deterministic pseudo-random generator -// *not* for cryptography -// Frequency of generated floats should be uniform [0,1) -__host__ __device__ float dpr32_randf(int32_t *rseed_inout); - -//box muller standard normal pseudorandom variable -__host__ __device__ float dpr32_randnf(int32_t *rseed_inout); - -////////////////////////////////////////////////// -// Deterministic Pseudorandom int64_t Generator // -////////////////////////////////////////////////// - -//operates without side-effects on explicit seed for threaded use -//deterministic pseudorandom number generator - takes seed and returns next seed -__host__ __device__ void dpr64_nextseed(int64_t *seedinout); - -//deterministic pseudorandom number generator - takes seed and returns next seed -//returns uniformly distributed double -__host__ __device__ double dpr64_randd(int64_t *seedinout); - -__host__ __device__ float dpr64_randf(int64_t *seedinout); - - -void test_dprg64(); -void test_dprg32(); - - -}; //end namespace amscuda - -#endif - +#ifndef __AMSCU_RANDOM_HPP__ +#define __AMSCU_RANDOM_HPP__ + +namespace amscuda +{ + +// Random Number Gerneators + + +// faster floating point hash function used in fractal generators +__device__ __host__ float fhash1d_su(float x); + +__device__ __host__ float fhash3d_su(float x, float y, float z); + +__device__ __host__ float fhash4d_su(float x, float y, float z, float w); + + +////////////////////////////////////////////////// +// Deterministic Pseudorandom int32_t Generator // +////////////////////////////////////////////////// + +//Next seed in simple 32 bit integer deterministic psuedo-rand generator +__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout); + +//Simple 32 bit integer deterministic pseudo-random generator +// *not* for cryptography +// Frequency of generated floats should be uniform [0,1) +__host__ __device__ float dpr32_randf(int32_t *rseed_inout); + +//box muller standard normal pseudorandom variable +__host__ __device__ float dpr32_randnf(int32_t *rseed_inout); + +////////////////////////////////////////////////// +// Deterministic Pseudorandom int64_t Generator // +////////////////////////////////////////////////// + +//operates without side-effects on explicit seed for threaded use +//deterministic pseudorandom number generator - takes seed and returns next seed +__host__ __device__ void dpr64_nextseed(int64_t *seedinout); + +//deterministic pseudorandom number generator - takes seed and returns next seed +//returns uniformly distributed double +__host__ __device__ double dpr64_randd(int64_t *seedinout); + +__host__ __device__ float dpr64_randf(int64_t *seedinout); + + +void test_dprg64(); +void test_dprg32(); + + +}; //end namespace amscuda + +#endif + diff --git a/include/amsculib2/amscuarray.hpp b/include/amsculib2/amscuarray.hpp index 9049ad2..e7edb6d 100644 --- a/include/amsculib2/amscuarray.hpp +++ b/include/amsculib2/amscuarray.hpp @@ -1,47 +1,47 @@ -#ifndef __CUARRAY_HPP__ -#define __CUARRAY_HPP__ - -namespace amscuda -{ - -template class cuarray -{ -public: - int length; - T* data; - - __device__ __host__ cuarray(); - __device__ __host__ ~cuarray(); - - //Only call this on the device for thread/block local - // dynamic arrays - __device__ __host__ int resize(const int _length); - - __device__ __host__ int size() const; - __device__ __host__ T& at(const int I); - __device__ __host__ const T& at(const int I) const; - - __device__ __host__ T& operator[](const int I); - __device__ __host__ const T& operator[](const int I) const; - - - - __host__ int device_send(cuarray **dptr); - __host__ int _device_send_overwrite(cuarray **dptr); - __host__ int _device_send_copy(cuarray *dptr); - - __host__ int device_pull(cuarray *dptr); - __host__ int device_free(cuarray **dptr); - - __host__ int device_length(cuarray *dptr); - __host__ T* device_data_ptr(cuarray *dptr); - -}; - -void test_cuarray(); - -}; - -#include - +#ifndef __CUARRAY_HPP__ +#define __CUARRAY_HPP__ + +namespace amscuda +{ + +template class cuarray +{ +public: + int length; + T* data; + + __device__ __host__ cuarray(); + __device__ __host__ ~cuarray(); + + //Only call this on the device for thread/block local + // dynamic arrays + __device__ __host__ int resize(const int _length); + + __device__ __host__ int size() const; + __device__ __host__ T& at(const int I); + __device__ __host__ const T& at(const int I) const; + + __device__ __host__ T& operator[](const int I); + __device__ __host__ const T& operator[](const int I) const; + + + + __host__ int device_send(cuarray **dptr); + __host__ int _device_send_overwrite(cuarray **dptr); + __host__ int _device_send_copy(cuarray *dptr); + + __host__ int device_pull(cuarray *dptr); + __host__ int device_free(cuarray **dptr); + + __host__ int device_length(cuarray *dptr); + __host__ T* device_data_ptr(cuarray *dptr); + +}; + +void test_cuarray(); + +}; + +#include + #endif \ No newline at end of file diff --git a/include/amsculib2/amscuarray_dops.hpp b/include/amsculib2/amscuarray_dops.hpp index 2d3f1fe..eff18a6 100644 --- a/include/amsculib2/amscuarray_dops.hpp +++ b/include/amsculib2/amscuarray_dops.hpp @@ -1,76 +1,76 @@ -#ifndef __AMSCUARRAY_DOPS_HPP__ -#define __AMSCUARRAY_DOPS_HPP__ - -//Device Operations on Arrays -// - -//Device Operations on Device Buffers -// dodb - -namespace amscuda -{ - - - //sum - template T devcuarray_sum(cuarray *devptr); - - template T dbuff_sum(T *devbuffer, int N); - - - struct dbuff_statstruct - { - public: - float min; - float max; - float mean; - float stdev; - float sum; - }; - - //stats (min,max,mean,stdev) - - template void dbuff_minmax(T *devbuffer, int N, T *min, T *max); - - template dbuff_statstruct dbuff_stats(T *devbuffer, int N); // - - //sets all elements to setto - template void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads); - - //random device buffer functions - void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); // - void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); // - - - void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads); // - - //Elementwise device-buffer vector binary operation - //takes two input arrays ( , ) --> one output array - template void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads); - - //Elementwise device-buffer vector two-parameter operation - //takes one input array, and a constant paramter ( ) ---> one output array - template void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads); - - - //vector_add - template void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); - template void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); - - - // Tests // - - void test_dbuff_rand_dpr32(); - -}; - -#include - -#endif - +#ifndef __AMSCUARRAY_DOPS_HPP__ +#define __AMSCUARRAY_DOPS_HPP__ + +//Device Operations on Arrays +// + +//Device Operations on Device Buffers +// dodb + +namespace amscuda +{ + + + //sum + template T devcuarray_sum(cuarray *devptr); + + template T dbuff_sum(T *devbuffer, int N); + + + struct dbuff_statstruct + { + public: + float min; + float max; + float mean; + float stdev; + float sum; + }; + + //stats (min,max,mean,stdev) + + template void dbuff_minmax(T *devbuffer, int N, T *min, T *max); + + template dbuff_statstruct dbuff_stats(T *devbuffer, int N); // + + //sets all elements to setto + template void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads); + + //random device buffer functions + void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); // + void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); // + + + void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads); // + + //Elementwise device-buffer vector binary operation + //takes two input arrays ( , ) --> one output array + template void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads); + + //Elementwise device-buffer vector two-parameter operation + //takes one input array, and a constant paramter ( ) ---> one output array + template void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads); + + + //vector_add + template void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads); + template void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads); + + + // Tests // + + void test_dbuff_rand_dpr32(); + +}; + +#include + +#endif + diff --git a/include/amsculib2/amscuarray_dops_impl.hpp b/include/amsculib2/amscuarray_dops_impl.hpp index 2cd7e88..b666b3f 100644 --- a/include/amsculib2/amscuarray_dops_impl.hpp +++ b/include/amsculib2/amscuarray_dops_impl.hpp @@ -1,404 +1,404 @@ -#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__ -#define __AMSCUARRAY_DOPS_IMPL_HPP__ - -namespace amscuda -{ - -template __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - T ret = (T) 0; - for(I=I0;I T devcuarray_sum(cuarray *devptr) -{ - T ret = T(); - cudaError_t err = cudaSuccess; - - cuarray ldptr; - - cudaMemcpy(&ldptr,devptr,sizeof(cuarray),cudaMemcpyDeviceToHost); - - ret = devbuffer_sum(ldptr.data,ldptr.length); - - ldptr.data = NULL; - ldptr.length=0; - - return ret; -} - -template T dbuff_sum(T *dbuff, int N) -{ - int I; - T ret = T(); - cudaError_t err = cudaSuccess; - - int nblocks; - int nthreads; - - if(dbuff==NULL || N<=0) - { - return ret; - } - - if(N>100) - { - nblocks = 10; - nthreads = (int)sqrt((float) (N/nblocks)); - if(nthreads<=0) nthreads=1; - if(nthreads>512) nthreads=512; - } - else - { - nblocks = 1; - nthreads = 1; - } - - T *rets = NULL; - T *devrets = NULL; - - rets = new T[nblocks*nthreads]; - cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads); - - dbuff_sum_kf<<>>(dbuff,N,devrets); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err)); - } - - cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost); - - ret = (T)0; - for(I=0;I __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - for(I=I0;Imaxs[I0]) - { - maxs[I0] = devbuffer[I]; - } - if(devbuffer[I] void dbuff_minmax(T *devbuffer, int N, T *min, T *max) -{ - cudaError_t err = cudaSuccess; - int nblocks; - int nthreads; - int I; - - T *maxs = NULL; - T *dev_maxs = NULL; - T *mins = NULL; - T *dev_mins = NULL; - - T localmax = T(0); - T localmin = T(0); - - if(devbuffer==NULL || N<=0) - { - if(min!=NULL) *min = T(0); - if(max!=NULL) *max = T(0); - return; - } - - if(N>25) - { - nblocks = 25; - nthreads = (int) sqrt((float)(N/nblocks)); - if(nthreads<1) nthreads = 1; - if(nthreads>512) nthreads = 512; - } - else - { - nblocks = 1; - nthreads = 1; - } - - maxs = new T[nblocks*nthreads]; - mins = new T[nblocks*nthreads]; - cudaMalloc(&dev_maxs,nblocks*nthreads); - cudaMalloc(&dev_mins,nblocks*nthreads); - - dbuff_minmax_kf<<>>(devbuffer,N,dev_maxs,dev_mins); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err)); - } - - cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost); - cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost); - - - for(I=0;Ilocalmax) localmax = maxs[I]; - if(mins[I] __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - for(I=I0;I void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads) -{ - cudaError_t err = cudaSuccess; - - if(devbuffer==NULL || N<=0) - { - return; - } - - dbuff_setall_kf<<>>(devbuffer,N,setto); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err)); - } - - return; -} - -template __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2)) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - T1 a; - T2 b; - T3 c; - - for(I=I0;I __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2)) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - T1 a; - T2 b; - T3 c; - - for(I=I0;I one output array -template void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads) -{ - cudaError_t err = cudaSuccess; - - if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0) - { - return; - } - - dbuff_vectorbinop_kf1<<>>(dbuf_a,dbuf_b,dbuf_out,N); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err)); - } - - return; -} - -//Elementwise device-buffer vector two-parameter operation -//takes one input array, and a constant paramter ( ) ---> one output array -template void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads) -{ - cudaError_t err = cudaSuccess; - - if(dbuf_a == NULL || dbuf_out == NULL || N<=0) - { - return; - } - - dbuff_vectorbinop_kf2<<>>(dbuf_a,par_b,dbuf_out,N); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err)); - } - - return; -} - -template T dbuff_add_fn(T a, T b) -{ - return a+b; -} - -template void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads); - return; -} - -template void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads); - return; -} - -template T dbuff_sub_fn(T a, T b) -{ - return a-b; -} - -template void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads); - return; -} - -template void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads); - return; -} - -template T dbuff_mult_fn(T a, T b) -{ - return a*b; -} - -template void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads); - return; -} - -template void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads); - return; -} - -template T dbuff_div_fn(T a, T b) -{ - return a/b; -} - -template void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads); - return; -} - -template void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads); - return; -} - -template T dbuff_ldiv_fn(T a, T b) -{ - return b/a; -} - - -template void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) -{ - dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads); - return; -} - - -}; - -#endif - +#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__ +#define __AMSCUARRAY_DOPS_IMPL_HPP__ + +namespace amscuda +{ + +template __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + T ret = (T) 0; + for(I=I0;I T devcuarray_sum(cuarray *devptr) +{ + T ret = T(); + cudaError_t err = cudaSuccess; + + cuarray ldptr; + + cudaMemcpy(&ldptr,devptr,sizeof(cuarray),cudaMemcpyDeviceToHost); + + ret = devbuffer_sum(ldptr.data,ldptr.length); + + ldptr.data = NULL; + ldptr.length=0; + + return ret; +} + +template T dbuff_sum(T *dbuff, int N) +{ + int I; + T ret = T(); + cudaError_t err = cudaSuccess; + + int nblocks; + int nthreads; + + if(dbuff==NULL || N<=0) + { + return ret; + } + + if(N>100) + { + nblocks = 10; + nthreads = (int)sqrt((float) (N/nblocks)); + if(nthreads<=0) nthreads=1; + if(nthreads>512) nthreads=512; + } + else + { + nblocks = 1; + nthreads = 1; + } + + T *rets = NULL; + T *devrets = NULL; + + rets = new T[nblocks*nthreads]; + cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads); + + dbuff_sum_kf<<>>(dbuff,N,devrets); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err)); + } + + cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost); + + ret = (T)0; + for(I=0;I __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + for(I=I0;Imaxs[I0]) + { + maxs[I0] = devbuffer[I]; + } + if(devbuffer[I] void dbuff_minmax(T *devbuffer, int N, T *min, T *max) +{ + cudaError_t err = cudaSuccess; + int nblocks; + int nthreads; + int I; + + T *maxs = NULL; + T *dev_maxs = NULL; + T *mins = NULL; + T *dev_mins = NULL; + + T localmax = T(0); + T localmin = T(0); + + if(devbuffer==NULL || N<=0) + { + if(min!=NULL) *min = T(0); + if(max!=NULL) *max = T(0); + return; + } + + if(N>25) + { + nblocks = 25; + nthreads = (int) sqrt((float)(N/nblocks)); + if(nthreads<1) nthreads = 1; + if(nthreads>512) nthreads = 512; + } + else + { + nblocks = 1; + nthreads = 1; + } + + maxs = new T[nblocks*nthreads]; + mins = new T[nblocks*nthreads]; + cudaMalloc(&dev_maxs,nblocks*nthreads); + cudaMalloc(&dev_mins,nblocks*nthreads); + + dbuff_minmax_kf<<>>(devbuffer,N,dev_maxs,dev_mins); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err)); + } + + cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost); + cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost); + + + for(I=0;Ilocalmax) localmax = maxs[I]; + if(mins[I] __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + for(I=I0;I void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads) +{ + cudaError_t err = cudaSuccess; + + if(devbuffer==NULL || N<=0) + { + return; + } + + dbuff_setall_kf<<>>(devbuffer,N,setto); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err)); + } + + return; +} + +template __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2)) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + T1 a; + T2 b; + T3 c; + + for(I=I0;I __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2)) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + T1 a; + T2 b; + T3 c; + + for(I=I0;I one output array +template void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads) +{ + cudaError_t err = cudaSuccess; + + if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0) + { + return; + } + + dbuff_vectorbinop_kf1<<>>(dbuf_a,dbuf_b,dbuf_out,N); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err)); + } + + return; +} + +//Elementwise device-buffer vector two-parameter operation +//takes one input array, and a constant paramter ( ) ---> one output array +template void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads) +{ + cudaError_t err = cudaSuccess; + + if(dbuf_a == NULL || dbuf_out == NULL || N<=0) + { + return; + } + + dbuff_vectorbinop_kf2<<>>(dbuf_a,par_b,dbuf_out,N); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err)); + } + + return; +} + +template T dbuff_add_fn(T a, T b) +{ + return a+b; +} + +template void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads); + return; +} + +template void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads); + return; +} + +template T dbuff_sub_fn(T a, T b) +{ + return a-b; +} + +template void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads); + return; +} + +template void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads); + return; +} + +template T dbuff_mult_fn(T a, T b) +{ + return a*b; +} + +template void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads); + return; +} + +template void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads); + return; +} + +template T dbuff_div_fn(T a, T b) +{ + return a/b; +} + +template void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads); + return; +} + +template void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads); + return; +} + +template T dbuff_ldiv_fn(T a, T b) +{ + return b/a; +} + + +template void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads) +{ + dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads); + return; +} + + +}; + +#endif + diff --git a/include/amsculib2/amscuarray_impl.hpp b/include/amsculib2/amscuarray_impl.hpp index 813d90b..6aae6ac 100644 --- a/include/amsculib2/amscuarray_impl.hpp +++ b/include/amsculib2/amscuarray_impl.hpp @@ -1,323 +1,323 @@ -#ifndef __CUARRAY_IMPL_HPP__ -#define __CUARRAY_IMPL_HPP__ - -namespace amscuda -{ - -// New Version cuarray -// simpler, less crap going on - -template __device__ __host__ cuarray::cuarray() -{ - length = 0; - data = NULL; -} - -template __device__ __host__ cuarray::~cuarray() -{ - if(data!=NULL) - { - delete[] data; data = NULL; - } - length = 0; -} - -template __device__ __host__ int cuarray::resize(const int _length) -{ - int ret = 0; - - T *newbuffer = NULL; - - if(length==_length) - { - //do nothing - ret = 1; - return ret; - } - if(_length<=0) - { - if(data!=NULL) - { - delete[] data; - data = NULL; - } - length = 0; - ret = 1; - } - - newbuffer = new T[_length]; - if(newbuffer==NULL) - { - ret = -1; //failed to allocate memory - return ret; - } - - int I; - T def; - - if(data!=NULL) - { - for(I=0;I __host__ int cuarray::device_send(cuarray **dptr) -{ - int ret = 0; - int dlength; - - if(*dptr==NULL) - { - ret = _device_send_overwrite(dptr); - } - else - { - dlength = device_length(*dptr); - if(dlength=length) - { - ret = _device_send_copy(*dptr); - } - else - { - ret = _device_send_overwrite(dptr); - } - } - - return ret; -} - -template __host__ int cuarray::_device_send_overwrite(cuarray **dptr) -{ - int ret = 0; - cuarray dlocal; - cudaError_t err = cudaSuccess; - device_free(dptr); - - if(length>=0 && data!=NULL) - { - err = cudaMalloc(dptr,sizeof(cuarray)); - if(err==cudaSuccess) - { - err = cudaMalloc(&(dlocal.data),sizeof(T)*length); - dlocal.length = length; - - if(err==cudaSuccess) - { - cudaMemcpy(*dptr,&dlocal,sizeof(cuarray),cudaMemcpyHostToDevice); - if(data!=NULL) - err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice); - else - err = cudaSuccess; - if(err==cudaSuccess) - { - ret = 1; - } - else - { - ret = -3; - } - } - else - { - ret = -2; - } - } - else - { - ret = -1; - } - } - else - { - dlocal.data = NULL; - dlocal.length = 0; - err = cudaMalloc(dptr,sizeof(cuarray)); - if(err==cudaSuccess) - { - cudaMemcpy(*dptr,&dlocal,sizeof(cuarray),cudaMemcpyHostToDevice); - ret = 1; - } - else - { - ret = -4; - } - } - - - dlocal.data = NULL; - dlocal.length = -1; - - return ret; -} - -template __host__ int cuarray::_device_send_copy(cuarray *dptr) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - T* ddata = NULL; - ddata = device_data_ptr(dptr); - - err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice); - if(err==cudaSuccess) - { - ret = 1; - } - else - { - ret = -1; - } - - return ret; -} - -template __host__ int cuarray::device_pull(cuarray *dptr) -{ - int ret = 0; - int dlength; - T* ddata; - cudaError_t err; - - if(dptr==NULL) - { - ret = -1; // null d pointer - return ret; - } - - dlength = device_length(dptr); - if(dlength!=length) - { - this->resize(dlength); - } - - ddata = device_data_ptr(dptr); - - if(length>0 && data!=NULL && ddata!=NULL) - { - err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost); - if(err==cudaSuccess) - { - ret = 1; - } - else - { - ret = -2; - } - } - - return ret; -} - -template __host__ int cuarray::device_free(cuarray **dptr) -{ - int ret = 0; - cuarray dlocal; - - if(*dptr!=NULL) - { - cudaMemcpy(&dlocal,dptr,sizeof(cuarray),cudaMemcpyDeviceToHost); - if(dlocal.data!=NULL) - { - cudaFree(dlocal.data); - dlocal.data = NULL; - } - - cudaFree(*dptr); - *dptr = NULL; - ret = 1; - } - - dlocal.data = NULL; - dlocal.length = -1; - - return ret; -} - -template __host__ int cuarray::device_length(cuarray *dptr) -{ - int ret = -1; - cuarray dlocal; - - if(dptr==NULL) - { - return ret; - } - - cudaMemcpy(&dlocal,dptr,sizeof(cuarray),cudaMemcpyDeviceToHost); - ret = dlocal.length; - - dlocal.data = NULL; - dlocal.length = -1; - - return ret; -} - -template __host__ T* cuarray::device_data_ptr(cuarray *dptr) -{ - T* ret = NULL; - cuarray dlocal; - - if(dptr==NULL) - { - return ret; - } - - cudaMemcpy(&dlocal,dptr,sizeof(cuarray),cudaMemcpyDeviceToHost); - ret = dlocal.data; - - dlocal.data = NULL; - dlocal.length = -1; - - return ret; -} - -template __device__ __host__ int cuarray::size() const -{ - return this->length; -} - -template __device__ __host__ T& cuarray::at(const int I) -{ - return this->data[I]; -} - -template __device__ __host__ const T& cuarray::at(const int I) const -{ - return this->data[I]; -} - -template __device__ __host__ T& cuarray::operator[](const int I) -{ - return this->data[I]; -} - -template __device__ __host__ const T& cuarray::operator[](const int I) const -{ - return this->data[I]; -} - -}; - - +#ifndef __CUARRAY_IMPL_HPP__ +#define __CUARRAY_IMPL_HPP__ + +namespace amscuda +{ + +// New Version cuarray +// simpler, less crap going on + +template __device__ __host__ cuarray::cuarray() +{ + length = 0; + data = NULL; +} + +template __device__ __host__ cuarray::~cuarray() +{ + if(data!=NULL) + { + delete[] data; data = NULL; + } + length = 0; +} + +template __device__ __host__ int cuarray::resize(const int _length) +{ + int ret = 0; + + T *newbuffer = NULL; + + if(length==_length) + { + //do nothing + ret = 1; + return ret; + } + if(_length<=0) + { + if(data!=NULL) + { + delete[] data; + data = NULL; + } + length = 0; + ret = 1; + } + + newbuffer = new T[_length]; + if(newbuffer==NULL) + { + ret = -1; //failed to allocate memory + return ret; + } + + int I; + T def; + + if(data!=NULL) + { + for(I=0;I __host__ int cuarray::device_send(cuarray **dptr) +{ + int ret = 0; + int dlength; + + if(*dptr==NULL) + { + ret = _device_send_overwrite(dptr); + } + else + { + dlength = device_length(*dptr); + if(dlength=length) + { + ret = _device_send_copy(*dptr); + } + else + { + ret = _device_send_overwrite(dptr); + } + } + + return ret; +} + +template __host__ int cuarray::_device_send_overwrite(cuarray **dptr) +{ + int ret = 0; + cuarray dlocal; + cudaError_t err = cudaSuccess; + device_free(dptr); + + if(length>=0 && data!=NULL) + { + err = cudaMalloc(dptr,sizeof(cuarray)); + if(err==cudaSuccess) + { + err = cudaMalloc(&(dlocal.data),sizeof(T)*length); + dlocal.length = length; + + if(err==cudaSuccess) + { + cudaMemcpy(*dptr,&dlocal,sizeof(cuarray),cudaMemcpyHostToDevice); + if(data!=NULL) + err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice); + else + err = cudaSuccess; + if(err==cudaSuccess) + { + ret = 1; + } + else + { + ret = -3; + } + } + else + { + ret = -2; + } + } + else + { + ret = -1; + } + } + else + { + dlocal.data = NULL; + dlocal.length = 0; + err = cudaMalloc(dptr,sizeof(cuarray)); + if(err==cudaSuccess) + { + cudaMemcpy(*dptr,&dlocal,sizeof(cuarray),cudaMemcpyHostToDevice); + ret = 1; + } + else + { + ret = -4; + } + } + + + dlocal.data = NULL; + dlocal.length = -1; + + return ret; +} + +template __host__ int cuarray::_device_send_copy(cuarray *dptr) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + T* ddata = NULL; + ddata = device_data_ptr(dptr); + + err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice); + if(err==cudaSuccess) + { + ret = 1; + } + else + { + ret = -1; + } + + return ret; +} + +template __host__ int cuarray::device_pull(cuarray *dptr) +{ + int ret = 0; + int dlength; + T* ddata; + cudaError_t err; + + if(dptr==NULL) + { + ret = -1; // null d pointer + return ret; + } + + dlength = device_length(dptr); + if(dlength!=length) + { + this->resize(dlength); + } + + ddata = device_data_ptr(dptr); + + if(length>0 && data!=NULL && ddata!=NULL) + { + err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost); + if(err==cudaSuccess) + { + ret = 1; + } + else + { + ret = -2; + } + } + + return ret; +} + +template __host__ int cuarray::device_free(cuarray **dptr) +{ + int ret = 0; + cuarray dlocal; + + if(*dptr!=NULL) + { + cudaMemcpy(&dlocal,dptr,sizeof(cuarray),cudaMemcpyDeviceToHost); + if(dlocal.data!=NULL) + { + cudaFree(dlocal.data); + dlocal.data = NULL; + } + + cudaFree(*dptr); + *dptr = NULL; + ret = 1; + } + + dlocal.data = NULL; + dlocal.length = -1; + + return ret; +} + +template __host__ int cuarray::device_length(cuarray *dptr) +{ + int ret = -1; + cuarray dlocal; + + if(dptr==NULL) + { + return ret; + } + + cudaMemcpy(&dlocal,dptr,sizeof(cuarray),cudaMemcpyDeviceToHost); + ret = dlocal.length; + + dlocal.data = NULL; + dlocal.length = -1; + + return ret; +} + +template __host__ T* cuarray::device_data_ptr(cuarray *dptr) +{ + T* ret = NULL; + cuarray dlocal; + + if(dptr==NULL) + { + return ret; + } + + cudaMemcpy(&dlocal,dptr,sizeof(cuarray),cudaMemcpyDeviceToHost); + ret = dlocal.data; + + dlocal.data = NULL; + dlocal.length = -1; + + return ret; +} + +template __device__ __host__ int cuarray::size() const +{ + return this->length; +} + +template __device__ __host__ T& cuarray::at(const int I) +{ + return this->data[I]; +} + +template __device__ __host__ const T& cuarray::at(const int I) const +{ + return this->data[I]; +} + +template __device__ __host__ T& cuarray::operator[](const int I) +{ + return this->data[I]; +} + +template __device__ __host__ const T& cuarray::operator[](const int I) const +{ + return this->data[I]; +} + +}; + + #endif \ No newline at end of file diff --git a/include/amsculib2/amscuda_binarrrw.hpp b/include/amsculib2/amscuda_binarrrw.hpp index 14f3dde..0014d35 100644 --- a/include/amsculib2/amscuda_binarrrw.hpp +++ b/include/amsculib2/amscuda_binarrrw.hpp @@ -1,19 +1,19 @@ -#ifndef __AMSCUDA_BINARRRW_HPP__ -#define __AMSCUDA_BINARRRW_HPP__ - -namespace amscuda -{ - -template int fread_ndarray(FILE *fp, cuarray *shape, cuarray *buffer); -template int fwrite_ndarray(FILE *fp, const cuarray *shape, const cuarray *buffer); - -template int fwrite_buffer(FILE *fp, const int N, const T *buffer); -template int fread_buffer(FILE *fp, const int Nmax, const T *buffer); - - -}; //end namespace amscuda - -#include - -#endif - +#ifndef __AMSCUDA_BINARRRW_HPP__ +#define __AMSCUDA_BINARRRW_HPP__ + +namespace amscuda +{ + +template int fread_ndarray(FILE *fp, cuarray *shape, cuarray *buffer); +template int fwrite_ndarray(FILE *fp, const cuarray *shape, const cuarray *buffer); + +template int fwrite_buffer(FILE *fp, const int N, const T *buffer); +template int fread_buffer(FILE *fp, const int Nmax, const T *buffer); + + +}; //end namespace amscuda + +#include + +#endif + diff --git a/include/amsculib2/amscuda_binarrrw_impl.hpp b/include/amsculib2/amscuda_binarrrw_impl.hpp index 84674d1..f797f73 100644 --- a/include/amsculib2/amscuda_binarrrw_impl.hpp +++ b/include/amsculib2/amscuda_binarrrw_impl.hpp @@ -1,194 +1,194 @@ -#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__ -#define __AMSCUDA_BINARRRW_IMPL_HPP__ - -namespace amscuda -{ - -template int fread_ndarray(FILE *fp, cuarray *shape, cuarray *buffer) -{ - int ret = 1; - int I; - long piprod; - int32_t q; - int cnt; - - int32_t Nd; - - if(fp!=NULL) - { - if(!feof(fp)) - { - cnt = fread(&Nd,sizeof(int32_t),1,fp); - if(Nd>0 && cnt>0) - { - shape->resize(Nd); - piprod = 1; - for(I=0;Iat(I) = q; - if(q>0) - { - piprod = piprod*q; - } - else - { - piprod = 0; - } - } - - buffer->resize(piprod); - if(piprod>0) - { - cnt = fread((buffer->data),sizeof(T),piprod,fp); - if(piprod==cnt) - { - ret = 1; - } - else - { - printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod); - ret = 0; - } - } - } - else - { - printf("fread_ndarray: Read a number of dimensions<=0.\n"); - Nd = 0; - shape->resize(0); - buffer->resize(0); - } - } - else - { - printf("fread_ndarray: fp=NULL.\n"); - ret = 0; - } - } - else - { - ret = 0; - } - - return ret; -} - -template int fwrite_ndarray(FILE *fp, const cuarray *shape, const cuarray *buffer) -{ - int ret = 1; - long piprod; - int I; - int32_t Nd; - - if(fp==NULL) - { - ret = 0; - printf("fwrite_ndarray: fp=NULL\n"); - return ret; - } - - piprod = 1; - for(I=0;Isize();I++) - { - if(shape->at(I)>0) - { - piprod = piprod*shape->at(I); - } - else - { - piprod = 0; - } - } - - Nd = (int32_t) shape->size(); - - if(piprod!=buffer->size()) - { - ret = 0; - printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod); - return ret; - } - - fwrite(&Nd,sizeof(int32_t),1,fp); - if(Nd>0) - { - fwrite(shape->data,sizeof(int32_t),Nd,fp); - if(piprod>0) - { - fwrite(buffer->data,sizeof(T),buffer->size(),fp); - } - } - - return ret; -} - -template int fwrite_buffer(FILE *fp, const int N, const T *buffer) -{ - int ret = 0; - int Nd = 1; - - if(fp==NULL) - { - ret = 0; - printf("fwrite_buffer: fp=NULL\n"); - return ret; - } - - fwrite(&Nd,sizeof(int32_t),1,fp); - fwrite(&N,sizeof(int32_t),1,fp); - fwrite(buffer,sizeof(T),N,fp); - - return ret; -} - -template int fread_buffer(FILE *fp, const int Nmax, const T *buffer) -{ - int ret = 0; - - int cnt; - int32_t Nd; - int32_t *dims = NULL; - int piprod; - int32_t q; - int I; - - int Nr; - - - if(fp==NULL) {ret = -1; return ret;} - if(feof(fp)) {ret = -2; return ret;} - - cnt = fread(&Nd,sizeof(int32_t),1,fp); - if(Nd>0 && cnt>0) - { - piprod = 1; - dims = new(std::nothrow) int32_t[Nd]; - for(I=0;I(Nmax,piprod); - cnt = fread(buffer,sizeof(T),Nr,fp); - } - - if(dims!=NULL) {delete[] dims; dims=NULL;} - - return ret; -} - -}; //end namespace amscuda - -#endif - +#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__ +#define __AMSCUDA_BINARRRW_IMPL_HPP__ + +namespace amscuda +{ + +template int fread_ndarray(FILE *fp, cuarray *shape, cuarray *buffer) +{ + int ret = 1; + int I; + long piprod; + int32_t q; + int cnt; + + int32_t Nd; + + if(fp!=NULL) + { + if(!feof(fp)) + { + cnt = fread(&Nd,sizeof(int32_t),1,fp); + if(Nd>0 && cnt>0) + { + shape->resize(Nd); + piprod = 1; + for(I=0;Iat(I) = q; + if(q>0) + { + piprod = piprod*q; + } + else + { + piprod = 0; + } + } + + buffer->resize(piprod); + if(piprod>0) + { + cnt = fread((buffer->data),sizeof(T),piprod,fp); + if(piprod==cnt) + { + ret = 1; + } + else + { + printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod); + ret = 0; + } + } + } + else + { + printf("fread_ndarray: Read a number of dimensions<=0.\n"); + Nd = 0; + shape->resize(0); + buffer->resize(0); + } + } + else + { + printf("fread_ndarray: fp=NULL.\n"); + ret = 0; + } + } + else + { + ret = 0; + } + + return ret; +} + +template int fwrite_ndarray(FILE *fp, const cuarray *shape, const cuarray *buffer) +{ + int ret = 1; + long piprod; + int I; + int32_t Nd; + + if(fp==NULL) + { + ret = 0; + printf("fwrite_ndarray: fp=NULL\n"); + return ret; + } + + piprod = 1; + for(I=0;Isize();I++) + { + if(shape->at(I)>0) + { + piprod = piprod*shape->at(I); + } + else + { + piprod = 0; + } + } + + Nd = (int32_t) shape->size(); + + if(piprod!=buffer->size()) + { + ret = 0; + printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod); + return ret; + } + + fwrite(&Nd,sizeof(int32_t),1,fp); + if(Nd>0) + { + fwrite(shape->data,sizeof(int32_t),Nd,fp); + if(piprod>0) + { + fwrite(buffer->data,sizeof(T),buffer->size(),fp); + } + } + + return ret; +} + +template int fwrite_buffer(FILE *fp, const int N, const T *buffer) +{ + int ret = 0; + int Nd = 1; + + if(fp==NULL) + { + ret = 0; + printf("fwrite_buffer: fp=NULL\n"); + return ret; + } + + fwrite(&Nd,sizeof(int32_t),1,fp); + fwrite(&N,sizeof(int32_t),1,fp); + fwrite(buffer,sizeof(T),N,fp); + + return ret; +} + +template int fread_buffer(FILE *fp, const int Nmax, const T *buffer) +{ + int ret = 0; + + int cnt; + int32_t Nd; + int32_t *dims = NULL; + int piprod; + int32_t q; + int I; + + int Nr; + + + if(fp==NULL) {ret = -1; return ret;} + if(feof(fp)) {ret = -2; return ret;} + + cnt = fread(&Nd,sizeof(int32_t),1,fp); + if(Nd>0 && cnt>0) + { + piprod = 1; + dims = new(std::nothrow) int32_t[Nd]; + for(I=0;I(Nmax,piprod); + cnt = fread(buffer,sizeof(T),Nr,fp); + } + + if(dims!=NULL) {delete[] dims; dims=NULL;} + + return ret; +} + +}; //end namespace amscuda + +#endif + diff --git a/include/amsculib2/amscugeom.hpp b/include/amsculib2/amscugeom.hpp index 60ec3fb..d927c71 100644 --- a/include/amsculib2/amscugeom.hpp +++ b/include/amsculib2/amscugeom.hpp @@ -1,11 +1,11 @@ -#ifndef __AMSCUGEOM_HPP__ -#define __AMSCUGEOM_HPP__ - -namespace amscuda -{ - - -}; //end namespace amscuda - -#endif - +#ifndef __AMSCUGEOM_HPP__ +#define __AMSCUGEOM_HPP__ + +namespace amscuda +{ + + +}; //end namespace amscuda + +#endif + diff --git a/include/amsculib2/amsculib2.hpp b/include/amsculib2/amsculib2.hpp index 4f5e412..2c026c4 100644 --- a/include/amsculib2/amsculib2.hpp +++ b/include/amsculib2/amsculib2.hpp @@ -1,70 +1,70 @@ -#ifndef __AMSCULIB2_HPP__ -#define __AMSCULIB2_HPP__ - -//Std Lib Includes -#include -#include -#include -#include -#include -#include - -#include //where all the cuda functions live -#include -#include - -//Dependencies - -//Predeclarations -class cuvect2; -class cuvect3; -class cuvect4; -class cuvect2f; -class cuvect3f; -class cuvect4f; - -//Need a way to define the same symbols using both host and device code -//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory -#ifdef __CUDA_ARCH__ -#define AMSCU_CONST __constant__ -#else -#define AMSCU_CONST -#endif - -namespace amscuda -{ - - //default thread and block execution - AMSCU_CONST static const int amscu_defnblocks = 256; - AMSCU_CONST static const int amscu_defnthreads = 512; - - //default numthreads to execute on cpu - AMSCU_CONST static const int amscu_defcputhreads = 8; - -}; //end namespace amscuda - -//Components -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - - - - -#endif - +#ifndef __AMSCULIB2_HPP__ +#define __AMSCULIB2_HPP__ + +//Std Lib Includes +#include +#include +#include +#include +#include +#include + +#include //where all the cuda functions live +#include +#include + +//Dependencies + +//Predeclarations +class cuvect2; +class cuvect3; +class cuvect4; +class cuvect2f; +class cuvect3f; +class cuvect4f; + +//Need a way to define the same symbols using both host and device code +//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory +#ifdef __CUDA_ARCH__ +#define AMSCU_CONST __constant__ +#else +#define AMSCU_CONST +#endif + +namespace amscuda +{ + + //default thread and block execution + AMSCU_CONST static const int amscu_defnblocks = 256; + AMSCU_CONST static const int amscu_defnthreads = 512; + + //default numthreads to execute on cpu + AMSCU_CONST static const int amscu_defcputhreads = 8; + +}; //end namespace amscuda + +//Components +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + + + + +#endif + diff --git a/include/amsculib2/amscumath.hpp b/include/amsculib2/amscumath.hpp index 574a1c2..3c8d598 100644 --- a/include/amsculib2/amscumath.hpp +++ b/include/amsculib2/amscumath.hpp @@ -1,56 +1,56 @@ -#ifndef __AMSCUMATH_HPP__ -#define __AMSCUMATH_HPP__ - -namespace amscuda -{ - - - - //Problem: These are not in the namespace - //#define nan NAN - //#define fnan (float) NAN - //#define inf INFINITY - //#define finf (float) INFINITY - //#define pi 3.1415926535897936 - - //These need to be the same symbol for both host and device code - AMSCU_CONST static const double nan = NAN; - AMSCU_CONST static const float fnan = (float) NAN; - AMSCU_CONST static const double inf = INFINITY; - AMSCU_CONST static const float finf = (float) INFINITY; - AMSCU_CONST static const double pi = 3.1415926535897936; - AMSCU_CONST static const float pif = 3.1415926535897936; - - __host__ __device__ double dabs(double x); - __host__ __device__ float fabs(float x); - - template __host__ __device__ T abs(const T in) - { - T ret = in; - if(in<0) ret = -in; - return ret; - } - - __host__ __device__ double mod(double a, double md); - __host__ __device__ float mod(float a, float md); - __host__ __device__ int mod(int x, int n); - __host__ __device__ long mod(long x, long n); - - __host__ __device__ int truediv(int x, int y); - __host__ __device__ long truediv(long x, long y); - - template __host__ __device__ T min(T a, T b); - template __host__ __device__ T max(T a, T b); - - __device__ __host__ double arg(double x, double y); - __device__ __host__ void get_azel(double x, double y, double z, double *az, double *el); - - void test_amscumath1(); - - -}; //end namespace amscuda - -#include - -#endif - +#ifndef __AMSCUMATH_HPP__ +#define __AMSCUMATH_HPP__ + +namespace amscuda +{ + + + + //Problem: These are not in the namespace + //#define nan NAN + //#define fnan (float) NAN + //#define inf INFINITY + //#define finf (float) INFINITY + //#define pi 3.1415926535897936 + + //These need to be the same symbol for both host and device code + AMSCU_CONST static const double nan = NAN; + AMSCU_CONST static const float fnan = (float) NAN; + AMSCU_CONST static const double inf = INFINITY; + AMSCU_CONST static const float finf = (float) INFINITY; + AMSCU_CONST static const double pi = 3.1415926535897936; + AMSCU_CONST static const float pif = 3.1415926535897936; + + __host__ __device__ double dabs(double x); + __host__ __device__ float fabs(float x); + + template __host__ __device__ T abs(const T in) + { + T ret = in; + if(in<0) ret = -in; + return ret; + } + + __host__ __device__ double mod(double a, double md); + __host__ __device__ float mod(float a, float md); + __host__ __device__ int mod(int x, int n); + __host__ __device__ long mod(long x, long n); + + __host__ __device__ int truediv(int x, int y); + __host__ __device__ long truediv(long x, long y); + + template __host__ __device__ T min(T a, T b); + template __host__ __device__ T max(T a, T b); + + __device__ __host__ double arg(double x, double y); + __device__ __host__ void get_azel(double x, double y, double z, double *az, double *el); + + void test_amscumath1(); + + +}; //end namespace amscuda + +#include + +#endif + diff --git a/include/amsculib2/amscumath_impl.hpp b/include/amsculib2/amscumath_impl.hpp index 4a43e06..6e4b838 100644 --- a/include/amsculib2/amscumath_impl.hpp +++ b/include/amsculib2/amscumath_impl.hpp @@ -1,42 +1,42 @@ -#ifndef __AMSCUMATH_IMPL_HPP__ -#define __AMSCUMATH_IMPL_HPP__ - -namespace amscuda -{ - -template __host__ __device__ T min(T a, T b) -{ - if(a>b) - { - return b; - } - else - { - return a; - } - return a; -} - -template __host__ __device__ T max(T a, T b) -{ - if(a>b) - { - return a; - } - else - { - return b; - } - return a; -} - -template<> __host__ __device__ double min(double a, double b); -template<> __host__ __device__ float min(float a, float b); -template<> __host__ __device__ double max(double a, double b); -template<> __host__ __device__ float max(float a, float b); - - -}; //end namespace amscuda - -#endif - +#ifndef __AMSCUMATH_IMPL_HPP__ +#define __AMSCUMATH_IMPL_HPP__ + +namespace amscuda +{ + +template __host__ __device__ T min(T a, T b) +{ + if(a>b) + { + return b; + } + else + { + return a; + } + return a; +} + +template __host__ __device__ T max(T a, T b) +{ + if(a>b) + { + return a; + } + else + { + return b; + } + return a; +} + +template<> __host__ __device__ double min(double a, double b); +template<> __host__ __device__ float min(float a, float b); +template<> __host__ __device__ double max(double a, double b); +template<> __host__ __device__ float max(float a, float b); + + +}; //end namespace amscuda + +#endif + diff --git a/include/amsculib2/amscurarray.cuh b/include/amsculib2/amscurarray.cuh index cd9ff0b..ec2a210 100644 --- a/include/amsculib2/amscurarray.cuh +++ b/include/amsculib2/amscurarray.cuh @@ -1,66 +1,66 @@ -#ifndef __AMSCURARRAY_HPP__ -#define __AMSCURARRAY_HPP__ - -namespace amscuda -{ - -//Cuda ragged array class -template class curarray -{ -public: - int device; - curarray* devptr; //pointer to mirror class on the device - - int Narrays; //number of arrays - - int *N; //dimension of each array - T** hostarrayptrs; //pointers to each array on the host - null on the device - T** devarrayptrs; //pointers to each array on the device - //the double pointer is a host pointer to device pointers on the host class - //for the device class, only the second set of arrays is in use - - //the constructor and destructor set all pointers to NULL, they - // do *not* manage memory. This is done with curarray_new and curarray_delete - __device__ __host__ curarray(); - __device__ __host__ ~curarray(); - - __host__ int push(); - __host__ int pull(); - //__device__ int dev_resizearray(int arraynum, int arraysize); - __host__ int resizearray(int arraynum, int arraysize); - // I may want a way to resize arrays on the device without pushing/pulling all the array contents - - -}; - -template int curarray_new(curarray** ptr, int Narrays); - -template int curarray_delete(curarray** ptr); - -template int curarray_device_new(curarray *hostptr); - -template int curarray_device_delete(curarray *hostptr); - -template int curarray_push(curarray *hostptr); - -template int curarray_pull(curarray *hostptr); - - -//template int curarray_host_fillall(curarray *hostptr, const T &val); -//template int curarray_device_fillall(curarray *hostptr, const T &val); - - -//template __host__ int curarray_deletearray(curarray *hostptr, int arrayindex); -//template __device__ int curarray_dev_deletearray(curarray *devptr, int arrayindex); - -//template __host__ int curarray_allocarray(curarray *hostptr, int arrayindex, int size); -//template __device__ int curarray_dev_allocarray(curarray *devptr, int arrayindex, int size); - - -void test_amscurarray1(); - -}; - -#include - +#ifndef __AMSCURARRAY_HPP__ +#define __AMSCURARRAY_HPP__ + +namespace amscuda +{ + +//Cuda ragged array class +template class curarray +{ +public: + int device; + curarray* devptr; //pointer to mirror class on the device + + int Narrays; //number of arrays + + int *N; //dimension of each array + T** hostarrayptrs; //pointers to each array on the host - null on the device + T** devarrayptrs; //pointers to each array on the device + //the double pointer is a host pointer to device pointers on the host class + //for the device class, only the second set of arrays is in use + + //the constructor and destructor set all pointers to NULL, they + // do *not* manage memory. This is done with curarray_new and curarray_delete + __device__ __host__ curarray(); + __device__ __host__ ~curarray(); + + __host__ int push(); + __host__ int pull(); + //__device__ int dev_resizearray(int arraynum, int arraysize); + __host__ int resizearray(int arraynum, int arraysize); + // I may want a way to resize arrays on the device without pushing/pulling all the array contents + + +}; + +template int curarray_new(curarray** ptr, int Narrays); + +template int curarray_delete(curarray** ptr); + +template int curarray_device_new(curarray *hostptr); + +template int curarray_device_delete(curarray *hostptr); + +template int curarray_push(curarray *hostptr); + +template int curarray_pull(curarray *hostptr); + + +//template int curarray_host_fillall(curarray *hostptr, const T &val); +//template int curarray_device_fillall(curarray *hostptr, const T &val); + + +//template __host__ int curarray_deletearray(curarray *hostptr, int arrayindex); +//template __device__ int curarray_dev_deletearray(curarray *devptr, int arrayindex); + +//template __host__ int curarray_allocarray(curarray *hostptr, int arrayindex, int size); +//template __device__ int curarray_dev_allocarray(curarray *devptr, int arrayindex, int size); + + +void test_amscurarray1(); + +}; + +#include + #endif \ No newline at end of file diff --git a/include/amsculib2/amscurarray_impl.cuh b/include/amsculib2/amscurarray_impl.cuh index d3841f7..76ba783 100644 --- a/include/amsculib2/amscurarray_impl.cuh +++ b/include/amsculib2/amscurarray_impl.cuh @@ -1,529 +1,529 @@ -#ifndef __AMSCURARRAY_IMPL_HPP__ -#define __AMSCURARRAY_IMPL_HPP__ - -namespace amscuda -{ - -template curarray::curarray() -{ - device = -1; - devptr = NULL; - Narrays = 0; - N = NULL; - hostarrayptrs = NULL; - devarrayptrs = NULL; - -} - -template curarray::~curarray() -{ - device = -1; - devptr = NULL; - Narrays = 0; - N = NULL; - hostarrayptrs = NULL; - devarrayptrs = NULL; - -} - -template int curarray_new(curarray** ptr, int Narrays) -{ - int ret = 0; - int device; - curarray *lhptr = *ptr; - - cudaGetDevice(&device); - - if(lhptr!=NULL) - { - curarray_delete(ptr); - } - - *ptr = new(std::nothrow) curarray(); - lhptr = *ptr; - - int I; - - if(Narrays<0) Narrays=0; - - lhptr->Narrays = Narrays; - lhptr->device = device; - lhptr->N = new(std::nothrow) int[Narrays]; - lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays]; - lhptr->devarrayptrs = new(std::nothrow) T*[Narrays]; - - for(I=0;IN[I] = 0; - lhptr->hostarrayptrs[I] = NULL; - lhptr->devarrayptrs[I] = NULL; - } - - curarray_device_new(lhptr); - - return ret; -} - -template int curarray_delete(curarray** ptr) -{ - int ret = 0; - curarray *lptr = NULL; - int olddev; - - int I; - - if(*ptr==NULL) - { - return 0; - } - lptr = *ptr; - - cudaGetDevice(&olddev); - cudaSetDevice(lptr->device); - - - if(lptr->devptr!=NULL) - { - curarray_device_delete(lptr); - } - - lptr->device = -1; - - for(I=0;INarrays;I++) - { - if(lptr->hostarrayptrs!=NULL) - { - if(lptr->hostarrayptrs[I]!=NULL) - { - delete[] lptr->hostarrayptrs[I]; - lptr->hostarrayptrs[I] = NULL; - } - } - - if(lptr->devarrayptrs!=NULL) - { - if(lptr->devarrayptrs[I]!=NULL) - { - //erasing device memory should have been handled in curarray_device_delete - lptr->devarrayptrs[I] = NULL; - } - } - - lptr->N[I] = 0; - } - - if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;} - if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;} - if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;} - - if(*ptr!=NULL) {delete *ptr; *ptr = NULL;} - - cudaSetDevice(olddev); - - return ret; -} - -template int curarray_device_new(curarray *hostptr) -{ - int ret = 0; - curarray ldevdata; - - if(hostptr==NULL) return -1; - if(hostptr->devptr!=NULL) - { - curarray_device_delete(hostptr); - } - - int I; - - cudaGetDevice(&(hostptr->device)); - - ldevdata.device = hostptr->device; - ldevdata.Narrays = hostptr->Narrays; - int Narrays = hostptr->Narrays; - - cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays); - cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice); - - ldevdata.hostarrayptrs = NULL; - - for(I=0;IN[I]>0) - { - if(hostptr->devarrayptrs[I]!=NULL) - { - cudaFree(hostptr->devarrayptrs[I]); - hostptr->devarrayptrs[I] = NULL; - } - - cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]); - cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice); - } - else - { - if(hostptr->devarrayptrs[I]!=NULL) - { - cudaFree(hostptr->devarrayptrs[I]); - hostptr->devarrayptrs[I] = NULL; - } - } - } - - cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays); - cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice); - - cudaMalloc(&(hostptr->devptr),sizeof(curarray)); - cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray),cudaMemcpyHostToDevice); - - ret = 1; - - return ret; -} - -template int curarray_device_delete(curarray *hostptr) -{ - int ret = 0; - - curarray ldevdata; - int olddev; - - if(hostptr->devptr==NULL) - { - return 0; - } - - cudaGetDevice(&olddev); - cudaSetDevice(hostptr->device); - - cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); - - int I; - for(I=0;INarrays;I++) - { - if(hostptr->devarrayptrs[I]!=NULL) - { - cudaFree(hostptr->devarrayptrs[I]); - hostptr->devarrayptrs[I] = NULL; - } - } - - cudaFree(ldevdata.devarrayptrs); - cudaFree(ldevdata.N); - - cudaFree(hostptr->devptr); - hostptr->devptr = NULL; - hostptr->device = -1; - - cudaSetDevice(olddev); - - ret = 1; - return ret; -} - -template int curarray_push(curarray *hostptr) -{ - int ret = 0; - - int olddev; - - curarray ldevdata; - T** ldevarrayptrs = NULL; - int *devN = NULL; - - if(hostptr==NULL) return -1; - - cudaGetDevice(&olddev); - cudaSetDevice(hostptr->device); - - int Narrays = hostptr->Narrays; - - cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); - ldevarrayptrs = new(std::nothrow) T*[Narrays]; - devN = new(std::nothrow) int[Narrays]; - - cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost); - cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost); - - - int I; - - for(I=0;IN[I]!=devN[I]) || - (hostptr->devarrayptrs[I] != ldevarrayptrs[I]) - ) - { - cudaFree(ldevarrayptrs[I]); - ldevarrayptrs[I] = NULL; - hostptr->devarrayptrs[I] = NULL; - - if(hostptr->N[I]>0) - { - cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]); - ldevarrayptrs[I] = hostptr->devarrayptrs[I]; - devN[I] = hostptr->N[I]; - } - else - { - devN[I] = 0; - } - } - if(hostptr->N[I]>0) - { - //copy host data to device - cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice); - } - } //for each array - - //rectify and copy device data structure to device - ldevdata.device = hostptr->device; - ldevdata.devptr = NULL; - ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true - ldevdata.hostarrayptrs = NULL; - - - cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice); - cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice); - - cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray),cudaMemcpyHostToDevice); - - cuda_errortrap("curarray_push cuda error:"); - - cudaSetDevice(olddev); - - delete[] ldevarrayptrs; - delete[] devN; - - return ret; -} - -template int curarray_pull(curarray *hostptr) -{ - int ret = 0; - - int olddev; - - curarray ldevdata; - T** ldevarrayptrs = NULL; - int *devN = NULL; - - if(hostptr==NULL) return -1; - - cudaGetDevice(&olddev); - cudaSetDevice(hostptr->device); - - cuda_errortrap("dbg1"); - - int Narrays = hostptr->Narrays; - - cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); - ldevarrayptrs = new(std::nothrow) T*[Narrays]; - devN = new(std::nothrow) int[Narrays]; - - cuda_errortrap("dbg2"); - - cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost); - cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost); - - cuda_errortrap("dbg3"); - char dbgjnk[50]; - - int I; - for(I=0;Idevarrayptrs[I] != ldevarrayptrs[I]) - { - hostptr->devarrayptrs[I] = ldevarrayptrs[I]; - } - - if(hostptr->N[I]!=devN[I]) - { - if(hostptr->hostarrayptrs[I]!=NULL) - { - delete[] hostptr->hostarrayptrs[I]; - hostptr->hostarrayptrs[I] = NULL; - } - - if(devN[I]>0) - { - hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]]; - hostptr->N[I] = devN[I]; - } - else - { - hostptr->N[I] = 0; - } - } - - if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL) - { - cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost); - sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]); - cuda_errortrap(dbgjnk); - } - } //for each array - - //for the pull operation, I don't think any update of the device data structure is necessary - - cudaSetDevice(olddev); - - delete[] ldevarrayptrs; - delete[] devN; - - return ret; -} - -template __host__ int curarray::push() -{ - return curarray_push(this); -} -template __host__ int curarray::pull() -{ - return curarray_pull(this); -} - - - -/* -https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D - -cudaMalloc() and cudaFree() have distinct semantics between the host and -device environments. When invoked from the host, cudaMalloc() allocates a -new region from unused device memory. When invoked from the device runtime -these functions map to device-side malloc() and free(). This implies that -within the device environment the total allocatable memory is limited to the -device malloc() heap size, which may be smaller than the available unused -device memory. Also, it is an error to invoke cudaFree() from the host -program on a pointer which was allocated by cudaMalloc() on the device -or vice-versa. - -So, basically this entire function is not going to work. I'll be unable to resize within -a kernel. -*/ - -/* - -template __device__ int curarray::dev_resizearray(int arraynum, int arraysize) -{ - int ret = 0; - T* newptr = NULL; - int I; - T def; - - if(arraynum>=0 && arraynum __host__ int curarray::resizearray(int arraynum, int arraysize) -{ - int ret = 0; - T* newptr = NULL; - int I; - T def; - - if(arraynum>=0 && arraynum curarray::curarray() +{ + device = -1; + devptr = NULL; + Narrays = 0; + N = NULL; + hostarrayptrs = NULL; + devarrayptrs = NULL; + +} + +template curarray::~curarray() +{ + device = -1; + devptr = NULL; + Narrays = 0; + N = NULL; + hostarrayptrs = NULL; + devarrayptrs = NULL; + +} + +template int curarray_new(curarray** ptr, int Narrays) +{ + int ret = 0; + int device; + curarray *lhptr = *ptr; + + cudaGetDevice(&device); + + if(lhptr!=NULL) + { + curarray_delete(ptr); + } + + *ptr = new(std::nothrow) curarray(); + lhptr = *ptr; + + int I; + + if(Narrays<0) Narrays=0; + + lhptr->Narrays = Narrays; + lhptr->device = device; + lhptr->N = new(std::nothrow) int[Narrays]; + lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays]; + lhptr->devarrayptrs = new(std::nothrow) T*[Narrays]; + + for(I=0;IN[I] = 0; + lhptr->hostarrayptrs[I] = NULL; + lhptr->devarrayptrs[I] = NULL; + } + + curarray_device_new(lhptr); + + return ret; +} + +template int curarray_delete(curarray** ptr) +{ + int ret = 0; + curarray *lptr = NULL; + int olddev; + + int I; + + if(*ptr==NULL) + { + return 0; + } + lptr = *ptr; + + cudaGetDevice(&olddev); + cudaSetDevice(lptr->device); + + + if(lptr->devptr!=NULL) + { + curarray_device_delete(lptr); + } + + lptr->device = -1; + + for(I=0;INarrays;I++) + { + if(lptr->hostarrayptrs!=NULL) + { + if(lptr->hostarrayptrs[I]!=NULL) + { + delete[] lptr->hostarrayptrs[I]; + lptr->hostarrayptrs[I] = NULL; + } + } + + if(lptr->devarrayptrs!=NULL) + { + if(lptr->devarrayptrs[I]!=NULL) + { + //erasing device memory should have been handled in curarray_device_delete + lptr->devarrayptrs[I] = NULL; + } + } + + lptr->N[I] = 0; + } + + if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;} + if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;} + if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;} + + if(*ptr!=NULL) {delete *ptr; *ptr = NULL;} + + cudaSetDevice(olddev); + + return ret; +} + +template int curarray_device_new(curarray *hostptr) +{ + int ret = 0; + curarray ldevdata; + + if(hostptr==NULL) return -1; + if(hostptr->devptr!=NULL) + { + curarray_device_delete(hostptr); + } + + int I; + + cudaGetDevice(&(hostptr->device)); + + ldevdata.device = hostptr->device; + ldevdata.Narrays = hostptr->Narrays; + int Narrays = hostptr->Narrays; + + cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays); + cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice); + + ldevdata.hostarrayptrs = NULL; + + for(I=0;IN[I]>0) + { + if(hostptr->devarrayptrs[I]!=NULL) + { + cudaFree(hostptr->devarrayptrs[I]); + hostptr->devarrayptrs[I] = NULL; + } + + cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]); + cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice); + } + else + { + if(hostptr->devarrayptrs[I]!=NULL) + { + cudaFree(hostptr->devarrayptrs[I]); + hostptr->devarrayptrs[I] = NULL; + } + } + } + + cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays); + cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice); + + cudaMalloc(&(hostptr->devptr),sizeof(curarray)); + cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray),cudaMemcpyHostToDevice); + + ret = 1; + + return ret; +} + +template int curarray_device_delete(curarray *hostptr) +{ + int ret = 0; + + curarray ldevdata; + int olddev; + + if(hostptr->devptr==NULL) + { + return 0; + } + + cudaGetDevice(&olddev); + cudaSetDevice(hostptr->device); + + cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); + + int I; + for(I=0;INarrays;I++) + { + if(hostptr->devarrayptrs[I]!=NULL) + { + cudaFree(hostptr->devarrayptrs[I]); + hostptr->devarrayptrs[I] = NULL; + } + } + + cudaFree(ldevdata.devarrayptrs); + cudaFree(ldevdata.N); + + cudaFree(hostptr->devptr); + hostptr->devptr = NULL; + hostptr->device = -1; + + cudaSetDevice(olddev); + + ret = 1; + return ret; +} + +template int curarray_push(curarray *hostptr) +{ + int ret = 0; + + int olddev; + + curarray ldevdata; + T** ldevarrayptrs = NULL; + int *devN = NULL; + + if(hostptr==NULL) return -1; + + cudaGetDevice(&olddev); + cudaSetDevice(hostptr->device); + + int Narrays = hostptr->Narrays; + + cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); + ldevarrayptrs = new(std::nothrow) T*[Narrays]; + devN = new(std::nothrow) int[Narrays]; + + cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost); + cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost); + + + int I; + + for(I=0;IN[I]!=devN[I]) || + (hostptr->devarrayptrs[I] != ldevarrayptrs[I]) + ) + { + cudaFree(ldevarrayptrs[I]); + ldevarrayptrs[I] = NULL; + hostptr->devarrayptrs[I] = NULL; + + if(hostptr->N[I]>0) + { + cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]); + ldevarrayptrs[I] = hostptr->devarrayptrs[I]; + devN[I] = hostptr->N[I]; + } + else + { + devN[I] = 0; + } + } + if(hostptr->N[I]>0) + { + //copy host data to device + cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice); + } + } //for each array + + //rectify and copy device data structure to device + ldevdata.device = hostptr->device; + ldevdata.devptr = NULL; + ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true + ldevdata.hostarrayptrs = NULL; + + + cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice); + cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice); + + cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray),cudaMemcpyHostToDevice); + + cuda_errortrap("curarray_push cuda error:"); + + cudaSetDevice(olddev); + + delete[] ldevarrayptrs; + delete[] devN; + + return ret; +} + +template int curarray_pull(curarray *hostptr) +{ + int ret = 0; + + int olddev; + + curarray ldevdata; + T** ldevarrayptrs = NULL; + int *devN = NULL; + + if(hostptr==NULL) return -1; + + cudaGetDevice(&olddev); + cudaSetDevice(hostptr->device); + + cuda_errortrap("dbg1"); + + int Narrays = hostptr->Narrays; + + cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); + ldevarrayptrs = new(std::nothrow) T*[Narrays]; + devN = new(std::nothrow) int[Narrays]; + + cuda_errortrap("dbg2"); + + cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost); + cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost); + + cuda_errortrap("dbg3"); + char dbgjnk[50]; + + int I; + for(I=0;Idevarrayptrs[I] != ldevarrayptrs[I]) + { + hostptr->devarrayptrs[I] = ldevarrayptrs[I]; + } + + if(hostptr->N[I]!=devN[I]) + { + if(hostptr->hostarrayptrs[I]!=NULL) + { + delete[] hostptr->hostarrayptrs[I]; + hostptr->hostarrayptrs[I] = NULL; + } + + if(devN[I]>0) + { + hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]]; + hostptr->N[I] = devN[I]; + } + else + { + hostptr->N[I] = 0; + } + } + + if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL) + { + cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost); + sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]); + cuda_errortrap(dbgjnk); + } + } //for each array + + //for the pull operation, I don't think any update of the device data structure is necessary + + cudaSetDevice(olddev); + + delete[] ldevarrayptrs; + delete[] devN; + + return ret; +} + +template __host__ int curarray::push() +{ + return curarray_push(this); +} +template __host__ int curarray::pull() +{ + return curarray_pull(this); +} + + + +/* +https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D + +cudaMalloc() and cudaFree() have distinct semantics between the host and +device environments. When invoked from the host, cudaMalloc() allocates a +new region from unused device memory. When invoked from the device runtime +these functions map to device-side malloc() and free(). This implies that +within the device environment the total allocatable memory is limited to the +device malloc() heap size, which may be smaller than the available unused +device memory. Also, it is an error to invoke cudaFree() from the host +program on a pointer which was allocated by cudaMalloc() on the device +or vice-versa. + +So, basically this entire function is not going to work. I'll be unable to resize within +a kernel. +*/ + +/* + +template __device__ int curarray::dev_resizearray(int arraynum, int arraysize) +{ + int ret = 0; + T* newptr = NULL; + int I; + T def; + + if(arraynum>=0 && arraynum __host__ int curarray::resizearray(int arraynum, int arraysize) +{ + int ret = 0; + T* newptr = NULL; + int I; + T def; + + if(arraynum>=0 && arraynum=2): + if(sys.argv[1]=="clean"): + obj_list = flist('./build_linux64',recurse=True,exts=['.o']) + for o in obj_list: + os.remove('{}'.format(o)) + exit() + +os.system('python3 ./build/make.linux64.lib.py') +os.system('python3 ./build/make.linux64.test.py') + +obj_list = flist('./src',recurse=True,exts=['.o']) +for o in obj_list: + os.remove('{}'.format(o)) + +os.chdir('./build_linux64') +callproc('./test') +os.chdir('..') diff --git a/make_mingw.py b/make_mingw.py new file mode 100644 index 0000000..8be3766 --- /dev/null +++ b/make_mingw.py @@ -0,0 +1,28 @@ +#!/usr/bin/python3 +#!/usr/bin/python3 + +import os,sys,math +from build.amsbuildlib4 import * + +if(len(sys.argv)>=2): + if(sys.argv[1]=="clean"): + obj_list = flist('./build_mingw64',recurse=True,exts=['.o']) + for o in obj_list: + os.remove('{}'.format(o)) + exit() + +os.system('python3 ./build/make.mingw64.lib.py') +os.system('python3 ./build/make.mingw64.test.py') + +obj_list = flist('./src',recurse=True,exts=['.o','.obj']) +for o in obj_list: + os.remove('{}'.format(o)) + +if(sys.platform!="win32"): + os.chdir('./build_mingw64') + callproc('wine ./test.exe') + os.chdir('..') +else: + os.chdir('./build_mingw64') + callproc('test.exe') + os.chdir('..') \ No newline at end of file diff --git a/objstore/amscu_comp128.o b/objstore/amscu_comp128.o index cb855e2..4c8b7b2 100644 Binary files a/objstore/amscu_comp128.o and b/objstore/amscu_comp128.o differ diff --git a/objstore/amscu_comp64.o b/objstore/amscu_comp64.o index 24b8dc5..abe6423 100644 Binary files a/objstore/amscu_comp64.o and b/objstore/amscu_comp64.o differ diff --git a/objstore/amscu_cudafunctions.o b/objstore/amscu_cudafunctions.o index 3d88f68..57d3ee5 100644 Binary files a/objstore/amscu_cudafunctions.o and b/objstore/amscu_cudafunctions.o differ diff --git a/objstore/amscu_random.o b/objstore/amscu_random.o index c735b3c..26e5258 100644 Binary files a/objstore/amscu_random.o and b/objstore/amscu_random.o differ diff --git a/objstore/amscuarray.o b/objstore/amscuarray.o index b555254..0593531 100644 Binary files a/objstore/amscuarray.o and b/objstore/amscuarray.o differ diff --git a/objstore/amscuarray_dops.o b/objstore/amscuarray_dops.o index 998a0bf..d29c393 100644 Binary files a/objstore/amscuarray_dops.o and b/objstore/amscuarray_dops.o differ diff --git a/objstore/amscugeom.o b/objstore/amscugeom.o index cc0d0b0..43e6a78 100644 Binary files a/objstore/amscugeom.o and b/objstore/amscugeom.o differ diff --git a/objstore/amsculib2.o b/objstore/amsculib2.o index 98de25c..57d10a1 100644 Binary files a/objstore/amsculib2.o and b/objstore/amsculib2.o differ diff --git a/objstore/amscumath.o b/objstore/amscumath.o index edc6e7c..6651b82 100644 Binary files a/objstore/amscumath.o and b/objstore/amscumath.o differ diff --git a/objstore/amscurarray.o b/objstore/amscurarray.o index 5e9cbef..2fd4b7d 100644 Binary files a/objstore/amscurarray.o and b/objstore/amscurarray.o differ diff --git a/objstore/cuvect2.o b/objstore/cuvect2.o index 6f9ff84..927c738 100644 Binary files a/objstore/cuvect2.o and b/objstore/cuvect2.o differ diff --git a/objstore/cuvect2f.o b/objstore/cuvect2f.o index d190e6b..3ebc3fa 100644 Binary files a/objstore/cuvect2f.o and b/objstore/cuvect2f.o differ diff --git a/objstore/cuvect3.o b/objstore/cuvect3.o index ee8bb16..aa5fd6d 100644 Binary files a/objstore/cuvect3.o and b/objstore/cuvect3.o differ diff --git a/objstore/cuvect3f.o b/objstore/cuvect3f.o index 8c004e1..9760022 100644 Binary files a/objstore/cuvect3f.o and b/objstore/cuvect3f.o differ diff --git a/objstore/cuvect4.o b/objstore/cuvect4.o index baa5b48..94cd5f1 100644 Binary files a/objstore/cuvect4.o and b/objstore/cuvect4.o differ diff --git a/objstore/cuvect4f.o b/objstore/cuvect4f.o index 8d900fa..b236094 100644 Binary files a/objstore/cuvect4f.o and b/objstore/cuvect4f.o differ diff --git a/old/bin_linux64/libamsculib2.linux64.a b/old/bin_linux64/libamsculib2.linux64.a new file mode 100644 index 0000000..8306912 Binary files /dev/null and b/old/bin_linux64/libamsculib2.linux64.a differ diff --git a/old/bin_linux64/test b/old/bin_linux64/test new file mode 100644 index 0000000..0791c07 Binary files /dev/null and b/old/bin_linux64/test differ diff --git a/old/bin_winx64/libamsculib2.msvc64.lib b/old/bin_winx64/libamsculib2.msvc64.lib new file mode 100644 index 0000000..86d4904 Binary files /dev/null and b/old/bin_winx64/libamsculib2.msvc64.lib differ diff --git a/old/bin_winx64/test.exe b/old/bin_winx64/test.exe new file mode 100644 index 0000000..ba98a33 Binary files /dev/null and b/old/bin_winx64/test.exe differ diff --git a/old/bin_winx64/test.exp b/old/bin_winx64/test.exp new file mode 100644 index 0000000..98ef086 Binary files /dev/null and b/old/bin_winx64/test.exp differ diff --git a/old/bin_winx64/test.lib b/old/bin_winx64/test.lib new file mode 100644 index 0000000..ba12197 Binary files /dev/null and b/old/bin_winx64/test.lib differ diff --git a/old/compscripts/__pycache__/complib2.cpython-310.pyc b/old/compscripts/__pycache__/complib2.cpython-310.pyc new file mode 100644 index 0000000..ca6a6f5 Binary files /dev/null and b/old/compscripts/__pycache__/complib2.cpython-310.pyc differ diff --git a/old/compscripts/__pycache__/complib2.cpython-312.pyc b/old/compscripts/__pycache__/complib2.cpython-312.pyc new file mode 100644 index 0000000..0daf319 Binary files /dev/null and b/old/compscripts/__pycache__/complib2.cpython-312.pyc differ diff --git a/old/compscripts/__pycache__/complib2.cpython-36.pyc b/old/compscripts/__pycache__/complib2.cpython-36.pyc new file mode 100644 index 0000000..8ecee7c Binary files /dev/null and b/old/compscripts/__pycache__/complib2.cpython-36.pyc differ diff --git a/old/compscripts/__pycache__/complib2.cpython-38.pyc b/old/compscripts/__pycache__/complib2.cpython-38.pyc new file mode 100644 index 0000000..ec396bb Binary files /dev/null and b/old/compscripts/__pycache__/complib2.cpython-38.pyc differ diff --git a/old/compscripts/__pycache__/complib2.cpython-39.pyc b/old/compscripts/__pycache__/complib2.cpython-39.pyc new file mode 100644 index 0000000..3028214 Binary files /dev/null and b/old/compscripts/__pycache__/complib2.cpython-39.pyc differ diff --git a/old/compscripts/__pycache__/complib3.cpython-310.pyc b/old/compscripts/__pycache__/complib3.cpython-310.pyc new file mode 100644 index 0000000..1bc8f4f Binary files /dev/null and b/old/compscripts/__pycache__/complib3.cpython-310.pyc differ diff --git a/old/compscripts/__pycache__/complib3.cpython-312.pyc b/old/compscripts/__pycache__/complib3.cpython-312.pyc new file mode 100644 index 0000000..749a9d2 Binary files /dev/null and b/old/compscripts/__pycache__/complib3.cpython-312.pyc differ diff --git a/old/compscripts/__pycache__/complib3.cpython-39.pyc b/old/compscripts/__pycache__/complib3.cpython-39.pyc new file mode 100644 index 0000000..7a86934 Binary files /dev/null and b/old/compscripts/__pycache__/complib3.cpython-39.pyc differ diff --git a/old/compscripts/complib2.py b/old/compscripts/complib2.py new file mode 100644 index 0000000..f9b7121 --- /dev/null +++ b/old/compscripts/complib2.py @@ -0,0 +1,639 @@ +#!/usr/bin/python3 + +#Python3 compilation library +#Aaron M. Schinder +#29 Dec 2020 +# +#Cleanup and refactor from 2017 python2 version compilation libraries + +import os,sys,math,subprocess + +##################### +#Directory Functions# +##################### + +##flist - list all files in a given directory pth +##optional arguments: +# recurse - (T/F): Whether to recursively search for files in directory tree +# exts - (list): A list of file extensions to filter on +# normpath (T/F): whether to normalize path variables after +#filelist = flist(pth,**kwargs): +def flist(pth,**kwargs): + flst = [] + if(not('recurse' in kwargs)): + recurse_ = False + else: + recurse_ = kwargs['recurse'] + if(not('exts' in kwargs)): + filterexts_ = False + else: + filterexts_ = True + exts = kwargs['exts'] + if(not('normpath' in kwargs)): + normpath_ = True + else: + normpath_ = kwargs['normpath'] + if(not('linuxpath' in kwargs)): + linuxpath_ = False + else: + linuxpath_ = kwargs['linuxpath'] + if(not('followlinks' in kwargs)): + followlinks_ = False + else: + followlinks_ = kwargs['followlinks'] + + dirlist = [] + rawlist = os.listdir(pth) + + for F in rawlist: + F2 = os.path.join(pth,F) + if(os.path.isdir(F2)): + b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2))) + if(b): + if((F2!=".")&(F2!="..")): + dirlist.append(F2) + elif(os.path.isfile(F2)): + flst.append(F2) + + #Recurse through directories + if(recurse_): + for D in dirlist: + lst = flist(D,**kwargs) + for L in lst: + flst.append(L) + + #Postprocess: + #Filter out all extensions except the selected ext list + if(filterexts_): + flst = filterexts(flst,exts) + + #Normalize filename path according to os + if(normpath_): + flst2 = list(flst) + for I in range(0,len(flst2)): + flst[I] = os.path.normpath(flst2[I]) + + #If linuxpath, convert all \\ to / + #if(linuxpath_): + # flst2 = list(flst) + # for I in range(0,len(flst2)): + # flst[I] = linuxpath(flst2[I]) + + return flst + +#Filters by extensions in a list of files +#flst = def filterexts(flst,exts): +def filterexts(flst,exts): + flst2 = [] + if(isinstance(exts,str)): + exts = list([exts]) + for F in flst: + b = False + for ext in exts: + if(ext[0]!='.'): + ext = '.'+ext + F2 = os.path.splitext(F) + if(len(F2)>=2): + ex = F2[1] + if(len(ex)>0): + if(ex[0]!='.'): + ex = '.'+ex + if(ex==ext): + b = True + if(b): + flst2.append(F) + + return flst2 + +#Find a file fname, starting in pth and recursing +#Used for finding library files to link +def findfile(fname,pth,**kwargs): + fullfname = "" + flst = flist(pth,recurse=True) + for F in flst: + F2 = os.path.split(F)[1] + if(F2 == fname): + fullfname = F + + return fullfname + +#List to space-seperated-string +def list_to_sss(lst): + lout = "" + for I in range(0,len(lst)-1): + lout = lout + lst[I] + " " + if(len(lst)>0): + lout = lout + lst[len(lst)-1] + return lout + +def strip_whitespace(strin): + strout = "" + I1 = -1 + I2 = -1 + for I in range(0,len(strin)): + if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'): + I1 = I + break + q = list(range(0,len(strin))) + q.reverse() + for I in q: + if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'): + I2 = I+1 + break + if(I1>=0 and I2>=0): + strout = strin[I1:I2] + return strout + +def sss_to_list(sss): + lout = [] + l1 = sss.split(' ') + for l in l1: + l2 = strip_whitespace(l) + lout.append(l2) + return lout + + +def replaceext(fname,ext): + fname2 = "" + if(len(ext)>0): + if(ext[0]!='.'): + ext = '.'+ext + fname2 = os.path.splitext(fname)[0]+ext + else: + fname2 = os.path.splitext(fname)[0] + return fname2 + +def replaceexts(fnamelist,ext): + fname2list = [] + for F in fnamelist: + F2 = replaceext(F,ext) + fname2list.append(F2) + return fname2list + +# def except_contains_oldv(lst1,exc): +# lst2 = [] +# for item in lst1: +# b = 1 +# for item2 in exc: +# if(item.find(item2)>=0): +# b = 0 +# break +# if(b==1): +# lst2.append(item) +# return lst2 + +#filenames must match +def except_contains(lst1,exc): + lst2 = [] + for item in lst1: + b = 1 + for item2 in exc: + fsplit = os.path.split(item) + fn = fsplit[len(fsplit)-1] + if(fn==item2): + b = 0 + break + if(b==1): + lst2.append(item) + return lst2 + +########################## +##System Call Procedures## +########################## + +def callproc(cmd, **kwargs): + if(not('logfile' in kwargs)): + use_lf = False + else: + logfile = kwargs['logfile'] + if(logfile!=""): + fp = open(kwargs['logfile'],'a+') + use_lf = True + else: + use_lf = False + + if(not('echo' in kwargs)): + echo = True + else: + echo = kwargs['echo'] + + if(echo): + print(cmd) + + #encoding/deconding to/from bytes is necessary to use the subprocess command + #in python3.7 + #However, only do this in linux + if(sys.platform!='win32'): + cmd2 = cmd.encode(encoding='utf-8') + else: + cmd2 = cmd + proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + + out = out.decode(encoding='utf-8') + + if(echo): + print(out) + #print(err); + if(use_lf): + fp.writelines(cmd+'\n') + fp.writelines(out+'\n') + + if(use_lf): + fp.close() + +####################################### +##Compiler, Archive, and Linker Calls## +####################################### + +def smartcompile(srcfile,objext='.o'): + mtsrc = os.path.getmtime(srcfile) + objfile = replaceext(srcfile,objext) + objexists = os.path.exists(objfile) + ret = True + if(objexists): + mtobj = os.path.getmtime(objfile) + if(mtobj>=mtsrc): + ret = False + + return ret + +#MSVC compiler wrapper +def msvc_compile(compilername, srcfile, **kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '/c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '/Fo:' + else: + outfileflag = kwargs['outfileflag'] + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + outfile = replaceext(srcfile,objext) + ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +#MSVC compiler wrapper +def msvc_compile_list(compiler,srclist,**kwargs): + for S in srclist: + msvc_compile(compiler,S,**kwargs) + return + +#gnu-style compiler compile: Should work with gcc, g++, gfortran +def gs_compile(compiler,srcfile,**kwargs): + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '-c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '-o' + else: + outfileflag = kwargs['outfileflag'] + + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + if(not('smartcompile' in kwargs)): + _smartcompile = True + else: + _smartcompile = kwargs['smartcompile'] + + #Do I want to make this thing this general? + + if(not(_smartcompile) or smartcompile(srcfile,objext)): + outfile = replaceext(srcfile,objext) + ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +def gs_compile_list(compiler,srclist,**kwargs): + for S in srclist: + gs_compile(compiler,S,**kwargs) + return + +def gs_compile_all(compiler,srcdir,srcexts,**kwargs): + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + srcfils = flist(srcdir,exts=srcexts,recurse=recurse) + + for S in srcfils: + gs_compile(compiler,S,**kwargs) + + return + +def gs_link_all(linker,srcpath,target,**kwargs): + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + + objfils = flist(srcpath,exts=objext,recurse=recurse) + oflst = list_to_sss(objfils) + + gs_link_list(linker,oflst,target,**kwargs) + + return + +def gs_link_list(linker,objlist,target,**kwargs): + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('libdir' in kwargs)): + libdir = '' + else: + libdir = kwargs['libdir'] + + if(not('staticlibs' in kwargs)): + staticlibs = '' + else: + staticlibs = kwargs['staticlibs'] + + if(not('libflags' in kwargs)): + libflags = '' + else: + libflags = kwargs['libflags'] + + if(not('linkerflags' in kwargs)): + linkerflags = '' + else: + linkerflags = kwargs['linkerflags'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + if(not('logfile' in kwargs)): + logfile = '' + else: + logfile = kwargs['logfile'] + + ln = linker+" -o "+target+" "+libdir + ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags + + callproc(ln,logfile=logfile) + return + +def msvc_link_list(objlist,target,**kwargs): + + linker = 'link' + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('libdir' in kwargs)): + libdir = '' + else: + libdir = kwargs['libdir'] + + if(not('staticlibs' in kwargs)): + staticlibs = '' + else: + staticlibs = kwargs['staticlibs'] + + if(not('libflags' in kwargs)): + libflags = '' + else: + libflags = kwargs['libflags'] + + if(not('linkerflags' in kwargs)): + linkerflags = '' + else: + linkerflags = kwargs['linkerflags'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + if(not('logfile' in kwargs)): + logfile = '' + else: + logfile = kwargs['logfile'] + + ln = linker+" "+libdir + ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags + ln = ln+" /out:"+target+" "+libflags + + callproc(ln,logfile=logfile) + + return + +def ar_all(srcpath,arname,**kwargs): + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + objlist = flist(srcpath,exts=objext,recurse=recurse) + ar_list(objlist,arname,**kwargs) + + return + +def msvc_lib_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "lib "+objlist2+" /out:"+arname + callproc(ln) + + return + +def ar_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "ar cr "+ arname+" "+objlist2 + callproc(ln) + + return + +def ar_add_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "ar t "+arname+" "+objlist2 + callproc(ln) + return + +############################## +##Derived Compiler Functions## +############################## + +def gcc_compile(srcfile,**kwargs): + compiler = 'gcc' + kwargs['objext'] = '.o' + #srcexts = ['.c'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def gcc_compile_all(srcdir,**kwargs): + compiler = 'gcc' + kwargs['objext'] = '.o' + srcexts = ['.c'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def gcc_compile_list(srclist,**kwargs): + compiler = 'gcc' + kwargs['objext'] = '.o' + #srcexts = ['.c'] + + gs_compile_list(compiler,srclist,**kwargs) + + return + +def gpp_compile(srcfile,**kwargs): + compiler = 'g++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def gpp_compile_all(srcdir,**kwargs): + compiler = 'g++' + kwargs['objext'] = '.o' + srcexts = ['.c','.cpp'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def gpp_compile_list(srclist,**kwargs): + compiler = 'g++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile_list(compiler,srclist,**kwargs) + + return + +def gfortran_compile(srcfile,**kwargs): + compiler = 'gfortran' + kwargs['objext'] = '.o' + #srcexts = ['.f','.f90','.f77'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def gfortran_compile_all(srcdir,**kwargs): + compiler = 'gfortran' + kwargs['objext'] = '.o' + srcexts = ['.f','.f90','.f77'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def gfortran_compile_list(srclist,**kwargs): + compiler = 'gfortran' + kwargs['objext'] = '.o' + #srcexts = ['.f','.f90','.f77'] + + gs_compile_list(compiler,srclist,**kwargs) + + return + +def clang_compile(srcfile,**kwargs): + compiler = 'clang++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def clang_compile_all(srcdir,**kwargs): + compiler = 'clang++' + kwargs['objext'] = '.o' + srcexts = ['.c','.cpp'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def clang_compile_list(srclist,**kwargs): + compiler = 'clang++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile_list(compiler,srclist,**kwargs) + + return \ No newline at end of file diff --git a/old/compscripts/complib3.py b/old/compscripts/complib3.py new file mode 100644 index 0000000..bd7d2ee --- /dev/null +++ b/old/compscripts/complib3.py @@ -0,0 +1,524 @@ +#!/usr/bin/python3 + +import os,sys,math +import subprocess + +##flist - list all files in a given directory pth +##optional arguments: +# recurse - (T/F): Whether to recursively search for files in directory tree +# exts - (list): A list of file extensions to filter on +# normpath (T/F): whether to normalize path variables after +#filelist = flist(pth,**kwargs): +def flist(pth,**kwargs): + flst = [] + if(not('recurse' in kwargs)): + recurse_ = False + else: + recurse_ = kwargs['recurse'] + if(not('exts' in kwargs)): + filterexts_ = False + else: + filterexts_ = True + exts = kwargs['exts'] + if(not('normpath' in kwargs)): + normpath_ = True + else: + normpath_ = kwargs['normpath'] + if(not('linuxpath' in kwargs)): + linuxpath_ = False + else: + linuxpath_ = kwargs['linuxpath'] + if(not('followlinks' in kwargs)): + followlinks_ = False + else: + followlinks_ = kwargs['followlinks'] + + dirlist = [] + rawlist = os.listdir(pth) + + for F in rawlist: + F2 = os.path.join(pth,F) + if(os.path.isdir(F2)): + b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2))) + if(b): + if((F2!=".")&(F2!="..")): + dirlist.append(F2) + elif(os.path.isfile(F2)): + flst.append(F2) + + #Recurse through directories + if(recurse_): + for D in dirlist: + lst = flist(D,**kwargs) + for L in lst: + flst.append(L) + + #Postprocess: + #Filter out all extensions except the selected ext list + if(filterexts_): + flst = filterexts(flst,exts) + + #Normalize filename path according to os + if(normpath_): + flst2 = list(flst) + for I in range(0,len(flst2)): + flst[I] = os.path.normpath(flst2[I]) + + #If linuxpath, convert all \\ to / + #if(linuxpath_): + # flst2 = list(flst) + # for I in range(0,len(flst2)): + # flst[I] = linuxpath(flst2[I]) + + return flst + +#Filters by extensions in a list of files +#flst = def filterexts(flst,exts): +def filterexts(flst,exts): + flst2 = [] + if(isinstance(exts,str)): + exts = list([exts]) + for F in flst: + b = False + for ext in exts: + if(ext[0]!='.'): + ext = '.'+ext + F2 = os.path.splitext(F) + if(len(F2)>=2): + ex = F2[1] + if(len(ex)>0): + if(ex[0]!='.'): + ex = '.'+ex + if(ex==ext): + b = True + if(b): + flst2.append(F) + + return flst2 + +#Find a file fname, starting in pth and recursing +#Used for finding library files to link +def findfile(fname,pth,**kwargs): + fullfname = "" + flst = flist(pth,recurse=True) + for F in flst: + F2 = os.path.split(F)[1] + if(F2 == fname): + fullfname = F + + return fullfname + +def replaceext(fname,ext): + fname2 = "" + if(len(ext)>0): + if(ext[0]!='.'): + ext = '.'+ext + fname2 = os.path.splitext(fname)[0]+ext + else: + fname2 = os.path.splitext(fname)[0] + return fname2 + +def replaceexts(fnamelist,ext): + fname2list = [] + for F in fnamelist: + F2 = replaceext(F,ext) + fname2list.append(F2) + return fname2list + +#filenames must match +def except_contains(lst1,exc): + lst2 = [] + for item in lst1: + b = 1 + for item2 in exc: + fsplit = os.path.split(item) + fn = fsplit[len(fsplit)-1] + if(fn==item2): + b = 0 + break + if(b==1): + lst2.append(item) + return lst2 + +########################## +##System Call Procedures## +########################## + +def callproc(cmd, **kwargs): + if(not('logfile' in kwargs)): + use_lf = False + else: + logfile = kwargs['logfile'] + if(logfile!=""): + fp = open(kwargs['logfile'],'a+') + use_lf = True + else: + use_lf = False + + if(not('echo' in kwargs)): + echo = True + else: + echo = kwargs['echo'] + + if(echo): + print(cmd) + + #encoding/deconding to/from bytes is necessary to use the subprocess command + #in python3.7 + #However, only do this in linux + if(sys.platform!='win32'): + cmd2 = cmd.encode(encoding='utf-8') + else: + cmd2 = cmd + proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + + out = out.decode(encoding='utf-8') + + if(echo): + print(out) + #print(err); + if(use_lf): + fp.writelines(cmd+'\n') + fp.writelines(out+'\n') + + if(use_lf): + fp.close() + +#List to space-seperated-string +def list_to_sss(lst): + lout = "" + for I in range(0,len(lst)-1): + lout = lout + lst[I] + " " + if(len(lst)>0): + lout = lout + lst[len(lst)-1] + return lout + +##################################### +## Incremental Compilation Library ## +##################################### + +#silently read lines from a text file if exists +def readtextlines(fname): + txtlns = [] + + if(not os.path.isfile(fname)): + return txtlns + + try: + fp = open(fname,"r") + except: + return txtlns + + ln = " " + while(ln!=""): + ln = fp.readline() + txtlns.append(ln) + + fp.close() + + return txtlns + +def getincludefnfrage(includeline): + + fnfrag = "" + I1 = -1 + I2 = -1 + + for I in range(0,len(includeline)): + if(I1<0 and (includeline[I]=='<' or includeline[I]=='"')): + I1 = I + if(I1>=0 and (includeline[I]=='>' or includeline[I]=='"')): + I2 = I + break + if(I1>=0 and I2>=0): + fnfrag = includeline[I1+1:I2] + + return fnfrag + +#Returns the name of the source file fname (if it exists) +#and all included filenames +def getsrcandincludes(fname, incdirs): + + flist = [] + if(os.path.isfile(fname)): + flist.append(fname) + + Ilist = 0 + while(Ilist=0): + fnfrag = getincludefnfrage(lns[J]) + for K in range(0,len(incdirs)): + tfn = os.path.join(incdirs[K],fnfrag) + if(os.path.isfile(tfn)): + flist.append(tfn) + break + + Ilist = Ilist + 1 + + return flist + +#Returns the name of the object file associated with the source file +#within the object store folder (if it exists) +def getobjfile(fname,objstore,objext = ".o"): + + fret = "" + f1 = os.path.split(fname)[1] + f2 = f1 + while(os.path.splitext(f2)[1]!=""): + f2 = os.path.splitext(f2)[0] + objext = objext.strip('.') + f3 = os.path.join(objstore,"{}.{}".format(f2,objext)) + if(os.path.exists(f3)): + fret = f3 + + return fret + +def getsrctimes(fname, incdirs): + + ftimes = [] + flst = getsrcandincludes(fname, incdirs) + for I in range(0,len(flst)): + f = flst[I] + mt = os.path.getmtime(f) + ftimes.append(mt) + + return ftimes + +def getobjtime(fname,objstore,objext=".o"): + ret = -1 + fret = getobjfile(fname,objstore,objext) + if(fret!=""): + ret = os.path.getmtime(fret) + + return ret + +#Decide whether or not to compile source file +def decidecompile(fname,**kwargs): + ret = True + + if(not os.path.isfile(fname)): + ret = False + return ret + + ##unpack kwargs + if("searchincdirs" in kwargs): + incdirs = kwargs["searchincdirs"] + else: + incdirs = ["./include"] + + if("objext" in kwargs): + objext = kwargs["objext"] + else: + objext = ".o" + if("objstore" in kwargs): + objstore = kwargs["objstore"] + else: + objstore = "./objstore" + + + srclist = getsrcandincludes(fname,incdirs) + srctlist = getsrctimes(fname,incdirs) + obj = getobjfile(fname,objstore,objext) + objt = getobjtime(fname,objstore,objext) + + if(obj!=""): + ret = False + for I in range(0,len(srctlist)): + if(srctlist[I]>objt): + ret = True + break + + return ret + +def gs_incremental_compile(compiler,srcfile,**kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + if(not('srcfileflag' in kwargs)): + srcfileflag = '-c' + else: + srcfileflag = kwargs['srcfileflag'] + if(not('outfileflag' in kwargs)): + outfileflag = '-o' + else: + outfileflag = kwargs['outfileflag'] + + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + if(not('smartcompile' in kwargs)): + _smartcompile = True + else: + _smartcompile = kwargs['smartcompile'] + + #incrementalcompile + if("searchincdirs" in kwargs): + incdirs = kwargs["searchincdirs"] + else: + incdirs = ["./include"] + + if("objext" in kwargs): + objext = kwargs["objext"] + else: + objext = ".o" + if("objstore" in kwargs): + objstore = kwargs["objstore"] + else: + objstore = "./objstore" + + #Do I want to make this thing this general? + + docompile = decidecompile(srcfile,**kwargs) + + if(docompile): + f1 = os.path.split(srcfile)[1] + f2 = f1 + while(os.path.splitext(f2)[1]!=""): + f2 = os.path.splitext(f2)[0] + outfile = os.path.join(objstore,"{}{}".format(f2,objext)) + + ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +def gs_incremental_compile_list(compiler,srclist,**kwargs): + + for s in srclist: + gs_incremental_compile(compiler,s,**kwargs) + + return + +#MSVC compiler wrapper + +def msvc_incremental_compile(compilername, srcfile, **kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '/c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '/Fo:' + else: + outfileflag = kwargs['outfileflag'] + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + #incrementalcompile + if("searchincdirs" in kwargs): + incdirs = kwargs["searchincdirs"] + else: + incdirs = ["./include"] + # if("objext" in kwargs): + # objext = kwargs["objext"] + # else: + # objext = ".o" + if("objstore" in kwargs): + objstore = kwargs["objstore"] + else: + objstore = "./objstore" + + docompile = decidecompile(srcfile,**kwargs) + + if(docompile): + f1 = os.path.split(srcfile)[1] + f2 = f1 + while(os.path.splitext(f2)[1]!=""): + f2 = os.path.splitext(f2)[0] + outfile = os.path.join(objstore,"{}{}".format(f2,objext)) + + ln = compilername+" "+flags+" "+srcfileflag+" "+srcfile+" "+outfileflag+" "+outfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + # outfile = replaceext(srcfile,objext) + # ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile + # ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +def msvc_incremental_compile_list(compiler,srclist,**kwargs): + for S in srclist: + msvc_incremental_compile(compiler,S,**kwargs) + return + +####################### +## Main Script Tests ## +####################### + +def testtimes(args): + if(len(args)>=2): + flist = getsrcandincludes(args[1],["./include"]) + ftlist = getsrctimes(args[1],["./include"]) + for I in range(0,len(flist)): + print("{}\t\t{}".format(flist[I],ftlist[I])) + + print("associated obj file:") + fobj = getobjfile(args[1],"./objstore") + ftobj = getobjtime(args[1],"./objstore") + if(fobj!=""): + print("{}\t\t{}".format(fobj,ftobj)) + else: + print("none found") + + cflag = decidecompile(args[1]) + print("compile? : {}".format(cflag)) + + + return + +# if(__name__ == "__main__"): + +# args = sys.argv +# testtimes(args) + + + + \ No newline at end of file diff --git a/old/compscripts/linux64.makelib.py b/old/compscripts/linux64.makelib.py new file mode 100644 index 0000000..2d65139 --- /dev/null +++ b/old/compscripts/linux64.makelib.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * +from complib3 import gs_incremental_compile, gs_incremental_compile_list + +import shutil +#from distutils.dir_util import copy_tree as copy_tree #this version does overwrites +from shutil import copytree + +libname = 'amsculib2.linux64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../linux64/include" +commonlibdir = "../../linux64/lib" +localbindir = "./bin_linux64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +#-dc flag: relocatable device code - needed for device functions to link in different "execution units" +#--ptxas-options=-v +kwargs['flags'] = "-dc --compiler-options '-fPIC -O3'" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True +kwargs['objstore'] = "./objstore" +kwargs['searchincdirs'] = ['./include'] + +#find all source files, except the main project files +files = flist('./src',exts = srcexts, recurse=True) +files = except_contains(files,mainsrc) +objfiles = replaceexts(files,'.o') +objfiles_sss = list_to_sss(objfiles) + +#compile all the source files in the list +#gs_compile_list(cc,files,**kwargs) +gs_incremental_compile_list(cc,files,**kwargs) + +#archive all the source files into a static library +#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname)) +objlist = flist(kwargs['objstore'],exts='.o',recurse=True) +ar_list(objlist,'{}/lib{}.a'.format(localbindir,libname)) + +# #Push any libraries to the common lib folder +shutil.copy('{}/lib{}.a'.format(localbindir,libname),commonlibdir) + +# #Copy include files to the common include folder +copytree('./include/',commonincdir+'/',dirs_exist_ok=True) diff --git a/old/compscripts/linux64.maketest.py b/old/compscripts/linux64.maketest.py new file mode 100644 index 0000000..6482698 --- /dev/null +++ b/old/compscripts/linux64.maketest.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * +from complib3 import gs_incremental_compile, gs_incremental_compile_list + +import shutil + +libname = 'amsculib2.linux64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../linux64/include" +commonlibdir = "../../linux64/lib" +localbindir = "./bin_linux64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +#-dc flag: relocatable device code - needed for device functions to link in different "execution units" +kwargs['flags'] = "-dc --compiler-options '-fPIC'" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{} -lamsculib2.linux64".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True +kwargs['objstore'] = "./objstore" +kwargs['searchincdirs'] = ['./include'] + +#-lamsmathlib3.linux64 -lamsstring3.linux64 -lamsmatrix_cpp.linux64 -llapack -lblas -lgfortran -lamsmathutilthread.linux64 -lamsmathutil2.linux64 + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir); +#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir); +#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cu'] +fobj = replaceexts(fsrc,'.o') + +#Compile test programs +gs_compile_list(cc,fsrc,**kwargs) +gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs) diff --git a/old/compscripts/old/complib2.py b/old/compscripts/old/complib2.py new file mode 100644 index 0000000..f9b7121 --- /dev/null +++ b/old/compscripts/old/complib2.py @@ -0,0 +1,639 @@ +#!/usr/bin/python3 + +#Python3 compilation library +#Aaron M. Schinder +#29 Dec 2020 +# +#Cleanup and refactor from 2017 python2 version compilation libraries + +import os,sys,math,subprocess + +##################### +#Directory Functions# +##################### + +##flist - list all files in a given directory pth +##optional arguments: +# recurse - (T/F): Whether to recursively search for files in directory tree +# exts - (list): A list of file extensions to filter on +# normpath (T/F): whether to normalize path variables after +#filelist = flist(pth,**kwargs): +def flist(pth,**kwargs): + flst = [] + if(not('recurse' in kwargs)): + recurse_ = False + else: + recurse_ = kwargs['recurse'] + if(not('exts' in kwargs)): + filterexts_ = False + else: + filterexts_ = True + exts = kwargs['exts'] + if(not('normpath' in kwargs)): + normpath_ = True + else: + normpath_ = kwargs['normpath'] + if(not('linuxpath' in kwargs)): + linuxpath_ = False + else: + linuxpath_ = kwargs['linuxpath'] + if(not('followlinks' in kwargs)): + followlinks_ = False + else: + followlinks_ = kwargs['followlinks'] + + dirlist = [] + rawlist = os.listdir(pth) + + for F in rawlist: + F2 = os.path.join(pth,F) + if(os.path.isdir(F2)): + b = (followlinks_) or ((not followlinks_) and not(os.path.islink(F2))) + if(b): + if((F2!=".")&(F2!="..")): + dirlist.append(F2) + elif(os.path.isfile(F2)): + flst.append(F2) + + #Recurse through directories + if(recurse_): + for D in dirlist: + lst = flist(D,**kwargs) + for L in lst: + flst.append(L) + + #Postprocess: + #Filter out all extensions except the selected ext list + if(filterexts_): + flst = filterexts(flst,exts) + + #Normalize filename path according to os + if(normpath_): + flst2 = list(flst) + for I in range(0,len(flst2)): + flst[I] = os.path.normpath(flst2[I]) + + #If linuxpath, convert all \\ to / + #if(linuxpath_): + # flst2 = list(flst) + # for I in range(0,len(flst2)): + # flst[I] = linuxpath(flst2[I]) + + return flst + +#Filters by extensions in a list of files +#flst = def filterexts(flst,exts): +def filterexts(flst,exts): + flst2 = [] + if(isinstance(exts,str)): + exts = list([exts]) + for F in flst: + b = False + for ext in exts: + if(ext[0]!='.'): + ext = '.'+ext + F2 = os.path.splitext(F) + if(len(F2)>=2): + ex = F2[1] + if(len(ex)>0): + if(ex[0]!='.'): + ex = '.'+ex + if(ex==ext): + b = True + if(b): + flst2.append(F) + + return flst2 + +#Find a file fname, starting in pth and recursing +#Used for finding library files to link +def findfile(fname,pth,**kwargs): + fullfname = "" + flst = flist(pth,recurse=True) + for F in flst: + F2 = os.path.split(F)[1] + if(F2 == fname): + fullfname = F + + return fullfname + +#List to space-seperated-string +def list_to_sss(lst): + lout = "" + for I in range(0,len(lst)-1): + lout = lout + lst[I] + " " + if(len(lst)>0): + lout = lout + lst[len(lst)-1] + return lout + +def strip_whitespace(strin): + strout = "" + I1 = -1 + I2 = -1 + for I in range(0,len(strin)): + if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'): + I1 = I + break + q = list(range(0,len(strin))) + q.reverse() + for I in q: + if(strin[I]!=' ' and strin[I]!='\t' and strin[I]!='\r'and strin[I]!='\n'): + I2 = I+1 + break + if(I1>=0 and I2>=0): + strout = strin[I1:I2] + return strout + +def sss_to_list(sss): + lout = [] + l1 = sss.split(' ') + for l in l1: + l2 = strip_whitespace(l) + lout.append(l2) + return lout + + +def replaceext(fname,ext): + fname2 = "" + if(len(ext)>0): + if(ext[0]!='.'): + ext = '.'+ext + fname2 = os.path.splitext(fname)[0]+ext + else: + fname2 = os.path.splitext(fname)[0] + return fname2 + +def replaceexts(fnamelist,ext): + fname2list = [] + for F in fnamelist: + F2 = replaceext(F,ext) + fname2list.append(F2) + return fname2list + +# def except_contains_oldv(lst1,exc): +# lst2 = [] +# for item in lst1: +# b = 1 +# for item2 in exc: +# if(item.find(item2)>=0): +# b = 0 +# break +# if(b==1): +# lst2.append(item) +# return lst2 + +#filenames must match +def except_contains(lst1,exc): + lst2 = [] + for item in lst1: + b = 1 + for item2 in exc: + fsplit = os.path.split(item) + fn = fsplit[len(fsplit)-1] + if(fn==item2): + b = 0 + break + if(b==1): + lst2.append(item) + return lst2 + +########################## +##System Call Procedures## +########################## + +def callproc(cmd, **kwargs): + if(not('logfile' in kwargs)): + use_lf = False + else: + logfile = kwargs['logfile'] + if(logfile!=""): + fp = open(kwargs['logfile'],'a+') + use_lf = True + else: + use_lf = False + + if(not('echo' in kwargs)): + echo = True + else: + echo = kwargs['echo'] + + if(echo): + print(cmd) + + #encoding/deconding to/from bytes is necessary to use the subprocess command + #in python3.7 + #However, only do this in linux + if(sys.platform!='win32'): + cmd2 = cmd.encode(encoding='utf-8') + else: + cmd2 = cmd + proc = subprocess.Popen(cmd2,stderr = subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + + out = out.decode(encoding='utf-8') + + if(echo): + print(out) + #print(err); + if(use_lf): + fp.writelines(cmd+'\n') + fp.writelines(out+'\n') + + if(use_lf): + fp.close() + +####################################### +##Compiler, Archive, and Linker Calls## +####################################### + +def smartcompile(srcfile,objext='.o'): + mtsrc = os.path.getmtime(srcfile) + objfile = replaceext(srcfile,objext) + objexists = os.path.exists(objfile) + ret = True + if(objexists): + mtobj = os.path.getmtime(objfile) + if(mtobj>=mtsrc): + ret = False + + return ret + +#MSVC compiler wrapper +def msvc_compile(compilername, srcfile, **kwargs): + + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '/c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '/Fo:' + else: + outfileflag = kwargs['outfileflag'] + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + outfile = replaceext(srcfile,objext) + ln = compilername+" "+flags+" "+" "+srcfileflag+" "+srcfile+" "+outfileflag+outfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +#MSVC compiler wrapper +def msvc_compile_list(compiler,srclist,**kwargs): + for S in srclist: + msvc_compile(compiler,S,**kwargs) + return + +#gnu-style compiler compile: Should work with gcc, g++, gfortran +def gs_compile(compiler,srcfile,**kwargs): + if(not('include' in kwargs)): + include = '' + else: + include = kwargs['include'] + if(isinstance(include,list)): + include = list_to_sss(include) + + if(not('flags' in kwargs)): + flags = '' + else: + flags = kwargs['flags'] + if(isinstance(flags,list)): + flags = list_to_sss(flags) + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('srcfileflag' in kwargs)): + srcfileflag = '-c' + else: + srcfileflag = kwargs['srcfileflag'] + + if(not('outfileflag' in kwargs)): + outfileflag = '-o' + else: + outfileflag = kwargs['outfileflag'] + + if(not('logfile' in kwargs)): + logfile = "" + else: + logfile = kwargs['logfile'] + + if(not('smartcompile' in kwargs)): + _smartcompile = True + else: + _smartcompile = kwargs['smartcompile'] + + #Do I want to make this thing this general? + + if(not(_smartcompile) or smartcompile(srcfile,objext)): + outfile = replaceext(srcfile,objext) + ln = compiler+" "+flags+" " + outfileflag+" "+outfile+" "+srcfileflag+" "+srcfile + ln = ln + " " + include + + callproc(ln,echo=True,logfile=logfile) + + return + +def gs_compile_list(compiler,srclist,**kwargs): + for S in srclist: + gs_compile(compiler,S,**kwargs) + return + +def gs_compile_all(compiler,srcdir,srcexts,**kwargs): + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + srcfils = flist(srcdir,exts=srcexts,recurse=recurse) + + for S in srcfils: + gs_compile(compiler,S,**kwargs) + + return + +def gs_link_all(linker,srcpath,target,**kwargs): + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + + objfils = flist(srcpath,exts=objext,recurse=recurse) + oflst = list_to_sss(objfils) + + gs_link_list(linker,oflst,target,**kwargs) + + return + +def gs_link_list(linker,objlist,target,**kwargs): + + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + if(not('libdir' in kwargs)): + libdir = '' + else: + libdir = kwargs['libdir'] + + if(not('staticlibs' in kwargs)): + staticlibs = '' + else: + staticlibs = kwargs['staticlibs'] + + if(not('libflags' in kwargs)): + libflags = '' + else: + libflags = kwargs['libflags'] + + if(not('linkerflags' in kwargs)): + linkerflags = '' + else: + linkerflags = kwargs['linkerflags'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + if(not('logfile' in kwargs)): + logfile = '' + else: + logfile = kwargs['logfile'] + + ln = linker+" -o "+target+" "+libdir + ln = ln+" "+objlist+" "+staticlibs+" "+libflags+" "+linkerflags + + callproc(ln,logfile=logfile) + return + +def msvc_link_list(objlist,target,**kwargs): + + linker = 'link' + + if(not('objext' in kwargs)): + objext = '.obj' + else: + objext = kwargs['objext'] + + if(not('libdir' in kwargs)): + libdir = '' + else: + libdir = kwargs['libdir'] + + if(not('staticlibs' in kwargs)): + staticlibs = '' + else: + staticlibs = kwargs['staticlibs'] + + if(not('libflags' in kwargs)): + libflags = '' + else: + libflags = kwargs['libflags'] + + if(not('linkerflags' in kwargs)): + linkerflags = '' + else: + linkerflags = kwargs['linkerflags'] + + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + + if(not('logfile' in kwargs)): + logfile = '' + else: + logfile = kwargs['logfile'] + + ln = linker+" "+libdir + ln = ln+" "+objlist+" "+staticlibs+" "+linkerflags + ln = ln+" /out:"+target+" "+libflags + + callproc(ln,logfile=logfile) + + return + +def ar_all(srcpath,arname,**kwargs): + if(not('recurse' in kwargs)): + recurse = True + else: + recurse = kwargs['recurse'] + if(not('objext' in kwargs)): + objext = '.o' + else: + objext = kwargs['objext'] + + objlist = flist(srcpath,exts=objext,recurse=recurse) + ar_list(objlist,arname,**kwargs) + + return + +def msvc_lib_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "lib "+objlist2+" /out:"+arname + callproc(ln) + + return + +def ar_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "ar cr "+ arname+" "+objlist2 + callproc(ln) + + return + +def ar_add_list(objlist,arname,**kwargs): + objlist2 = list_to_sss(objlist) + + ln = "ar t "+arname+" "+objlist2 + callproc(ln) + return + +############################## +##Derived Compiler Functions## +############################## + +def gcc_compile(srcfile,**kwargs): + compiler = 'gcc' + kwargs['objext'] = '.o' + #srcexts = ['.c'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def gcc_compile_all(srcdir,**kwargs): + compiler = 'gcc' + kwargs['objext'] = '.o' + srcexts = ['.c'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def gcc_compile_list(srclist,**kwargs): + compiler = 'gcc' + kwargs['objext'] = '.o' + #srcexts = ['.c'] + + gs_compile_list(compiler,srclist,**kwargs) + + return + +def gpp_compile(srcfile,**kwargs): + compiler = 'g++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def gpp_compile_all(srcdir,**kwargs): + compiler = 'g++' + kwargs['objext'] = '.o' + srcexts = ['.c','.cpp'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def gpp_compile_list(srclist,**kwargs): + compiler = 'g++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile_list(compiler,srclist,**kwargs) + + return + +def gfortran_compile(srcfile,**kwargs): + compiler = 'gfortran' + kwargs['objext'] = '.o' + #srcexts = ['.f','.f90','.f77'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def gfortran_compile_all(srcdir,**kwargs): + compiler = 'gfortran' + kwargs['objext'] = '.o' + srcexts = ['.f','.f90','.f77'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def gfortran_compile_list(srclist,**kwargs): + compiler = 'gfortran' + kwargs['objext'] = '.o' + #srcexts = ['.f','.f90','.f77'] + + gs_compile_list(compiler,srclist,**kwargs) + + return + +def clang_compile(srcfile,**kwargs): + compiler = 'clang++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile(compiler,srcfile,**kwargs) + + return + +def clang_compile_all(srcdir,**kwargs): + compiler = 'clang++' + kwargs['objext'] = '.o' + srcexts = ['.c','.cpp'] + + gs_compile_all(compiler,srcdir,srcexts,**kwargs) + + return + +def clang_compile_list(srclist,**kwargs): + compiler = 'clang++' + kwargs['objext'] = '.o' + #srcexts = ['.c','.cpp'] + + gs_compile_list(compiler,srclist,**kwargs) + + return \ No newline at end of file diff --git a/old/compscripts/old/linux64.makelib.py b/old/compscripts/old/linux64.makelib.py new file mode 100644 index 0000000..4f685f0 --- /dev/null +++ b/old/compscripts/old/linux64.makelib.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * + +import shutil +#from distutils.dir_util import copy_tree as copy_tree #this version does overwrites +from shutil import copytree as copytree + +libname = 'amsculib2.linux64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../linux64/include" +commonlibdir = "../../linux64/lib" +localbindir = "./bin_linux64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +#-dc flag: relocatable device code - needed for device functions to link in different "execution units" +kwargs['flags'] = "-dc" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True + +#find all source files, except the main project files +files = flist('./src',exts = srcexts, recurse=True) +files = except_contains(files,mainsrc) +objfiles = replaceexts(files,'.o') +objfiles_sss = list_to_sss(objfiles) + +#compile all the source files in the list +gs_compile_list(cc,files,**kwargs) + +#archive all the source files into a static library +ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname)) + +#Push any libraries to the common lib folder +shutil.copy('{}/lib{}.a'.format(localbindir,libname),commonlibdir) + +#Copy include files to the common include folder +copytree('./include/',commonincdir+'/',dirs_exist_ok=True) diff --git a/old/compscripts/old/linux64.maketest.py b/old/compscripts/old/linux64.maketest.py new file mode 100644 index 0000000..034f08f --- /dev/null +++ b/old/compscripts/old/linux64.maketest.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * + +import shutil + +libname = 'amsculib2.linux64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../linux64/include" +commonlibdir = "../../linux64/lib" +localbindir = "./bin_linux64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +#-dc flag: relocatable device code - needed for device functions to link in different "execution units" +kwargs['flags'] = "-dc" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir); +#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir); +#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cu'] +fobj = replaceexts(fsrc,'.o') + +#Compile test programs +gs_compile_list(cc,fsrc,**kwargs) +gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs) diff --git a/old/compscripts/old/msvc.makelib.py b/old/compscripts/old/msvc.makelib.py new file mode 100644 index 0000000..a457d2d --- /dev/null +++ b/old/compscripts/old/msvc.makelib.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * + +import shutil +from shutil import copytree as copytree + +libname = 'assetcuda.msvc64' #prefix static library name to generate +targetname = 'main' #create this executable when compiling tests +commonincdir = "../../winx64/include" +commonlibdir = "../../winx64/lib" +localbindir = "./bin_winx64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp'] +mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +kwargs['flags'] = "/O2" +kwargs['libdir'] = "/LIBPATH:{} /LIBPATH:{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True + +#find all source files, except the main project files +files = flist('./src',exts = srcexts, recurse=True) +files = except_contains(files,mainsrc) +objfiles = replaceexts(files,'.obj') +objfiles_sss = list_to_sss(objfiles) + +#compile all the source files in the list +msvc_compile_list(cc,files,**kwargs) +#gs_compile_list(cc,files,**kwargs) + +#archive all the source files into a static library +#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname)) +msvc_lib_list(objfiles,'{}/lib{}.lib'.format(localbindir,libname)) + +#Push any libraries to the common lib folder +shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir) + +#Copy include files to the common include folder +copytree('./include/',commonincdir+'/',dirs_exist_ok=True) diff --git a/old/compscripts/old/msvc.maketest.py b/old/compscripts/old/msvc.maketest.py new file mode 100644 index 0000000..5cb0533 --- /dev/null +++ b/old/compscripts/old/msvc.maketest.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * + +import shutil +from distutils.dir_util import copy_tree as copy_tree #this version does overwrites + +libname = 'assetcuda.msvc64' #prefix static library name to generate +targetname = 'tests.exe' #create this executable when compiling tests +commonincdir = "../../winx64/include" +commonlibdir = "../../winx64/lib" +localbindir = "./bin_winx64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp'] +mainsrc = ['main.c','main.cpp','main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +kwargs['flags'] = "/O2" +kwargs['libdir'] = "/LIBPATH:{} /LIBPATH:{}".format(localbindir,commonlibdir) +#kwargs['libflags'] = "lib{}.lib libamsearthtools.msvc64.lib libamsmeshtools.msvc64.lib libamsmathlib3.msvc64.lib libamsmatrix_cpp.msvc64.lib liblapack.a libblas.a libamsstring3.msvc64.lib libamsmathutil2.msvc64.lib".format(libname) +kwargs['libflags'] = "lib{}.lib".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir); +#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir); +#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cu'] +fobj = replaceexts(fsrc,'.obj') + +#Compile test programs +msvc_compile_list(cc,fsrc,**kwargs) +msvc_link_list(list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs) diff --git a/old/compscripts/old/winnvcc.makelib.py b/old/compscripts/old/winnvcc.makelib.py new file mode 100644 index 0000000..c9010a8 --- /dev/null +++ b/old/compscripts/old/winnvcc.makelib.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * + +import shutil +from distutils.dir_util import copy_tree as copy_tree #this version does overwrites + +libname = 'amsculib2.msvc64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../winx64/include" +commonlibdir = "../../winx64/lib" +localbindir = "./bin_winx64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.c','main.cpp'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +kwargs['flags'] = "-dc" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True + +#find all source files, except the main project files +files = flist('./src',exts = srcexts, recurse=True) +files = except_contains(files,mainsrc) +objfiles = replaceexts(files,'.o') +objfiles_sss = list_to_sss(objfiles) + +#compile all the source files in the list +gs_compile_list(cc,files,**kwargs) + +#archive all the source files into a static library +#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname)) +msvc_lib_list(objfiles,'{}/lib{}.lib'.format(localbindir,libname)) + +#Push any libraries to the common lib folder +shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir) + +#Copy include files to the common include folder +copy_tree('./include/',commonincdir+'/') diff --git a/old/compscripts/old/winnvcc.maketest.py b/old/compscripts/old/winnvcc.maketest.py new file mode 100644 index 0000000..1914884 --- /dev/null +++ b/old/compscripts/old/winnvcc.maketest.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * + +import shutil +from distutils.dir_util import copy_tree as copy_tree #this version does overwrites + +libname = 'amsculib2.msvc64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../winx64/include" +commonlibdir = "../../winx64/lib" +localbindir = "./bin_winx64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.c','main.cpp'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +kwargs['flags'] = "-dc" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-llib{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir); +#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir); +#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cpp'] +fobj = replaceexts(fsrc,'.o') + +#Compile test programs +gs_compile_list(cc,fsrc,**kwargs) +gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs) diff --git a/old/compscripts/winnvcc.makelib.py b/old/compscripts/winnvcc.makelib.py new file mode 100644 index 0000000..7cfc6f1 --- /dev/null +++ b/old/compscripts/winnvcc.makelib.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * +from complib3 import gs_incremental_compile, gs_incremental_compile_list + +import shutil +from shutil import copytree + +libname = 'amsculib2.msvc64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../winx64/include" +commonlibdir = "../../winx64/lib" +localbindir = "./bin_winx64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +kwargs['flags'] = "-dc" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-l{}".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True +kwargs['objstore'] = "./objstore" +kwargs['searchincdirs'] = ['./include'] + +#find all source files, except the main project files +files = flist('./src',exts = srcexts, recurse=True) +files = except_contains(files,mainsrc) +objfiles = replaceexts(files,'.o') +objfiles_sss = list_to_sss(objfiles) + +#compile all the source files in the list +#gs_compile_list(cc,files,**kwargs) +gs_incremental_compile_list(cc,files,**kwargs) + +#archive all the source files into a static library +#ar_list(objfiles,'{}/lib{}.a'.format(localbindir,libname)) +objlist = flist(kwargs['objstore'],exts='.o',recurse=True) +msvc_lib_list(objlist,'{}/lib{}.lib'.format(localbindir,libname)) + +# #Push any libraries to the common lib folder +shutil.copy('{}/lib{}.lib'.format(localbindir,libname),commonlibdir) + +# #Copy include files to the common include folder +copytree('./include/',commonincdir+'/',dirs_exist_ok=True) diff --git a/old/compscripts/winnvcc.maketest.py b/old/compscripts/winnvcc.maketest.py new file mode 100644 index 0000000..8622faa --- /dev/null +++ b/old/compscripts/winnvcc.maketest.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +import os,sys,subprocess,math +from complib2 import * +from complib3 import gs_incremental_compile, gs_incremental_compile_list + +import shutil +from shutil import copytree + +libname = 'amsculib2.msvc64' #prefix static library name to generate +targetname = 'test' #create this executable when compiling tests +commonincdir = "../../winx64/include" +commonlibdir = "../../winx64/lib" +localbindir = "./bin_winx64" +cc = 'nvcc' #compiler +srcexts = ['.c','.cpp','.cu'] +mainsrc = ['main.cu'] #ignore these files when compiling the static library + +kwargs = dict() +include = "-I./include -I{}".format(commonincdir) +kwargs['include'] = include +kwargs['flags'] = "-dc" +kwargs['libdir'] = "-L{} -L{}".format(localbindir,commonlibdir) +kwargs['libflags'] = "-llib{} -llibamsculib2.msvc64".format(libname) +kwargs['linkerflags'] = "" +kwargs['recurse'] = True +kwargs['objstore'] = "./objstore" +kwargs['searchincdirs'] = ['./include'] + +#-lamsmathlib3.linux64 -lamsstring3.linux64 -lamsmatrix_cpp.linux64 -llapack -lblas -lgfortran -lamsmathutilthread.linux64 -lamsmathutil2.linux64 + +#Pull required binary dynamic libraries to the bin folder +#shutil.copy('{}/libamsimg.dll.a'.format(commonlibdir),localbindir); +#shutil.copy('{}/libamsimg.dll'.format(commonlibdir),localbindir); +#shutil.copy('../../lib_winx64/glew32.dll','./bin_winx64'); + +#Designate source files for main test program +fsrc = ['./src/main.cu'] +fobj = replaceexts(fsrc,'.o') + +#Compile test programs +gs_compile_list(cc,fsrc,**kwargs) +gs_link_list(cc,list_to_sss(fobj),'{}/{}'.format(localbindir,targetname),**kwargs) diff --git a/old/run.py b/old/run.py new file mode 100644 index 0000000..943ac72 --- /dev/null +++ b/old/run.py @@ -0,0 +1,23 @@ +#!/usr/bin/python3 + +import os,sys,math; +from compscripts.complib2 import *; + +args = sys.argv +if(len(args)>=2): + if(args[1]=="clean"): + obj_list = flist('./objstore',recurse=True,exts=['.o']) + for o in obj_list: + os.remove('{}'.format(o)) + exit(0) + +os.system('python3 ./compscripts/linux64.makelib.py') +os.system('python3 ./compscripts/linux64.maketest.py') + +# obj_list = flist('./src',recurse=True,exts=['.o']) +# for o in obj_list: +# os.remove('{}'.format(o)) + +#os.chdir('./bin_linux64') +callproc('./bin_linux64/test') +#os.chdir('..') \ No newline at end of file diff --git a/old/run3.py b/old/run3.py new file mode 100644 index 0000000..75fb9f7 --- /dev/null +++ b/old/run3.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import os,sys,math; +from compscripts.complib2 import *; + + +os.system('python ./compscripts/winnvcc.makelib.py') +os.system('python ./compscripts/winnvcc.maketest.py') + +obj_list = flist('./',recurse=True,exts=['.o']) +for o in obj_list: + os.remove('{}'.format(o)) + +#os.chdir('./bin_winx64') +callproc('.\\bin_winx64\\test.exe') +#os.chdir('..') + diff --git a/pull.sh b/pull.sh new file mode 100644 index 0000000..ea05dcd --- /dev/null +++ b/pull.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +scp aschinder@amssolarempire.com:workspace/projects/amsculib2.tar.gz ../ +tar xzvf ../amsculib2.tar.gz ./ diff --git a/src/amsculib2/amscu_comp128.cu b/src/amsculib2/amscu_comp128.cu index 859fc4d..7c4de7a 100644 --- a/src/amsculib2/amscu_comp128.cu +++ b/src/amsculib2/amscu_comp128.cu @@ -1,476 +1,476 @@ -#include - -namespace amscuda -{ -namespace cmp -{ - -__host__ __device__ cucomp128::cucomp128() -{ - real = 0.0; - imag = 0.0; - return; -} - -__host__ __device__ cucomp128::~cucomp128() -{ - real = 0.0; - imag = 0.0; - return; -} - -__host__ __device__ cucomp128::cucomp128(const cucomp128 &other) -{ - real = other.real; - imag = other.imag; - - return; -} - -__host__ __device__ cucomp128::cucomp128(const double &other) -{ - real = other; - imag = 0.0; - return; -} - -__host__ __device__ cucomp128& cucomp128::operator=(cucomp128& other) -{ - real = other.real; - imag = other.imag; - return *this; -} - -__host__ __device__ const cucomp128& cucomp128::operator=(const cucomp128& other) -{ - this->real = other.real; - this->imag = other.imag; - return *this; -} - -__host__ __device__ cucomp128& cucomp128::operator=(double& other) -{ - real = other; - imag = 0.0; - return *this; -} - -__host__ __device__ const cucomp128& cucomp128::operator=(const double& other) -{ - this->real = other; - this->imag = 0.0; - return *this; -} - -__host__ __device__ double& cucomp128::operator[](int& ind) -{ - if(ind==0) - { - return this->real; - } - else - { - return this->imag; - } -} - -__host__ __device__ const double& cucomp128::operator[](const int& ind) const -{ - if(ind==0) - { - return this->real; - } - else - { - return this->imag; - } -} - -__host__ __device__ cucomp128 cucomp128::operator+(const cucomp128& z) -{ - cucomp128 ret; - ret.real = real + z.real; - ret.imag = imag + z.imag; - return ret; -} - -__host__ __device__ cucomp128 cucomp128::operator-(const cucomp128& z) -{ - cucomp128 ret; - ret.real = real - z.real; - ret.imag = imag - z.imag; - return ret; -} - -__host__ __device__ cucomp128 cucomp128::operator*(const cucomp128& z) -{ - cucomp128 ret; - ret.real = (real*z.real - imag*z.imag); - ret.imag = (imag*z.real + real*z.imag); - return ret; -} - -__host__ __device__ cucomp128 cucomp128::operator/(const cucomp128& z) -{ - cucomp128 ret; - double zm2 = z.real*z.real+z.imag*z.imag; - - if(zm2>0.0) - { - ret.real = (this->real*z.real+this->imag*z.imag)/zm2; - ret.imag = (this->imag*z.real-this->real*z.imag)/zm2; - } - else - { - ret.real = (double) finf; - ret.imag = (double) finf; - } - - return ret; -} - -__host__ __device__ cucomp128 cucomp128::operator+(const double& z) -{ - cucomp128 ret; - ret.real = this->real + z; - ret.imag = this->imag; - return ret; -} -__host__ __device__ cucomp128 cucomp128::operator-(const double& z) -{ - cucomp128 ret; - ret.real = real-z; - ret.imag = imag; - return ret; -} -__host__ __device__ cucomp128 cucomp128::operator*(const double& z) -{ - cucomp128 ret; - ret.real = real*z; - ret.imag = imag*z; - return ret; -} -__host__ __device__ cucomp128 cucomp128::operator/(const double& z) -{ - cucomp128 ret; - if(z!=0.0f) - { - ret.real = real/z; - ret.imag = imag/z; - } - else - { - ret.real = finf; - ret.imag = finf; - } - - return ret; -} - -__host__ __device__ bool cucomp128::operator==(const cucomp128& z) const -{ - bool ret = 0; - if(z.real == real && z.imag == imag) - { - ret = 1; - } - return ret; -} - -__host__ __device__ bool cucomp128::operator!=(const cucomp128& z) const -{ - return !(*this==z); -} - -//sort first by real value, then by imaginary value -//this is done so that an ordering exists, as long as two values aren't equal -__host__ __device__ bool cucomp128::operator>(const cucomp128& z) const -{ - bool ret = 0; - if(this->real>z.real) - { - ret = 1; - } - else if(this->real==z.real) - { - if(this->imag>z.imag) - { - ret = 1; - } - else - { - ret = 0; - } - } - else - { - ret = 0; - } - return ret; -} - -__host__ __device__ bool cucomp128::operator<(const cucomp128& z) const -{ - bool ret = 0; - if(this->realreal==z.real) - { - if(this->imag=(const cucomp128& z) const -{ - bool ret = (*this==z || *this>z); - return ret; -} - -__host__ __device__ bool cucomp128::operator<=(const cucomp128& z) const -{ - bool ret = (*this==z || *thisreal) || ::isnan(this->imag)) - { - ret = 1; - } - return ret; -} - -__host__ __device__ bool cucomp128::isinf() const -{ - bool ret = 0; - //calls math.h isinf() - if(::isinf(this->real) || ::isinf(this->imag)) - { - ret = 1; - } - return ret; -} - -__host__ __device__ bool cucomp128::isreal() const -{ - bool ret = 1; - if(imag!=0.0f) - { - ret = 0; - } - return ret; -} - -__host__ __device__ bool cucomp128::isimag() const -{ - bool ret = 1; - if(real!=0.0f) - { - ret = 0; - } - return ret; -} - -__host__ __device__ bool cucomp128::iszero() const -{ - bool ret = 1; - if(real!=0.0f || imag!=0.0f) - { - ret = 0; - } - return ret; -} - -__host__ __device__ double cucomp128::arg() const -{ - double ret = 0.0; - ret = (double) amscuda::arg((double)real,(double)imag); - return ret; -} - -__host__ __device__ double cucomp128::mag() const -{ - double ret = 0.0; - ret = ::sqrt(real*real+imag*imag); - return ret; -} - -__host__ __device__ cucomp128 cucomp128::conj() const -{ - cucomp128 ret; - ret.real = real; - ret.imag = -imag; - return ret; -} - -__host__ __device__ double arg(cucomp128 z) -{ - return z.arg(); -} - -__host__ __device__ double abs(cucomp128 z) -{ - return z.mag(); -} - -__host__ __device__ cucomp128 dtocomp(double _r, double _i) -{ - cucomp128 ret; - ret.real = _r; - ret.imag = _i; - return ret; -} - -__host__ __device__ double real(cucomp128 z) -{ - return z.real; -} - -__host__ __device__ double imag(cucomp128 z) -{ - return z.imag; -} - -__host__ __device__ cucomp128 sin(cucomp128 z) -{ - cucomp128 ret; - cucomp128 im1 = dtocomp(0.0f,1.0f); - cucomp128 div = dtocomp(0.0f,2.0f); - - ret = (exp(im1*z)-exp(-im1*z))/div; - - return ret; -} - -__host__ __device__ cucomp128 cos(cucomp128 z) -{ - cucomp128 ret; - cucomp128 im1 = dtocomp(0.0f,1.0f); - cucomp128 div = dtocomp(2.0f,0.0f); - - ret = (exp(im1*z)+exp(-im1*z))/div; - - return ret; -} - -__host__ __device__ cucomp128 tan(cucomp128 z) -{ - return sin(z)/cos(z); -} - -__host__ __device__ cucomp128 exp(cucomp128 z) -{ - cucomp128 ret; - ret.real = ::exp(z.real)*::cos(z.imag); - ret.imag = ::exp(z.real)*::sin(z.imag); - return ret; -} - -__host__ __device__ cucomp128 log(cucomp128 z) -{ - cucomp128 ret; - ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag)); - ret.imag = amscuda::arg(z.real,z.imag); - return ret; -} - -__host__ __device__ cucomp128 conj(cucomp128 z) -{ - return z.conj(); -} - -__host__ __device__ cucomp128 cosh(cucomp128 z) -{ - cucomp128 ret; - cucomp128 div = dtocomp(2.0f,0.0f); - - ret = (exp(z)+exp(-z))/div; - - return ret; -} - -__host__ __device__ cucomp128 sinh(cucomp128 z) -{ - cucomp128 ret; - cucomp128 div = dtocomp(2.0f,0.0f); - - ret = (exp(z)-exp(-z))/div; - - return ret; -} - -__host__ __device__ cucomp128 tanh(cucomp128 z) -{ - return sinh(z)/cosh(z); -} - -__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2) -{ - cucomp128 ret; - if(z1.mag()>0.0) - ret = exp(z2*log(z1)); - else - ret = dtocomp(0.0f,0.0f); - return ret; -} - - -void test_cucomp128_1() -{ - cucomp128 z1; - cucomp128 a,b,c; - double d1; - double f1; - - printf("sizeof double=%ld\n",(long)(8*sizeof(f1))); - printf("sizeof double=%ld\n",(long)(8*sizeof(d1))); - printf("sizeof complex=%ld\n",(long)(8*sizeof(z1))); - printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a))); - - a = dtocomp(1.0,1.0); - b = dtocomp(1.0,-1.0); - - printf("a=%1.4f + %1.4fi\n",a[0],a[1]); - printf("b=%1.4f + %1.4fi\n",b[0],b[1]); - c = a+b; - printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]); - c = a-b; - printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]); - c = a*b; - printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]); - c = a/b; - printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]); - f1 = abs(a); - printf("abs(a)=%1.4f\n",f1); - f1 = arg(a); - printf("abs(a)=%1.4f pi\n",f1/pi); - - -} - -}; //end namespace cmp +#include + +namespace amscuda +{ +namespace cmp +{ + +__host__ __device__ cucomp128::cucomp128() +{ + real = 0.0; + imag = 0.0; + return; +} + +__host__ __device__ cucomp128::~cucomp128() +{ + real = 0.0; + imag = 0.0; + return; +} + +__host__ __device__ cucomp128::cucomp128(const cucomp128 &other) +{ + real = other.real; + imag = other.imag; + + return; +} + +__host__ __device__ cucomp128::cucomp128(const double &other) +{ + real = other; + imag = 0.0; + return; +} + +__host__ __device__ cucomp128& cucomp128::operator=(cucomp128& other) +{ + real = other.real; + imag = other.imag; + return *this; +} + +__host__ __device__ const cucomp128& cucomp128::operator=(const cucomp128& other) +{ + this->real = other.real; + this->imag = other.imag; + return *this; +} + +__host__ __device__ cucomp128& cucomp128::operator=(double& other) +{ + real = other; + imag = 0.0; + return *this; +} + +__host__ __device__ const cucomp128& cucomp128::operator=(const double& other) +{ + this->real = other; + this->imag = 0.0; + return *this; +} + +__host__ __device__ double& cucomp128::operator[](int& ind) +{ + if(ind==0) + { + return this->real; + } + else + { + return this->imag; + } +} + +__host__ __device__ const double& cucomp128::operator[](const int& ind) const +{ + if(ind==0) + { + return this->real; + } + else + { + return this->imag; + } +} + +__host__ __device__ cucomp128 cucomp128::operator+(const cucomp128& z) +{ + cucomp128 ret; + ret.real = real + z.real; + ret.imag = imag + z.imag; + return ret; +} + +__host__ __device__ cucomp128 cucomp128::operator-(const cucomp128& z) +{ + cucomp128 ret; + ret.real = real - z.real; + ret.imag = imag - z.imag; + return ret; +} + +__host__ __device__ cucomp128 cucomp128::operator*(const cucomp128& z) +{ + cucomp128 ret; + ret.real = (real*z.real - imag*z.imag); + ret.imag = (imag*z.real + real*z.imag); + return ret; +} + +__host__ __device__ cucomp128 cucomp128::operator/(const cucomp128& z) +{ + cucomp128 ret; + double zm2 = z.real*z.real+z.imag*z.imag; + + if(zm2>0.0) + { + ret.real = (this->real*z.real+this->imag*z.imag)/zm2; + ret.imag = (this->imag*z.real-this->real*z.imag)/zm2; + } + else + { + ret.real = (double) finf; + ret.imag = (double) finf; + } + + return ret; +} + +__host__ __device__ cucomp128 cucomp128::operator+(const double& z) +{ + cucomp128 ret; + ret.real = this->real + z; + ret.imag = this->imag; + return ret; +} +__host__ __device__ cucomp128 cucomp128::operator-(const double& z) +{ + cucomp128 ret; + ret.real = real-z; + ret.imag = imag; + return ret; +} +__host__ __device__ cucomp128 cucomp128::operator*(const double& z) +{ + cucomp128 ret; + ret.real = real*z; + ret.imag = imag*z; + return ret; +} +__host__ __device__ cucomp128 cucomp128::operator/(const double& z) +{ + cucomp128 ret; + if(z!=0.0f) + { + ret.real = real/z; + ret.imag = imag/z; + } + else + { + ret.real = finf; + ret.imag = finf; + } + + return ret; +} + +__host__ __device__ bool cucomp128::operator==(const cucomp128& z) const +{ + bool ret = 0; + if(z.real == real && z.imag == imag) + { + ret = 1; + } + return ret; +} + +__host__ __device__ bool cucomp128::operator!=(const cucomp128& z) const +{ + return !(*this==z); +} + +//sort first by real value, then by imaginary value +//this is done so that an ordering exists, as long as two values aren't equal +__host__ __device__ bool cucomp128::operator>(const cucomp128& z) const +{ + bool ret = 0; + if(this->real>z.real) + { + ret = 1; + } + else if(this->real==z.real) + { + if(this->imag>z.imag) + { + ret = 1; + } + else + { + ret = 0; + } + } + else + { + ret = 0; + } + return ret; +} + +__host__ __device__ bool cucomp128::operator<(const cucomp128& z) const +{ + bool ret = 0; + if(this->realreal==z.real) + { + if(this->imag=(const cucomp128& z) const +{ + bool ret = (*this==z || *this>z); + return ret; +} + +__host__ __device__ bool cucomp128::operator<=(const cucomp128& z) const +{ + bool ret = (*this==z || *thisreal) || ::isnan(this->imag)) + { + ret = 1; + } + return ret; +} + +__host__ __device__ bool cucomp128::isinf() const +{ + bool ret = 0; + //calls math.h isinf() + if(::isinf(this->real) || ::isinf(this->imag)) + { + ret = 1; + } + return ret; +} + +__host__ __device__ bool cucomp128::isreal() const +{ + bool ret = 1; + if(imag!=0.0f) + { + ret = 0; + } + return ret; +} + +__host__ __device__ bool cucomp128::isimag() const +{ + bool ret = 1; + if(real!=0.0f) + { + ret = 0; + } + return ret; +} + +__host__ __device__ bool cucomp128::iszero() const +{ + bool ret = 1; + if(real!=0.0f || imag!=0.0f) + { + ret = 0; + } + return ret; +} + +__host__ __device__ double cucomp128::arg() const +{ + double ret = 0.0; + ret = (double) amscuda::arg((double)real,(double)imag); + return ret; +} + +__host__ __device__ double cucomp128::mag() const +{ + double ret = 0.0; + ret = ::sqrt(real*real+imag*imag); + return ret; +} + +__host__ __device__ cucomp128 cucomp128::conj() const +{ + cucomp128 ret; + ret.real = real; + ret.imag = -imag; + return ret; +} + +__host__ __device__ double arg(cucomp128 z) +{ + return z.arg(); +} + +__host__ __device__ double abs(cucomp128 z) +{ + return z.mag(); +} + +__host__ __device__ cucomp128 dtocomp(double _r, double _i) +{ + cucomp128 ret; + ret.real = _r; + ret.imag = _i; + return ret; +} + +__host__ __device__ double real(cucomp128 z) +{ + return z.real; +} + +__host__ __device__ double imag(cucomp128 z) +{ + return z.imag; +} + +__host__ __device__ cucomp128 sin(cucomp128 z) +{ + cucomp128 ret; + cucomp128 im1 = dtocomp(0.0f,1.0f); + cucomp128 div = dtocomp(0.0f,2.0f); + + ret = (exp(im1*z)-exp(-im1*z))/div; + + return ret; +} + +__host__ __device__ cucomp128 cos(cucomp128 z) +{ + cucomp128 ret; + cucomp128 im1 = dtocomp(0.0f,1.0f); + cucomp128 div = dtocomp(2.0f,0.0f); + + ret = (exp(im1*z)+exp(-im1*z))/div; + + return ret; +} + +__host__ __device__ cucomp128 tan(cucomp128 z) +{ + return sin(z)/cos(z); +} + +__host__ __device__ cucomp128 exp(cucomp128 z) +{ + cucomp128 ret; + ret.real = ::exp(z.real)*::cos(z.imag); + ret.imag = ::exp(z.real)*::sin(z.imag); + return ret; +} + +__host__ __device__ cucomp128 log(cucomp128 z) +{ + cucomp128 ret; + ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag)); + ret.imag = amscuda::arg(z.real,z.imag); + return ret; +} + +__host__ __device__ cucomp128 conj(cucomp128 z) +{ + return z.conj(); +} + +__host__ __device__ cucomp128 cosh(cucomp128 z) +{ + cucomp128 ret; + cucomp128 div = dtocomp(2.0f,0.0f); + + ret = (exp(z)+exp(-z))/div; + + return ret; +} + +__host__ __device__ cucomp128 sinh(cucomp128 z) +{ + cucomp128 ret; + cucomp128 div = dtocomp(2.0f,0.0f); + + ret = (exp(z)-exp(-z))/div; + + return ret; +} + +__host__ __device__ cucomp128 tanh(cucomp128 z) +{ + return sinh(z)/cosh(z); +} + +__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2) +{ + cucomp128 ret; + if(z1.mag()>0.0) + ret = exp(z2*log(z1)); + else + ret = dtocomp(0.0f,0.0f); + return ret; +} + + +void test_cucomp128_1() +{ + cucomp128 z1; + cucomp128 a,b,c; + double d1; + double f1; + + printf("sizeof double=%ld\n",(long)(8*sizeof(f1))); + printf("sizeof double=%ld\n",(long)(8*sizeof(d1))); + printf("sizeof complex=%ld\n",(long)(8*sizeof(z1))); + printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a))); + + a = dtocomp(1.0,1.0); + b = dtocomp(1.0,-1.0); + + printf("a=%1.4f + %1.4fi\n",a[0],a[1]); + printf("b=%1.4f + %1.4fi\n",b[0],b[1]); + c = a+b; + printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]); + c = a-b; + printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]); + c = a*b; + printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]); + c = a/b; + printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]); + f1 = abs(a); + printf("abs(a)=%1.4f\n",f1); + f1 = arg(a); + printf("abs(a)=%1.4f pi\n",f1/pi); + + +} + +}; //end namespace cmp }; //end namespace amscuda \ No newline at end of file diff --git a/src/amsculib2/amscu_comp64.cu b/src/amsculib2/amscu_comp64.cu index 6599074..b5d3dde 100644 --- a/src/amsculib2/amscu_comp64.cu +++ b/src/amsculib2/amscu_comp64.cu @@ -1,476 +1,476 @@ -#include - -namespace amscuda -{ -namespace cmp -{ - -__host__ __device__ cucomp64::cucomp64() -{ - real = 0.0; - imag = 0.0; - return; -} - -__host__ __device__ cucomp64::~cucomp64() -{ - real = 0.0; - imag = 0.0; - return; -} - -__host__ __device__ cucomp64::cucomp64(const cucomp64 &other) -{ - real = other.real; - imag = other.imag; - - return; -} - -__host__ __device__ cucomp64::cucomp64(const float &other) -{ - real = other; - imag = 0.0; - return; -} - -__host__ __device__ cucomp64& cucomp64::operator=(cucomp64& other) -{ - real = other.real; - imag = other.imag; - return *this; -} - -__host__ __device__ const cucomp64& cucomp64::operator=(const cucomp64& other) -{ - this->real = other.real; - this->imag = other.imag; - return *this; -} - -__host__ __device__ cucomp64& cucomp64::operator=(float& other) -{ - real = other; - imag = 0.0; - return *this; -} - -__host__ __device__ const cucomp64& cucomp64::operator=(const float& other) -{ - this->real = other; - this->imag = 0.0; - return *this; -} - -__host__ __device__ float& cucomp64::operator[](int& ind) -{ - if(ind==0) - { - return this->real; - } - else - { - return this->imag; - } -} - -__host__ __device__ const float& cucomp64::operator[](const int& ind) const -{ - if(ind==0) - { - return this->real; - } - else - { - return this->imag; - } -} - -__host__ __device__ cucomp64 cucomp64::operator+(const cucomp64& z) -{ - cucomp64 ret; - ret.real = real + z.real; - ret.imag = imag + z.imag; - return ret; -} - -__host__ __device__ cucomp64 cucomp64::operator-(const cucomp64& z) -{ - cucomp64 ret; - ret.real = real - z.real; - ret.imag = imag - z.imag; - return ret; -} - -__host__ __device__ cucomp64 cucomp64::operator*(const cucomp64& z) -{ - cucomp64 ret; - ret.real = (real*z.real - imag*z.imag); - ret.imag = (imag*z.real + real*z.imag); - return ret; -} - -__host__ __device__ cucomp64 cucomp64::operator/(const cucomp64& z) -{ - cucomp64 ret; - float zm2 = z.real*z.real+z.imag*z.imag; - - if(zm2>0.0) - { - ret.real = (this->real*z.real+this->imag*z.imag)/zm2; - ret.imag = (this->imag*z.real-this->real*z.imag)/zm2; - } - else - { - ret.real = (float) finf; - ret.imag = (float) finf; - } - - return ret; -} - -__host__ __device__ cucomp64 cucomp64::operator+(const float& z) -{ - cucomp64 ret; - ret.real = this->real + z; - ret.imag = this->imag; - return ret; -} -__host__ __device__ cucomp64 cucomp64::operator-(const float& z) -{ - cucomp64 ret; - ret.real = real-z; - ret.imag = imag; - return ret; -} -__host__ __device__ cucomp64 cucomp64::operator*(const float& z) -{ - cucomp64 ret; - ret.real = real*z; - ret.imag = imag*z; - return ret; -} -__host__ __device__ cucomp64 cucomp64::operator/(const float& z) -{ - cucomp64 ret; - if(z!=0.0f) - { - ret.real = real/z; - ret.imag = imag/z; - } - else - { - ret.real = finf; - ret.imag = finf; - } - - return ret; -} - -__host__ __device__ bool cucomp64::operator==(const cucomp64& z) const -{ - bool ret = 0; - if(z.real == real && z.imag == imag) - { - ret = 1; - } - return ret; -} - -__host__ __device__ bool cucomp64::operator!=(const cucomp64& z) const -{ - return !(*this==z); -} - -//sort first by real value, then by imaginary value -//this is done so that an ordering exists, as long as two values aren't equal -__host__ __device__ bool cucomp64::operator>(const cucomp64& z) const -{ - bool ret = 0; - if(this->real>z.real) - { - ret = 1; - } - else if(this->real==z.real) - { - if(this->imag>z.imag) - { - ret = 1; - } - else - { - ret = 0; - } - } - else - { - ret = 0; - } - return ret; -} - -__host__ __device__ bool cucomp64::operator<(const cucomp64& z) const -{ - bool ret = 0; - if(this->realreal==z.real) - { - if(this->imag=(const cucomp64& z) const -{ - bool ret = (*this==z || *this>z); - return ret; -} - -__host__ __device__ bool cucomp64::operator<=(const cucomp64& z) const -{ - bool ret = (*this==z || *thisreal) || ::isnan(this->imag)) - { - ret = 1; - } - return ret; -} - -__host__ __device__ bool cucomp64::isinf() const -{ - bool ret = 0; - //calls math.h isinf() - if(::isinf(this->real) || ::isinf(this->imag)) - { - ret = 1; - } - return ret; -} - -__host__ __device__ bool cucomp64::isreal() const -{ - bool ret = 1; - if(imag!=0.0f) - { - ret = 0; - } - return ret; -} - -__host__ __device__ bool cucomp64::isimag() const -{ - bool ret = 1; - if(real!=0.0f) - { - ret = 0; - } - return ret; -} - -__host__ __device__ bool cucomp64::iszero() const -{ - bool ret = 1; - if(real!=0.0f || imag!=0.0f) - { - ret = 0; - } - return ret; -} - -__host__ __device__ float cucomp64::arg() const -{ - float ret = 0.0; - ret = (float) amscuda::arg((double)real,(double)imag); - return ret; -} - -__host__ __device__ float cucomp64::mag() const -{ - float ret = 0.0; - ret = ::sqrt(real*real+imag*imag); - return ret; -} - -__host__ __device__ cucomp64 cucomp64::conj() const -{ - cucomp64 ret; - ret.real = real; - ret.imag = -imag; - return ret; -} - -__host__ __device__ float arg(cucomp64 z) -{ - return z.arg(); -} - -__host__ __device__ float abs(cucomp64 z) -{ - return z.mag(); -} - -__host__ __device__ cucomp64 dtocomp64(float _r, float _i) -{ - cucomp64 ret; - ret.real = _r; - ret.imag = _i; - return ret; -} - -__host__ __device__ float real(cucomp64 z) -{ - return z.real; -} - -__host__ __device__ float imag(cucomp64 z) -{ - return z.imag; -} - -__host__ __device__ cucomp64 sin(cucomp64 z) -{ - cucomp64 ret; - cucomp64 im1 = dtocomp64(0.0f,1.0f); - cucomp64 div = dtocomp64(0.0f,2.0f); - - ret = (exp(im1*z)-exp(-im1*z))/div; - - return ret; -} - -__host__ __device__ cucomp64 cos(cucomp64 z) -{ - cucomp64 ret; - cucomp64 im1 = dtocomp64(0.0f,1.0f); - cucomp64 div = dtocomp64(2.0f,0.0f); - - ret = (exp(im1*z)+exp(-im1*z))/div; - - return ret; -} - -__host__ __device__ cucomp64 tan(cucomp64 z) -{ - return sin(z)/cos(z); -} - -__host__ __device__ cucomp64 exp(cucomp64 z) -{ - cucomp64 ret; - ret.real = ::exp(z.real)*::cos(z.imag); - ret.imag = ::exp(z.real)*::sin(z.imag); - return ret; -} - -__host__ __device__ cucomp64 log(cucomp64 z) -{ - cucomp64 ret; - ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag)); - ret.imag = amscuda::arg(z.real,z.imag); - return ret; -} - -__host__ __device__ cucomp64 conj(cucomp64 z) -{ - return z.conj(); -} - -__host__ __device__ cucomp64 cosh(cucomp64 z) -{ - cucomp64 ret; - cucomp64 div = dtocomp64(2.0f,0.0f); - - ret = (exp(z)+exp(-z))/div; - - return ret; -} - -__host__ __device__ cucomp64 sinh(cucomp64 z) -{ - cucomp64 ret; - cucomp64 div = dtocomp64(2.0f,0.0f); - - ret = (exp(z)-exp(-z))/div; - - return ret; -} - -__host__ __device__ cucomp64 tanh(cucomp64 z) -{ - return sinh(z)/cosh(z); -} - -__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2) -{ - cucomp64 ret; - if(z1.mag()>0.0) - ret = exp(z2*log(z1)); - else - ret = dtocomp64(0.0f,0.0f); - return ret; -} - - -void test_cucomp64_1() -{ - cucomp64 z1; - cucomp64 a,b,c; - double d1; - float f1; - - printf("sizeof double=%ld\n",(long)(8*sizeof(f1))); - printf("sizeof double=%ld\n",(long)(8*sizeof(d1))); - printf("sizeof complex=%ld\n",(long)(8*sizeof(z1))); - printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a))); - - a = dtocomp64(1.0,1.0); - b = dtocomp64(1.0,-1.0); - - printf("a=%1.4f + %1.4fi\n",a[0],a[1]); - printf("b=%1.4f + %1.4fi\n",b[0],b[1]); - c = a+b; - printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]); - c = a-b; - printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]); - c = a*b; - printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]); - c = a/b; - printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]); - f1 = abs(a); - printf("abs(a)=%1.4f\n",f1); - f1 = arg(a); - printf("abs(a)=%1.4f pi\n",f1/pi); - - -} - -}; //end namespace cmp +#include + +namespace amscuda +{ +namespace cmp +{ + +__host__ __device__ cucomp64::cucomp64() +{ + real = 0.0; + imag = 0.0; + return; +} + +__host__ __device__ cucomp64::~cucomp64() +{ + real = 0.0; + imag = 0.0; + return; +} + +__host__ __device__ cucomp64::cucomp64(const cucomp64 &other) +{ + real = other.real; + imag = other.imag; + + return; +} + +__host__ __device__ cucomp64::cucomp64(const float &other) +{ + real = other; + imag = 0.0; + return; +} + +__host__ __device__ cucomp64& cucomp64::operator=(cucomp64& other) +{ + real = other.real; + imag = other.imag; + return *this; +} + +__host__ __device__ const cucomp64& cucomp64::operator=(const cucomp64& other) +{ + this->real = other.real; + this->imag = other.imag; + return *this; +} + +__host__ __device__ cucomp64& cucomp64::operator=(float& other) +{ + real = other; + imag = 0.0; + return *this; +} + +__host__ __device__ const cucomp64& cucomp64::operator=(const float& other) +{ + this->real = other; + this->imag = 0.0; + return *this; +} + +__host__ __device__ float& cucomp64::operator[](int& ind) +{ + if(ind==0) + { + return this->real; + } + else + { + return this->imag; + } +} + +__host__ __device__ const float& cucomp64::operator[](const int& ind) const +{ + if(ind==0) + { + return this->real; + } + else + { + return this->imag; + } +} + +__host__ __device__ cucomp64 cucomp64::operator+(const cucomp64& z) +{ + cucomp64 ret; + ret.real = real + z.real; + ret.imag = imag + z.imag; + return ret; +} + +__host__ __device__ cucomp64 cucomp64::operator-(const cucomp64& z) +{ + cucomp64 ret; + ret.real = real - z.real; + ret.imag = imag - z.imag; + return ret; +} + +__host__ __device__ cucomp64 cucomp64::operator*(const cucomp64& z) +{ + cucomp64 ret; + ret.real = (real*z.real - imag*z.imag); + ret.imag = (imag*z.real + real*z.imag); + return ret; +} + +__host__ __device__ cucomp64 cucomp64::operator/(const cucomp64& z) +{ + cucomp64 ret; + float zm2 = z.real*z.real+z.imag*z.imag; + + if(zm2>0.0) + { + ret.real = (this->real*z.real+this->imag*z.imag)/zm2; + ret.imag = (this->imag*z.real-this->real*z.imag)/zm2; + } + else + { + ret.real = (float) finf; + ret.imag = (float) finf; + } + + return ret; +} + +__host__ __device__ cucomp64 cucomp64::operator+(const float& z) +{ + cucomp64 ret; + ret.real = this->real + z; + ret.imag = this->imag; + return ret; +} +__host__ __device__ cucomp64 cucomp64::operator-(const float& z) +{ + cucomp64 ret; + ret.real = real-z; + ret.imag = imag; + return ret; +} +__host__ __device__ cucomp64 cucomp64::operator*(const float& z) +{ + cucomp64 ret; + ret.real = real*z; + ret.imag = imag*z; + return ret; +} +__host__ __device__ cucomp64 cucomp64::operator/(const float& z) +{ + cucomp64 ret; + if(z!=0.0f) + { + ret.real = real/z; + ret.imag = imag/z; + } + else + { + ret.real = finf; + ret.imag = finf; + } + + return ret; +} + +__host__ __device__ bool cucomp64::operator==(const cucomp64& z) const +{ + bool ret = 0; + if(z.real == real && z.imag == imag) + { + ret = 1; + } + return ret; +} + +__host__ __device__ bool cucomp64::operator!=(const cucomp64& z) const +{ + return !(*this==z); +} + +//sort first by real value, then by imaginary value +//this is done so that an ordering exists, as long as two values aren't equal +__host__ __device__ bool cucomp64::operator>(const cucomp64& z) const +{ + bool ret = 0; + if(this->real>z.real) + { + ret = 1; + } + else if(this->real==z.real) + { + if(this->imag>z.imag) + { + ret = 1; + } + else + { + ret = 0; + } + } + else + { + ret = 0; + } + return ret; +} + +__host__ __device__ bool cucomp64::operator<(const cucomp64& z) const +{ + bool ret = 0; + if(this->realreal==z.real) + { + if(this->imag=(const cucomp64& z) const +{ + bool ret = (*this==z || *this>z); + return ret; +} + +__host__ __device__ bool cucomp64::operator<=(const cucomp64& z) const +{ + bool ret = (*this==z || *thisreal) || ::isnan(this->imag)) + { + ret = 1; + } + return ret; +} + +__host__ __device__ bool cucomp64::isinf() const +{ + bool ret = 0; + //calls math.h isinf() + if(::isinf(this->real) || ::isinf(this->imag)) + { + ret = 1; + } + return ret; +} + +__host__ __device__ bool cucomp64::isreal() const +{ + bool ret = 1; + if(imag!=0.0f) + { + ret = 0; + } + return ret; +} + +__host__ __device__ bool cucomp64::isimag() const +{ + bool ret = 1; + if(real!=0.0f) + { + ret = 0; + } + return ret; +} + +__host__ __device__ bool cucomp64::iszero() const +{ + bool ret = 1; + if(real!=0.0f || imag!=0.0f) + { + ret = 0; + } + return ret; +} + +__host__ __device__ float cucomp64::arg() const +{ + float ret = 0.0; + ret = (float) amscuda::arg((double)real,(double)imag); + return ret; +} + +__host__ __device__ float cucomp64::mag() const +{ + float ret = 0.0; + ret = ::sqrt(real*real+imag*imag); + return ret; +} + +__host__ __device__ cucomp64 cucomp64::conj() const +{ + cucomp64 ret; + ret.real = real; + ret.imag = -imag; + return ret; +} + +__host__ __device__ float arg(cucomp64 z) +{ + return z.arg(); +} + +__host__ __device__ float abs(cucomp64 z) +{ + return z.mag(); +} + +__host__ __device__ cucomp64 dtocomp64(float _r, float _i) +{ + cucomp64 ret; + ret.real = _r; + ret.imag = _i; + return ret; +} + +__host__ __device__ float real(cucomp64 z) +{ + return z.real; +} + +__host__ __device__ float imag(cucomp64 z) +{ + return z.imag; +} + +__host__ __device__ cucomp64 sin(cucomp64 z) +{ + cucomp64 ret; + cucomp64 im1 = dtocomp64(0.0f,1.0f); + cucomp64 div = dtocomp64(0.0f,2.0f); + + ret = (exp(im1*z)-exp(-im1*z))/div; + + return ret; +} + +__host__ __device__ cucomp64 cos(cucomp64 z) +{ + cucomp64 ret; + cucomp64 im1 = dtocomp64(0.0f,1.0f); + cucomp64 div = dtocomp64(2.0f,0.0f); + + ret = (exp(im1*z)+exp(-im1*z))/div; + + return ret; +} + +__host__ __device__ cucomp64 tan(cucomp64 z) +{ + return sin(z)/cos(z); +} + +__host__ __device__ cucomp64 exp(cucomp64 z) +{ + cucomp64 ret; + ret.real = ::exp(z.real)*::cos(z.imag); + ret.imag = ::exp(z.real)*::sin(z.imag); + return ret; +} + +__host__ __device__ cucomp64 log(cucomp64 z) +{ + cucomp64 ret; + ret.real = ::log(::sqrt(z.real*z.real+z.imag*z.imag)); + ret.imag = amscuda::arg(z.real,z.imag); + return ret; +} + +__host__ __device__ cucomp64 conj(cucomp64 z) +{ + return z.conj(); +} + +__host__ __device__ cucomp64 cosh(cucomp64 z) +{ + cucomp64 ret; + cucomp64 div = dtocomp64(2.0f,0.0f); + + ret = (exp(z)+exp(-z))/div; + + return ret; +} + +__host__ __device__ cucomp64 sinh(cucomp64 z) +{ + cucomp64 ret; + cucomp64 div = dtocomp64(2.0f,0.0f); + + ret = (exp(z)-exp(-z))/div; + + return ret; +} + +__host__ __device__ cucomp64 tanh(cucomp64 z) +{ + return sinh(z)/cosh(z); +} + +__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2) +{ + cucomp64 ret; + if(z1.mag()>0.0) + ret = exp(z2*log(z1)); + else + ret = dtocomp64(0.0f,0.0f); + return ret; +} + + +void test_cucomp64_1() +{ + cucomp64 z1; + cucomp64 a,b,c; + double d1; + float f1; + + printf("sizeof double=%ld\n",(long)(8*sizeof(f1))); + printf("sizeof double=%ld\n",(long)(8*sizeof(d1))); + printf("sizeof complex=%ld\n",(long)(8*sizeof(z1))); + printf("sizeof cucomp128=%ld\n",(long)(8*sizeof(a))); + + a = dtocomp64(1.0,1.0); + b = dtocomp64(1.0,-1.0); + + printf("a=%1.4f + %1.4fi\n",a[0],a[1]); + printf("b=%1.4f + %1.4fi\n",b[0],b[1]); + c = a+b; + printf("c=a+b: c=%1.4f + %1.4fi\n",c[0],c[1]); + c = a-b; + printf("c=a-b: c=%1.4f + %1.4fi\n",c[0],c[1]); + c = a*b; + printf("c=a*b: c=%1.4f + %1.4fi\n",c[0],c[1]); + c = a/b; + printf("c=a/b: c=%1.4f + %1.4fi\n",c[0],c[1]); + f1 = abs(a); + printf("abs(a)=%1.4f\n",f1); + f1 = arg(a); + printf("abs(a)=%1.4f pi\n",f1/pi); + + +} + +}; //end namespace cmp }; //end namespace amscuda \ No newline at end of file diff --git a/src/amsculib2/amscu_cudafunctions.cu b/src/amsculib2/amscu_cudafunctions.cu index 7573ae0..52d1329 100644 --- a/src/amsculib2/amscu_cudafunctions.cu +++ b/src/amsculib2/amscu_cudafunctions.cu @@ -1,21 +1,21 @@ -#include - -namespace amscuda -{ - -int cuda_errortrap(const char *msgheader) -{ - int ret = 0; - cudaError_t err = cudaSuccess; - - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("%s :%s\n",msgheader,cudaGetErrorString(err)); - ret = 1; - } - - return ret; -} - +#include + +namespace amscuda +{ + +int cuda_errortrap(const char *msgheader) +{ + int ret = 0; + cudaError_t err = cudaSuccess; + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("%s :%s\n",msgheader,cudaGetErrorString(err)); + ret = 1; + } + + return ret; +} + }; //end namespace amscuda \ No newline at end of file diff --git a/src/amsculib2/amscu_random.cu b/src/amsculib2/amscu_random.cu index 9e36e80..c7e1942 100644 --- a/src/amsculib2/amscu_random.cu +++ b/src/amsculib2/amscu_random.cu @@ -1,222 +1,222 @@ -#include - -namespace amscuda -{ - -__device__ __host__ float fhash1d_su(float x) -{ - float ret; - ret = x*(x>0.0f) + -x*(x<0.0f); //sign without conditionals? - ret = fmodf(ret,10000.0f); //restrain domain - ret = fmodf(ret*(ret+3678.453f)+7890.453f,10000.0f); - ret = fmodf(ret*(ret+8927.2134f),10000.0f); - ret = fmodf(ret*(ret+3656.234f),10000.0f); - //ret = fmodf(ret*(ret+892.2134f),1000.0f); - //ret = fmodf(ret,1000.0f); - ret = ret/10000.0f; - return ret; -} - - -__device__ __host__ float fhash3d_su(float x, float y=0.0f, float z=0.0f) -{ - float ret = 0.0f; - - ret = fhash1d_su(z); - ret = fhash1d_su(1000.0f*ret*ret + y); - ret = fhash1d_su(1000.0f*ret*ret + x); - - return ret; -} - -__device__ __host__ float fhash4d_su(float x, float y=0.0f, float z=0.0f, float w=0.0f) -{ - float ret = 0.0f; - - ret = fhash1d_su(w); - ret = fhash1d_su(1000.0f*ret*ret + z); - ret = fhash1d_su(1000.0f*ret*ret + y); - ret = fhash1d_su(1000.0f*ret*ret + x); - - return ret; -} - -////////////////////////////////////////////////// -// Deterministic Pseudorandom int32_t Generator // -////////////////////////////////////////////////// - -//Simple 32 bit integer deterministic pseudo-random generator -// *not* for cryptography -// Frequency of generated floats should be uniform [0,1) - -AMSCU_CONST static const int32_t dpr32_mod = 1<<30-1; -AMSCU_CONST static const int32_t dpr32_mult = 25137; - -//Next seed in simple 32 bit integer deterministic psuedo-rand generator -__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout) -{ - int32_t lseed; - if(rseed_inout!=NULL) lseed = *rseed_inout; - - lseed = (lseed*dpr32_mult + 1)%dpr32_mod; - lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+dpr32_mod); //ensure mod is positive - - if(rseed_inout!=NULL) *rseed_inout = lseed; - - return; -} - -//Simple 32 bit integer deterministic pseudo-random generator -// *not* for cryptography -// Frequency of generated floats should be uniform [0,1) -__host__ __device__ float dpr32_randf(int32_t *rseed_inout) -{ - int32_t lseed = 1; - float ret = 0.0f; - - if(rseed_inout!=NULL) lseed = *rseed_inout; - - dpr32_nextseed(&lseed); - ret = ((float)(lseed))/((float)dpr32_mod); - - if(rseed_inout!=NULL) *rseed_inout = lseed; - - return ret; -} - -//box muller standard normal variable -__host__ __device__ float dpr32_randnf(int32_t *rseed_inout) -{ - int32_t lseed = 1; - float ret = 0.0f; - float u1,u2; - - if(rseed_inout!=NULL) lseed = *rseed_inout; - - u1 = dpr32_randf(&lseed); - u2 = dpr32_randf(&lseed); - - ret = ::sqrtf(-2.0f*::logf(u1))*::cosf(2.0f*pif*u2); - - if(rseed_inout!=NULL) *rseed_inout = lseed; - - return ret; -} - -////////////////////////////////////////////////// -// Deterministic Pseudorandom int64_t Generator // -////////////////////////////////////////////////// - -//"goodenough" deterministic pseudo-random number generator -//random enough for procedural applications, deterministic, -//operates without side-effects for thread safety - -AMSCU_CONST const int64_t random_dpr64_mod = (2LL<<31LL)-1LL; -AMSCU_CONST const int64_t random_dpr64_mult = 1201633LL; - -__host__ __device__ void dpr64_nextseed(int64_t *seedinout) -{ - int64_t lseed = 0LL; - if(seedinout!=NULL) lseed = *seedinout; - - lseed = (random_dpr64_mult*lseed+1LL)%random_dpr64_mod; - lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+random_dpr64_mod); - - if(seedinout!=NULL) *seedinout = lseed; - - return; -} - -__host__ __device__ double dpr64_randd(int64_t *seedinout) -{ - double ret = 0.0; - int64_t lseed = 0LL; - - if(seedinout!=NULL) lseed = *seedinout; - - dpr64_nextseed(&lseed); - ret = ((double)lseed)/((double)(random_dpr64_mod-1LL)); - - if(seedinout!=NULL) *seedinout = lseed; - - return ret; -} - -__host__ __device__ float dpr64_randf(int64_t *seedinout) -{ - float ret = 0.0f; - int64_t lseed = 0LL; - - if(seedinout!=NULL) lseed = *seedinout; - - dpr64_nextseed(&lseed); - ret = ((float)lseed)/((float)(random_dpr64_mod-1LL)); - - if(seedinout!=NULL) *seedinout = lseed; - - return ret; -} - -/////////// -// Tests // -/////////// - -void test_dprg64() -{ - printf("Tests for dprg:\n"); - long I; - int64_t seed = 133LL; - double d; - float f; - cuvect3 qv; - - printf("dpr64_randd test\n"); - seed = 133LL; - for(I=0;I<10;I++) - { - d = dpr64_randd(&seed); - printf("seed: %lld rand: %1.4f\n",(long long)seed,d); - } - - printf("\n\n"); - printf("dpr64_randf test\n"); - seed = 133LL; - for(I=0;I<10;I++) - { - f = dpr64_randf(&seed); - printf("seed: %lld rand: %1.4f\n",(long long)seed,f); - } - - return; -} - -void test_dprg32() -{ - printf("Tests for dprg:\n"); - long I; - int32_t seed = 133; - double d; - float f; - cuvect3 qv; - - printf("dpr32_randf test\n"); - seed = 133; - for(I=0;I<10;I++) - { - f = dpr32_randf(&seed); - printf("seed: %lld rand: %1.4f\n",(long long)seed,f); - } - - printf("\n\ndpr32_randnf test\n"); - seed = 133; - for(I=0;I<10;I++) - { - f = dpr32_randnf(&seed); - printf("seed: %lld rand: %1.4f\n",(long long)seed,f); - } - - return; -} - - +#include + +namespace amscuda +{ + +__device__ __host__ float fhash1d_su(float x) +{ + float ret; + ret = x*(x>0.0f) + -x*(x<0.0f); //sign without conditionals? + ret = fmodf(ret,10000.0f); //restrain domain + ret = fmodf(ret*(ret+3678.453f)+7890.453f,10000.0f); + ret = fmodf(ret*(ret+8927.2134f),10000.0f); + ret = fmodf(ret*(ret+3656.234f),10000.0f); + //ret = fmodf(ret*(ret+892.2134f),1000.0f); + //ret = fmodf(ret,1000.0f); + ret = ret/10000.0f; + return ret; +} + + +__device__ __host__ float fhash3d_su(float x, float y=0.0f, float z=0.0f) +{ + float ret = 0.0f; + + ret = fhash1d_su(z); + ret = fhash1d_su(1000.0f*ret*ret + y); + ret = fhash1d_su(1000.0f*ret*ret + x); + + return ret; +} + +__device__ __host__ float fhash4d_su(float x, float y=0.0f, float z=0.0f, float w=0.0f) +{ + float ret = 0.0f; + + ret = fhash1d_su(w); + ret = fhash1d_su(1000.0f*ret*ret + z); + ret = fhash1d_su(1000.0f*ret*ret + y); + ret = fhash1d_su(1000.0f*ret*ret + x); + + return ret; +} + +////////////////////////////////////////////////// +// Deterministic Pseudorandom int32_t Generator // +////////////////////////////////////////////////// + +//Simple 32 bit integer deterministic pseudo-random generator +// *not* for cryptography +// Frequency of generated floats should be uniform [0,1) + +AMSCU_CONST static const int32_t dpr32_mod = 1<<30-1; +AMSCU_CONST static const int32_t dpr32_mult = 25137; + +//Next seed in simple 32 bit integer deterministic psuedo-rand generator +__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout) +{ + int32_t lseed; + if(rseed_inout!=NULL) lseed = *rseed_inout; + + lseed = (lseed*dpr32_mult + 1)%dpr32_mod; + lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+dpr32_mod); //ensure mod is positive + + if(rseed_inout!=NULL) *rseed_inout = lseed; + + return; +} + +//Simple 32 bit integer deterministic pseudo-random generator +// *not* for cryptography +// Frequency of generated floats should be uniform [0,1) +__host__ __device__ float dpr32_randf(int32_t *rseed_inout) +{ + int32_t lseed = 1; + float ret = 0.0f; + + if(rseed_inout!=NULL) lseed = *rseed_inout; + + dpr32_nextseed(&lseed); + ret = ((float)(lseed))/((float)dpr32_mod); + + if(rseed_inout!=NULL) *rseed_inout = lseed; + + return ret; +} + +//box muller standard normal variable +__host__ __device__ float dpr32_randnf(int32_t *rseed_inout) +{ + int32_t lseed = 1; + float ret = 0.0f; + float u1,u2; + + if(rseed_inout!=NULL) lseed = *rseed_inout; + + u1 = dpr32_randf(&lseed); + u2 = dpr32_randf(&lseed); + + ret = ::sqrtf(-2.0f*::logf(u1))*::cosf(2.0f*pif*u2); + + if(rseed_inout!=NULL) *rseed_inout = lseed; + + return ret; +} + +////////////////////////////////////////////////// +// Deterministic Pseudorandom int64_t Generator // +////////////////////////////////////////////////// + +//"goodenough" deterministic pseudo-random number generator +//random enough for procedural applications, deterministic, +//operates without side-effects for thread safety + +AMSCU_CONST const int64_t random_dpr64_mod = (2LL<<31LL)-1LL; +AMSCU_CONST const int64_t random_dpr64_mult = 1201633LL; + +__host__ __device__ void dpr64_nextseed(int64_t *seedinout) +{ + int64_t lseed = 0LL; + if(seedinout!=NULL) lseed = *seedinout; + + lseed = (random_dpr64_mult*lseed+1LL)%random_dpr64_mod; + lseed = (lseed>=0)*(lseed)+(lseed<0)*(lseed+random_dpr64_mod); + + if(seedinout!=NULL) *seedinout = lseed; + + return; +} + +__host__ __device__ double dpr64_randd(int64_t *seedinout) +{ + double ret = 0.0; + int64_t lseed = 0LL; + + if(seedinout!=NULL) lseed = *seedinout; + + dpr64_nextseed(&lseed); + ret = ((double)lseed)/((double)(random_dpr64_mod-1LL)); + + if(seedinout!=NULL) *seedinout = lseed; + + return ret; +} + +__host__ __device__ float dpr64_randf(int64_t *seedinout) +{ + float ret = 0.0f; + int64_t lseed = 0LL; + + if(seedinout!=NULL) lseed = *seedinout; + + dpr64_nextseed(&lseed); + ret = ((float)lseed)/((float)(random_dpr64_mod-1LL)); + + if(seedinout!=NULL) *seedinout = lseed; + + return ret; +} + +/////////// +// Tests // +/////////// + +void test_dprg64() +{ + printf("Tests for dprg:\n"); + long I; + int64_t seed = 133LL; + double d; + float f; + cuvect3 qv; + + printf("dpr64_randd test\n"); + seed = 133LL; + for(I=0;I<10;I++) + { + d = dpr64_randd(&seed); + printf("seed: %lld rand: %1.4f\n",(long long)seed,d); + } + + printf("\n\n"); + printf("dpr64_randf test\n"); + seed = 133LL; + for(I=0;I<10;I++) + { + f = dpr64_randf(&seed); + printf("seed: %lld rand: %1.4f\n",(long long)seed,f); + } + + return; +} + +void test_dprg32() +{ + printf("Tests for dprg:\n"); + long I; + int32_t seed = 133; + double d; + float f; + cuvect3 qv; + + printf("dpr32_randf test\n"); + seed = 133; + for(I=0;I<10;I++) + { + f = dpr32_randf(&seed); + printf("seed: %lld rand: %1.4f\n",(long long)seed,f); + } + + printf("\n\ndpr32_randnf test\n"); + seed = 133; + for(I=0;I<10;I++) + { + f = dpr32_randnf(&seed); + printf("seed: %lld rand: %1.4f\n",(long long)seed,f); + } + + return; +} + + }; //namespace amscuda \ No newline at end of file diff --git a/src/amsculib2/amscuarray.cu b/src/amsculib2/amscuarray.cu index 81b746e..91fc95c 100644 --- a/src/amsculib2/amscuarray.cu +++ b/src/amsculib2/amscuarray.cu @@ -1,63 +1,63 @@ -#include - -namespace amscuda -{ - -__global__ void test_cuarray_sum_kf(cuarray *dq1, float *sum) -{ - int I; - *sum = 0.0f; - for(I=0;Ilength;I++) - { - *sum = *sum + dq1->data[I]; - } - //*sum = (float)dq1->length; - return; -} - -float test_cuarray_sum(cuarray *q1) -{ - float ret = 0.0f; - int res; - cuarray *dq1 = NULL; - float *dsum; - cudaError_t err = cudaSuccess; - - cudaMalloc(&dsum,sizeof(float)); - res = q1->device_send(&dq1); - printf("error: res=%d\n",res); - test_cuarray_sum_kf<<<1,1>>>(dq1,dsum); - cudaDeviceSynchronize(); - - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("test_cuarray_sum Error: %s\n",cudaGetErrorString(err)); - } - - cudaMemcpy(&ret,dsum,sizeof(float),cudaMemcpyDeviceToHost); - - q1->device_free(&dq1); - cudaFree(dsum); dsum = NULL; - - return ret; -} - -void test_cuarray() -{ - cuarray q1; - - int I; - - q1.resize(100); - for(I=0;I + +namespace amscuda +{ + +__global__ void test_cuarray_sum_kf(cuarray *dq1, float *sum) +{ + int I; + *sum = 0.0f; + for(I=0;Ilength;I++) + { + *sum = *sum + dq1->data[I]; + } + //*sum = (float)dq1->length; + return; +} + +float test_cuarray_sum(cuarray *q1) +{ + float ret = 0.0f; + int res; + cuarray *dq1 = NULL; + float *dsum; + cudaError_t err = cudaSuccess; + + cudaMalloc(&dsum,sizeof(float)); + res = q1->device_send(&dq1); + printf("error: res=%d\n",res); + test_cuarray_sum_kf<<<1,1>>>(dq1,dsum); + cudaDeviceSynchronize(); + + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("test_cuarray_sum Error: %s\n",cudaGetErrorString(err)); + } + + cudaMemcpy(&ret,dsum,sizeof(float),cudaMemcpyDeviceToHost); + + q1->device_free(&dq1); + cudaFree(dsum); dsum = NULL; + + return ret; +} + +void test_cuarray() +{ + cuarray q1; + + int I; + + q1.resize(100); + for(I=0;I - -namespace amscuda -{ - -//template instantiations -template float dbuff_sum(float *devbuffer, int N); -template void dbuff_minmax(float *devbuffer, int N, float *min, float *max); - - -template void dbuff_setall(float *devbuffer, int N, float setto, int nblocks, int nthreads); - - - - -//fill devbuffer with random uniform numbers between 0 and 1 using int32_t based generator -__global__ void dbuff_rand_dpr32_kf(float *devbuffer, int N, int32_t *seeds) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - int32_t lseed; - float f; - - lseed = seeds[I0]; - for(I=I0;I>>(devbuffer,N,devseeds); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err)); - } - - cudaFree(devseeds); devseeds = NULL; - delete[] seeds; seeds = NULL; - - return; -} - -__global__ void dbuff_rand_dpr32n_kf(float *devbuffer, int N, int32_t *seeds) -{ - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int I; - - int32_t lseed; - float f; - - lseed = seeds[I0]; - for(I=I0;I>>(devbuffer,N,devseeds); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err)); - } - - cudaFree(devseeds); devseeds = NULL; - delete[] seeds; seeds = NULL; - - return; -} - - - -void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads) -{ - int I; - - return; -} - - - -/////////// -// Tests // -/////////// - -void test_dbuff_rand_dpr32() -{ - cuarray data; - float *dev_data = NULL; - int Nx = 5000; - int Ny = 5000; - cuarray dims; - int32_t rseed = 15; - FILE *fp = NULL; - const char *fname = "./test_scripts/test_dbuff_rand_dpr32.bin"; - - clock_t t0,t1,t2; - double dt0,dt1; - - printf("Tests of dbuff_rand_dpr32...\n"); - - fp = fopen(fname,"w+"); - if(fp==NULL) - { - printf("Error: Could not open %s for writing.\n",fname); - return; - } - - data.resize(Nx*Ny); - dims.resize(2); - dims[0] = Nx; dims[1] = Ny; - cudaMalloc(&dev_data,Nx*Ny*sizeof(float)); - - t0 = clock(); - dbuff_rand_dpr32(dev_data,Nx*Ny,&rseed,256,512); - t1 = clock(); - cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost); - t2 = clock(); - - dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0; - dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0; - printf("dbuff_rand_dpr32 exec time: %1.3f msec\n",dt0); - printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1); - - fwrite_ndarray(fp,&dims,&data); - - t0 = clock(); - dbuff_rand_dpr32n(dev_data,Nx*Ny,&rseed,256,512); - t1 = clock(); - cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost); - t2 = clock(); - - dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0; - dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0; - printf("dbuff_rand_dpr32n exec time: %1.3f msec\n",dt0); - printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1); - - fwrite_ndarray(fp,&dims,&data); - - fclose(fp); - - cudaFree(dev_data); dev_data = NULL; - -} - -}; - +#include + +namespace amscuda +{ + +//template instantiations +template float dbuff_sum(float *devbuffer, int N); +template void dbuff_minmax(float *devbuffer, int N, float *min, float *max); + + +template void dbuff_setall(float *devbuffer, int N, float setto, int nblocks, int nthreads); + + + + +//fill devbuffer with random uniform numbers between 0 and 1 using int32_t based generator +__global__ void dbuff_rand_dpr32_kf(float *devbuffer, int N, int32_t *seeds) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + int32_t lseed; + float f; + + lseed = seeds[I0]; + for(I=I0;I>>(devbuffer,N,devseeds); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err)); + } + + cudaFree(devseeds); devseeds = NULL; + delete[] seeds; seeds = NULL; + + return; +} + +__global__ void dbuff_rand_dpr32n_kf(float *devbuffer, int N, int32_t *seeds) +{ + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int I; + + int32_t lseed; + float f; + + lseed = seeds[I0]; + for(I=I0;I>>(devbuffer,N,devseeds); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("amscu::dbuff_rand_dpr32 error: %s\n",cudaGetErrorString(err)); + } + + cudaFree(devseeds); devseeds = NULL; + delete[] seeds; seeds = NULL; + + return; +} + + + +void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads) +{ + int I; + + return; +} + + + +/////////// +// Tests // +/////////// + +void test_dbuff_rand_dpr32() +{ + cuarray data; + float *dev_data = NULL; + int Nx = 5000; + int Ny = 5000; + cuarray dims; + int32_t rseed = 15; + FILE *fp = NULL; + const char *fname = "./test_scripts/test_dbuff_rand_dpr32.bin"; + + clock_t t0,t1,t2; + double dt0,dt1; + + printf("Tests of dbuff_rand_dpr32...\n"); + + fp = fopen(fname,"w+"); + if(fp==NULL) + { + printf("Error: Could not open %s for writing.\n",fname); + return; + } + + data.resize(Nx*Ny); + dims.resize(2); + dims[0] = Nx; dims[1] = Ny; + cudaMalloc(&dev_data,Nx*Ny*sizeof(float)); + + t0 = clock(); + dbuff_rand_dpr32(dev_data,Nx*Ny,&rseed,256,512); + t1 = clock(); + cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost); + t2 = clock(); + + dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0; + dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0; + printf("dbuff_rand_dpr32 exec time: %1.3f msec\n",dt0); + printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1); + + fwrite_ndarray(fp,&dims,&data); + + t0 = clock(); + dbuff_rand_dpr32n(dev_data,Nx*Ny,&rseed,256,512); + t1 = clock(); + cudaMemcpy(data.data,dev_data,Nx*Ny*sizeof(float),cudaMemcpyDeviceToHost); + t2 = clock(); + + dt0 = (double)(t1-t0)/(double)CLOCKS_PER_SEC*1000.0; + dt1 = (double)(t2-t1)/(double)CLOCKS_PER_SEC*1000.0; + printf("dbuff_rand_dpr32n exec time: %1.3f msec\n",dt0); + printf("copy (%d,%d) to host time: %1.3f msec\n",Nx,Ny,dt1); + + fwrite_ndarray(fp,&dims,&data); + + fclose(fp); + + cudaFree(dev_data); dev_data = NULL; + +} + +}; + diff --git a/src/amsculib2/amscugeom.cu b/src/amsculib2/amscugeom.cu index f2d5b97..3ffc878 100644 --- a/src/amsculib2/amscugeom.cu +++ b/src/amsculib2/amscugeom.cu @@ -1,6 +1,6 @@ -#include - -namespace amscuda -{ - +#include + +namespace amscuda +{ + }; \ No newline at end of file diff --git a/src/amsculib2/amsculib2.cu b/src/amsculib2/amsculib2.cu index f2d5b97..3ffc878 100644 --- a/src/amsculib2/amsculib2.cu +++ b/src/amsculib2/amsculib2.cu @@ -1,6 +1,6 @@ -#include - -namespace amscuda -{ - +#include + +namespace amscuda +{ + }; \ No newline at end of file diff --git a/src/amsculib2/amscumath.cu b/src/amsculib2/amscumath.cu index 154bbca..e082fff 100644 --- a/src/amsculib2/amscumath.cu +++ b/src/amsculib2/amscumath.cu @@ -1,269 +1,269 @@ -#include - -namespace amscuda -{ - - __host__ __device__ double dabs(double x) - { - if(x<0.0) - { - x = -x; - } - return x; - } - - __host__ __device__ float fabs(float x) - { - if(x<0.0f) - { - x = -x; - } - return x; - } - - __host__ __device__ double mod(double x, double md) - { - x = fmod(x,md); - if(x<0.0) - { - x = x + md; - } - return x; - } - - __host__ __device__ float mod(float x, float md) - { - x = fmodf(x,md); - if(x<0.0f) - { - x = x + md; - } - return x; - } - - __host__ __device__ int mod(int x, int n) - { - x = x % n; - if(x<0) - { - x = x + n; - } - return x; - } - - __host__ __device__ long mod(long x, long n) - { - x = x % n; - if(x<0) - { - x = x + n; - } - return x; - } - - __host__ __device__ int truediv(int x, int y) - { - int z = 0; - if(x>=0 && y>0) - { - z = x/y; - } - else if(x<0 && y>0) - { - z = -((-x)/y) - 1; - } - else if(x>=0 && y<0) - { - z = -(x/(-y)) - 1; - } - else if(x<0 && y<0) - { - z = ((-x)/(-y)); - } - - return z; - } - - __host__ __device__ long truediv(long x, long y) - { - int z = 0; - if(x>=0 && y>0) - { - z = x/y; - } - else if(x<0 && y>0) - { - z = -((-x)/y) - 1; - } - else if(x>=0 && y<0) - { - z = -(x/(-y)) - 1; - } - else if(x<0 && y<0) - { - z = ((-x)/(-y)); - } - - return z; - } - - - template<> __host__ __device__ double min(double a, double b) - { - if(isnan(a)) - { - return b; - } - else if(isnan(b)) - { - return a; - } - else if(a>b) - { - return b; - } - else - { - return a; - } - } - - template<> __host__ __device__ float min(float a, float b) - { - if(isnan(a)) - { - return b; - } - else if(isnan(b)) - { - return a; - } - else if(a>b) - { - return b; - } - else - { - return a; - } - } - - template<> __host__ __device__ double max(double a, double b) - { - if(isnan(a)) - { - return b; - } - else if(isnan(b)) - { - return a; - } - else if(a>b) - { - return a; - } - else - { - return b; - } - } - - template<> __host__ __device__ float max(float a, float b) - { - if(isnan(a)) - { - return b; - } - else if(isnan(b)) - { - return a; - } - else if(a>b) - { - return a; - } - else - { - return b; - } - } - - __device__ __host__ double arg(double x, double y) - { - double ret = 0.0; - double z = ::sqrt(x*x+y*y); - - if(z>0.0) - { - if(y<=x && y>=-x) - { - ret = asin(y/z); - } - else if(y>=x && y>=-x) - { - ret = acos(x/z); - } - else if(y>=x && y<=-x) - { - ret = pi-asin(y/z); - } - else - { - ret = 2.0*pi-acos(x/z); - } - } - - if(ret<0.0) - { - ret = 2.0*pi+ret; - } - - return ret; - } - - __device__ __host__ void get_azel(double x, double y, double z, double *az, double *el) - { - //int ret = -2; //should never see this return - double n, rp; - n = ::sqrt(x*x+y*y+z*z); - if(n>0.0) - { - rp = ::sqrt(x*x+y*y); - if(rp>0.0) - { - //ret = 1; //nonzero vector - should work - *az = arg(x,y); - *el = ::atan(z/rp); - } - else - { - //ret = 0; //straight up or straight down - if(z>0.0) - { - *az = 0.0; - *el = pi/2.0; - } - else - { - *az = 0.0; - *el = -pi/2.0; - } - } - } - else - { - *az = 0.0; - *el = 0.0; - //ret = -1; //zero vector - no real azimuth/elevation - } - - return; - } - - - void test_amscumath1() - { - printf("pi = %1.16f\n",amscuda::pi); - } - +#include + +namespace amscuda +{ + + __host__ __device__ double dabs(double x) + { + if(x<0.0) + { + x = -x; + } + return x; + } + + __host__ __device__ float fabs(float x) + { + if(x<0.0f) + { + x = -x; + } + return x; + } + + __host__ __device__ double mod(double x, double md) + { + x = fmod(x,md); + if(x<0.0) + { + x = x + md; + } + return x; + } + + __host__ __device__ float mod(float x, float md) + { + x = fmodf(x,md); + if(x<0.0f) + { + x = x + md; + } + return x; + } + + __host__ __device__ int mod(int x, int n) + { + x = x % n; + if(x<0) + { + x = x + n; + } + return x; + } + + __host__ __device__ long mod(long x, long n) + { + x = x % n; + if(x<0) + { + x = x + n; + } + return x; + } + + __host__ __device__ int truediv(int x, int y) + { + int z = 0; + if(x>=0 && y>0) + { + z = x/y; + } + else if(x<0 && y>0) + { + z = -((-x)/y) - 1; + } + else if(x>=0 && y<0) + { + z = -(x/(-y)) - 1; + } + else if(x<0 && y<0) + { + z = ((-x)/(-y)); + } + + return z; + } + + __host__ __device__ long truediv(long x, long y) + { + int z = 0; + if(x>=0 && y>0) + { + z = x/y; + } + else if(x<0 && y>0) + { + z = -((-x)/y) - 1; + } + else if(x>=0 && y<0) + { + z = -(x/(-y)) - 1; + } + else if(x<0 && y<0) + { + z = ((-x)/(-y)); + } + + return z; + } + + + template<> __host__ __device__ double min(double a, double b) + { + if(isnan(a)) + { + return b; + } + else if(isnan(b)) + { + return a; + } + else if(a>b) + { + return b; + } + else + { + return a; + } + } + + template<> __host__ __device__ float min(float a, float b) + { + if(isnan(a)) + { + return b; + } + else if(isnan(b)) + { + return a; + } + else if(a>b) + { + return b; + } + else + { + return a; + } + } + + template<> __host__ __device__ double max(double a, double b) + { + if(isnan(a)) + { + return b; + } + else if(isnan(b)) + { + return a; + } + else if(a>b) + { + return a; + } + else + { + return b; + } + } + + template<> __host__ __device__ float max(float a, float b) + { + if(isnan(a)) + { + return b; + } + else if(isnan(b)) + { + return a; + } + else if(a>b) + { + return a; + } + else + { + return b; + } + } + + __device__ __host__ double arg(double x, double y) + { + double ret = 0.0; + double z = ::sqrt(x*x+y*y); + + if(z>0.0) + { + if(y<=x && y>=-x) + { + ret = asin(y/z); + } + else if(y>=x && y>=-x) + { + ret = acos(x/z); + } + else if(y>=x && y<=-x) + { + ret = pi-asin(y/z); + } + else + { + ret = 2.0*pi-acos(x/z); + } + } + + if(ret<0.0) + { + ret = 2.0*pi+ret; + } + + return ret; + } + + __device__ __host__ void get_azel(double x, double y, double z, double *az, double *el) + { + //int ret = -2; //should never see this return + double n, rp; + n = ::sqrt(x*x+y*y+z*z); + if(n>0.0) + { + rp = ::sqrt(x*x+y*y); + if(rp>0.0) + { + //ret = 1; //nonzero vector - should work + *az = arg(x,y); + *el = ::atan(z/rp); + } + else + { + //ret = 0; //straight up or straight down + if(z>0.0) + { + *az = 0.0; + *el = pi/2.0; + } + else + { + *az = 0.0; + *el = -pi/2.0; + } + } + } + else + { + *az = 0.0; + *el = 0.0; + //ret = -1; //zero vector - no real azimuth/elevation + } + + return; + } + + + void test_amscumath1() + { + printf("pi = %1.16f\n",amscuda::pi); + } + }; \ No newline at end of file diff --git a/src/amsculib2/amscurarray.cu b/src/amsculib2/amscurarray.cu index 8b764ff..e2b0e17 100644 --- a/src/amsculib2/amscurarray.cu +++ b/src/amsculib2/amscurarray.cu @@ -1,85 +1,85 @@ -#include - -namespace amscuda -{ - -__global__ void test_amscurarray1_kf1(curarray *q) -{ - int I,J,Na; - int I0 = threadIdx.x + blockIdx.x*blockDim.x; - int Is = blockDim.x*gridDim.x; - int N = q->Narrays; - - for(I=I0; IN[I]; - - //printf("I:%d Na: %d\n",I,Na); - //q->dev_resizearray(I, Na); - for(J=0;Jdevarrayptrs[I][J] = J; - } - } -} - -void test_amscurarray1() -{ - int I; - cudaError_t err = cudaSuccess; - - printf("test_amscurarray1:\n"); - curarray *qarray = NULL; - - curarray_new(&qarray,100); - err = cudaGetLastError(); - if(err!=cudaSuccess) - { - printf("debug error trap 1: %s\n",cudaGetErrorString(err)); - } - - for(I=0;I<100;I++) - { - qarray->resizearray(I,5); - } - - qarray->push(); - qarray->pull(); - - cuda_errortrap("debug: error trap 2"); - - for(I=0;I<5;I++) - { - printf("array[%d], size %d\n",I,qarray->N[I]); - } - - // - for(I=0;I<100;I++) - { - qarray->resizearray(I,I%5); - cuda_errortrap("debug: error trap resize2"); - } - - qarray->push(); - qarray->pull(); - test_amscurarray1_kf1<<<128,1>>>(qarray->devptr); - - cuda_errortrap("debug: error trap kf1"); - - qarray->pull(); - cuda_errortrap("debug: error trap pull2"); - - - for(I=0;I<5;I++) - { - printf("array[%d], size %d\n",I,qarray->N[I]); - } - - curarray_delete(&qarray); - - cuda_errortrap("debug: error trap 3"); - - return; -} - +#include + +namespace amscuda +{ + +__global__ void test_amscurarray1_kf1(curarray *q) +{ + int I,J,Na; + int I0 = threadIdx.x + blockIdx.x*blockDim.x; + int Is = blockDim.x*gridDim.x; + int N = q->Narrays; + + for(I=I0; IN[I]; + + //printf("I:%d Na: %d\n",I,Na); + //q->dev_resizearray(I, Na); + for(J=0;Jdevarrayptrs[I][J] = J; + } + } +} + +void test_amscurarray1() +{ + int I; + cudaError_t err = cudaSuccess; + + printf("test_amscurarray1:\n"); + curarray *qarray = NULL; + + curarray_new(&qarray,100); + err = cudaGetLastError(); + if(err!=cudaSuccess) + { + printf("debug error trap 1: %s\n",cudaGetErrorString(err)); + } + + for(I=0;I<100;I++) + { + qarray->resizearray(I,5); + } + + qarray->push(); + qarray->pull(); + + cuda_errortrap("debug: error trap 2"); + + for(I=0;I<5;I++) + { + printf("array[%d], size %d\n",I,qarray->N[I]); + } + + // + for(I=0;I<100;I++) + { + qarray->resizearray(I,I%5); + cuda_errortrap("debug: error trap resize2"); + } + + qarray->push(); + qarray->pull(); + test_amscurarray1_kf1<<<128,1>>>(qarray->devptr); + + cuda_errortrap("debug: error trap kf1"); + + qarray->pull(); + cuda_errortrap("debug: error trap pull2"); + + + for(I=0;I<5;I++) + { + printf("array[%d], size %d\n",I,qarray->N[I]); + } + + curarray_delete(&qarray); + + cuda_errortrap("debug: error trap 3"); + + return; +} + }; \ No newline at end of file diff --git a/src/amsculib2/cuvect2.cu b/src/amsculib2/cuvect2.cu index 246eff8..581318a 100644 --- a/src/amsculib2/cuvect2.cu +++ b/src/amsculib2/cuvect2.cu @@ -1,361 +1,361 @@ -#include - -namespace amscuda -{ - - __host__ __device__ cuvect2::cuvect2() - { - x = 0.0; y = 0.0; - return; - } - - __host__ __device__ cuvect2::~cuvect2() - { - x = 0.0; y = 0.0; - return; - } - - __host__ __device__ cuvect2::cuvect2(double _x, double _y) - { - x = _x; y = _y; - return; - } - - __host__ __device__ double& cuvect2::operator[](const int I) - { - if(I==0) return x; - if(I==1) return y; - return x; - } - - __host__ __device__ const double& cuvect2::operator[](const int I) const - { - if(I==0) return x; - if(I==1) return y; - return x; - } - - __host__ __device__ cuvect2 cuvect2::operator+(cuvect2 lhs) - { - cuvect2 ret; - ret.x = x + lhs.x; - ret.y = y + lhs.y; - return ret; - } - - __host__ __device__ cuvect2 cuvect2::operator-(cuvect2 lhs) - { - cuvect2 ret; - ret.x = x - lhs.x; - ret.y = y - lhs.y; - return ret; - } - __host__ __device__ cuvect2 cuvect2::operator*(double lhs) - { - cuvect2 ret; - ret.x = x*lhs; - ret.y = y*lhs; - return ret; - } - - __host__ __device__ cuvect2 cuvect2::operator/(double lhs) - { - cuvect2 ret; - ret.x = x/lhs; - ret.y = y/lhs; - return ret; - } - - __host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b) - { - double ret = a.x*b.x+a.y*b.y; - return ret; - } - - __host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b) - { - double ret = a.x*b.y-a.y*b.x; - return ret; - } - - __host__ __device__ double cuvect2_norm(cuvect2 a) - { - double ret = ::sqrt(a.x*a.x+a.y*a.y); - return ret; - } - - __host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a) - { - cuvect2 ret; - double m = cuvect2_norm(a); - if(m>0.0) - { - ret.x = a.x/m; ret.y = a.y/m; - } - else - { - ret.x = 0.0; ret.y = 0.0; - } - return ret; - } - - __host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b) - { - cuvect2 ret; - cuvect2 bn = cuvect2_normalize(b); - double m = cuvect2_dot(a,bn); - ret = bn*m; - return ret; - } - - -__host__ __device__ cumat2::cumat2() -{ - int I; - for(I=0;I<4;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ cumat2::~cumat2() -{ - int I; - for(I=0;I<4;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ double& cumat2::operator[](const int I) -{ - return dat[I]; -} - -__host__ __device__ double& cumat2::operator()(const int I, const int J) -{ - return dat[I+2*J]; -} - -__host__ __device__ cumat2 cumat2::operator+(cumat2 lhs) -{ - int I; - cumat2 ret; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I] + lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat2 cumat2::operator-(cumat2 lhs) -{ - int I; - cumat2 ret; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I] - lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat2 cumat2::operator*(double lhs) -{ - cumat2 ret; - int I; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I]*lhs; - } - return ret; -} - -__host__ __device__ cumat2 cumat2::operator/(double lhs) -{ - cumat2 ret; - int I; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I]/lhs; - } - return ret; -} - -__host__ __device__ double& cumat2::at(const int I, const int J) -{ - return dat[I+2*J]; -} - - -__host__ __device__ cuvect2 cumat2::operator*(cuvect2 lhs) -{ - cuvect2 ret = cuvect2(0.0,0.0); - int I,J; - for(I=0;I<2;I++) - { - for(J=0;J<2;J++) - { - ret[I] = ret[I] + this->at(I,J)*lhs[J]; - } - } - return ret; -} - -__host__ __device__ cumat2 cumat2::operator*(cumat2 lhs) -{ - cumat2 ret; - int I,J,K; - - for(I=0;I<2;I++) - { - for(J=0;J<2;J++) - { - ret(I,J) = 0.0; - for(K=0;K<2;K++) - { - ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); - } - } - } - - return ret; -} - -__host__ __device__ double cumat2::det() -{ - double ret = 0.0; - ret = ret + this->at(0,0)*this->at(1,1); - ret = ret - this->at(1,0)*this->at(0,1); - return ret; -} - -__host__ __device__ cumat2 cumat2::transpose() -{ - cumat2 q; - int I,J; - for(I=0;I<2;I++) - { - for(J=0;J<2;J++) - { - q.at(I,J) = this->at(J,I); - } - } - return q; -} - -__host__ __device__ cumat2 cumat2::inverse() -{ - cumat2 q; - double dt = q.det(); - if(dt!=0.0) - { - q(0,0) = this->at(1,1)/dt; - q(0,1) = -this->at(0,1)/dt; - q(1,0) = -this->at(1,0)/dt; - q(1,1) = this->at(0,0)/dt; - } - else - { - q(0,0) = inf; - q(0,1) = inf; - q(1,0) = inf; - q(1,1) = inf; - } - - return q; -} - -//2x2 matrix operations -//matrix order is assumed to be mat[I,J] = mat[I+3*J] - -//transpose a 2x2 matrix in place -__host__ __device__ void mat2_transpose(double *mat2inout) -{ - double mat2_in[4]; - - mat2_copy(mat2_in,mat2inout); - mat2inout[0+0*2] = mat2_in[0+0*2]; - mat2inout[1+0*2] = mat2_in[0+1*2]; - mat2inout[0+1*2] = mat2_in[1+0*2]; - mat2inout[1+1*2] = mat2_in[1+1*2]; - - return; -} - -//copies src to dest -__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src) -{ - mat2_dest[0+0*2] = mat2_src[0+0*2]; - mat2_dest[1+0*2] = mat2_src[1+0*2]; - mat2_dest[0+1*2] = mat2_src[0+1*2]; - mat2_dest[1+1*2] = mat2_src[1+1*2]; - - return; -} - -//inverts mat?inout[4] -__host__ __device__ void mat2_inverse(double *mat2inout) -{ - double det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2]; - double mat2in[4]; - - mat2_copy(mat2in,mat2inout); - mat2inout[0+0*2] = mat2inout[1+1*2]/det; - mat2inout[1+0*2] = -mat2inout[1+0*2]/det; - mat2inout[0+1*2] = -mat2inout[0+1*2]/det; - mat2inout[1+1*2] = mat2inout[0+0*2]/det; - - return; -} - -//rotatin matrix from angle -__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2) -{ - mat2[0+0*2] = ::cos(angle); - mat2[1+0*2] = ::sin(angle); - mat2[0+1*2] = -::sin(angle); - mat2[1+1*2] = ::cos(angle); - return; -} - -//multiplies c = a*b -__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c) -{ - double mat2a_in[4]; - double mat2b_in[4]; - - if(mat2a==NULL || mat2b==NULL || mat2c==NULL) - { - return; - } - - mat2_copy(mat2a_in,mat2a); - mat2_copy(mat2b_in,mat2b); - - mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2]; - mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2]; - mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2]; - mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2]; - - return; -} - -// ret = a*b -__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b) -{ - cuvect2 ret; - ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2]; - ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2]; - return ret; -} - -void test_cuvect2_1() -{ - - - return; -} - +#include + +namespace amscuda +{ + + __host__ __device__ cuvect2::cuvect2() + { + x = 0.0; y = 0.0; + return; + } + + __host__ __device__ cuvect2::~cuvect2() + { + x = 0.0; y = 0.0; + return; + } + + __host__ __device__ cuvect2::cuvect2(double _x, double _y) + { + x = _x; y = _y; + return; + } + + __host__ __device__ double& cuvect2::operator[](const int I) + { + if(I==0) return x; + if(I==1) return y; + return x; + } + + __host__ __device__ const double& cuvect2::operator[](const int I) const + { + if(I==0) return x; + if(I==1) return y; + return x; + } + + __host__ __device__ cuvect2 cuvect2::operator+(cuvect2 lhs) + { + cuvect2 ret; + ret.x = x + lhs.x; + ret.y = y + lhs.y; + return ret; + } + + __host__ __device__ cuvect2 cuvect2::operator-(cuvect2 lhs) + { + cuvect2 ret; + ret.x = x - lhs.x; + ret.y = y - lhs.y; + return ret; + } + __host__ __device__ cuvect2 cuvect2::operator*(double lhs) + { + cuvect2 ret; + ret.x = x*lhs; + ret.y = y*lhs; + return ret; + } + + __host__ __device__ cuvect2 cuvect2::operator/(double lhs) + { + cuvect2 ret; + ret.x = x/lhs; + ret.y = y/lhs; + return ret; + } + + __host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b) + { + double ret = a.x*b.x+a.y*b.y; + return ret; + } + + __host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b) + { + double ret = a.x*b.y-a.y*b.x; + return ret; + } + + __host__ __device__ double cuvect2_norm(cuvect2 a) + { + double ret = ::sqrt(a.x*a.x+a.y*a.y); + return ret; + } + + __host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a) + { + cuvect2 ret; + double m = cuvect2_norm(a); + if(m>0.0) + { + ret.x = a.x/m; ret.y = a.y/m; + } + else + { + ret.x = 0.0; ret.y = 0.0; + } + return ret; + } + + __host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b) + { + cuvect2 ret; + cuvect2 bn = cuvect2_normalize(b); + double m = cuvect2_dot(a,bn); + ret = bn*m; + return ret; + } + + +__host__ __device__ cumat2::cumat2() +{ + int I; + for(I=0;I<4;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ cumat2::~cumat2() +{ + int I; + for(I=0;I<4;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ double& cumat2::operator[](const int I) +{ + return dat[I]; +} + +__host__ __device__ double& cumat2::operator()(const int I, const int J) +{ + return dat[I+2*J]; +} + +__host__ __device__ cumat2 cumat2::operator+(cumat2 lhs) +{ + int I; + cumat2 ret; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I] + lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat2 cumat2::operator-(cumat2 lhs) +{ + int I; + cumat2 ret; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I] - lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat2 cumat2::operator*(double lhs) +{ + cumat2 ret; + int I; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I]*lhs; + } + return ret; +} + +__host__ __device__ cumat2 cumat2::operator/(double lhs) +{ + cumat2 ret; + int I; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I]/lhs; + } + return ret; +} + +__host__ __device__ double& cumat2::at(const int I, const int J) +{ + return dat[I+2*J]; +} + + +__host__ __device__ cuvect2 cumat2::operator*(cuvect2 lhs) +{ + cuvect2 ret = cuvect2(0.0,0.0); + int I,J; + for(I=0;I<2;I++) + { + for(J=0;J<2;J++) + { + ret[I] = ret[I] + this->at(I,J)*lhs[J]; + } + } + return ret; +} + +__host__ __device__ cumat2 cumat2::operator*(cumat2 lhs) +{ + cumat2 ret; + int I,J,K; + + for(I=0;I<2;I++) + { + for(J=0;J<2;J++) + { + ret(I,J) = 0.0; + for(K=0;K<2;K++) + { + ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); + } + } + } + + return ret; +} + +__host__ __device__ double cumat2::det() +{ + double ret = 0.0; + ret = ret + this->at(0,0)*this->at(1,1); + ret = ret - this->at(1,0)*this->at(0,1); + return ret; +} + +__host__ __device__ cumat2 cumat2::transpose() +{ + cumat2 q; + int I,J; + for(I=0;I<2;I++) + { + for(J=0;J<2;J++) + { + q.at(I,J) = this->at(J,I); + } + } + return q; +} + +__host__ __device__ cumat2 cumat2::inverse() +{ + cumat2 q; + double dt = q.det(); + if(dt!=0.0) + { + q(0,0) = this->at(1,1)/dt; + q(0,1) = -this->at(0,1)/dt; + q(1,0) = -this->at(1,0)/dt; + q(1,1) = this->at(0,0)/dt; + } + else + { + q(0,0) = inf; + q(0,1) = inf; + q(1,0) = inf; + q(1,1) = inf; + } + + return q; +} + +//2x2 matrix operations +//matrix order is assumed to be mat[I,J] = mat[I+3*J] + +//transpose a 2x2 matrix in place +__host__ __device__ void mat2_transpose(double *mat2inout) +{ + double mat2_in[4]; + + mat2_copy(mat2_in,mat2inout); + mat2inout[0+0*2] = mat2_in[0+0*2]; + mat2inout[1+0*2] = mat2_in[0+1*2]; + mat2inout[0+1*2] = mat2_in[1+0*2]; + mat2inout[1+1*2] = mat2_in[1+1*2]; + + return; +} + +//copies src to dest +__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src) +{ + mat2_dest[0+0*2] = mat2_src[0+0*2]; + mat2_dest[1+0*2] = mat2_src[1+0*2]; + mat2_dest[0+1*2] = mat2_src[0+1*2]; + mat2_dest[1+1*2] = mat2_src[1+1*2]; + + return; +} + +//inverts mat?inout[4] +__host__ __device__ void mat2_inverse(double *mat2inout) +{ + double det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2]; + double mat2in[4]; + + mat2_copy(mat2in,mat2inout); + mat2inout[0+0*2] = mat2inout[1+1*2]/det; + mat2inout[1+0*2] = -mat2inout[1+0*2]/det; + mat2inout[0+1*2] = -mat2inout[0+1*2]/det; + mat2inout[1+1*2] = mat2inout[0+0*2]/det; + + return; +} + +//rotatin matrix from angle +__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2) +{ + mat2[0+0*2] = ::cos(angle); + mat2[1+0*2] = ::sin(angle); + mat2[0+1*2] = -::sin(angle); + mat2[1+1*2] = ::cos(angle); + return; +} + +//multiplies c = a*b +__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c) +{ + double mat2a_in[4]; + double mat2b_in[4]; + + if(mat2a==NULL || mat2b==NULL || mat2c==NULL) + { + return; + } + + mat2_copy(mat2a_in,mat2a); + mat2_copy(mat2b_in,mat2b); + + mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2]; + mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2]; + mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2]; + mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2]; + + return; +} + +// ret = a*b +__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b) +{ + cuvect2 ret; + ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2]; + ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2]; + return ret; +} + +void test_cuvect2_1() +{ + + + return; +} + }; \ No newline at end of file diff --git a/src/amsculib2/cuvect2f.cu b/src/amsculib2/cuvect2f.cu index 891d39c..240ed6a 100644 --- a/src/amsculib2/cuvect2f.cu +++ b/src/amsculib2/cuvect2f.cu @@ -1,361 +1,369 @@ -#include - -namespace amscuda -{ - - __host__ __device__ cuvect2f::cuvect2f() - { - x = 0.0; y = 0.0; - return; - } - - __host__ __device__ cuvect2f::~cuvect2f() - { - x = 0.0; y = 0.0; - return; - } - - __host__ __device__ cuvect2f::cuvect2f(float _x, float _y) - { - x = _x; y = _y; - return; - } - - __host__ __device__ float& cuvect2f::operator[](const int I) - { - if(I==0) return x; - if(I==1) return y; - return x; - } - - __host__ __device__ const float& cuvect2f::operator[](const int I) const - { - if(I==0) return x; - if(I==1) return y; - return x; - } - - __host__ __device__ cuvect2f cuvect2f::operator+(cuvect2f lhs) - { - cuvect2f ret; - ret.x = x + lhs.x; - ret.y = y + lhs.y; - return ret; - } - - __host__ __device__ cuvect2f cuvect2f::operator-(cuvect2f lhs) - { - cuvect2f ret; - ret.x = x - lhs.x; - ret.y = y - lhs.y; - return ret; - } - __host__ __device__ cuvect2f cuvect2f::operator*(float lhs) - { - cuvect2f ret; - ret.x = x*lhs; - ret.y = y*lhs; - return ret; - } - - __host__ __device__ cuvect2f cuvect2f::operator/(float lhs) - { - cuvect2f ret; - ret.x = x/lhs; - ret.y = y/lhs; - return ret; - } - - __host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b) - { - float ret = a.x*b.x+a.y*b.y; - return ret; - } - - __host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b) - { - float ret = a.x*b.y-a.y*b.x; - return ret; - } - - __host__ __device__ float cuvect2f_norm(cuvect2f a) - { - float ret = ::sqrtf(a.x*a.x+a.y*a.y); - return ret; - } - - __host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a) - { - cuvect2f ret; - float m = cuvect2f_norm(a); - if(m>0.0) - { - ret.x = a.x/m; ret.y = a.y/m; - } - else - { - ret.x = 0.0; ret.y = 0.0; - } - return ret; - } - - __host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b) - { - cuvect2f ret; - cuvect2f bn = cuvect2f_normalize(b); - float m = cuvect2f_dot(a,bn); - ret = bn*m; - return ret; - } - - -__host__ __device__ cumat2f::cumat2f() -{ - int I; - for(I=0;I<4;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ cumat2f::~cumat2f() -{ - int I; - for(I=0;I<4;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ float& cumat2f::operator[](const int I) -{ - return dat[I]; -} - -__host__ __device__ float& cumat2f::operator()(const int I, const int J) -{ - return dat[I+2*J]; -} - -__host__ __device__ cumat2f cumat2f::operator+(cumat2f lhs) -{ - int I; - cumat2f ret; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I] + lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat2f cumat2f::operator-(cumat2f lhs) -{ - int I; - cumat2f ret; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I] - lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat2f cumat2f::operator*(float lhs) -{ - cumat2f ret; - int I; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I]*lhs; - } - return ret; -} - -__host__ __device__ cumat2f cumat2f::operator/(float lhs) -{ - cumat2f ret; - int I; - for(I=0;I<4;I++) - { - ret.dat[I] = this->dat[I]/lhs; - } - return ret; -} - -__host__ __device__ float& cumat2f::at(const int I, const int J) -{ - return dat[I+2*J]; -} - - -__host__ __device__ cuvect2f cumat2f::operator*(cuvect2f lhs) -{ - cuvect2f ret = cuvect2f(0.0,0.0); - int I,J; - for(I=0;I<2;I++) - { - for(J=0;J<2;J++) - { - ret[I] = ret[I] + this->at(I,J)*lhs[J]; - } - } - return ret; -} - -__host__ __device__ cumat2f cumat2f::operator*(cumat2f lhs) -{ - cumat2f ret; - int I,J,K; - - for(I=0;I<2;I++) - { - for(J=0;J<2;J++) - { - ret(I,J) = 0.0; - for(K=0;K<2;K++) - { - ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); - } - } - } - - return ret; -} - -__host__ __device__ float cumat2f::det() -{ - float ret = 0.0; - ret = ret + this->at(0,0)*this->at(1,1); - ret = ret - this->at(1,0)*this->at(0,1); - return ret; -} - -__host__ __device__ cumat2f cumat2f::transpose() -{ - cumat2f q; - int I,J; - for(I=0;I<2;I++) - { - for(J=0;J<2;J++) - { - q.at(I,J) = this->at(J,I); - } - } - return q; -} - -__host__ __device__ cumat2f cumat2f::inverse() -{ - cumat2f q; - float dt = q.det(); - if(dt!=0.0) - { - q(0,0) = this->at(1,1)/dt; - q(0,1) = -this->at(0,1)/dt; - q(1,0) = -this->at(1,0)/dt; - q(1,1) = this->at(0,0)/dt; - } - else - { - q(0,0) = inf; - q(0,1) = inf; - q(1,0) = inf; - q(1,1) = inf; - } - - return q; -} - -//2x2 matrix operations -//matrix order is assumed to be mat[I,J] = mat[I+3*J] - -//transpose a 2x2 matrix in place -__host__ __device__ void mat2f_transpose(float *mat2inout) -{ - float mat2f_in[4]; - - mat2f_copy(mat2f_in,mat2inout); - mat2inout[0+0*2] = mat2f_in[0+0*2]; - mat2inout[1+0*2] = mat2f_in[0+1*2]; - mat2inout[0+1*2] = mat2f_in[1+0*2]; - mat2inout[1+1*2] = mat2f_in[1+1*2]; - - return; -} - -//copies src to dest -__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src) -{ - mat2f_dest[0+0*2] = mat2f_src[0+0*2]; - mat2f_dest[1+0*2] = mat2f_src[1+0*2]; - mat2f_dest[0+1*2] = mat2f_src[0+1*2]; - mat2f_dest[1+1*2] = mat2f_src[1+1*2]; - - return; -} - -//inverts mat?inout[4] -__host__ __device__ void mat2f_inverse(float *mat2inout) -{ - float det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2]; - float mat2in[4]; - - mat2f_copy(mat2in,mat2inout); - mat2inout[0+0*2] = mat2inout[1+1*2]/det; - mat2inout[1+0*2] = -mat2inout[1+0*2]/det; - mat2inout[0+1*2] = -mat2inout[0+1*2]/det; - mat2inout[1+1*2] = mat2inout[0+0*2]/det; - - return; -} - -//rotatin matrix from angle -__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2) -{ - mat2[0+0*2] = ::cosf(angle); - mat2[1+0*2] = ::sinf(angle); - mat2[0+1*2] = -::sinf(angle); - mat2[1+1*2] = ::cosf(angle); - return; -} - -//multiplies c = a*b -__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c) -{ - float mat2a_in[4]; - float mat2b_in[4]; - - if(mat2a==NULL || mat2b==NULL || mat2c==NULL) - { - return; - } - - mat2f_copy(mat2a_in,mat2a); - mat2f_copy(mat2b_in,mat2b); - - mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2]; - mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2]; - mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2]; - mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2]; - - return; -} - -// ret = a*b -__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b) -{ - cuvect2f ret; - ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2]; - ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2]; - return ret; -} - -void test_cuvect2f_1() -{ - - - return; -} - +#include + +namespace amscuda +{ + + __host__ __device__ cuvect2f::cuvect2f() + { + x = 0.0; y = 0.0; + return; + } + + __host__ __device__ cuvect2f::~cuvect2f() + { + x = 0.0; y = 0.0; + return; + } + + __host__ __device__ cuvect2f::cuvect2f(float _x, float _y) + { + x = _x; y = _y; + return; + } + + __host__ __device__ float& cuvect2f::operator[](const int I) + { + if(I==0) return x; + if(I==1) return y; + return x; + } + + __host__ __device__ const float& cuvect2f::operator[](const int I) const + { + if(I==0) return x; + if(I==1) return y; + return x; + } + + __host__ __device__ cuvect2f cuvect2f::operator+(cuvect2f lhs) + { + cuvect2f ret; + ret.x = x + lhs.x; + ret.y = y + lhs.y; + return ret; + } + + __host__ __device__ cuvect2f cuvect2f::operator-(cuvect2f lhs) + { + cuvect2f ret; + ret.x = x - lhs.x; + ret.y = y - lhs.y; + return ret; + } + __host__ __device__ cuvect2f cuvect2f::operator*(float lhs) + { + cuvect2f ret; + ret.x = x*lhs; + ret.y = y*lhs; + return ret; + } + + __host__ __device__ cuvect2f cuvect2f::operator/(float lhs) + { + cuvect2f ret; + ret.x = x/lhs; + ret.y = y/lhs; + return ret; + } + + __host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b) + { + float ret = a.x*b.x+a.y*b.y; + return ret; + } + + __host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b) + { + float ret = a.x*b.y-a.y*b.x; + return ret; + } + + __host__ __device__ float cuvect2f_norm(cuvect2f a) + { + float ret = ::sqrtf(a.x*a.x+a.y*a.y); + return ret; + } + + __host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a) + { + cuvect2f ret; + float m = cuvect2f_norm(a); + if(m>0.0) + { + ret.x = a.x/m; ret.y = a.y/m; + } + else + { + ret.x = 0.0; ret.y = 0.0; + } + return ret; + } + + __host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b) + { + cuvect2f ret; + cuvect2f bn = cuvect2f_normalize(b); + float m = cuvect2f_dot(a,bn); + ret = bn*m; + return ret; + } + + +__host__ __device__ cumat2f::cumat2f() +{ + int I; + for(I=0;I<4;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ cumat2f::~cumat2f() +{ + int I; + for(I=0;I<4;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ float& cumat2f::operator[](const int I) +{ + return dat[I]; +} + +__host__ __device__ float& cumat2f::operator()(const int I, const int J) +{ + return dat[I+2*J]; +} + +__host__ __device__ cumat2f cumat2f::operator+(cumat2f lhs) +{ + int I; + cumat2f ret; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I] + lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat2f cumat2f::operator-(cumat2f lhs) +{ + int I; + cumat2f ret; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I] - lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat2f cumat2f::operator*(float lhs) +{ + cumat2f ret; + int I; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I]*lhs; + } + return ret; +} + +__host__ __device__ cumat2f cumat2f::operator/(float lhs) +{ + cumat2f ret; + int I; + for(I=0;I<4;I++) + { + ret.dat[I] = this->dat[I]/lhs; + } + return ret; +} + +__host__ __device__ float& cumat2f::at(const int I, const int J) +{ + return dat[I+2*J]; +} + + +__host__ __device__ cuvect2f cumat2f::operator*(cuvect2f lhs) +{ + cuvect2f ret = cuvect2f(0.0,0.0); + int I,J; + for(I=0;I<2;I++) + { + for(J=0;J<2;J++) + { + ret[I] = ret[I] + this->at(I,J)*lhs[J]; + } + } + return ret; +} + +__host__ __device__ cumat2f cumat2f::operator*(cumat2f lhs) +{ + cumat2f ret; + int I,J,K; + + for(I=0;I<2;I++) + { + for(J=0;J<2;J++) + { + ret(I,J) = 0.0; + for(K=0;K<2;K++) + { + ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); + } + } + } + + return ret; +} + +__host__ __device__ float cumat2f::det() +{ + float ret = 0.0; + ret = ret + this->at(0,0)*this->at(1,1); + ret = ret - this->at(1,0)*this->at(0,1); + return ret; +} + +__host__ __device__ cumat2f cumat2f::transpose() +{ + cumat2f q; + int I,J; + for(I=0;I<2;I++) + { + for(J=0;J<2;J++) + { + q.at(I,J) = this->at(J,I); + } + } + return q; +} + +__host__ __device__ cumat2f cumat2f::inverse() +{ + cumat2f q; + float dt = q.det(); + if(dt!=0.0) + { + q(0,0) = this->at(1,1)/dt; + q(0,1) = -this->at(0,1)/dt; + q(1,0) = -this->at(1,0)/dt; + q(1,1) = this->at(0,0)/dt; + } + else + { + q(0,0) = inf; + q(0,1) = inf; + q(1,0) = inf; + q(1,1) = inf; + } + + return q; +} + +//2x2 matrix operations +//matrix order is assumed to be mat[I,J] = mat[I+3*J] + +//transpose a 2x2 matrix in place +__host__ __device__ void mat2f_transpose(float *mat2inout) +{ + float mat2f_in[4]; + + mat2f_copy(mat2f_in,mat2inout); + mat2inout[0+0*2] = mat2f_in[0+0*2]; + mat2inout[1+0*2] = mat2f_in[0+1*2]; + mat2inout[0+1*2] = mat2f_in[1+0*2]; + mat2inout[1+1*2] = mat2f_in[1+1*2]; + + return; +} + +//copies src to dest +__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src) +{ + mat2f_dest[0+0*2] = mat2f_src[0+0*2]; + mat2f_dest[1+0*2] = mat2f_src[1+0*2]; + mat2f_dest[0+1*2] = mat2f_src[0+1*2]; + mat2f_dest[1+1*2] = mat2f_src[1+1*2]; + + return; +} + +//inverts mat?inout[4] +__host__ __device__ void mat2f_inverse(float *mat2inout) +{ + float det = mat2inout[0+0*2]*mat2inout[1+1*2]-mat2inout[0+1*2]*mat2inout[1+0*2]; + float mat2in[4]; + + mat2f_copy(mat2in,mat2inout); + mat2inout[0+0*2] = mat2inout[1+1*2]/det; + mat2inout[1+0*2] = -mat2inout[1+0*2]/det; + mat2inout[0+1*2] = -mat2inout[0+1*2]/det; + mat2inout[1+1*2] = mat2inout[0+0*2]/det; + + return; +} + +//rotatin matrix from angle +__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2) +{ + mat2[0+0*2] = ::cosf(angle); + mat2[1+0*2] = ::sinf(angle); + mat2[0+1*2] = -::sinf(angle); + mat2[1+1*2] = ::cosf(angle); + return; +} + +//multiplies c = a*b +__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c) +{ + float mat2a_in[4]; + float mat2b_in[4]; + + if(mat2a==NULL || mat2b==NULL || mat2c==NULL) + { + return; + } + + mat2f_copy(mat2a_in,mat2a); + mat2f_copy(mat2b_in,mat2b); + + mat2c[0+0*2] = mat2a_in[0+0*2]*mat2b_in[0+0*2] + mat2a_in[1+0*2]*mat2b_in[0+1*2]; + mat2c[1+0*2] = mat2a_in[0+0*2]*mat2b_in[1+0*2] + mat2a_in[1+0*2]*mat2b_in[1+1*2]; + mat2c[0+1*2] = mat2a_in[0+1*2]*mat2b_in[0+0*2] + mat2a_in[1+1*2]*mat2b_in[0+1*2]; + mat2c[1+1*2] = mat2a_in[0+1*2]*mat2b_in[1+0*2] + mat2a_in[1+1*2]*mat2b_in[1+1*2]; + + return; +} + +// ret = a*b +__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b) +{ + cuvect2f ret; + ret.x = b.x*mat2a[0+0*2] + b.y*mat2a[1+0*2]; + ret.y = b.x*mat2a[0+1*2] + b.y*mat2a[1+1*2]; + return ret; +} + +__host__ __device__ cuvect2f operator-(cuvect2f rhs) +{ + cuvect2f ret; + ret[0] = -rhs[0]; + ret[1] = -rhs[1]; + return ret; +} + +void test_cuvect2f_1() +{ + + + return; +} + }; \ No newline at end of file diff --git a/src/amsculib2/cuvect3.cu b/src/amsculib2/cuvect3.cu index a4858c7..34566f2 100644 --- a/src/amsculib2/cuvect3.cu +++ b/src/amsculib2/cuvect3.cu @@ -1,581 +1,581 @@ -#include - -namespace amscuda -{ - - __host__ __device__ cuvect3::cuvect3() - { - x = 0.0; y = 0.0; z = 0.0; - return; - } - - __host__ __device__ cuvect3::~cuvect3() - { - x = 0.0; y = 0.0; z = 0.0; - return; - } - - __host__ __device__ double& cuvect3::operator[](const int I) - { - if(I==0) return x; - if(I==1) return y; - if(I==2) return z; - return x; - } - - __host__ __device__ const double& cuvect3::operator[](const int I) const - { - if(I==0) return x; - if(I==1) return y; - if(I==2) return z; - return x; - } - - __host__ __device__ cuvect3 cuvect3::operator+(cuvect3 lhs) - { - cuvect3 ret; - ret.x = x+lhs.x; - ret.y = y+lhs.y; - ret.z = z+lhs.z; - - return ret; - } - - __host__ __device__ cuvect3 cuvect3::operator-(cuvect3 lhs) - { - cuvect3 ret; - ret.x = x-lhs.x; - ret.y = y-lhs.y; - ret.z = z-lhs.z; - - return ret; - } - - __host__ __device__ cuvect3 cuvect3::operator*(double lhs) - { - cuvect3 ret; - ret.x = x*lhs; - ret.y = y*lhs; - ret.z = z*lhs; - return ret; - } - - __host__ __device__ cuvect3 cuvect3::operator/(double lhs) - { - cuvect3 ret; - ret.x = x/lhs; - ret.y = y/lhs; - ret.z = z/lhs; - return ret; - } - - __host__ __device__ cuvect3::cuvect3(double _x, double _y, double _z) - { - x = _x; y = _y; z = _z; - return; - } - - - __host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b) - { - double ret = a.x*b.x+a.y*b.y+a.z*b.z; - - return ret; - } - - __host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b) - { - cuvect3 ret; - ret[0] = a[1]*b[2]-a[2]*b[1]; - ret[1] = a[2]*b[0]-a[0]*b[2]; - ret[2] = a[0]*b[1]-a[1]*b[0]; - - return ret; - } - - __host__ __device__ double cuvect3_norm(cuvect3 a) - { - double ret; - ret = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z); - return ret; - } - - __host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a) - { - cuvect3 ret; - double m; - m = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z); - if(m>0.0) - { - ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m; - } - else - { - ret.x = 0.0; ret.y = 0.0; ret.z = 0.0; - } - - return ret; - } - - __host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b) - { - cuvect3 ret; - cuvect3 bn = cuvect3_normalize(b); - double m = cuvect3_dot(a,bn); - ret = bn*m; - return ret; - } - -//transposes a 3x3 (9 element) matrix -__host__ __device__ void mat3_transpose(double *mat3inout) -{ - int I,J; - double matint[9]; - for(I=0;I<9;I++) - { - matint[I] = mat3inout[I]; - } - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - mat3inout[I+J*3] = matint[J+I*3]; - } - } - - return; -} - -//copies src to dest -__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src) -{ - int I; - if(mat3_dest==NULL || mat3_src==NULL) - return; - - for(I=0;I<9;I++) - mat3_dest[I] = mat3_src[I]; - - return; -} - - -__host__ __device__ double mat3_det(double *mat3in) -{ - double ret = 0.0; - - ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3]; - ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3]; - ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3]; - ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3]; - ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3]; - ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3]; - - return ret; -} - -//inverts a 3x3 (9 element) matrix -__host__ __device__ void mat3_inverse(double *mat3inout) -{ - int I; - double matint[9]; - double det = mat3_det(mat3inout); - - for(I=0;I<9;I++) - { - matint[I] = mat3inout[I]; - } - - mat3inout[0+0*3] = (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det; - mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det; - mat3inout[0+2*3] = (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det; - mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det; - mat3inout[1+1*3] = (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det; - mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det; - mat3inout[2+0*3] = (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det; - mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det; - mat3inout[2+2*3] = (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det; - - mat3_transpose(mat3inout); - - return; -} - -__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin) -{ - int I,J; - cuvect3 ret; - for(I=0;I<3;I++) - { - ret[I] = 0.0; - for(J=0;J<3;J++) - { - ret[I] = ret[I] + mat3in[I+3*J]*cvin[J]; - } - } - - return ret; -} - -__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout) -{ - double wrk[9]; - int I,J,K; - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - wrk[I+3*J] = 0.0; - } - } - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - for(K=0;K<3;K++) - { - wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K]; - } - } - } - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - matout[I+3*J] = wrk[I+3*J]; - } - } - - return; -} - -__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout) -{ - matout[0 + 0*3] = 0.0f; - matout[1 + 0*3] = -vecin[2]; - matout[2 + 0*3] = vecin[1]; - - matout[0 + 1*3] = vecin[2]; - matout[1 + 1*3] = 0.0f; - matout[2 + 1*3] = -vecin[0]; - - matout[0 + 2*3] = -vecin[1]; - matout[1 + 2*3] = vecin[0]; - matout[2 + 2*3] = 0.0f; - return; -} - -__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout) -{ - vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]); - vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]); - vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]); - - return; -} - -//returns direction cosine rotation matrix from axis and angle -__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout) -{ - int I; - double H[9]; - double Hsq[9]; - double II[9]; - - for(I=0;I<9;I++) II[I] = 0.0; - II[0+0*3] = 1.0; - II[1+1*3] = 1.0; - II[2+2*3] = 1.0; - - axis = cuvect3_normalize(axis); - - mat3_hodgedual(axis,H); - mat3_mult(H,H,Hsq); - - for(I=0;I<9;I++) - { - matout[I] = (II[I] + Hsq[I]) + H[I]*sin(angle) - Hsq[I]*cos(angle); - } - - return; -} - - - -__host__ void test_cudavect_logic1() -{ - //3 dim vector and matrix functional tests on host side - - printf("3 dim vector and matrix functional tests on host side\n"); - - cuvect3 a,b,c; - double ma[9],mb[9],mc[9]; - - int I,J; - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - ma[I+3*J] = ((double) rand())/((double) RAND_MAX); - mb[I+3*J] = ma[I+3*J]; - } - } - - mat3_inverse(mb); - mat3_mult(ma,mb,mc); - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]); - } - } - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]); - } - } - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]); - } - } - - a = cuvect3(1,1,1); - b = mat3_mult(ma,a); - b = mat3_mult(mb,b); - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]); - } - - a = cuvect3(1,0,1); - b = cuvect3(0,1,-1); - c = a+b; - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - - c = c/2.0; - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - - c = cuvect3_cross(a,b); - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - - printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b)); - - printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c)); - a = cuvect3_normalize(a); - b = cuvect3_normalize(b); - c = cuvect3_normalize(c); - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b)); - printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c)); - - return; -} - - - -__host__ __device__ cumat3::cumat3() -{ - int I; - for(I=0;I<9;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ cumat3::~cumat3() -{ - int I; - for(I=0;I<9;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ double& cumat3::operator[](const int I) -{ - return dat[I]; -} - -__host__ __device__ double& cumat3::operator()(const int I, const int J) -{ - return dat[I+3*J]; -} - -__host__ __device__ cumat3 cumat3::operator+(cumat3 lhs) -{ - int I; - cumat3 ret; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I] + lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat3 cumat3::operator-(cumat3 lhs) -{ - int I; - cumat3 ret; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I] - lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat3 cumat3::operator*(double lhs) -{ - cumat3 ret; - int I; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I]*lhs; - } - return ret; -} - -__host__ __device__ cumat3 cumat3::operator/(double lhs) -{ - cumat3 ret; - int I; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I]/lhs; - } - return ret; -} - -__host__ __device__ double& cumat3::at(const int I, const int J) -{ - return dat[I+3*J]; -} - - -__host__ __device__ cuvect3 cumat3::operator*(cuvect3 lhs) -{ - cuvect3 ret = cuvect3(0.0,0.0,0.0); - int I,J; - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - ret[I] = ret[I] + this->at(I,J)*lhs[J]; - } - } - return ret; -} - -__host__ __device__ cumat3 cumat3::operator*(cumat3 lhs) -{ - cumat3 ret; - int I,J,K; - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - ret(I,J) = 0.0; - for(K=0;K<3;K++) - { - ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); - } - } - } - - return ret; -} - -__host__ __device__ double cumat3::det() -{ - double ret = 0.0; - - ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2); - ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0); - ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1); - ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1); - ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2); - ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0); - - return ret; -} - -__host__ __device__ cumat3 cumat3::transpose() -{ - cumat3 q; - int I,J; - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - q.at(I,J) = this->at(J,I); - } - } - return q; -} - -__host__ __device__ cumat3 cumat3::inverse() -{ - cumat3 q; - double dt = q.det(); - if(dt!=0.0) - { - q(0,0) = (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt; - q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt; - q(0,2) = (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt; - q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt; - q(1,1) = (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt; - q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt; - q(2,0) = (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt; - q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt; - q(2,2) = (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt; - - q = q.transpose(); - } - else - { - q(0,0) = inf; - q(0,1) = inf; - q(0,2) = inf; - q(1,0) = inf; - q(1,1) = inf; - q(1,2) = inf; - q(2,0) = inf; - q(2,1) = inf; - q(2,2) = inf; - } - - return q; -} - +#include + +namespace amscuda +{ + + __host__ __device__ cuvect3::cuvect3() + { + x = 0.0; y = 0.0; z = 0.0; + return; + } + + __host__ __device__ cuvect3::~cuvect3() + { + x = 0.0; y = 0.0; z = 0.0; + return; + } + + __host__ __device__ double& cuvect3::operator[](const int I) + { + if(I==0) return x; + if(I==1) return y; + if(I==2) return z; + return x; + } + + __host__ __device__ const double& cuvect3::operator[](const int I) const + { + if(I==0) return x; + if(I==1) return y; + if(I==2) return z; + return x; + } + + __host__ __device__ cuvect3 cuvect3::operator+(cuvect3 lhs) + { + cuvect3 ret; + ret.x = x+lhs.x; + ret.y = y+lhs.y; + ret.z = z+lhs.z; + + return ret; + } + + __host__ __device__ cuvect3 cuvect3::operator-(cuvect3 lhs) + { + cuvect3 ret; + ret.x = x-lhs.x; + ret.y = y-lhs.y; + ret.z = z-lhs.z; + + return ret; + } + + __host__ __device__ cuvect3 cuvect3::operator*(double lhs) + { + cuvect3 ret; + ret.x = x*lhs; + ret.y = y*lhs; + ret.z = z*lhs; + return ret; + } + + __host__ __device__ cuvect3 cuvect3::operator/(double lhs) + { + cuvect3 ret; + ret.x = x/lhs; + ret.y = y/lhs; + ret.z = z/lhs; + return ret; + } + + __host__ __device__ cuvect3::cuvect3(double _x, double _y, double _z) + { + x = _x; y = _y; z = _z; + return; + } + + + __host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b) + { + double ret = a.x*b.x+a.y*b.y+a.z*b.z; + + return ret; + } + + __host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b) + { + cuvect3 ret; + ret[0] = a[1]*b[2]-a[2]*b[1]; + ret[1] = a[2]*b[0]-a[0]*b[2]; + ret[2] = a[0]*b[1]-a[1]*b[0]; + + return ret; + } + + __host__ __device__ double cuvect3_norm(cuvect3 a) + { + double ret; + ret = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z); + return ret; + } + + __host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a) + { + cuvect3 ret; + double m; + m = ::sqrt(a.x*a.x+a.y*a.y+a.z*a.z); + if(m>0.0) + { + ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m; + } + else + { + ret.x = 0.0; ret.y = 0.0; ret.z = 0.0; + } + + return ret; + } + + __host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b) + { + cuvect3 ret; + cuvect3 bn = cuvect3_normalize(b); + double m = cuvect3_dot(a,bn); + ret = bn*m; + return ret; + } + +//transposes a 3x3 (9 element) matrix +__host__ __device__ void mat3_transpose(double *mat3inout) +{ + int I,J; + double matint[9]; + for(I=0;I<9;I++) + { + matint[I] = mat3inout[I]; + } + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + mat3inout[I+J*3] = matint[J+I*3]; + } + } + + return; +} + +//copies src to dest +__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src) +{ + int I; + if(mat3_dest==NULL || mat3_src==NULL) + return; + + for(I=0;I<9;I++) + mat3_dest[I] = mat3_src[I]; + + return; +} + + +__host__ __device__ double mat3_det(double *mat3in) +{ + double ret = 0.0; + + ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3]; + ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3]; + ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3]; + ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3]; + ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3]; + ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3]; + + return ret; +} + +//inverts a 3x3 (9 element) matrix +__host__ __device__ void mat3_inverse(double *mat3inout) +{ + int I; + double matint[9]; + double det = mat3_det(mat3inout); + + for(I=0;I<9;I++) + { + matint[I] = mat3inout[I]; + } + + mat3inout[0+0*3] = (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det; + mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det; + mat3inout[0+2*3] = (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det; + mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det; + mat3inout[1+1*3] = (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det; + mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det; + mat3inout[2+0*3] = (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det; + mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det; + mat3inout[2+2*3] = (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det; + + mat3_transpose(mat3inout); + + return; +} + +__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin) +{ + int I,J; + cuvect3 ret; + for(I=0;I<3;I++) + { + ret[I] = 0.0; + for(J=0;J<3;J++) + { + ret[I] = ret[I] + mat3in[I+3*J]*cvin[J]; + } + } + + return ret; +} + +__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout) +{ + double wrk[9]; + int I,J,K; + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + wrk[I+3*J] = 0.0; + } + } + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + for(K=0;K<3;K++) + { + wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K]; + } + } + } + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + matout[I+3*J] = wrk[I+3*J]; + } + } + + return; +} + +__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout) +{ + matout[0 + 0*3] = 0.0f; + matout[1 + 0*3] = -vecin[2]; + matout[2 + 0*3] = vecin[1]; + + matout[0 + 1*3] = vecin[2]; + matout[1 + 1*3] = 0.0f; + matout[2 + 1*3] = -vecin[0]; + + matout[0 + 2*3] = -vecin[1]; + matout[1 + 2*3] = vecin[0]; + matout[2 + 2*3] = 0.0f; + return; +} + +__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout) +{ + vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]); + vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]); + vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]); + + return; +} + +//returns direction cosine rotation matrix from axis and angle +__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout) +{ + int I; + double H[9]; + double Hsq[9]; + double II[9]; + + for(I=0;I<9;I++) II[I] = 0.0; + II[0+0*3] = 1.0; + II[1+1*3] = 1.0; + II[2+2*3] = 1.0; + + axis = cuvect3_normalize(axis); + + mat3_hodgedual(axis,H); + mat3_mult(H,H,Hsq); + + for(I=0;I<9;I++) + { + matout[I] = (II[I] + Hsq[I]) + H[I]*sin(angle) - Hsq[I]*cos(angle); + } + + return; +} + + + +__host__ void test_cudavect_logic1() +{ + //3 dim vector and matrix functional tests on host side + + printf("3 dim vector and matrix functional tests on host side\n"); + + cuvect3 a,b,c; + double ma[9],mb[9],mc[9]; + + int I,J; + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + ma[I+3*J] = ((double) rand())/((double) RAND_MAX); + mb[I+3*J] = ma[I+3*J]; + } + } + + mat3_inverse(mb); + mat3_mult(ma,mb,mc); + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]); + } + } + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]); + } + } + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]); + } + } + + a = cuvect3(1,1,1); + b = mat3_mult(ma,a); + b = mat3_mult(mb,b); + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]); + } + + a = cuvect3(1,0,1); + b = cuvect3(0,1,-1); + c = a+b; + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + + c = c/2.0; + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + + c = cuvect3_cross(a,b); + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + + printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b)); + + printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c)); + a = cuvect3_normalize(a); + b = cuvect3_normalize(b); + c = cuvect3_normalize(c); + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3_dot(c,a),cuvect3_dot(c,b)); + printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3_norm(a),cuvect3_norm(b),cuvect3_norm(c)); + + return; +} + + + +__host__ __device__ cumat3::cumat3() +{ + int I; + for(I=0;I<9;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ cumat3::~cumat3() +{ + int I; + for(I=0;I<9;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ double& cumat3::operator[](const int I) +{ + return dat[I]; +} + +__host__ __device__ double& cumat3::operator()(const int I, const int J) +{ + return dat[I+3*J]; +} + +__host__ __device__ cumat3 cumat3::operator+(cumat3 lhs) +{ + int I; + cumat3 ret; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I] + lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat3 cumat3::operator-(cumat3 lhs) +{ + int I; + cumat3 ret; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I] - lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat3 cumat3::operator*(double lhs) +{ + cumat3 ret; + int I; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I]*lhs; + } + return ret; +} + +__host__ __device__ cumat3 cumat3::operator/(double lhs) +{ + cumat3 ret; + int I; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I]/lhs; + } + return ret; +} + +__host__ __device__ double& cumat3::at(const int I, const int J) +{ + return dat[I+3*J]; +} + + +__host__ __device__ cuvect3 cumat3::operator*(cuvect3 lhs) +{ + cuvect3 ret = cuvect3(0.0,0.0,0.0); + int I,J; + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + ret[I] = ret[I] + this->at(I,J)*lhs[J]; + } + } + return ret; +} + +__host__ __device__ cumat3 cumat3::operator*(cumat3 lhs) +{ + cumat3 ret; + int I,J,K; + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + ret(I,J) = 0.0; + for(K=0;K<3;K++) + { + ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); + } + } + } + + return ret; +} + +__host__ __device__ double cumat3::det() +{ + double ret = 0.0; + + ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2); + ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0); + ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1); + ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1); + ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2); + ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0); + + return ret; +} + +__host__ __device__ cumat3 cumat3::transpose() +{ + cumat3 q; + int I,J; + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + q.at(I,J) = this->at(J,I); + } + } + return q; +} + +__host__ __device__ cumat3 cumat3::inverse() +{ + cumat3 q; + double dt = q.det(); + if(dt!=0.0) + { + q(0,0) = (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt; + q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt; + q(0,2) = (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt; + q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt; + q(1,1) = (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt; + q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt; + q(2,0) = (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt; + q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt; + q(2,2) = (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt; + + q = q.transpose(); + } + else + { + q(0,0) = inf; + q(0,1) = inf; + q(0,2) = inf; + q(1,0) = inf; + q(1,1) = inf; + q(1,2) = inf; + q(2,0) = inf; + q(2,1) = inf; + q(2,2) = inf; + } + + return q; +} + }; \ No newline at end of file diff --git a/src/amsculib2/cuvect3f.cu b/src/amsculib2/cuvect3f.cu index 55bf631..71913d2 100644 --- a/src/amsculib2/cuvect3f.cu +++ b/src/amsculib2/cuvect3f.cu @@ -1,581 +1,590 @@ -#include - -namespace amscuda -{ - - __host__ __device__ cuvect3f::cuvect3f() - { - x = 0.0; y = 0.0; z = 0.0; - return; - } - - __host__ __device__ cuvect3f::~cuvect3f() - { - x = 0.0; y = 0.0; z = 0.0; - return; - } - - __host__ __device__ float& cuvect3f::operator[](const int I) - { - if(I==0) return x; - if(I==1) return y; - if(I==2) return z; - return x; - } - - __host__ __device__ const float& cuvect3f::operator[](const int I) const - { - if(I==0) return x; - if(I==1) return y; - if(I==2) return z; - return x; - } - - __host__ __device__ cuvect3f cuvect3f::operator+(cuvect3f lhs) - { - cuvect3f ret; - ret.x = x+lhs.x; - ret.y = y+lhs.y; - ret.z = z+lhs.z; - - return ret; - } - - __host__ __device__ cuvect3f cuvect3f::operator-(cuvect3f lhs) - { - cuvect3f ret; - ret.x = x-lhs.x; - ret.y = y-lhs.y; - ret.z = z-lhs.z; - - return ret; - } - - __host__ __device__ cuvect3f cuvect3f::operator*(float lhs) - { - cuvect3f ret; - ret.x = x*lhs; - ret.y = y*lhs; - ret.z = z*lhs; - return ret; - } - - __host__ __device__ cuvect3f cuvect3f::operator/(float lhs) - { - cuvect3f ret; - ret.x = x/lhs; - ret.y = y/lhs; - ret.z = z/lhs; - return ret; - } - - __host__ __device__ cuvect3f::cuvect3f(float _x, float _y, float _z) - { - x = _x; y = _y; z = _z; - return; - } - - - __host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b) - { - float ret = a.x*b.x+a.y*b.y+a.z*b.z; - - return ret; - } - - __host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b) - { - cuvect3f ret; - ret[0] = a[1]*b[2]-a[2]*b[1]; - ret[1] = a[2]*b[0]-a[0]*b[2]; - ret[2] = a[0]*b[1]-a[1]*b[0]; - - return ret; - } - - __host__ __device__ float cuvect3f_norm(cuvect3f a) - { - float ret; - ret = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z); - return ret; - } - - __host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a) - { - cuvect3f ret; - float m; - m = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z); - if(m>0.0) - { - ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m; - } - else - { - ret.x = 0.0; ret.y = 0.0; ret.z = 0.0; - } - - return ret; - } - - __host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b) - { - cuvect3f ret; - cuvect3f bn = cuvect3f_normalize(b); - float m = cuvect3f_dot(a,bn); - ret = bn*m; - return ret; - } - -//transposes a 3x3 (9 element) matrix -__host__ __device__ void mat3f_transpose(float *mat3inout) -{ - int I,J; - float matint[9]; - for(I=0;I<9;I++) - { - matint[I] = mat3inout[I]; - } - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - mat3inout[I+J*3] = matint[J+I*3]; - } - } - - return; -} - -//copies src to dest -__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src) -{ - int I; - if(mat3f_dest==NULL || mat3f_src==NULL) - return; - - for(I=0;I<9;I++) - mat3f_dest[I] = mat3f_src[I]; - - return; -} - - -__host__ __device__ float mat3f_det(float *mat3in) -{ - float ret = 0.0; - - ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3]; - ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3]; - ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3]; - ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3]; - ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3]; - ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3]; - - return ret; -} - -//inverts a 3x3 (9 element) matrix -__host__ __device__ void mat3f_inverse(float *mat3inout) -{ - int I; - float matint[9]; - float det = mat3f_det(mat3inout); - - for(I=0;I<9;I++) - { - matint[I] = mat3inout[I]; - } - - mat3inout[0+0*3] = (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det; - mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det; - mat3inout[0+2*3] = (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det; - mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det; - mat3inout[1+1*3] = (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det; - mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det; - mat3inout[2+0*3] = (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det; - mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det; - mat3inout[2+2*3] = (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det; - - mat3f_transpose(mat3inout); - - return; -} - -__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin) -{ - int I,J; - cuvect3f ret; - for(I=0;I<3;I++) - { - ret[I] = 0.0; - for(J=0;J<3;J++) - { - ret[I] = ret[I] + mat3in[I+3*J]*cvin[J]; - } - } - - return ret; -} - -__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout) -{ - float wrk[9]; - int I,J,K; - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - wrk[I+3*J] = 0.0; - } - } - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - for(K=0;K<3;K++) - { - wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K]; - } - } - } - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - matout[I+3*J] = wrk[I+3*J]; - } - } - - return; -} - -__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout) -{ - matout[0 + 0*3] = 0.0f; - matout[1 + 0*3] = -vecin[2]; - matout[2 + 0*3] = vecin[1]; - - matout[0 + 1*3] = vecin[2]; - matout[1 + 1*3] = 0.0f; - matout[2 + 1*3] = -vecin[0]; - - matout[0 + 2*3] = -vecin[1]; - matout[1 + 2*3] = vecin[0]; - matout[2 + 2*3] = 0.0f; - return; -} - -__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout) -{ - vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]); - vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]); - vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]); - - return; -} - -//returns direction cosine rotation matrix from axis and angle -__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout) -{ - int I; - float H[9]; - float Hsq[9]; - float II[9]; - - for(I=0;I<9;I++) II[I] = 0.0; - II[0+0*3] = 1.0; - II[1+1*3] = 1.0; - II[2+2*3] = 1.0; - - axis = cuvect3f_normalize(axis); - - mat3f_hodgedual(axis,H); - mat3f_mult(H,H,Hsq); - - for(I=0;I<9;I++) - { - matout[I] = (II[I] + Hsq[I]) + H[I]*sinf(angle) - Hsq[I]*cosf(angle); - } - - return; -} - - - -__host__ void test_cudavectf_logic1() -{ - //3 dim vector and matrix functional tests on host side - - printf("3 dim vector and matrix functional tests on host side\n"); - - cuvect3f a,b,c; - float ma[9],mb[9],mc[9]; - - int I,J; - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - ma[I+3*J] = ((float) rand())/((float) RAND_MAX); - mb[I+3*J] = ma[I+3*J]; - } - } - - mat3f_inverse(mb); - mat3f_mult(ma,mb,mc); - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]); - } - } - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]); - } - } - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]); - } - } - - a = cuvect3f(1,1,1); - b = mat3f_mult(ma,a); - b = mat3f_mult(mb,b); - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]); - } - - a = cuvect3f(1,0,1); - b = cuvect3f(0,1,-1); - c = a+b; - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - - c = c/2.0; - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - - c = cuvect3f_cross(a,b); - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - - printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b)); - - printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c)); - a = cuvect3f_normalize(a); - b = cuvect3f_normalize(b); - c = cuvect3f_normalize(c); - - for(I=0;I<3;I++) - { - printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); - } - printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b)); - printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c)); - - return; -} - - - -__host__ __device__ cumat3f::cumat3f() -{ - int I; - for(I=0;I<9;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ cumat3f::~cumat3f() -{ - int I; - for(I=0;I<9;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ float& cumat3f::operator[](const int I) -{ - return dat[I]; -} - -__host__ __device__ float& cumat3f::operator()(const int I, const int J) -{ - return dat[I+3*J]; -} - -__host__ __device__ cumat3f cumat3f::operator+(cumat3f lhs) -{ - int I; - cumat3f ret; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I] + lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat3f cumat3f::operator-(cumat3f lhs) -{ - int I; - cumat3f ret; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I] - lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat3f cumat3f::operator*(float lhs) -{ - cumat3f ret; - int I; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I]*lhs; - } - return ret; -} - -__host__ __device__ cumat3f cumat3f::operator/(float lhs) -{ - cumat3f ret; - int I; - for(I=0;I<9;I++) - { - ret.dat[I] = this->dat[I]/lhs; - } - return ret; -} - -__host__ __device__ float& cumat3f::at(const int I, const int J) -{ - return dat[I+3*J]; -} - - -__host__ __device__ cuvect3f cumat3f::operator*(cuvect3f lhs) -{ - cuvect3f ret = cuvect3f(0.0,0.0,0.0); - int I,J; - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - ret[I] = ret[I] + this->at(I,J)*lhs[J]; - } - } - return ret; -} - -__host__ __device__ cumat3f cumat3f::operator*(cumat3f lhs) -{ - cumat3f ret; - int I,J,K; - - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - ret(I,J) = 0.0; - for(K=0;K<3;K++) - { - ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); - } - } - } - - return ret; -} - -__host__ __device__ float cumat3f::det() -{ - float ret = 0.0; - - ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2); - ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0); - ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1); - ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1); - ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2); - ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0); - - return ret; -} - -__host__ __device__ cumat3f cumat3f::transpose() -{ - cumat3f q; - int I,J; - for(I=0;I<3;I++) - { - for(J=0;J<3;J++) - { - q.at(I,J) = this->at(J,I); - } - } - return q; -} - -__host__ __device__ cumat3f cumat3f::inverse() -{ - cumat3f q; - float dt = q.det(); - if(dt!=0.0) - { - q(0,0) = (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt; - q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt; - q(0,2) = (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt; - q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt; - q(1,1) = (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt; - q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt; - q(2,0) = (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt; - q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt; - q(2,2) = (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt; - - q = q.transpose(); - } - else - { - q(0,0) = inf; - q(0,1) = inf; - q(0,2) = inf; - q(1,0) = inf; - q(1,1) = inf; - q(1,2) = inf; - q(2,0) = inf; - q(2,1) = inf; - q(2,2) = inf; - } - - return q; -} - +#include + +namespace amscuda +{ + + __host__ __device__ cuvect3f::cuvect3f() + { + x = 0.0; y = 0.0; z = 0.0; + return; + } + + __host__ __device__ cuvect3f::~cuvect3f() + { + x = 0.0; y = 0.0; z = 0.0; + return; + } + + __host__ __device__ float& cuvect3f::operator[](const int I) + { + if(I==0) return x; + if(I==1) return y; + if(I==2) return z; + return x; + } + + __host__ __device__ const float& cuvect3f::operator[](const int I) const + { + if(I==0) return x; + if(I==1) return y; + if(I==2) return z; + return x; + } + + __host__ __device__ cuvect3f cuvect3f::operator+(cuvect3f lhs) + { + cuvect3f ret; + ret.x = x+lhs.x; + ret.y = y+lhs.y; + ret.z = z+lhs.z; + + return ret; + } + + __host__ __device__ cuvect3f cuvect3f::operator-(cuvect3f lhs) + { + cuvect3f ret; + ret.x = x-lhs.x; + ret.y = y-lhs.y; + ret.z = z-lhs.z; + + return ret; + } + + __host__ __device__ cuvect3f cuvect3f::operator*(float lhs) + { + cuvect3f ret; + ret.x = x*lhs; + ret.y = y*lhs; + ret.z = z*lhs; + return ret; + } + + __host__ __device__ cuvect3f cuvect3f::operator/(float lhs) + { + cuvect3f ret; + ret.x = x/lhs; + ret.y = y/lhs; + ret.z = z/lhs; + return ret; + } + + __host__ __device__ cuvect3f::cuvect3f(float _x, float _y, float _z) + { + x = _x; y = _y; z = _z; + return; + } + + + __host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b) + { + float ret = a.x*b.x+a.y*b.y+a.z*b.z; + + return ret; + } + + __host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b) + { + cuvect3f ret; + ret[0] = a[1]*b[2]-a[2]*b[1]; + ret[1] = a[2]*b[0]-a[0]*b[2]; + ret[2] = a[0]*b[1]-a[1]*b[0]; + + return ret; + } + + __host__ __device__ float cuvect3f_norm(cuvect3f a) + { + float ret; + ret = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z); + return ret; + } + + __host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a) + { + cuvect3f ret; + float m; + m = ::sqrtf(a.x*a.x+a.y*a.y+a.z*a.z); + if(m>0.0) + { + ret.x = a.x/m; ret.y = a.y/m; ret.z = a.z/m; + } + else + { + ret.x = 0.0; ret.y = 0.0; ret.z = 0.0; + } + + return ret; + } + + __host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b) + { + cuvect3f ret; + cuvect3f bn = cuvect3f_normalize(b); + float m = cuvect3f_dot(a,bn); + ret = bn*m; + return ret; + } + +//transposes a 3x3 (9 element) matrix +__host__ __device__ void mat3f_transpose(float *mat3inout) +{ + int I,J; + float matint[9]; + for(I=0;I<9;I++) + { + matint[I] = mat3inout[I]; + } + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + mat3inout[I+J*3] = matint[J+I*3]; + } + } + + return; +} + +//copies src to dest +__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src) +{ + int I; + if(mat3f_dest==NULL || mat3f_src==NULL) + return; + + for(I=0;I<9;I++) + mat3f_dest[I] = mat3f_src[I]; + + return; +} + + +__host__ __device__ float mat3f_det(float *mat3in) +{ + float ret = 0.0; + + ret = ret + mat3in[0+0*3]*mat3in[1+1*3]*mat3in[2+2*3]; + ret = ret + mat3in[0+1*3]*mat3in[1+2*3]*mat3in[2+0*3]; + ret = ret + mat3in[0+2*3]*mat3in[1+0*3]*mat3in[2+1*3]; + ret = ret - mat3in[0+0*3]*mat3in[1+2*3]*mat3in[2+1*3]; + ret = ret - mat3in[0+1*3]*mat3in[1+0*3]*mat3in[2+2*3]; + ret = ret - mat3in[0+2*3]*mat3in[1+1*3]*mat3in[2+0*3]; + + return ret; +} + +//inverts a 3x3 (9 element) matrix +__host__ __device__ void mat3f_inverse(float *mat3inout) +{ + int I; + float matint[9]; + float det = mat3f_det(mat3inout); + + for(I=0;I<9;I++) + { + matint[I] = mat3inout[I]; + } + + mat3inout[0+0*3] = (matint[1+1*3]*matint[2+2*3]-matint[1+2*3]*matint[2+1*3])/det; + mat3inout[0+1*3] = -(matint[1+0*3]*matint[2+2*3]-matint[1+2*3]*matint[2+0*3])/det; + mat3inout[0+2*3] = (matint[1+0*3]*matint[2+1*3]-matint[1+1*3]*matint[2+0*3])/det; + mat3inout[1+0*3] = -(matint[0+1*3]*matint[2+2*3]-matint[0+2*3]*matint[2+1*3])/det; + mat3inout[1+1*3] = (matint[0+0*3]*matint[2+2*3]-matint[0+2*3]*matint[2+0*3])/det; + mat3inout[1+2*3] = -(matint[0+0*3]*matint[2+1*3]-matint[0+1*3]*matint[2+0*3])/det; + mat3inout[2+0*3] = (matint[0+1*3]*matint[1+2*3]-matint[0+2*3]*matint[1+1*3])/det; + mat3inout[2+1*3] = -(matint[0+0*3]*matint[1+2*3]-matint[0+2*3]*matint[1+0*3])/det; + mat3inout[2+2*3] = (matint[0+0*3]*matint[1+1*3]-matint[0+1*3]*matint[1+0*3])/det; + + mat3f_transpose(mat3inout); + + return; +} + +__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin) +{ + int I,J; + cuvect3f ret; + for(I=0;I<3;I++) + { + ret[I] = 0.0; + for(J=0;J<3;J++) + { + ret[I] = ret[I] + mat3in[I+3*J]*cvin[J]; + } + } + + return ret; +} + +__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout) +{ + float wrk[9]; + int I,J,K; + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + wrk[I+3*J] = 0.0; + } + } + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + for(K=0;K<3;K++) + { + wrk[I+3*K] = wrk[I+3*K] + matina[I+3*J]*matinb[J+3*K]; + } + } + } + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + matout[I+3*J] = wrk[I+3*J]; + } + } + + return; +} + +__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout) +{ + matout[0 + 0*3] = 0.0f; + matout[1 + 0*3] = -vecin[2]; + matout[2 + 0*3] = vecin[1]; + + matout[0 + 1*3] = vecin[2]; + matout[1 + 1*3] = 0.0f; + matout[2 + 1*3] = -vecin[0]; + + matout[0 + 2*3] = -vecin[1]; + matout[1 + 2*3] = vecin[0]; + matout[2 + 2*3] = 0.0f; + return; +} + +__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout) +{ + vecout[0] = 0.5*(matin[1 + 2*3] - matin[2 + 1*3]); + vecout[1] = 0.5*(matin[2 + 0*3] - matin[0 + 2*3]); + vecout[2] = 0.5*(matin[0 + 1*3] - matin[1 + 0*3]); + + return; +} + +//returns direction cosine rotation matrix from axis and angle +__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout) +{ + int I; + float H[9]; + float Hsq[9]; + float II[9]; + + for(I=0;I<9;I++) II[I] = 0.0; + II[0+0*3] = 1.0; + II[1+1*3] = 1.0; + II[2+2*3] = 1.0; + + axis = cuvect3f_normalize(axis); + + mat3f_hodgedual(axis,H); + mat3f_mult(H,H,Hsq); + + for(I=0;I<9;I++) + { + matout[I] = (II[I] + Hsq[I]) + H[I]*sinf(angle) - Hsq[I]*cosf(angle); + } + + return; +} + + + +__host__ void test_cudavectf_logic1() +{ + //3 dim vector and matrix functional tests on host side + + printf("3 dim vector and matrix functional tests on host side\n"); + + cuvect3f a,b,c; + float ma[9],mb[9],mc[9]; + + int I,J; + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + ma[I+3*J] = ((float) rand())/((float) RAND_MAX); + mb[I+3*J] = ma[I+3*J]; + } + } + + mat3f_inverse(mb); + mat3f_mult(ma,mb,mc); + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + printf("ma[%d,%d] = %1.3f\n",I,J,ma[I+3*J]); + } + } + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + printf("mb[%d,%d] = %1.3f\n",I,J,mb[I+3*J]); + } + } + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + printf("mc[%d,%d] = %1.3f\n",I,J,mc[I+3*J]); + } + } + + a = cuvect3f(1,1,1); + b = mat3f_mult(ma,a); + b = mat3f_mult(mb,b); + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f\n",I,a[I],I,b[I]); + } + + a = cuvect3f(1,0,1); + b = cuvect3f(0,1,-1); + c = a+b; + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + + c = c/2.0; + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + + c = cuvect3f_cross(a,b); + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + + printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b)); + + printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c)); + a = cuvect3f_normalize(a); + b = cuvect3f_normalize(b); + c = cuvect3f_normalize(c); + + for(I=0;I<3;I++) + { + printf("a[%d] = %1.3f, b[%d] = %1.3f, c[%d] = %1.3f\n",I,a[I],I,b[I],I,c[I]); + } + printf("c dot a = %1.3f, c dot b = %1.3f\n",cuvect3f_dot(c,a),cuvect3f_dot(c,b)); + printf("norm(a)=%1.3f, norm(b)=%1.3f, norm(c)=%1.3f\n",cuvect3f_norm(a),cuvect3f_norm(b),cuvect3f_norm(c)); + + return; +} + + + +__host__ __device__ cumat3f::cumat3f() +{ + int I; + for(I=0;I<9;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ cumat3f::~cumat3f() +{ + int I; + for(I=0;I<9;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ float& cumat3f::operator[](const int I) +{ + return dat[I]; +} + +__host__ __device__ float& cumat3f::operator()(const int I, const int J) +{ + return dat[I+3*J]; +} + +__host__ __device__ cumat3f cumat3f::operator+(cumat3f lhs) +{ + int I; + cumat3f ret; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I] + lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat3f cumat3f::operator-(cumat3f lhs) +{ + int I; + cumat3f ret; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I] - lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat3f cumat3f::operator*(float lhs) +{ + cumat3f ret; + int I; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I]*lhs; + } + return ret; +} + +__host__ __device__ cumat3f cumat3f::operator/(float lhs) +{ + cumat3f ret; + int I; + for(I=0;I<9;I++) + { + ret.dat[I] = this->dat[I]/lhs; + } + return ret; +} + +__host__ __device__ float& cumat3f::at(const int I, const int J) +{ + return dat[I+3*J]; +} + + +__host__ __device__ cuvect3f cumat3f::operator*(cuvect3f lhs) +{ + cuvect3f ret = cuvect3f(0.0,0.0,0.0); + int I,J; + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + ret[I] = ret[I] + this->at(I,J)*lhs[J]; + } + } + return ret; +} + +__host__ __device__ cumat3f cumat3f::operator*(cumat3f lhs) +{ + cumat3f ret; + int I,J,K; + + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + ret(I,J) = 0.0; + for(K=0;K<3;K++) + { + ret(I,J) = ret(I,J) + this->at(I,K)*lhs(K,J); + } + } + } + + return ret; +} + +__host__ __device__ float cumat3f::det() +{ + float ret = 0.0; + + ret = ret + this->at(0,0)*this->at(1,1)*this->at(2,2); + ret = ret + this->at(0,1)*this->at(1,2)*this->at(2,0); + ret = ret + this->at(0,2)*this->at(1,0)*this->at(2,1); + ret = ret - this->at(0,0)*this->at(1,2)*this->at(2,1); + ret = ret - this->at(0,1)*this->at(1,0)*this->at(2,2); + ret = ret - this->at(0,2)*this->at(1,1)*this->at(2,0); + + return ret; +} + +__host__ __device__ cumat3f cumat3f::transpose() +{ + cumat3f q; + int I,J; + for(I=0;I<3;I++) + { + for(J=0;J<3;J++) + { + q.at(I,J) = this->at(J,I); + } + } + return q; +} + +__host__ __device__ cumat3f cumat3f::inverse() +{ + cumat3f q; + float dt = q.det(); + if(dt!=0.0) + { + q(0,0) = (this->at(1,1)*this->at(2,2)-this->at(1,2)*this->at(2,1))/dt; + q(0,1) = -(this->at(1,0)*this->at(2,2)-this->at(1,2)*this->at(2,0))/dt; + q(0,2) = (this->at(1,0)*this->at(2,1)-this->at(1,1)*this->at(2,0))/dt; + q(1,0) = -(this->at(0,1)*this->at(2,2)-this->at(0,2)*this->at(2,1))/dt; + q(1,1) = (this->at(0,0)*this->at(2,2)-this->at(0,2)*this->at(2,0))/dt; + q(1,2) = -(this->at(0,0)*this->at(2,1)-this->at(0,1)*this->at(2,0))/dt; + q(2,0) = (this->at(0,1)*this->at(1,2)-this->at(0,2)*this->at(1,1))/dt; + q(2,1) = -(this->at(0,0)*this->at(1,2)-this->at(0,2)*this->at(1,0))/dt; + q(2,2) = (this->at(0,0)*this->at(1,1)-this->at(0,1)*this->at(1,0))/dt; + + q = q.transpose(); + } + else + { + q(0,0) = inf; + q(0,1) = inf; + q(0,2) = inf; + q(1,0) = inf; + q(1,1) = inf; + q(1,2) = inf; + q(2,0) = inf; + q(2,1) = inf; + q(2,2) = inf; + } + + return q; +} + +__host__ __device__ cuvect3f operator-(cuvect3f rhs) +{ + cuvect3f ret; + ret[0] = -rhs[0]; + ret[1] = -rhs[1]; + ret[2] = -rhs[2]; + return ret; +} + }; \ No newline at end of file diff --git a/src/amsculib2/cuvect4.cu b/src/amsculib2/cuvect4.cu index f6bced8..bc2655f 100644 --- a/src/amsculib2/cuvect4.cu +++ b/src/amsculib2/cuvect4.cu @@ -1,414 +1,414 @@ -#include - -namespace amscuda -{ - -__host__ __device__ cuvect4::cuvect4() -{ - x = 0.0; y = 0.0; z = 0.0; w = 0.0; - return; -} - -__host__ __device__ cuvect4::~cuvect4() -{ - x = 0.0; y = 0.0; z = 0.0; w = 0.0; - return; -} - -__host__ __device__ cuvect4::cuvect4(double _x, double _y, double _z, double _w) -{ - x = _x; y = _y; z = _z; w = _w; - return; -} - -__host__ __device__ double& cuvect4::operator[](const int I) -{ - if(I==0) return x; - else if(I==1) return y; - else if(I==2) return z; - else if(I==3) return w; - return x; -} - -__host__ __device__ const double& cuvect4::operator[](const int I) const -{ - if(I==0) return x; - else if(I==1) return y; - else if(I==2) return z; - else if(I==3) return w; - return x; -} - -__host__ __device__ cuvect4 cuvect4::operator+(cuvect4 lhs) -{ - cuvect4 ret; - ret.x = this->x + lhs.x; - ret.y = this->y + lhs.y; - ret.z = this->z + lhs.z; - ret.w = this->w + lhs.w; - return ret; -} - -__host__ __device__ cuvect4 cuvect4::operator-(cuvect4 lhs) -{ - cuvect4 ret; - ret.x = this->x - lhs.x; - ret.y = this->y - lhs.y; - ret.z = this->z - lhs.z; - ret.w = this->w - lhs.w; - return ret; -} - -__host__ __device__ cuvect4 cuvect4::operator*(double lhs) -{ - cuvect4 ret; - ret.x = this->x*lhs; - ret.y = this->y*lhs; - ret.z = this->z*lhs; - ret.w = this->w*lhs; - return ret; -} - -__host__ __device__ cuvect4 cuvect4::operator/(double lhs) -{ - cuvect4 ret; - ret.x = this->x/lhs; - ret.y = this->y/lhs; - ret.z = this->z/lhs; - ret.w = this->w/lhs; - return ret; -} - -__host__ __device__ cumat4::cumat4() -{ - int I; - for(I=0;I<16;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ cumat4::~cumat4() -{ - int I; - for(I=0;I<16;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ double& cumat4::operator[](const int I) -{ - return dat[I]; -} - -__host__ __device__ double& cumat4::operator()(const int I, const int J) -{ - return dat[I+4*J]; -} - -__host__ __device__ double& cumat4::at(const int I, const int J) -{ - return dat[I+4*J]; -} - -__host__ __device__ cumat4 cumat4::operator+(cumat4 lhs) -{ - cumat4 ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I] + lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat4 cumat4::operator-(cumat4 lhs) -{ - cumat4 ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I] - lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat4 cumat4::operator*(double lhs) -{ - cumat4 ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I]*lhs; - } - return ret; -} - -__host__ __device__ cumat4 cumat4::operator/(double lhs) -{ - cumat4 ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I]/lhs; - } - return ret; -} - -__host__ __device__ cuvect4 cumat4::operator*(cuvect4 lhs) -{ - cuvect4 ret = cuvect4(0.0,0.0,0.0,0.0); - int I,J; - - for(I=0;I<4;I++) - { - for(J=0;J<4;J++) - { - ret[I] = ret[I] + this->at(I,J)*lhs[J]; - } - } - - return ret; -} - -__host__ __device__ cumat4 cumat4::operator*(cumat4 lhs) -{ - cumat4 ret; - int I,J,K; - for(I=0;I<4;I++) - { - for(J=0;J<4;J++) - { - ret(I,J) = 0; - for(K=0;K<4;K++) - { - ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J); - } - } - } - return ret; -} - -__host__ __device__ cumat4 cumat4::transpose() -{ - cumat4 q; - int I,J; - for(I=0;I<4;I++) - { - for(J=0;J<4;J++) - { - q(I,J) = this->at(J,I); - } - } - return q; -} - -__host__ __device__ double cumat4::det() -{ - double a00,a01,a02,a03; - double a10,a11,a12,a13; - double a20,a21,a22,a23; - double a30,a31,a32,a33; - double det; - - a00 = this->at(0,0); - a01 = this->at(0,1); - a02 = this->at(0,2); - a03 = this->at(0,3); - a10 = this->at(1,0); - a11 = this->at(1,1); - a12 = this->at(1,2); - a13 = this->at(1,3); - a20 = this->at(2,0); - a21 = this->at(2,1); - a22 = this->at(2,2); - a23 = this->at(2,3); - a30 = this->at(3,0); - a31 = this->at(3,1); - a32 = this->at(3,2); - a33 = this->at(3,3); - - det = a03*a12*a21*a30 - - a02*a13*a21*a30 - - a03*a11*a22*a30 + - a01*a13*a22*a30 + - a02*a11*a23*a30 - - a01*a12*a23*a30 - - a03*a12*a20*a31 + - a02*a13*a20*a31 + - a03*a10*a22*a31 - - a00*a13*a22*a31 - - a02*a10*a23*a31 + - a00*a12*a23*a31 + - a03*a11*a20*a32 - - a01*a13*a20*a32 - - a03*a10*a21*a32 + - a00*a13*a21*a32 + - a01*a10*a23*a32 - - a00*a11*a23*a32 - - a02*a11*a20*a33 + - a01*a12*a20*a33 + - a02*a10*a21*a33 - - a00*a12*a21*a33 - - a01*a10*a22*a33 + - a00*a11*a22*a33; - - return det; -} - -__host__ __device__ cumat4 minverse(cumat4 ma) -{ - cumat4 mb; - - double a00,a01,a02,a03; - double a10,a11,a12,a13; - double a20,a21,a22,a23; - double a30,a31,a32,a33; - - double b00,b01,b02,b03; - double b10,b11,b12,b13; - double b20,b21,b22,b23; - double b30,b31,b32,b33; - - double det = 0.0; - - a00 = ma.at(0,0); - a01 = ma.at(0,1); - a02 = ma.at(0,2); - a03 = ma.at(0,3); - a10 = ma.at(1,0); - a11 = ma.at(1,1); - a12 = ma.at(1,2); - a13 = ma.at(1,3); - a20 = ma.at(2,0); - a21 = ma.at(2,1); - a22 = ma.at(2,2); - a23 = ma.at(2,3); - a30 = ma.at(3,0); - a31 = ma.at(3,1); - a32 = ma.at(3,2); - a33 = ma.at(3,3); - - det = a03*a12*a21*a30 - - a02*a13*a21*a30 - - a03*a11*a22*a30 + - a01*a13*a22*a30 + - a02*a11*a23*a30 - - a01*a12*a23*a30 - - a03*a12*a20*a31 + - a02*a13*a20*a31 + - a03*a10*a22*a31 - - a00*a13*a22*a31 - - a02*a10*a23*a31 + - a00*a12*a23*a31 + - a03*a11*a20*a32 - - a01*a13*a20*a32 - - a03*a10*a21*a32 + - a00*a13*a21*a32 + - a01*a10*a23*a32 - - a00*a11*a23*a32 - - a02*a11*a20*a33 + - a01*a12*a20*a33 + - a02*a10*a21*a33 - - a00*a12*a21*a33 - - a01*a10*a22*a33 + - a00*a11*a22*a33; - - if(det*det>1.0E-30) - { - b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33; - b01 = a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33; - b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33; - b03 = a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23; - b10 = a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33; - b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33; - b12 = a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33; - b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23; - b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33; - b21 = a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33; - b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33; - b23 = a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23; - b30 = a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32; - b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32; - b32 = a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32; - b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22; - b00 = b00/det; - b01 = b01/det; - b02 = b02/det; - b03 = b03/det; - b10 = b10/det; - b11 = b11/det; - b12 = b12/det; - b13 = b13/det; - b20 = b20/det; - b21 = b21/det; - b22 = b22/det; - b23 = b23/det; - b30 = b30/det; - b31 = b31/det; - b32 = b32/det; - b33 = b33/det; - mb.at(0,0) = b00; - mb.at(0,1) = b01; - mb.at(0,2) = b02; - mb.at(0,3) = b03; - mb.at(1,0) = b10; - mb.at(1,1) = b11; - mb.at(1,2) = b12; - mb.at(1,3) = b13; - mb.at(2,0) = b20; - mb.at(2,1) = b21; - mb.at(2,2) = b22; - mb.at(2,3) = b23; - mb.at(3,0) = b30; - mb.at(3,1) = b31; - mb.at(3,2) = b32; - mb.at(3,3) = b33; - } - //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again! - return mb; -} - -__host__ __device__ cumat4 cumat4::inverse() -{ - return minverse(*this); -} - - -__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b) -{ - double ret = 0.0; - ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; - return ret; -} - -__host__ __device__ double cuvect4_norm(cuvect4 a) -{ - double ret = 0.0; - ret = ::sqrt(cuvect4_dot(a,a)); - return ret; -} - -__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a) -{ - cuvect4 ret = cuvect4(0.0f,0.0f,0.0f,0.0f); - double nrm = cuvect4_norm(a); - if(nrm>0.0) - ret = a/nrm; - return ret; -} - -__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b) -{ - cuvect4 ret; - cuvect4 bn = cuvect4_normalize(b); - double d = cuvect4_dot(a,bn); - ret = bn*d; - return ret; -} - - +#include + +namespace amscuda +{ + +__host__ __device__ cuvect4::cuvect4() +{ + x = 0.0; y = 0.0; z = 0.0; w = 0.0; + return; +} + +__host__ __device__ cuvect4::~cuvect4() +{ + x = 0.0; y = 0.0; z = 0.0; w = 0.0; + return; +} + +__host__ __device__ cuvect4::cuvect4(double _x, double _y, double _z, double _w) +{ + x = _x; y = _y; z = _z; w = _w; + return; +} + +__host__ __device__ double& cuvect4::operator[](const int I) +{ + if(I==0) return x; + else if(I==1) return y; + else if(I==2) return z; + else if(I==3) return w; + return x; +} + +__host__ __device__ const double& cuvect4::operator[](const int I) const +{ + if(I==0) return x; + else if(I==1) return y; + else if(I==2) return z; + else if(I==3) return w; + return x; +} + +__host__ __device__ cuvect4 cuvect4::operator+(cuvect4 lhs) +{ + cuvect4 ret; + ret.x = this->x + lhs.x; + ret.y = this->y + lhs.y; + ret.z = this->z + lhs.z; + ret.w = this->w + lhs.w; + return ret; +} + +__host__ __device__ cuvect4 cuvect4::operator-(cuvect4 lhs) +{ + cuvect4 ret; + ret.x = this->x - lhs.x; + ret.y = this->y - lhs.y; + ret.z = this->z - lhs.z; + ret.w = this->w - lhs.w; + return ret; +} + +__host__ __device__ cuvect4 cuvect4::operator*(double lhs) +{ + cuvect4 ret; + ret.x = this->x*lhs; + ret.y = this->y*lhs; + ret.z = this->z*lhs; + ret.w = this->w*lhs; + return ret; +} + +__host__ __device__ cuvect4 cuvect4::operator/(double lhs) +{ + cuvect4 ret; + ret.x = this->x/lhs; + ret.y = this->y/lhs; + ret.z = this->z/lhs; + ret.w = this->w/lhs; + return ret; +} + +__host__ __device__ cumat4::cumat4() +{ + int I; + for(I=0;I<16;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ cumat4::~cumat4() +{ + int I; + for(I=0;I<16;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ double& cumat4::operator[](const int I) +{ + return dat[I]; +} + +__host__ __device__ double& cumat4::operator()(const int I, const int J) +{ + return dat[I+4*J]; +} + +__host__ __device__ double& cumat4::at(const int I, const int J) +{ + return dat[I+4*J]; +} + +__host__ __device__ cumat4 cumat4::operator+(cumat4 lhs) +{ + cumat4 ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I] + lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat4 cumat4::operator-(cumat4 lhs) +{ + cumat4 ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I] - lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat4 cumat4::operator*(double lhs) +{ + cumat4 ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I]*lhs; + } + return ret; +} + +__host__ __device__ cumat4 cumat4::operator/(double lhs) +{ + cumat4 ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I]/lhs; + } + return ret; +} + +__host__ __device__ cuvect4 cumat4::operator*(cuvect4 lhs) +{ + cuvect4 ret = cuvect4(0.0,0.0,0.0,0.0); + int I,J; + + for(I=0;I<4;I++) + { + for(J=0;J<4;J++) + { + ret[I] = ret[I] + this->at(I,J)*lhs[J]; + } + } + + return ret; +} + +__host__ __device__ cumat4 cumat4::operator*(cumat4 lhs) +{ + cumat4 ret; + int I,J,K; + for(I=0;I<4;I++) + { + for(J=0;J<4;J++) + { + ret(I,J) = 0; + for(K=0;K<4;K++) + { + ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J); + } + } + } + return ret; +} + +__host__ __device__ cumat4 cumat4::transpose() +{ + cumat4 q; + int I,J; + for(I=0;I<4;I++) + { + for(J=0;J<4;J++) + { + q(I,J) = this->at(J,I); + } + } + return q; +} + +__host__ __device__ double cumat4::det() +{ + double a00,a01,a02,a03; + double a10,a11,a12,a13; + double a20,a21,a22,a23; + double a30,a31,a32,a33; + double det; + + a00 = this->at(0,0); + a01 = this->at(0,1); + a02 = this->at(0,2); + a03 = this->at(0,3); + a10 = this->at(1,0); + a11 = this->at(1,1); + a12 = this->at(1,2); + a13 = this->at(1,3); + a20 = this->at(2,0); + a21 = this->at(2,1); + a22 = this->at(2,2); + a23 = this->at(2,3); + a30 = this->at(3,0); + a31 = this->at(3,1); + a32 = this->at(3,2); + a33 = this->at(3,3); + + det = a03*a12*a21*a30 - + a02*a13*a21*a30 - + a03*a11*a22*a30 + + a01*a13*a22*a30 + + a02*a11*a23*a30 - + a01*a12*a23*a30 - + a03*a12*a20*a31 + + a02*a13*a20*a31 + + a03*a10*a22*a31 - + a00*a13*a22*a31 - + a02*a10*a23*a31 + + a00*a12*a23*a31 + + a03*a11*a20*a32 - + a01*a13*a20*a32 - + a03*a10*a21*a32 + + a00*a13*a21*a32 + + a01*a10*a23*a32 - + a00*a11*a23*a32 - + a02*a11*a20*a33 + + a01*a12*a20*a33 + + a02*a10*a21*a33 - + a00*a12*a21*a33 - + a01*a10*a22*a33 + + a00*a11*a22*a33; + + return det; +} + +__host__ __device__ cumat4 minverse(cumat4 ma) +{ + cumat4 mb; + + double a00,a01,a02,a03; + double a10,a11,a12,a13; + double a20,a21,a22,a23; + double a30,a31,a32,a33; + + double b00,b01,b02,b03; + double b10,b11,b12,b13; + double b20,b21,b22,b23; + double b30,b31,b32,b33; + + double det = 0.0; + + a00 = ma.at(0,0); + a01 = ma.at(0,1); + a02 = ma.at(0,2); + a03 = ma.at(0,3); + a10 = ma.at(1,0); + a11 = ma.at(1,1); + a12 = ma.at(1,2); + a13 = ma.at(1,3); + a20 = ma.at(2,0); + a21 = ma.at(2,1); + a22 = ma.at(2,2); + a23 = ma.at(2,3); + a30 = ma.at(3,0); + a31 = ma.at(3,1); + a32 = ma.at(3,2); + a33 = ma.at(3,3); + + det = a03*a12*a21*a30 - + a02*a13*a21*a30 - + a03*a11*a22*a30 + + a01*a13*a22*a30 + + a02*a11*a23*a30 - + a01*a12*a23*a30 - + a03*a12*a20*a31 + + a02*a13*a20*a31 + + a03*a10*a22*a31 - + a00*a13*a22*a31 - + a02*a10*a23*a31 + + a00*a12*a23*a31 + + a03*a11*a20*a32 - + a01*a13*a20*a32 - + a03*a10*a21*a32 + + a00*a13*a21*a32 + + a01*a10*a23*a32 - + a00*a11*a23*a32 - + a02*a11*a20*a33 + + a01*a12*a20*a33 + + a02*a10*a21*a33 - + a00*a12*a21*a33 - + a01*a10*a22*a33 + + a00*a11*a22*a33; + + if(det*det>1.0E-30) + { + b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33; + b01 = a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33; + b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33; + b03 = a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23; + b10 = a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33; + b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33; + b12 = a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33; + b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23; + b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33; + b21 = a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33; + b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33; + b23 = a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23; + b30 = a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32; + b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32; + b32 = a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32; + b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22; + b00 = b00/det; + b01 = b01/det; + b02 = b02/det; + b03 = b03/det; + b10 = b10/det; + b11 = b11/det; + b12 = b12/det; + b13 = b13/det; + b20 = b20/det; + b21 = b21/det; + b22 = b22/det; + b23 = b23/det; + b30 = b30/det; + b31 = b31/det; + b32 = b32/det; + b33 = b33/det; + mb.at(0,0) = b00; + mb.at(0,1) = b01; + mb.at(0,2) = b02; + mb.at(0,3) = b03; + mb.at(1,0) = b10; + mb.at(1,1) = b11; + mb.at(1,2) = b12; + mb.at(1,3) = b13; + mb.at(2,0) = b20; + mb.at(2,1) = b21; + mb.at(2,2) = b22; + mb.at(2,3) = b23; + mb.at(3,0) = b30; + mb.at(3,1) = b31; + mb.at(3,2) = b32; + mb.at(3,3) = b33; + } + //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again! + return mb; +} + +__host__ __device__ cumat4 cumat4::inverse() +{ + return minverse(*this); +} + + +__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b) +{ + double ret = 0.0; + ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; + return ret; +} + +__host__ __device__ double cuvect4_norm(cuvect4 a) +{ + double ret = 0.0; + ret = ::sqrt(cuvect4_dot(a,a)); + return ret; +} + +__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a) +{ + cuvect4 ret = cuvect4(0.0f,0.0f,0.0f,0.0f); + double nrm = cuvect4_norm(a); + if(nrm>0.0) + ret = a/nrm; + return ret; +} + +__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b) +{ + cuvect4 ret; + cuvect4 bn = cuvect4_normalize(b); + double d = cuvect4_dot(a,bn); + ret = bn*d; + return ret; +} + + }; \ No newline at end of file diff --git a/src/amsculib2/cuvect4f.cu b/src/amsculib2/cuvect4f.cu index ba170a2..6701afa 100644 --- a/src/amsculib2/cuvect4f.cu +++ b/src/amsculib2/cuvect4f.cu @@ -1,417 +1,427 @@ -#include - -namespace amscuda -{ - -//////////// -//cuvect4ff// -//////////// - -__host__ __device__ cuvect4f::cuvect4f() -{ - x = 0.0; y = 0.0; z = 0.0; w = 0.0; - return; -} - -__host__ __device__ cuvect4f::~cuvect4f() -{ - x = 0.0; y = 0.0; z = 0.0; w = 0.0; - return; -} - -__host__ __device__ cuvect4f::cuvect4f(float _x, float _y, float _z, float _w) -{ - x = _x; y = _y; z = _z; w = _w; - return; -} - -__host__ __device__ float& cuvect4f::operator[](const int I) -{ - if(I==0) return x; - else if(I==1) return y; - else if(I==2) return z; - else if(I==3) return w; - return x; -} - -__host__ __device__ const float& cuvect4f::operator[](const int I) const -{ - if(I==0) return x; - else if(I==1) return y; - else if(I==2) return z; - else if(I==3) return w; - return x; -} - -__host__ __device__ cuvect4f cuvect4f::operator+(cuvect4f lhs) -{ - cuvect4f ret; - ret.x = this->x + lhs.x; - ret.y = this->y + lhs.y; - ret.z = this->z + lhs.z; - ret.w = this->w + lhs.w; - return ret; -} - -__host__ __device__ cuvect4f cuvect4f::operator-(cuvect4f lhs) -{ - cuvect4f ret; - ret.x = this->x - lhs.x; - ret.y = this->y - lhs.y; - ret.z = this->z - lhs.z; - ret.w = this->w - lhs.w; - return ret; -} - -__host__ __device__ cuvect4f cuvect4f::operator*(float lhs) -{ - cuvect4f ret; - ret.x = this->x*lhs; - ret.y = this->y*lhs; - ret.z = this->z*lhs; - ret.w = this->w*lhs; - return ret; -} - -__host__ __device__ cuvect4f cuvect4f::operator/(float lhs) -{ - cuvect4f ret; - ret.x = this->x/lhs; - ret.y = this->y/lhs; - ret.z = this->z/lhs; - ret.w = this->w/lhs; - return ret; -} - -__host__ __device__ cumat4f::cumat4f() -{ - int I; - for(I=0;I<16;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ cumat4f::~cumat4f() -{ - int I; - for(I=0;I<16;I++) - { - dat[I] = 0.0; - } - return; -} - -__host__ __device__ float& cumat4f::operator[](const int I) -{ - return dat[I]; -} - -__host__ __device__ float& cumat4f::operator()(const int I, const int J) -{ - return dat[I+4*J]; -} - -__host__ __device__ float& cumat4f::at(const int I, const int J) -{ - return dat[I+4*J]; -} - -__host__ __device__ cumat4f cumat4f::operator+(cumat4f lhs) -{ - cumat4f ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I] + lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat4f cumat4f::operator-(cumat4f lhs) -{ - cumat4f ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I] - lhs.dat[I]; - } - return ret; -} - -__host__ __device__ cumat4f cumat4f::operator*(float lhs) -{ - cumat4f ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I]*lhs; - } - return ret; -} - -__host__ __device__ cumat4f cumat4f::operator/(float lhs) -{ - cumat4f ret; - int I; - for(I=0;I<16;I++) - { - ret.dat[I] = this->dat[I]/lhs; - } - return ret; -} - -__host__ __device__ cuvect4f cumat4f::operator*(cuvect4f lhs) -{ - cuvect4f ret = cuvect4f(0.0,0.0,0.0,0.0); - int I,J; - - for(I=0;I<4;I++) - { - for(J=0;J<4;J++) - { - ret[I] = ret[I] + this->at(I,J)*lhs[J]; - } - } - - return ret; -} - -__host__ __device__ cumat4f cumat4f::operator*(cumat4f lhs) -{ - cumat4f ret; - int I,J,K; - for(I=0;I<4;I++) - { - for(J=0;J<4;J++) - { - ret(I,J) = 0; - for(K=0;K<4;K++) - { - ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J); - } - } - } - return ret; -} - -__host__ __device__ cumat4f cumat4f::transpose() -{ - cumat4f q; - int I,J; - for(I=0;I<4;I++) - { - for(J=0;J<4;J++) - { - q(I,J) = this->at(J,I); - } - } - return q; -} - -__host__ __device__ float cumat4f::det() -{ - float a00,a01,a02,a03; - float a10,a11,a12,a13; - float a20,a21,a22,a23; - float a30,a31,a32,a33; - float det; - - a00 = this->at(0,0); - a01 = this->at(0,1); - a02 = this->at(0,2); - a03 = this->at(0,3); - a10 = this->at(1,0); - a11 = this->at(1,1); - a12 = this->at(1,2); - a13 = this->at(1,3); - a20 = this->at(2,0); - a21 = this->at(2,1); - a22 = this->at(2,2); - a23 = this->at(2,3); - a30 = this->at(3,0); - a31 = this->at(3,1); - a32 = this->at(3,2); - a33 = this->at(3,3); - - det = a03*a12*a21*a30 - - a02*a13*a21*a30 - - a03*a11*a22*a30 + - a01*a13*a22*a30 + - a02*a11*a23*a30 - - a01*a12*a23*a30 - - a03*a12*a20*a31 + - a02*a13*a20*a31 + - a03*a10*a22*a31 - - a00*a13*a22*a31 - - a02*a10*a23*a31 + - a00*a12*a23*a31 + - a03*a11*a20*a32 - - a01*a13*a20*a32 - - a03*a10*a21*a32 + - a00*a13*a21*a32 + - a01*a10*a23*a32 - - a00*a11*a23*a32 - - a02*a11*a20*a33 + - a01*a12*a20*a33 + - a02*a10*a21*a33 - - a00*a12*a21*a33 - - a01*a10*a22*a33 + - a00*a11*a22*a33; - - return det; -} - -__host__ __device__ cumat4f minverse(cumat4f ma) -{ - cumat4f mb; - - float a00,a01,a02,a03; - float a10,a11,a12,a13; - float a20,a21,a22,a23; - float a30,a31,a32,a33; - - float b00,b01,b02,b03; - float b10,b11,b12,b13; - float b20,b21,b22,b23; - float b30,b31,b32,b33; - - float det = 0.0; - - a00 = ma.at(0,0); - a01 = ma.at(0,1); - a02 = ma.at(0,2); - a03 = ma.at(0,3); - a10 = ma.at(1,0); - a11 = ma.at(1,1); - a12 = ma.at(1,2); - a13 = ma.at(1,3); - a20 = ma.at(2,0); - a21 = ma.at(2,1); - a22 = ma.at(2,2); - a23 = ma.at(2,3); - a30 = ma.at(3,0); - a31 = ma.at(3,1); - a32 = ma.at(3,2); - a33 = ma.at(3,3); - - det = a03*a12*a21*a30 - - a02*a13*a21*a30 - - a03*a11*a22*a30 + - a01*a13*a22*a30 + - a02*a11*a23*a30 - - a01*a12*a23*a30 - - a03*a12*a20*a31 + - a02*a13*a20*a31 + - a03*a10*a22*a31 - - a00*a13*a22*a31 - - a02*a10*a23*a31 + - a00*a12*a23*a31 + - a03*a11*a20*a32 - - a01*a13*a20*a32 - - a03*a10*a21*a32 + - a00*a13*a21*a32 + - a01*a10*a23*a32 - - a00*a11*a23*a32 - - a02*a11*a20*a33 + - a01*a12*a20*a33 + - a02*a10*a21*a33 - - a00*a12*a21*a33 - - a01*a10*a22*a33 + - a00*a11*a22*a33; - - if(det*det>1.0E-30) - { - b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33; - b01 = a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33; - b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33; - b03 = a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23; - b10 = a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33; - b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33; - b12 = a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33; - b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23; - b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33; - b21 = a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33; - b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33; - b23 = a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23; - b30 = a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32; - b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32; - b32 = a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32; - b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22; - b00 = b00/det; - b01 = b01/det; - b02 = b02/det; - b03 = b03/det; - b10 = b10/det; - b11 = b11/det; - b12 = b12/det; - b13 = b13/det; - b20 = b20/det; - b21 = b21/det; - b22 = b22/det; - b23 = b23/det; - b30 = b30/det; - b31 = b31/det; - b32 = b32/det; - b33 = b33/det; - mb.at(0,0) = b00; - mb.at(0,1) = b01; - mb.at(0,2) = b02; - mb.at(0,3) = b03; - mb.at(1,0) = b10; - mb.at(1,1) = b11; - mb.at(1,2) = b12; - mb.at(1,3) = b13; - mb.at(2,0) = b20; - mb.at(2,1) = b21; - mb.at(2,2) = b22; - mb.at(2,3) = b23; - mb.at(3,0) = b30; - mb.at(3,1) = b31; - mb.at(3,2) = b32; - mb.at(3,3) = b33; - } - //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again! - return mb; -} - -__host__ __device__ cumat4f cumat4f::inverse() -{ - return minverse(*this); -} - - -__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b) -{ - float ret = 0.0; - ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; - return ret; -} - -__host__ __device__ float cuvect4f_norm(cuvect4f a) -{ - float ret = 0.0; - ret = ::sqrtf(cuvect4f_dot(a,a)); - return ret; -} - -__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a) -{ - cuvect4f ret = cuvect4f(0.0f,0.0f,0.0f,0.0f); - float nrm = cuvect4f_norm(a); - if(nrm>0.0) - ret = a/nrm; - return ret; -} - -__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b) -{ - cuvect4f ret; - cuvect4f bn = cuvect4f_normalize(b); - float d = cuvect4f_dot(a,bn); - ret = bn*d; - return ret; -} - -}; //namespace amscuda +#include + +namespace amscuda +{ + +//////////// +//cuvect4ff// +//////////// + +__host__ __device__ cuvect4f::cuvect4f() +{ + x = 0.0; y = 0.0; z = 0.0; w = 0.0; + return; +} + +__host__ __device__ cuvect4f::~cuvect4f() +{ + x = 0.0; y = 0.0; z = 0.0; w = 0.0; + return; +} + +__host__ __device__ cuvect4f::cuvect4f(float _x, float _y, float _z, float _w) +{ + x = _x; y = _y; z = _z; w = _w; + return; +} + +__host__ __device__ float& cuvect4f::operator[](const int I) +{ + if(I==0) return x; + else if(I==1) return y; + else if(I==2) return z; + else if(I==3) return w; + return x; +} + +__host__ __device__ const float& cuvect4f::operator[](const int I) const +{ + if(I==0) return x; + else if(I==1) return y; + else if(I==2) return z; + else if(I==3) return w; + return x; +} + +__host__ __device__ cuvect4f cuvect4f::operator+(cuvect4f lhs) +{ + cuvect4f ret; + ret.x = this->x + lhs.x; + ret.y = this->y + lhs.y; + ret.z = this->z + lhs.z; + ret.w = this->w + lhs.w; + return ret; +} + +__host__ __device__ cuvect4f cuvect4f::operator-(cuvect4f lhs) +{ + cuvect4f ret; + ret.x = this->x - lhs.x; + ret.y = this->y - lhs.y; + ret.z = this->z - lhs.z; + ret.w = this->w - lhs.w; + return ret; +} + +__host__ __device__ cuvect4f cuvect4f::operator*(float lhs) +{ + cuvect4f ret; + ret.x = this->x*lhs; + ret.y = this->y*lhs; + ret.z = this->z*lhs; + ret.w = this->w*lhs; + return ret; +} + +__host__ __device__ cuvect4f cuvect4f::operator/(float lhs) +{ + cuvect4f ret; + ret.x = this->x/lhs; + ret.y = this->y/lhs; + ret.z = this->z/lhs; + ret.w = this->w/lhs; + return ret; +} + +__host__ __device__ cumat4f::cumat4f() +{ + int I; + for(I=0;I<16;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ cumat4f::~cumat4f() +{ + int I; + for(I=0;I<16;I++) + { + dat[I] = 0.0; + } + return; +} + +__host__ __device__ float& cumat4f::operator[](const int I) +{ + return dat[I]; +} + +__host__ __device__ float& cumat4f::operator()(const int I, const int J) +{ + return dat[I+4*J]; +} + +__host__ __device__ float& cumat4f::at(const int I, const int J) +{ + return dat[I+4*J]; +} + +__host__ __device__ cumat4f cumat4f::operator+(cumat4f lhs) +{ + cumat4f ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I] + lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat4f cumat4f::operator-(cumat4f lhs) +{ + cumat4f ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I] - lhs.dat[I]; + } + return ret; +} + +__host__ __device__ cumat4f cumat4f::operator*(float lhs) +{ + cumat4f ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I]*lhs; + } + return ret; +} + +__host__ __device__ cumat4f cumat4f::operator/(float lhs) +{ + cumat4f ret; + int I; + for(I=0;I<16;I++) + { + ret.dat[I] = this->dat[I]/lhs; + } + return ret; +} + +__host__ __device__ cuvect4f cumat4f::operator*(cuvect4f lhs) +{ + cuvect4f ret = cuvect4f(0.0,0.0,0.0,0.0); + int I,J; + + for(I=0;I<4;I++) + { + for(J=0;J<4;J++) + { + ret[I] = ret[I] + this->at(I,J)*lhs[J]; + } + } + + return ret; +} + +__host__ __device__ cumat4f cumat4f::operator*(cumat4f lhs) +{ + cumat4f ret; + int I,J,K; + for(I=0;I<4;I++) + { + for(J=0;J<4;J++) + { + ret(I,J) = 0; + for(K=0;K<4;K++) + { + ret(I,J) = ret(I,J) + this->at(I,K) * lhs(K,J); + } + } + } + return ret; +} + +__host__ __device__ cumat4f cumat4f::transpose() +{ + cumat4f q; + int I,J; + for(I=0;I<4;I++) + { + for(J=0;J<4;J++) + { + q(I,J) = this->at(J,I); + } + } + return q; +} + +__host__ __device__ float cumat4f::det() +{ + float a00,a01,a02,a03; + float a10,a11,a12,a13; + float a20,a21,a22,a23; + float a30,a31,a32,a33; + float det; + + a00 = this->at(0,0); + a01 = this->at(0,1); + a02 = this->at(0,2); + a03 = this->at(0,3); + a10 = this->at(1,0); + a11 = this->at(1,1); + a12 = this->at(1,2); + a13 = this->at(1,3); + a20 = this->at(2,0); + a21 = this->at(2,1); + a22 = this->at(2,2); + a23 = this->at(2,3); + a30 = this->at(3,0); + a31 = this->at(3,1); + a32 = this->at(3,2); + a33 = this->at(3,3); + + det = a03*a12*a21*a30 - + a02*a13*a21*a30 - + a03*a11*a22*a30 + + a01*a13*a22*a30 + + a02*a11*a23*a30 - + a01*a12*a23*a30 - + a03*a12*a20*a31 + + a02*a13*a20*a31 + + a03*a10*a22*a31 - + a00*a13*a22*a31 - + a02*a10*a23*a31 + + a00*a12*a23*a31 + + a03*a11*a20*a32 - + a01*a13*a20*a32 - + a03*a10*a21*a32 + + a00*a13*a21*a32 + + a01*a10*a23*a32 - + a00*a11*a23*a32 - + a02*a11*a20*a33 + + a01*a12*a20*a33 + + a02*a10*a21*a33 - + a00*a12*a21*a33 - + a01*a10*a22*a33 + + a00*a11*a22*a33; + + return det; +} + +__host__ __device__ cumat4f minverse(cumat4f ma) +{ + cumat4f mb; + + float a00,a01,a02,a03; + float a10,a11,a12,a13; + float a20,a21,a22,a23; + float a30,a31,a32,a33; + + float b00,b01,b02,b03; + float b10,b11,b12,b13; + float b20,b21,b22,b23; + float b30,b31,b32,b33; + + float det = 0.0; + + a00 = ma.at(0,0); + a01 = ma.at(0,1); + a02 = ma.at(0,2); + a03 = ma.at(0,3); + a10 = ma.at(1,0); + a11 = ma.at(1,1); + a12 = ma.at(1,2); + a13 = ma.at(1,3); + a20 = ma.at(2,0); + a21 = ma.at(2,1); + a22 = ma.at(2,2); + a23 = ma.at(2,3); + a30 = ma.at(3,0); + a31 = ma.at(3,1); + a32 = ma.at(3,2); + a33 = ma.at(3,3); + + det = a03*a12*a21*a30 - + a02*a13*a21*a30 - + a03*a11*a22*a30 + + a01*a13*a22*a30 + + a02*a11*a23*a30 - + a01*a12*a23*a30 - + a03*a12*a20*a31 + + a02*a13*a20*a31 + + a03*a10*a22*a31 - + a00*a13*a22*a31 - + a02*a10*a23*a31 + + a00*a12*a23*a31 + + a03*a11*a20*a32 - + a01*a13*a20*a32 - + a03*a10*a21*a32 + + a00*a13*a21*a32 + + a01*a10*a23*a32 - + a00*a11*a23*a32 - + a02*a11*a20*a33 + + a01*a12*a20*a33 + + a02*a10*a21*a33 - + a00*a12*a21*a33 - + a01*a10*a22*a33 + + a00*a11*a22*a33; + + if(det*det>1.0E-30) + { + b00 = -a13*a22*a31 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 + a11*a22*a33; + b01 = a03*a22*a31 - a02*a23*a31 - a03*a21*a32 + a01*a23*a32 + a02*a21*a33 - a01*a22*a33; + b02 = -a03*a12*a31 + a02*a13*a31 + a03*a11*a32 - a01*a13*a32 - a02*a11*a33 + a01*a12*a33; + b03 = a03*a12*a21 - a02*a13*a21 - a03*a11*a22 + a01*a13*a22 + a02*a11*a23 - a01*a12*a23; + b10 = a13*a22*a30 - a12*a23*a30 - a13*a20*a32 + a10*a23*a32 + a12*a20*a33 - a10*a22*a33; + b11 = -a03*a22*a30 + a02*a23*a30 + a03*a20*a32 - a00*a23*a32 - a02*a20*a33 + a00*a22*a33; + b12 = a03*a12*a30 - a02*a13*a30 - a03*a10*a32 + a00*a13*a32 + a02*a10*a33 - a00*a12*a33; + b13 = -a03*a12*a20 + a02*a13*a20 + a03*a10*a22 - a00*a13*a22 - a02*a10*a23 + a00*a12*a23; + b20 = -a13*a21*a30 + a11*a23*a30 + a13*a20*a31 - a10*a23*a31 - a11*a20*a33 + a10*a21*a33; + b21 = a03*a21*a30 - a01*a23*a30 - a03*a20*a31 + a00*a23*a31 + a01*a20*a33 - a00*a21*a33; + b22 = -a03*a11*a30 + a01*a13*a30 + a03*a10*a31 - a00*a13*a31 - a01*a10*a33 + a00*a11*a33; + b23 = a03*a11*a20 - a01*a13*a20 - a03*a10*a21 + a00*a13*a21 + a01*a10*a23 - a00*a11*a23; + b30 = a12*a21*a30 - a11*a22*a30 - a12*a20*a31 + a10*a22*a31 + a11*a20*a32 - a10*a21*a32; + b31 = -a02*a21*a30 + a01*a22*a30 + a02*a20*a31 - a00*a22*a31 - a01*a20*a32 + a00*a21*a32; + b32 = a02*a11*a30 - a01*a12*a30 - a02*a10*a31 + a00*a12*a31 + a01*a10*a32 - a00*a11*a32; + b33 = -a02*a11*a20 + a01*a12*a20 + a02*a10*a21 - a00*a12*a21 - a01*a10*a22 + a00*a11*a22; + b00 = b00/det; + b01 = b01/det; + b02 = b02/det; + b03 = b03/det; + b10 = b10/det; + b11 = b11/det; + b12 = b12/det; + b13 = b13/det; + b20 = b20/det; + b21 = b21/det; + b22 = b22/det; + b23 = b23/det; + b30 = b30/det; + b31 = b31/det; + b32 = b32/det; + b33 = b33/det; + mb.at(0,0) = b00; + mb.at(0,1) = b01; + mb.at(0,2) = b02; + mb.at(0,3) = b03; + mb.at(1,0) = b10; + mb.at(1,1) = b11; + mb.at(1,2) = b12; + mb.at(1,3) = b13; + mb.at(2,0) = b20; + mb.at(2,1) = b21; + mb.at(2,2) = b22; + mb.at(2,3) = b23; + mb.at(3,0) = b30; + mb.at(3,1) = b31; + mb.at(3,2) = b32; + mb.at(3,3) = b33; + } + //this was STUPID. Gaah. Computer algebra system saves the day? I'd be surprised if this didn't end up *slower* than gaussian elimination. Don't do this again! + return mb; +} + +__host__ __device__ cumat4f cumat4f::inverse() +{ + return minverse(*this); +} + + +__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b) +{ + float ret = 0.0; + ret = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; + return ret; +} + +__host__ __device__ float cuvect4f_norm(cuvect4f a) +{ + float ret = 0.0; + ret = ::sqrtf(cuvect4f_dot(a,a)); + return ret; +} + +__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a) +{ + cuvect4f ret = cuvect4f(0.0f,0.0f,0.0f,0.0f); + float nrm = cuvect4f_norm(a); + if(nrm>0.0) + ret = a/nrm; + return ret; +} + +__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b) +{ + cuvect4f ret; + cuvect4f bn = cuvect4f_normalize(b); + float d = cuvect4f_dot(a,bn); + ret = bn*d; + return ret; +} + +__host__ __device__ cuvect4f operator-(cuvect4f rhs) +{ + cuvect4f ret; + ret[0] = -rhs[0]; + ret[1] = -rhs[1]; + ret[2] = -rhs[2]; + ret[3] = -rhs[3]; + return ret; +} + +}; //namespace amscuda diff --git a/src/main.cu b/src/main.cu index 5cb987d..6bfd347 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,27 +1,27 @@ -#include - -//using namespace ams; -using namespace amscuda; - -int main(int argc, char* argv[]) -{ - printf("AMSCULIB2: Cuda Library Tests.\n"); - - //test_amscuarray_1(); - //test_amscumath1(); - - //cmp::test_cucomp64_1(); - //cmp::test_cucomp128_1(); - - //test_amscuarray_2(); - - //test_dprg64(); - //printf("\n"); - //test_dprg32(); - - //test_dbuff_rand_dpr32(); - - test_amscurarray1(); - - return 0; +#include + +//using namespace ams; +using namespace amscuda; + +int main(int argc, char* argv[]) +{ + printf("AMSCULIB2: Cuda Library Tests.\n"); + + //test_amscuarray_1(); + //test_amscumath1(); + + //cmp::test_cucomp64_1(); + //cmp::test_cucomp128_1(); + + //test_amscuarray_2(); + + //test_dprg64(); + //printf("\n"); + //test_dprg32(); + + //test_dbuff_rand_dpr32(); + + //test_amscurarray1(); + + return 0; } \ No newline at end of file diff --git a/src/main.o b/src/main.o deleted file mode 100644 index 878d8a2..0000000 Binary files a/src/main.o and /dev/null differ diff --git a/test_scripts/test_dbuff_dpr32.py b/test_scripts/test_dbuff_dpr32.py index c9bad9c..2fb817e 100644 --- a/test_scripts/test_dbuff_dpr32.py +++ b/test_scripts/test_dbuff_dpr32.py @@ -1,76 +1,76 @@ -#!/usr/bin/python3 - -import os,sys,math -import numpy as np -import matplotlib.pyplot as plt - -################# -## Subroutines ## -################# - -def binload_float_ndarray(fp): - arr = np.zeros((0),dtype=np.float32,order='F') - - qb = fp.read(4) - Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0] - shp = np.zeros((Nd),dtype=np.int32) - - piprod = 1 - for I in range(0,Nd): - qb = fp.read(4) - shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0] - piprod = piprod*shp[I] - - qb = fp.read(4*piprod) - arr = np.frombuffer(qb,dtype=np.float32,count=piprod) - - arr = arr.reshape(shp) - - return arr; - -def binsave_float_ndarray(fp,arr): - - - - - return - - -################# -## Main Script ## -################# - -def test_1(): - - fname = "./test_scripts/test_dbuff_rand_dpr32.bin" - try: - fp = open(fname,"rb") - except: - print("Could not open {} for reading".format(fname)) - return - - arr1 = binload_float_ndarray(fp) - arr2 = binload_float_ndarray(fp) - - fp.close() - - plt.subplot(2,2,1) - plt.imshow(arr1) - plt.subplot(2,2,2) - plt.imshow(arr2) - plt.subplot(2,2,3) - plt.hist(arr1.flatten(),bins=100) - plt.subplot(2,2,4) - plt.hist(arr2.flatten(),bins=100) - plt.show() - - print("{} {}".format(np.mean(arr2),np.std(arr2))) - - - return - -if(__name__=="__main__"): - test_1() - - exit(0) - +#!/usr/bin/python3 + +import os,sys,math +import numpy as np +import matplotlib.pyplot as plt + +################# +## Subroutines ## +################# + +def binload_float_ndarray(fp): + arr = np.zeros((0),dtype=np.float32,order='F') + + qb = fp.read(4) + Nd = np.frombuffer(qb,dtype=np.int32,count=1)[0] + shp = np.zeros((Nd),dtype=np.int32) + + piprod = 1 + for I in range(0,Nd): + qb = fp.read(4) + shp[I] = np.frombuffer(qb,dtype=np.int32,count=1)[0] + piprod = piprod*shp[I] + + qb = fp.read(4*piprod) + arr = np.frombuffer(qb,dtype=np.float32,count=piprod) + + arr = arr.reshape(shp) + + return arr; + +def binsave_float_ndarray(fp,arr): + + + + + return + + +################# +## Main Script ## +################# + +def test_1(): + + fname = "./test_scripts/test_dbuff_rand_dpr32.bin" + try: + fp = open(fname,"rb") + except: + print("Could not open {} for reading".format(fname)) + return + + arr1 = binload_float_ndarray(fp) + arr2 = binload_float_ndarray(fp) + + fp.close() + + plt.subplot(2,2,1) + plt.imshow(arr1) + plt.subplot(2,2,2) + plt.imshow(arr2) + plt.subplot(2,2,3) + plt.hist(arr1.flatten(),bins=100) + plt.subplot(2,2,4) + plt.hist(arr2.flatten(),bins=100) + plt.show() + + print("{} {}".format(np.mean(arr2),np.std(arr2))) + + + return + +if(__name__=="__main__"): + test_1() + + exit(0) +