Making some of my library code public.

This commit is contained in:
2025-02-04 21:59:29 -05:00
commit 64a7bfc851
85 changed files with 9729 additions and 0 deletions

View File

@ -0,0 +1,89 @@
#ifndef __AMSCU_COMP128_HPP__
#define __AMSCU_COMP128_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp128
{
public:
double real;
double imag;
__host__ __device__ cucomp128();
__host__ __device__ ~cucomp128();
__host__ __device__ cucomp128(const cucomp128 &other);
__host__ __device__ cucomp128(const double &other);
__host__ __device__ cucomp128& operator=(cucomp128& other);
__host__ __device__ const cucomp128& operator=(const cucomp128& other);
__host__ __device__ cucomp128& operator=(double& other);
__host__ __device__ const cucomp128& operator=(const double& other);
__host__ __device__ double& operator[](int& ind);
__host__ __device__ const double& operator[](const int& ind) const;
__host__ __device__ cucomp128 operator+(const cucomp128& z);
__host__ __device__ cucomp128 operator-(const cucomp128& z);
__host__ __device__ cucomp128 operator*(const cucomp128& z);
__host__ __device__ cucomp128 operator/(const cucomp128& z);
__host__ __device__ cucomp128 operator+(const double& z);
__host__ __device__ cucomp128 operator-(const double& z);
__host__ __device__ cucomp128 operator*(const double& z);
__host__ __device__ cucomp128 operator/(const double& z);
__host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp128& z) const;
__host__ __device__ bool operator!=(const cucomp128& z) const;
__host__ __device__ bool operator>(const cucomp128& z) const;
__host__ __device__ bool operator<(const cucomp128& z) const;
__host__ __device__ bool operator>=(const cucomp128& z) const;
__host__ __device__ bool operator<=(const cucomp128& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ double arg() const;
__host__ __device__ double mag() const;
__host__ __device__ cucomp128 conj() const;
};
__host__ __device__ double arg(cucomp128 z);
__host__ __device__ cucomp128 dtocomp(double _r, double _i);
__host__ __device__ double real(cucomp128 z);
__host__ __device__ double imag(cucomp128 z);
__host__ __device__ cucomp128 sin(cucomp128 z);
__host__ __device__ cucomp128 cos(cucomp128 z);
__host__ __device__ cucomp128 tan(cucomp128 z);
__host__ __device__ cucomp128 exp(cucomp128 z);
__host__ __device__ cucomp128 log(cucomp128 z);
__host__ __device__ double abs(cucomp128 z);
__host__ __device__ cucomp128 conj(cucomp128 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp128 cosh(cucomp128 z);
__host__ __device__ cucomp128 sinh(cucomp128 z);
__host__ __device__ cucomp128 tanh(cucomp128 z);
__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp128 csgn(cucomp128 z);
void test_cucomp128_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,88 @@
#ifndef __AMSCU_COMP64_HPP__
#define __AMSCU_COMP64_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp64
{
public:
float real;
float imag;
__host__ __device__ cucomp64();
__host__ __device__ ~cucomp64();
__host__ __device__ cucomp64(const cucomp64 &other);
__host__ __device__ cucomp64(const float &other);
__host__ __device__ cucomp64& operator=(cucomp64& other);
__host__ __device__ const cucomp64& operator=(const cucomp64& other);
__host__ __device__ cucomp64& operator=(float& other);
__host__ __device__ const cucomp64& operator=(const float& other);
__host__ __device__ float& operator[](int& ind);
__host__ __device__ const float& operator[](const int& ind) const;
__host__ __device__ cucomp64 operator+(const cucomp64& z);
__host__ __device__ cucomp64 operator-(const cucomp64& z);
__host__ __device__ cucomp64 operator*(const cucomp64& z);
__host__ __device__ cucomp64 operator/(const cucomp64& z);
__host__ __device__ cucomp64 operator+(const float& z);
__host__ __device__ cucomp64 operator-(const float& z);
__host__ __device__ cucomp64 operator*(const float& z);
__host__ __device__ cucomp64 operator/(const float& z);
__host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp64& z) const;
__host__ __device__ bool operator!=(const cucomp64& z) const;
__host__ __device__ bool operator>(const cucomp64& z) const;
__host__ __device__ bool operator<(const cucomp64& z) const;
__host__ __device__ bool operator>=(const cucomp64& z) const;
__host__ __device__ bool operator<=(const cucomp64& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ float arg() const;
__host__ __device__ float mag() const;
__host__ __device__ cucomp64 conj() const;
};
__host__ __device__ float arg(cucomp64 z);
__host__ __device__ cucomp64 dtocomp64(float _r, float _i);
__host__ __device__ float real(cucomp64 z);
__host__ __device__ float imag(cucomp64 z);
__host__ __device__ cucomp64 sin(cucomp64 z);
__host__ __device__ cucomp64 cos(cucomp64 z);
__host__ __device__ cucomp64 tan(cucomp64 z);
__host__ __device__ cucomp64 exp(cucomp64 z);
__host__ __device__ cucomp64 log(cucomp64 z);
__host__ __device__ float abs(cucomp64 z);
__host__ __device__ cucomp64 conj(cucomp64 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp64 cosh(cucomp64 z);
__host__ __device__ cucomp64 sinh(cucomp64 z);
__host__ __device__ cucomp64 tanh(cucomp64 z);
__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp64 csgn(cucomp64 z);
void test_cucomp64_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,40 @@
#ifndef __AMSCU_CUDAFUNCTIONS_HPP__
#define __AMSCU_CUDAFUNCTIONS_HPP__
namespace amscuda
{
// device memory operations
// I'm trying to avoid some of the boilerplate mental overhead involved
// in calling cuda functions and handling errors
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr);
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite);
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite);
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr);
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr);
int cuda_errortrap(const char *msgheader);
};
#include <amsculib2/amscu_cudafunctions_impl.hpp>
#endif

View File

@ -0,0 +1,228 @@
#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
namespace amscuda
{
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(*devptr==NULL)
{
return ret; //devbuffer is already NULL/freed
}
err = cudaFree(*devptr);
if(err!=cudaSuccess)
{
ret = -1; //failed to free device pointer
*devptr = NULL; // - ? should only happen if I'm trying to double-free something
}
else
{
ret = 1;
*devptr = NULL;
}
return ret;
}
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(hostbuffer==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devbuffer !=NULL)
{
cuda_free(devbuffer);
}
}
if(*devbuffer==NULL)
{
err = cudaMalloc(devbuffer,sizeof(T)*N);
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devbuffer = NULL;
return ret;
}
}
err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(devbuffer==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostbuffer!=NULL)
{
delete[] (*hostbuffer); hostbuffer = NULL;
}
if(*hostbuffer==NULL)
{
*hostbuffer = new(std::nothrow) T[N];
if(*hostbuffer==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(hostptr==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devptr !=NULL)
{
cuda_free(devptr);
}
}
if(*devptr==NULL)
{
err = cudaMalloc(devptr,sizeof(T));
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devptr = NULL;
return ret;
}
}
err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(devptr==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostptr!=NULL)
{
delete (*hostptr); hostptr = NULL;
}
if(*hostptr==NULL)
{
*hostptr = new(std::nothrow) T;
if(*hostptr==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
};
#endif

View File

@ -0,0 +1,55 @@
#ifndef __AMSCU_RANDOM_HPP__
#define __AMSCU_RANDOM_HPP__
namespace amscuda
{
// Random Number Gerneators
// faster floating point hash function used in fractal generators
__device__ __host__ float fhash1d_su(float x);
__device__ __host__ float fhash3d_su(float x, float y, float z);
__device__ __host__ float fhash4d_su(float x, float y, float z, float w);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int32_t Generator //
//////////////////////////////////////////////////
//Next seed in simple 32 bit integer deterministic psuedo-rand generator
__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout);
//Simple 32 bit integer deterministic pseudo-random generator
// *not* for cryptography
// Frequency of generated floats should be uniform [0,1)
__host__ __device__ float dpr32_randf(int32_t *rseed_inout);
//box muller standard normal pseudorandom variable
__host__ __device__ float dpr32_randnf(int32_t *rseed_inout);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int64_t Generator //
//////////////////////////////////////////////////
//operates without side-effects on explicit seed for threaded use
//deterministic pseudorandom number generator - takes seed and returns next seed
__host__ __device__ void dpr64_nextseed(int64_t *seedinout);
//deterministic pseudorandom number generator - takes seed and returns next seed
//returns uniformly distributed double
__host__ __device__ double dpr64_randd(int64_t *seedinout);
__host__ __device__ float dpr64_randf(int64_t *seedinout);
void test_dprg64();
void test_dprg32();
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,47 @@
#ifndef __CUARRAY_HPP__
#define __CUARRAY_HPP__
namespace amscuda
{
template<typename T> class cuarray
{
public:
int length;
T* data;
__device__ __host__ cuarray();
__device__ __host__ ~cuarray();
//Only call this on the device for thread/block local
// dynamic arrays
__device__ __host__ int resize(const int _length);
__device__ __host__ int size() const;
__device__ __host__ T& at(const int I);
__device__ __host__ const T& at(const int I) const;
__device__ __host__ T& operator[](const int I);
__device__ __host__ const T& operator[](const int I) const;
__host__ int device_send(cuarray<T> **dptr);
__host__ int _device_send_overwrite(cuarray<T> **dptr);
__host__ int _device_send_copy(cuarray<T> *dptr);
__host__ int device_pull(cuarray<T> *dptr);
__host__ int device_free(cuarray<T> **dptr);
__host__ int device_length(cuarray<T> *dptr);
__host__ T* device_data_ptr(cuarray<T> *dptr);
};
void test_cuarray();
};
#include <amsculib2/amscuarray_impl.hpp>
#endif

View File

@ -0,0 +1,76 @@
#ifndef __AMSCUARRAY_DOPS_HPP__
#define __AMSCUARRAY_DOPS_HPP__
//Device Operations on Arrays
//
//Device Operations on Device Buffers
// dodb
namespace amscuda
{
//sum
template<typename T> T devcuarray_sum(cuarray<T> *devptr);
template<typename T> T dbuff_sum(T *devbuffer, int N);
struct dbuff_statstruct
{
public:
float min;
float max;
float mean;
float stdev;
float sum;
};
//stats (min,max,mean,stdev)
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max);
template<typename T> dbuff_statstruct dbuff_stats(T *devbuffer, int N); //
//sets all elements to setto
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads);
//random device buffer functions
void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads); //
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//vector_add
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
// Tests //
void test_dbuff_rand_dpr32();
};
#include <amsculib2/amscuarray_dops_impl.hpp>
#endif

View File

@ -0,0 +1,404 @@
#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
#define __AMSCUARRAY_DOPS_IMPL_HPP__
namespace amscuda
{
template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T ret = (T) 0;
for(I=I0;I<N;I=I+Is)
{
ret = ret + devbuffer[I];
}
rets[I0] = ret;
}
template<typename T> T devcuarray_sum(cuarray<T> *devptr)
{
T ret = T();
cudaError_t err = cudaSuccess;
cuarray<T> ldptr;
cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = devbuffer_sum(ldptr.data,ldptr.length);
ldptr.data = NULL;
ldptr.length=0;
return ret;
}
template<typename T> T dbuff_sum(T *dbuff, int N)
{
int I;
T ret = T();
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
if(dbuff==NULL || N<=0)
{
return ret;
}
if(N>100)
{
nblocks = 10;
nthreads = (int)sqrt((float) (N/nblocks));
if(nthreads<=0) nthreads=1;
if(nthreads>512) nthreads=512;
}
else
{
nblocks = 1;
nthreads = 1;
}
T *rets = NULL;
T *devrets = NULL;
rets = new T[nblocks*nthreads];
cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
ret = (T)0;
for(I=0;I<nblocks*nthreads;I++)
{
ret = ret + rets[I];
}
cudaFree(devrets); devrets = NULL;
delete[] rets;
return ret;
}
template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
if(I==I0)
{
maxs[I0] = devbuffer[I];
mins[I0] = devbuffer[I];
}
else
{
if(devbuffer[I]>maxs[I0])
{
maxs[I0] = devbuffer[I];
}
if(devbuffer[I]<mins[I0])
{
mins[I0] = devbuffer[I];
}
}
}
return;
}
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
{
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
int I;
T *maxs = NULL;
T *dev_maxs = NULL;
T *mins = NULL;
T *dev_mins = NULL;
T localmax = T(0);
T localmin = T(0);
if(devbuffer==NULL || N<=0)
{
if(min!=NULL) *min = T(0);
if(max!=NULL) *max = T(0);
return;
}
if(N>25)
{
nblocks = 25;
nthreads = (int) sqrt((float)(N/nblocks));
if(nthreads<1) nthreads = 1;
if(nthreads>512) nthreads = 512;
}
else
{
nblocks = 1;
nthreads = 1;
}
maxs = new T[nblocks*nthreads];
mins = new T[nblocks*nthreads];
cudaMalloc(&dev_maxs,nblocks*nthreads);
cudaMalloc(&dev_mins,nblocks*nthreads);
dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
for(I=0;I<nblocks*nthreads;I++)
{
if(I==0)
{
localmax = maxs[0];
localmin = mins[0];
}
else
{
if(maxs[I]>localmax) localmax = maxs[I];
if(mins[I]<localmin) localmin = mins[I];
}
}
if(max!=NULL) *max = localmax;
if(min!=NULL) *min = localmin;
cudaFree(dev_maxs); dev_maxs = NULL;
cudaFree(dev_mins); dev_mins = NULL;
delete[] maxs; maxs = NULL;
delete[] mins; mins = NULL;
return;
}
template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
devbuffer[I] = setto;
}
return;
}
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(devbuffer==NULL || N<=0)
{
return;
}
dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = dbuf_b[I];
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = par_b;
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T> T dbuff_add_fn(T a, T b)
{
return a+b;
}
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_sub_fn(T a, T b)
{
return a-b;
}
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_mult_fn(T a, T b)
{
return a*b;
}
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_div_fn(T a, T b)
{
return a/b;
}
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_ldiv_fn(T a, T b)
{
return b/a;
}
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
return;
}
};
#endif

View File

@ -0,0 +1,323 @@
#ifndef __CUARRAY_IMPL_HPP__
#define __CUARRAY_IMPL_HPP__
namespace amscuda
{
// New Version cuarray<T>
// simpler, less crap going on
template<typename T> __device__ __host__ cuarray<T>::cuarray()
{
length = 0;
data = NULL;
}
template<typename T> __device__ __host__ cuarray<T>::~cuarray()
{
if(data!=NULL)
{
delete[] data; data = NULL;
}
length = 0;
}
template<typename T> __device__ __host__ int cuarray<T>::resize(const int _length)
{
int ret = 0;
T *newbuffer = NULL;
if(length==_length)
{
//do nothing
ret = 1;
return ret;
}
if(_length<=0)
{
if(data!=NULL)
{
delete[] data;
data = NULL;
}
length = 0;
ret = 1;
}
newbuffer = new T[_length];
if(newbuffer==NULL)
{
ret = -1; //failed to allocate memory
return ret;
}
int I;
T def;
if(data!=NULL)
{
for(I=0;I<length&&I<_length;I++)
{
newbuffer[I] = data[I];
}
for(I=length;I<_length;I++)
{
newbuffer[I] = def;
}
delete[] data; data=NULL;
}
else
{
for(I=0;I<_length;I++)
{
newbuffer[I] = def;
}
}
data = newbuffer;
length = _length;
ret = 1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_send(cuarray<T> **dptr)
{
int ret = 0;
int dlength;
if(*dptr==NULL)
{
ret = _device_send_overwrite(dptr);
}
else
{
dlength = device_length(*dptr);
if(dlength=length)
{
ret = _device_send_copy(*dptr);
}
else
{
ret = _device_send_overwrite(dptr);
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_overwrite(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
cudaError_t err = cudaSuccess;
device_free(dptr);
if(length>=0 && data!=NULL)
{
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
err = cudaMalloc(&(dlocal.data),sizeof(T)*length);
dlocal.length = length;
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
if(data!=NULL)
err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice);
else
err = cudaSuccess;
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -3;
}
}
else
{
ret = -2;
}
}
else
{
ret = -1;
}
}
else
{
dlocal.data = NULL;
dlocal.length = 0;
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
ret = 1;
}
else
{
ret = -4;
}
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_copy(cuarray<T> *dptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
T* ddata = NULL;
ddata = device_data_ptr(dptr);
err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -1;
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_pull(cuarray<T> *dptr)
{
int ret = 0;
int dlength;
T* ddata;
cudaError_t err;
if(dptr==NULL)
{
ret = -1; // null d pointer
return ret;
}
dlength = device_length(dptr);
if(dlength!=length)
{
this->resize(dlength);
}
ddata = device_data_ptr(dptr);
if(length>0 && data!=NULL && ddata!=NULL)
{
err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -2;
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_free(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
if(*dptr!=NULL)
{
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
if(dlocal.data!=NULL)
{
cudaFree(dlocal.data);
dlocal.data = NULL;
}
cudaFree(*dptr);
*dptr = NULL;
ret = 1;
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_length(cuarray<T> *dptr)
{
int ret = -1;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.length;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ T* cuarray<T>::device_data_ptr(cuarray<T> *dptr)
{
T* ret = NULL;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.data;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __device__ __host__ int cuarray<T>::size() const
{
return this->length;
}
template<typename T> __device__ __host__ T& cuarray<T>::at(const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::at(const int I) const
{
return this->data[I];
}
template<typename T> __device__ __host__ T& cuarray<T>::operator[](const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::operator[](const int I) const
{
return this->data[I];
}
};
#endif

View File

@ -0,0 +1,19 @@
#ifndef __AMSCUDA_BINARRRW_HPP__
#define __AMSCUDA_BINARRRW_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer);
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer);
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer);
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer);
}; //end namespace amscuda
#include <amsculib2/amscuda_binarrrw_impl.hpp>
#endif

View File

@ -0,0 +1,194 @@
#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__
#define __AMSCUDA_BINARRRW_IMPL_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer)
{
int ret = 1;
int I;
long piprod;
int32_t q;
int cnt;
int32_t Nd;
if(fp!=NULL)
{
if(!feof(fp))
{
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
shape->resize(Nd);
piprod = 1;
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
shape->at(I) = q;
if(q>0)
{
piprod = piprod*q;
}
else
{
piprod = 0;
}
}
buffer->resize(piprod);
if(piprod>0)
{
cnt = fread((buffer->data),sizeof(T),piprod,fp);
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod);
ret = 0;
}
}
}
else
{
printf("fread_ndarray: Read a number of dimensions<=0.\n");
Nd = 0;
shape->resize(0);
buffer->resize(0);
}
}
else
{
printf("fread_ndarray: fp=NULL.\n");
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer)
{
int ret = 1;
long piprod;
int I;
int32_t Nd;
if(fp==NULL)
{
ret = 0;
printf("fwrite_ndarray: fp=NULL\n");
return ret;
}
piprod = 1;
for(I=0;I<shape->size();I++)
{
if(shape->at(I)>0)
{
piprod = piprod*shape->at(I);
}
else
{
piprod = 0;
}
}
Nd = (int32_t) shape->size();
if(piprod!=buffer->size())
{
ret = 0;
printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod);
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
if(Nd>0)
{
fwrite(shape->data,sizeof(int32_t),Nd,fp);
if(piprod>0)
{
fwrite(buffer->data,sizeof(T),buffer->size(),fp);
}
}
return ret;
}
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer)
{
int ret = 0;
int Nd = 1;
if(fp==NULL)
{
ret = 0;
printf("fwrite_buffer: fp=NULL\n");
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
fwrite(&N,sizeof(int32_t),1,fp);
fwrite(buffer,sizeof(T),N,fp);
return ret;
}
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer)
{
int ret = 0;
int cnt;
int32_t Nd;
int32_t *dims = NULL;
int piprod;
int32_t q;
int I;
int Nr;
if(fp==NULL) {ret = -1; return ret;}
if(feof(fp)) {ret = -2; return ret;}
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
piprod = 1;
dims = new(std::nothrow) int32_t[Nd];
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
dims[I] = q;
piprod = piprod*dims[I];
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fwrite_buffer, read %d values, expecting %d\n",cnt,piprod);
}
}
Nr = amscuda::min<int32_t>(Nmax,piprod);
cnt = fread(buffer,sizeof(T),Nr,fp);
}
if(dims!=NULL) {delete[] dims; dims=NULL;}
return ret;
}
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,11 @@
#ifndef __AMSCUGEOM_HPP__
#define __AMSCUGEOM_HPP__
namespace amscuda
{
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,70 @@
#ifndef __AMSCULIB2_HPP__
#define __AMSCULIB2_HPP__
//Std Lib Includes
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <time.h>
#include <new>
#include <cuda_runtime_api.h> //where all the cuda functions live
#include <cuda_runtime.h>
#include <cuda.h>
//Dependencies
//Predeclarations
class cuvect2;
class cuvect3;
class cuvect4;
class cuvect2f;
class cuvect3f;
class cuvect4f;
//Need a way to define the same symbols using both host and device code
//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory
#ifdef __CUDA_ARCH__
#define AMSCU_CONST __constant__
#else
#define AMSCU_CONST
#endif
namespace amscuda
{
//default thread and block execution
AMSCU_CONST static const int amscu_defnblocks = 256;
AMSCU_CONST static const int amscu_defnthreads = 512;
//default numthreads to execute on cpu
AMSCU_CONST static const int amscu_defcputhreads = 8;
}; //end namespace amscuda
//Components
#include <amsculib2/amscu_cudafunctions.hpp>
#include <amsculib2/amscumath.hpp>
#include <amsculib2/amscu_comp64.hpp>
#include <amsculib2/amscu_comp128.hpp>
#include <amsculib2/cuvect2.hpp>
#include <amsculib2/cuvect3.hpp>
#include <amsculib2/cuvect4.hpp>
#include <amsculib2/cuvect2f.hpp>
#include <amsculib2/cuvect3f.hpp>
#include <amsculib2/cuvect4f.hpp>
#include <amsculib2/amscugeom.hpp>
#include <amsculib2/amscuarray.hpp>
#include <amsculib2/amscuda_binarrrw.hpp>
#include <amsculib2/amscu_random.hpp>
#include <amsculib2/amscuarray_dops.hpp>
#include <amsculib2/amscurarray.cuh>
#endif

View File

@ -0,0 +1,56 @@
#ifndef __AMSCUMATH_HPP__
#define __AMSCUMATH_HPP__
namespace amscuda
{
//Problem: These are not in the namespace
//#define nan NAN
//#define fnan (float) NAN
//#define inf INFINITY
//#define finf (float) INFINITY
//#define pi 3.1415926535897936
//These need to be the same symbol for both host and device code
AMSCU_CONST static const double nan = NAN;
AMSCU_CONST static const float fnan = (float) NAN;
AMSCU_CONST static const double inf = INFINITY;
AMSCU_CONST static const float finf = (float) INFINITY;
AMSCU_CONST static const double pi = 3.1415926535897936;
AMSCU_CONST static const float pif = 3.1415926535897936;
__host__ __device__ double dabs(double x);
__host__ __device__ float fabs(float x);
template<typename T> __host__ __device__ T abs(const T in)
{
T ret = in;
if(in<0) ret = -in;
return ret;
}
__host__ __device__ double mod(double a, double md);
__host__ __device__ float mod(float a, float md);
__host__ __device__ int mod(int x, int n);
__host__ __device__ long mod(long x, long n);
__host__ __device__ int truediv(int x, int y);
__host__ __device__ long truediv(long x, long y);
template<typename T> __host__ __device__ T min(T a, T b);
template<typename T> __host__ __device__ T max(T a, T b);
__device__ __host__ double arg(double x, double y);
__device__ __host__ void get_azel(double x, double y, double z, double *az, double *el);
void test_amscumath1();
}; //end namespace amscuda
#include <amsculib2/amscumath_impl.hpp>
#endif

View File

@ -0,0 +1,42 @@
#ifndef __AMSCUMATH_IMPL_HPP__
#define __AMSCUMATH_IMPL_HPP__
namespace amscuda
{
template<typename T> __host__ __device__ T min(T a, T b)
{
if(a>b)
{
return b;
}
else
{
return a;
}
return a;
}
template<typename T> __host__ __device__ T max(T a, T b)
{
if(a>b)
{
return a;
}
else
{
return b;
}
return a;
}
template<> __host__ __device__ double min(double a, double b);
template<> __host__ __device__ float min(float a, float b);
template<> __host__ __device__ double max(double a, double b);
template<> __host__ __device__ float max(float a, float b);
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,66 @@
#ifndef __AMSCURARRAY_HPP__
#define __AMSCURARRAY_HPP__
namespace amscuda
{
//Cuda ragged array class
template<typename T> class curarray
{
public:
int device;
curarray* devptr; //pointer to mirror class on the device
int Narrays; //number of arrays
int *N; //dimension of each array
T** hostarrayptrs; //pointers to each array on the host - null on the device
T** devarrayptrs; //pointers to each array on the device
//the double pointer is a host pointer to device pointers on the host class
//for the device class, only the second set of arrays is in use
//the constructor and destructor set all pointers to NULL, they
// do *not* manage memory. This is done with curarray_new and curarray_delete
__device__ __host__ curarray();
__device__ __host__ ~curarray();
__host__ int push();
__host__ int pull();
//__device__ int dev_resizearray(int arraynum, int arraysize);
__host__ int resizearray(int arraynum, int arraysize);
// I may want a way to resize arrays on the device without pushing/pulling all the array contents
};
template<typename T> int curarray_new(curarray<T>** ptr, int Narrays);
template<typename T> int curarray_delete(curarray<T>** ptr);
template<typename T> int curarray_device_new(curarray<T> *hostptr);
template<typename T> int curarray_device_delete(curarray<T> *hostptr);
template<typename T> int curarray_push(curarray<T> *hostptr);
template<typename T> int curarray_pull(curarray<T> *hostptr);
//template<typename T> int curarray_host_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> int curarray_device_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> __host__ int curarray_deletearray(curarray<T> *hostptr, int arrayindex);
//template<typename T> __device__ int curarray_dev_deletearray(curarray<T> *devptr, int arrayindex);
//template<typename T> __host__ int curarray_allocarray(curarray<T> *hostptr, int arrayindex, int size);
//template<typename T> __device__ int curarray_dev_allocarray(curarray<T> *devptr, int arrayindex, int size);
void test_amscurarray1();
};
#include <amsculib2/amscurarray_impl.cuh>
#endif

View File

@ -0,0 +1,529 @@
#ifndef __AMSCURARRAY_IMPL_HPP__
#define __AMSCURARRAY_IMPL_HPP__
namespace amscuda
{
template<typename T> curarray<T>::curarray()
{
device = -1;
devptr = NULL;
Narrays = 0;
N = NULL;
hostarrayptrs = NULL;
devarrayptrs = NULL;
}
template<typename T> curarray<T>::~curarray()
{
device = -1;
devptr = NULL;
Narrays = 0;
N = NULL;
hostarrayptrs = NULL;
devarrayptrs = NULL;
}
template<typename T> int curarray_new(curarray<T>** ptr, int Narrays)
{
int ret = 0;
int device;
curarray<T> *lhptr = *ptr;
cudaGetDevice(&device);
if(lhptr!=NULL)
{
curarray_delete(ptr);
}
*ptr = new(std::nothrow) curarray<T>();
lhptr = *ptr;
int I;
if(Narrays<0) Narrays=0;
lhptr->Narrays = Narrays;
lhptr->device = device;
lhptr->N = new(std::nothrow) int[Narrays];
lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays];
lhptr->devarrayptrs = new(std::nothrow) T*[Narrays];
for(I=0;I<Narrays;I++)
{
lhptr->N[I] = 0;
lhptr->hostarrayptrs[I] = NULL;
lhptr->devarrayptrs[I] = NULL;
}
curarray_device_new(lhptr);
return ret;
}
template<typename T> int curarray_delete(curarray<T>** ptr)
{
int ret = 0;
curarray<T> *lptr = NULL;
int olddev;
int I;
if(*ptr==NULL)
{
return 0;
}
lptr = *ptr;
cudaGetDevice(&olddev);
cudaSetDevice(lptr->device);
if(lptr->devptr!=NULL)
{
curarray_device_delete(lptr);
}
lptr->device = -1;
for(I=0;I<lptr->Narrays;I++)
{
if(lptr->hostarrayptrs!=NULL)
{
if(lptr->hostarrayptrs[I]!=NULL)
{
delete[] lptr->hostarrayptrs[I];
lptr->hostarrayptrs[I] = NULL;
}
}
if(lptr->devarrayptrs!=NULL)
{
if(lptr->devarrayptrs[I]!=NULL)
{
//erasing device memory should have been handled in curarray_device_delete
lptr->devarrayptrs[I] = NULL;
}
}
lptr->N[I] = 0;
}
if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;}
if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;}
if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;}
if(*ptr!=NULL) {delete *ptr; *ptr = NULL;}
cudaSetDevice(olddev);
return ret;
}
template<typename T> int curarray_device_new(curarray<T> *hostptr)
{
int ret = 0;
curarray<T> ldevdata;
if(hostptr==NULL) return -1;
if(hostptr->devptr!=NULL)
{
curarray_device_delete(hostptr);
}
int I;
cudaGetDevice(&(hostptr->device));
ldevdata.device = hostptr->device;
ldevdata.Narrays = hostptr->Narrays;
int Narrays = hostptr->Narrays;
cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays);
cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
ldevdata.hostarrayptrs = NULL;
for(I=0;I<Narrays;I++)
{
if(hostptr->N[I]>0)
{
if(hostptr->devarrayptrs[I]!=NULL)
{
cudaFree(hostptr->devarrayptrs[I]);
hostptr->devarrayptrs[I] = NULL;
}
cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
}
else
{
if(hostptr->devarrayptrs[I]!=NULL)
{
cudaFree(hostptr->devarrayptrs[I]);
hostptr->devarrayptrs[I] = NULL;
}
}
}
cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays);
cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);
cudaMalloc(&(hostptr->devptr),sizeof(curarray<T>));
cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);
ret = 1;
return ret;
}
template<typename T> int curarray_device_delete(curarray<T> *hostptr)
{
int ret = 0;
curarray<T> ldevdata;
int olddev;
if(hostptr->devptr==NULL)
{
return 0;
}
cudaGetDevice(&olddev);
cudaSetDevice(hostptr->device);
cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
int I;
for(I=0;I<hostptr->Narrays;I++)
{
if(hostptr->devarrayptrs[I]!=NULL)
{
cudaFree(hostptr->devarrayptrs[I]);
hostptr->devarrayptrs[I] = NULL;
}
}
cudaFree(ldevdata.devarrayptrs);
cudaFree(ldevdata.N);
cudaFree(hostptr->devptr);
hostptr->devptr = NULL;
hostptr->device = -1;
cudaSetDevice(olddev);
ret = 1;
return ret;
}
template<typename T> int curarray_push(curarray<T> *hostptr)
{
int ret = 0;
int olddev;
curarray<T> ldevdata;
T** ldevarrayptrs = NULL;
int *devN = NULL;
if(hostptr==NULL) return -1;
cudaGetDevice(&olddev);
cudaSetDevice(hostptr->device);
int Narrays = hostptr->Narrays;
cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
ldevarrayptrs = new(std::nothrow) T*[Narrays];
devN = new(std::nothrow) int[Narrays];
cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);
int I;
for(I=0;I<Narrays;I++)
{
//check to see that host size is the same as device size, and that
//the host device pointer is the same as the device device pointer
if( (hostptr->N[I]!=devN[I]) ||
(hostptr->devarrayptrs[I] != ldevarrayptrs[I])
)
{
cudaFree(ldevarrayptrs[I]);
ldevarrayptrs[I] = NULL;
hostptr->devarrayptrs[I] = NULL;
if(hostptr->N[I]>0)
{
cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
ldevarrayptrs[I] = hostptr->devarrayptrs[I];
devN[I] = hostptr->N[I];
}
else
{
devN[I] = 0;
}
}
if(hostptr->N[I]>0)
{
//copy host data to device
cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
}
} //for each array
//rectify and copy device data structure to device
ldevdata.device = hostptr->device;
ldevdata.devptr = NULL;
ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true
ldevdata.hostarrayptrs = NULL;
cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);
cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);
cuda_errortrap("curarray_push cuda error:");
cudaSetDevice(olddev);
delete[] ldevarrayptrs;
delete[] devN;
return ret;
}
template<typename T> int curarray_pull(curarray<T> *hostptr)
{
int ret = 0;
int olddev;
curarray<T> ldevdata;
T** ldevarrayptrs = NULL;
int *devN = NULL;
if(hostptr==NULL) return -1;
cudaGetDevice(&olddev);
cudaSetDevice(hostptr->device);
cuda_errortrap("dbg1");
int Narrays = hostptr->Narrays;
cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
ldevarrayptrs = new(std::nothrow) T*[Narrays];
devN = new(std::nothrow) int[Narrays];
cuda_errortrap("dbg2");
cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);
cuda_errortrap("dbg3");
char dbgjnk[50];
int I;
for(I=0;I<Narrays;I++)
{
//check to see that host size is the same as device size, and that
//the host device pointer is the same as the device device pointer
if(hostptr->devarrayptrs[I] != ldevarrayptrs[I])
{
hostptr->devarrayptrs[I] = ldevarrayptrs[I];
}
if(hostptr->N[I]!=devN[I])
{
if(hostptr->hostarrayptrs[I]!=NULL)
{
delete[] hostptr->hostarrayptrs[I];
hostptr->hostarrayptrs[I] = NULL;
}
if(devN[I]>0)
{
hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]];
hostptr->N[I] = devN[I];
}
else
{
hostptr->N[I] = 0;
}
}
if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL)
{
cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost);
sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]);
cuda_errortrap(dbgjnk);
}
} //for each array
//for the pull operation, I don't think any update of the device data structure is necessary
cudaSetDevice(olddev);
delete[] ldevarrayptrs;
delete[] devN;
return ret;
}
template<typename T> __host__ int curarray<T>::push()
{
return curarray_push(this);
}
template<typename T> __host__ int curarray<T>::pull()
{
return curarray_pull(this);
}
/*
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D
cudaMalloc() and cudaFree() have distinct semantics between the host and
device environments. When invoked from the host, cudaMalloc() allocates a
new region from unused device memory. When invoked from the device runtime
these functions map to device-side malloc() and free(). This implies that
within the device environment the total allocatable memory is limited to the
device malloc() heap size, which may be smaller than the available unused
device memory. Also, it is an error to invoke cudaFree() from the host
program on a pointer which was allocated by cudaMalloc() on the device
or vice-versa.
So, basically this entire function is not going to work. I'll be unable to resize within
a kernel.
*/
/*
template<typename T> __device__ int curarray<T>::dev_resizearray(int arraynum, int arraysize)
{
int ret = 0;
T* newptr = NULL;
int I;
T def;
if(arraynum>=0 && arraynum<Narrays)
{
if(N[arraynum]!=arraysize)
{
if(arraysize<=0)
{
if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
devarrayptrs[arraynum] = NULL;
N[arraynum] = 0;
ret = 1;
return ret;
}
cudaMalloc(&newptr,arraysize*sizeof(T));
if(newptr!=NULL)
{
//do I want to assume there is a copy operator? (operator=)
//for now, yes - write a more restrictive class later if I don't want it
if(devarrayptrs[arraynum]!=NULL)
{
for(I=0;I<N[arraynum]&&I<arraysize;I++)
{
newptr[I] = devarrayptrs[arraynum][I];
}
}
for(I=N[arraynum];I<arraysize;I++)
{
newptr[I] = def;
}
if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
devarrayptrs[arraynum] = newptr;
N[arraynum] = arraysize;
ret = 1;
}
else
{
ret = -1;
}
}
else
{
ret = 1;
}
}
return ret;
}
*/
template<typename T> __host__ int curarray<T>::resizearray(int arraynum, int arraysize)
{
int ret = 0;
T* newptr = NULL;
int I;
T def;
if(arraynum>=0 && arraynum<Narrays)
{
if(N[arraynum]!=arraysize)
{
if(arraysize<=0)
{
delete[] hostarrayptrs[arraynum];
hostarrayptrs[arraynum] = NULL;
N[arraynum] = 0;
ret = 1;
return ret;
}
newptr = new(std::nothrow) T[arraysize];
if(newptr!=NULL)
{
//do I want to assume there is a copy operator? (operator=)
//for now, yes - write a more restrictive class later if I don't want it
if(hostarrayptrs[arraynum]!=NULL)
{
for(I=0;I<N[arraynum]&&I<arraysize;I++)
{
newptr[I] = hostarrayptrs[arraynum][I];
}
}
for(I=N[arraynum];I<arraysize;I++)
{
newptr[I] = def;
}
//cudaFree(hostarrayptrs[arraynum]);
delete[] hostarrayptrs[arraynum];
hostarrayptrs[arraynum] = newptr;
N[arraynum] = arraysize;
ret = 1;
}
else
{
ret = -1;
}
}
else
{
ret = 1;
}
}
return ret;
}
};
#endif

View File

@ -0,0 +1,84 @@
#ifndef __CUVECT2_HPP__
#define __CUVECT2_HPP__
namespace amscuda
{
class cuvect2
{
public:
double x;
double y;
__host__ __device__ cuvect2();
__host__ __device__ ~cuvect2();
__host__ __device__ cuvect2(double _x, double _y);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect2 operator+(cuvect2 lhs);
__host__ __device__ cuvect2 operator-(cuvect2 lhs);
__host__ __device__ cuvect2 operator*(double lhs);
__host__ __device__ cuvect2 operator/(double lhs);
};
class cumat2
{
public:
double dat[4];
__host__ __device__ cumat2();
__host__ __device__ ~cumat2();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat2 operator+(cumat2 lhs);
__host__ __device__ cumat2 operator-(cumat2 lhs);
__host__ __device__ cumat2 operator*(double lhs);
__host__ __device__ cumat2 operator/(double lhs);
__host__ __device__ cuvect2 operator*(cuvect2 lhs);
__host__ __device__ cumat2 operator*(cumat2 lhs);
__host__ __device__ double det();
__host__ __device__ cumat2 transpose();
__host__ __device__ cumat2 inverse();
};
__host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_norm(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2_transpose(double *mat2inout);
//copies src to dest
__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src);
//inverts mat?inout[4]
__host__ __device__ void mat2_inverse(double *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2);
//multiplies c = a*b
__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c);
// ret = a*b
__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b);
void test_cuvect2_1();
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,84 @@
#ifndef __CUVECT2F_HPP__
#define __CUVECT2F_HPP__
namespace amscuda
{
class cuvect2f
{
public:
float x;
float y;
__host__ __device__ cuvect2f();
__host__ __device__ ~cuvect2f();
__host__ __device__ cuvect2f(float _x, float _y);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect2f operator+(cuvect2f lhs);
__host__ __device__ cuvect2f operator-(cuvect2f lhs);
__host__ __device__ cuvect2f operator*(float lhs);
__host__ __device__ cuvect2f operator/(float lhs);
};
class cumat2f
{
public:
float dat[4];
__host__ __device__ cumat2f();
__host__ __device__ ~cumat2f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat2f operator+(cumat2f lhs);
__host__ __device__ cumat2f operator-(cumat2f lhs);
__host__ __device__ cumat2f operator*(float lhs);
__host__ __device__ cumat2f operator/(float lhs);
__host__ __device__ cuvect2f operator*(cuvect2f lhs);
__host__ __device__ cumat2f operator*(cumat2f lhs);
__host__ __device__ float det();
__host__ __device__ cumat2f transpose();
__host__ __device__ cumat2f inverse();
};
__host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_norm(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2f_transpose(float *mat2inout);
//copies src to dest
__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src);
//inverts mat?inout[4]
__host__ __device__ void mat2f_inverse(float *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2);
//multiplies c = a*b
__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c);
// ret = a*b
__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b);
void test_cuvect2f_1();
};
#endif

View File

@ -0,0 +1,86 @@
#ifndef __CUVECT3_HPP__
#define __CUVECT3_HPP__
namespace amscuda
{
class cuvect3
{
public:
double x;
double y;
double z;
__host__ __device__ cuvect3();
__host__ __device__ ~cuvect3();
__host__ __device__ cuvect3(double _x, double _y, double _z);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect3 operator+(cuvect3 lhs);
__host__ __device__ cuvect3 operator-(cuvect3 lhs);
__host__ __device__ cuvect3 operator*(double lhs);
__host__ __device__ cuvect3 operator/(double lhs);
};
class cumat3
{
public:
double dat[9];
__host__ __device__ cumat3();
__host__ __device__ ~cumat3();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat3 operator+(cumat3 lhs);
__host__ __device__ cumat3 operator-(cumat3 lhs);
__host__ __device__ cumat3 operator*(double lhs);
__host__ __device__ cumat3 operator/(double lhs);
__host__ __device__ cuvect3 operator*(cuvect3 lhs);
__host__ __device__ cumat3 operator*(cumat3 lhs);
__host__ __device__ double det();
__host__ __device__ cumat3 transpose();
__host__ __device__ cumat3 inverse();
};
__host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b);
__host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b);
__host__ __device__ double cuvect3_norm(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3_transpose(double *mat3inout);
//copies src to dest
__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src);
//returns determinant of 3x3 matrix
__host__ __device__ double mat3_det(double *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3_inverse(double *mat3inout);
__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin);
__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout);
__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout);
__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout);
__host__ void test_cudavect_logic1();
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,86 @@
#ifndef __CUVECT3F_HPP__
#define __CUVECT3F_HPP__
namespace amscuda
{
class cuvect3f
{
public:
float x;
float y;
float z;
__host__ __device__ cuvect3f();
__host__ __device__ ~cuvect3f();
__host__ __device__ cuvect3f(float _x, float _y, float _z);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect3f operator+(cuvect3f lhs);
__host__ __device__ cuvect3f operator-(cuvect3f lhs);
__host__ __device__ cuvect3f operator*(float lhs);
__host__ __device__ cuvect3f operator/(float lhs);
};
class cumat3f
{
public:
float dat[9];
__host__ __device__ cumat3f();
__host__ __device__ ~cumat3f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat3f operator+(cumat3f lhs);
__host__ __device__ cumat3f operator-(cumat3f lhs);
__host__ __device__ cumat3f operator*(float lhs);
__host__ __device__ cumat3f operator/(float lhs);
__host__ __device__ cuvect3f operator*(cuvect3f lhs);
__host__ __device__ cumat3f operator*(cumat3f lhs);
__host__ __device__ float det();
__host__ __device__ cumat3f transpose();
__host__ __device__ cumat3f inverse();
};
__host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b);
__host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b);
__host__ __device__ float cuvect3f_norm(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3f_transpose(float *mat3inout);
//copies src to dest
__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src);
//returns determinant of 3x3 matrix
__host__ __device__ float mat3f_det(float *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3f_inverse(float *mat3inout);
__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin);
__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout);
__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout);
__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout);
__host__ void test_cudavectf_logic1();
};
#endif

View File

@ -0,0 +1,59 @@
#ifndef __CUVECT4_HPP__
#define __CUVECT4_HPP__
namespace amscuda
{
class cuvect4
{
public:
double x;
double y;
double z;
double w;
__host__ __device__ cuvect4();
__host__ __device__ ~cuvect4();
__host__ __device__ cuvect4(double _x, double _y, double _z, double _w);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect4 operator+(cuvect4 lhs);
__host__ __device__ cuvect4 operator-(cuvect4 lhs);
__host__ __device__ cuvect4 operator*(double lhs);
__host__ __device__ cuvect4 operator/(double lhs);
};
class cumat4
{
public:
double dat[16];
__host__ __device__ cumat4();
__host__ __device__ ~cumat4();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat4 operator+(cumat4 lhs);
__host__ __device__ cumat4 operator-(cumat4 lhs);
__host__ __device__ cumat4 operator*(double lhs);
__host__ __device__ cumat4 operator/(double lhs);
__host__ __device__ cuvect4 operator*(cuvect4 lhs);
__host__ __device__ cumat4 operator*(cumat4 lhs);
__host__ __device__ double det();
__host__ __device__ cumat4 transpose();
__host__ __device__ cumat4 inverse();
};
__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b);
__host__ __device__ double cuvect4_norm(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b);
}; //end namespace amscuda
#endif

View File

@ -0,0 +1,60 @@
#ifndef __CUVECT4F_HPP__
#define __CUVECT4F_HPP__
namespace amscuda
{
class cuvect4f
{
public:
float x;
float y;
float z;
float w;
__host__ __device__ cuvect4f();
__host__ __device__ ~cuvect4f();
__host__ __device__ cuvect4f(float _x, float _y, float _z, float _w);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect4f operator+(cuvect4f lhs);
__host__ __device__ cuvect4f operator-(cuvect4f lhs);
__host__ __device__ cuvect4f operator*(float lhs);
__host__ __device__ cuvect4f operator/(float lhs);
};
class cumat4f
{
public:
float dat[16];
__host__ __device__ cumat4f();
__host__ __device__ ~cumat4f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat4f operator+(cumat4f lhs);
__host__ __device__ cumat4f operator-(cumat4f lhs);
__host__ __device__ cumat4f operator*(float lhs);
__host__ __device__ cumat4f operator/(float lhs);
__host__ __device__ cuvect4f operator*(cuvect4f lhs);
__host__ __device__ cumat4f operator*(cumat4f lhs);
__host__ __device__ float det();
__host__ __device__ cumat4f transpose();
__host__ __device__ cumat4f inverse();
};
__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b);
__host__ __device__ float cuvect4f_norm(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b);
};
#endif