cuda library updates

This commit is contained in:
2026-02-20 11:46:15 -05:00
parent c9f6307fc2
commit 3128d5dd19
122 changed files with 10842 additions and 7434 deletions

View File

@ -1,89 +1,89 @@
#ifndef __AMSCU_COMP128_HPP__
#define __AMSCU_COMP128_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp128
{
public:
double real;
double imag;
__host__ __device__ cucomp128();
__host__ __device__ ~cucomp128();
__host__ __device__ cucomp128(const cucomp128 &other);
__host__ __device__ cucomp128(const double &other);
__host__ __device__ cucomp128& operator=(cucomp128& other);
__host__ __device__ const cucomp128& operator=(const cucomp128& other);
__host__ __device__ cucomp128& operator=(double& other);
__host__ __device__ const cucomp128& operator=(const double& other);
__host__ __device__ double& operator[](int& ind);
__host__ __device__ const double& operator[](const int& ind) const;
__host__ __device__ cucomp128 operator+(const cucomp128& z);
__host__ __device__ cucomp128 operator-(const cucomp128& z);
__host__ __device__ cucomp128 operator*(const cucomp128& z);
__host__ __device__ cucomp128 operator/(const cucomp128& z);
__host__ __device__ cucomp128 operator+(const double& z);
__host__ __device__ cucomp128 operator-(const double& z);
__host__ __device__ cucomp128 operator*(const double& z);
__host__ __device__ cucomp128 operator/(const double& z);
__host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp128& z) const;
__host__ __device__ bool operator!=(const cucomp128& z) const;
__host__ __device__ bool operator>(const cucomp128& z) const;
__host__ __device__ bool operator<(const cucomp128& z) const;
__host__ __device__ bool operator>=(const cucomp128& z) const;
__host__ __device__ bool operator<=(const cucomp128& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ double arg() const;
__host__ __device__ double mag() const;
__host__ __device__ cucomp128 conj() const;
};
__host__ __device__ double arg(cucomp128 z);
__host__ __device__ cucomp128 dtocomp(double _r, double _i);
__host__ __device__ double real(cucomp128 z);
__host__ __device__ double imag(cucomp128 z);
__host__ __device__ cucomp128 sin(cucomp128 z);
__host__ __device__ cucomp128 cos(cucomp128 z);
__host__ __device__ cucomp128 tan(cucomp128 z);
__host__ __device__ cucomp128 exp(cucomp128 z);
__host__ __device__ cucomp128 log(cucomp128 z);
__host__ __device__ double abs(cucomp128 z);
__host__ __device__ cucomp128 conj(cucomp128 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp128 cosh(cucomp128 z);
__host__ __device__ cucomp128 sinh(cucomp128 z);
__host__ __device__ cucomp128 tanh(cucomp128 z);
__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp128 csgn(cucomp128 z);
void test_cucomp128_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif
#ifndef __AMSCU_COMP128_HPP__
#define __AMSCU_COMP128_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp128
{
public:
double real;
double imag;
__host__ __device__ cucomp128();
__host__ __device__ ~cucomp128();
__host__ __device__ cucomp128(const cucomp128 &other);
__host__ __device__ cucomp128(const double &other);
__host__ __device__ cucomp128& operator=(cucomp128& other);
__host__ __device__ const cucomp128& operator=(const cucomp128& other);
__host__ __device__ cucomp128& operator=(double& other);
__host__ __device__ const cucomp128& operator=(const double& other);
__host__ __device__ double& operator[](int& ind);
__host__ __device__ const double& operator[](const int& ind) const;
__host__ __device__ cucomp128 operator+(const cucomp128& z);
__host__ __device__ cucomp128 operator-(const cucomp128& z);
__host__ __device__ cucomp128 operator*(const cucomp128& z);
__host__ __device__ cucomp128 operator/(const cucomp128& z);
__host__ __device__ cucomp128 operator+(const double& z);
__host__ __device__ cucomp128 operator-(const double& z);
__host__ __device__ cucomp128 operator*(const double& z);
__host__ __device__ cucomp128 operator/(const double& z);
__host__ __device__ friend cucomp128 operator-(const cucomp128& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp128& z) const;
__host__ __device__ bool operator!=(const cucomp128& z) const;
__host__ __device__ bool operator>(const cucomp128& z) const;
__host__ __device__ bool operator<(const cucomp128& z) const;
__host__ __device__ bool operator>=(const cucomp128& z) const;
__host__ __device__ bool operator<=(const cucomp128& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ double arg() const;
__host__ __device__ double mag() const;
__host__ __device__ cucomp128 conj() const;
};
__host__ __device__ double arg(cucomp128 z);
__host__ __device__ cucomp128 dtocomp(double _r, double _i);
__host__ __device__ double real(cucomp128 z);
__host__ __device__ double imag(cucomp128 z);
__host__ __device__ cucomp128 sin(cucomp128 z);
__host__ __device__ cucomp128 cos(cucomp128 z);
__host__ __device__ cucomp128 tan(cucomp128 z);
__host__ __device__ cucomp128 exp(cucomp128 z);
__host__ __device__ cucomp128 log(cucomp128 z);
__host__ __device__ double abs(cucomp128 z);
__host__ __device__ cucomp128 conj(cucomp128 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp128 cosh(cucomp128 z);
__host__ __device__ cucomp128 sinh(cucomp128 z);
__host__ __device__ cucomp128 tanh(cucomp128 z);
__host__ __device__ cucomp128 pow(cucomp128 z1, cucomp128 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp128 csgn(cucomp128 z);
void test_cucomp128_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif

View File

@ -1,88 +1,88 @@
#ifndef __AMSCU_COMP64_HPP__
#define __AMSCU_COMP64_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp64
{
public:
float real;
float imag;
__host__ __device__ cucomp64();
__host__ __device__ ~cucomp64();
__host__ __device__ cucomp64(const cucomp64 &other);
__host__ __device__ cucomp64(const float &other);
__host__ __device__ cucomp64& operator=(cucomp64& other);
__host__ __device__ const cucomp64& operator=(const cucomp64& other);
__host__ __device__ cucomp64& operator=(float& other);
__host__ __device__ const cucomp64& operator=(const float& other);
__host__ __device__ float& operator[](int& ind);
__host__ __device__ const float& operator[](const int& ind) const;
__host__ __device__ cucomp64 operator+(const cucomp64& z);
__host__ __device__ cucomp64 operator-(const cucomp64& z);
__host__ __device__ cucomp64 operator*(const cucomp64& z);
__host__ __device__ cucomp64 operator/(const cucomp64& z);
__host__ __device__ cucomp64 operator+(const float& z);
__host__ __device__ cucomp64 operator-(const float& z);
__host__ __device__ cucomp64 operator*(const float& z);
__host__ __device__ cucomp64 operator/(const float& z);
__host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp64& z) const;
__host__ __device__ bool operator!=(const cucomp64& z) const;
__host__ __device__ bool operator>(const cucomp64& z) const;
__host__ __device__ bool operator<(const cucomp64& z) const;
__host__ __device__ bool operator>=(const cucomp64& z) const;
__host__ __device__ bool operator<=(const cucomp64& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ float arg() const;
__host__ __device__ float mag() const;
__host__ __device__ cucomp64 conj() const;
};
__host__ __device__ float arg(cucomp64 z);
__host__ __device__ cucomp64 dtocomp64(float _r, float _i);
__host__ __device__ float real(cucomp64 z);
__host__ __device__ float imag(cucomp64 z);
__host__ __device__ cucomp64 sin(cucomp64 z);
__host__ __device__ cucomp64 cos(cucomp64 z);
__host__ __device__ cucomp64 tan(cucomp64 z);
__host__ __device__ cucomp64 exp(cucomp64 z);
__host__ __device__ cucomp64 log(cucomp64 z);
__host__ __device__ float abs(cucomp64 z);
__host__ __device__ cucomp64 conj(cucomp64 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp64 cosh(cucomp64 z);
__host__ __device__ cucomp64 sinh(cucomp64 z);
__host__ __device__ cucomp64 tanh(cucomp64 z);
__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp64 csgn(cucomp64 z);
void test_cucomp64_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif
#ifndef __AMSCU_COMP64_HPP__
#define __AMSCU_COMP64_HPP__
namespace amscuda
{
namespace cmp
{
class cucomp64
{
public:
float real;
float imag;
__host__ __device__ cucomp64();
__host__ __device__ ~cucomp64();
__host__ __device__ cucomp64(const cucomp64 &other);
__host__ __device__ cucomp64(const float &other);
__host__ __device__ cucomp64& operator=(cucomp64& other);
__host__ __device__ const cucomp64& operator=(const cucomp64& other);
__host__ __device__ cucomp64& operator=(float& other);
__host__ __device__ const cucomp64& operator=(const float& other);
__host__ __device__ float& operator[](int& ind);
__host__ __device__ const float& operator[](const int& ind) const;
__host__ __device__ cucomp64 operator+(const cucomp64& z);
__host__ __device__ cucomp64 operator-(const cucomp64& z);
__host__ __device__ cucomp64 operator*(const cucomp64& z);
__host__ __device__ cucomp64 operator/(const cucomp64& z);
__host__ __device__ cucomp64 operator+(const float& z);
__host__ __device__ cucomp64 operator-(const float& z);
__host__ __device__ cucomp64 operator*(const float& z);
__host__ __device__ cucomp64 operator/(const float& z);
__host__ __device__ friend cucomp64 operator-(const cucomp64& z); //negation sign
//comparison operators
__host__ __device__ bool operator==(const cucomp64& z) const;
__host__ __device__ bool operator!=(const cucomp64& z) const;
__host__ __device__ bool operator>(const cucomp64& z) const;
__host__ __device__ bool operator<(const cucomp64& z) const;
__host__ __device__ bool operator>=(const cucomp64& z) const;
__host__ __device__ bool operator<=(const cucomp64& z) const;
__host__ __device__ bool isnan() const;
__host__ __device__ bool isinf() const;
__host__ __device__ bool isreal() const;
__host__ __device__ bool isimag() const;
__host__ __device__ bool iszero() const;
__host__ __device__ float arg() const;
__host__ __device__ float mag() const;
__host__ __device__ cucomp64 conj() const;
};
__host__ __device__ float arg(cucomp64 z);
__host__ __device__ cucomp64 dtocomp64(float _r, float _i);
__host__ __device__ float real(cucomp64 z);
__host__ __device__ float imag(cucomp64 z);
__host__ __device__ cucomp64 sin(cucomp64 z);
__host__ __device__ cucomp64 cos(cucomp64 z);
__host__ __device__ cucomp64 tan(cucomp64 z);
__host__ __device__ cucomp64 exp(cucomp64 z);
__host__ __device__ cucomp64 log(cucomp64 z);
__host__ __device__ float abs(cucomp64 z);
__host__ __device__ cucomp64 conj(cucomp64 z);
// //need hyperbolic trig Functions
__host__ __device__ cucomp64 cosh(cucomp64 z);
__host__ __device__ cucomp64 sinh(cucomp64 z);
__host__ __device__ cucomp64 tanh(cucomp64 z);
__host__ __device__ cucomp64 pow(cucomp64 z1, cucomp64 z2);
// //returns "complex sign" of complex number - 0, or a unit number with same argument
__host__ __device__ cucomp64 csgn(cucomp64 z);
void test_cucomp64_1();
}; //end namespace cmp
}; //end namespace amscuda
#endif

View File

@ -1,40 +1,40 @@
#ifndef __AMSCU_CUDAFUNCTIONS_HPP__
#define __AMSCU_CUDAFUNCTIONS_HPP__
namespace amscuda
{
// device memory operations
// I'm trying to avoid some of the boilerplate mental overhead involved
// in calling cuda functions and handling errors
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr);
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite);
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite);
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr);
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr);
int cuda_errortrap(const char *msgheader);
};
#include <amsculib2/amscu_cudafunctions_impl.hpp>
#endif
#ifndef __AMSCU_CUDAFUNCTIONS_HPP__
#define __AMSCU_CUDAFUNCTIONS_HPP__
namespace amscuda
{
// device memory operations
// I'm trying to avoid some of the boilerplate mental overhead involved
// in calling cuda functions and handling errors
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr);
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite);
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host with new[] (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite);
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr);
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr);
int cuda_errortrap(const char *msgheader);
};
#include <amsculib2/amscu_cudafunctions_impl.hpp>
#endif

View File

@ -1,228 +1,228 @@
#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
namespace amscuda
{
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(*devptr==NULL)
{
return ret; //devbuffer is already NULL/freed
}
err = cudaFree(*devptr);
if(err!=cudaSuccess)
{
ret = -1; //failed to free device pointer
*devptr = NULL; // - ? should only happen if I'm trying to double-free something
}
else
{
ret = 1;
*devptr = NULL;
}
return ret;
}
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(hostbuffer==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devbuffer !=NULL)
{
cuda_free(devbuffer);
}
}
if(*devbuffer==NULL)
{
err = cudaMalloc(devbuffer,sizeof(T)*N);
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devbuffer = NULL;
return ret;
}
}
err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(devbuffer==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostbuffer!=NULL)
{
delete[] (*hostbuffer); hostbuffer = NULL;
}
if(*hostbuffer==NULL)
{
*hostbuffer = new(std::nothrow) T[N];
if(*hostbuffer==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(hostptr==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devptr !=NULL)
{
cuda_free(devptr);
}
}
if(*devptr==NULL)
{
err = cudaMalloc(devptr,sizeof(T));
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devptr = NULL;
return ret;
}
}
err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(devptr==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostptr!=NULL)
{
delete (*hostptr); hostptr = NULL;
}
if(*hostptr==NULL)
{
*hostptr = new(std::nothrow) T;
if(*hostptr==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
};
#endif
#ifndef __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
#define __AMSCU_CUDAFUNCTIONS_IMPL_HPP__
namespace amscuda
{
//frees devbuffer if it is not already NULL, and sets devbuffer to NULL
//wrapper to cudaFree
template<typename T> int cuda_free(T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(*devptr==NULL)
{
return ret; //devbuffer is already NULL/freed
}
err = cudaFree(*devptr);
if(err!=cudaSuccess)
{
ret = -1; //failed to free device pointer
*devptr = NULL; // - ? should only happen if I'm trying to double-free something
}
else
{
ret = 1;
*devptr = NULL;
}
return ret;
}
//copies hostbuffer to devbuffer
//initializes devbuffer from NULL if devbuffer is NULL
//if overwrite is true, deletes and reallocates devbuffer on device (for resizing)
template<typename T> int buffer_copytodevice(T *hostbuffer, T **devbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(hostbuffer==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devbuffer !=NULL)
{
cuda_free(devbuffer);
}
}
if(*devbuffer==NULL)
{
err = cudaMalloc(devbuffer,sizeof(T)*N);
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devbuffer = NULL;
return ret;
}
}
err = cudaMemcpy(*devbuffer,hostbuffer,sizeof(T)*N,cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//copies info from devbuffer to hostbuffer
//initialzies hostbuffer from NULL if NULL
//if overwrite is true, deletes and reallocates hostbuffer on host (for resizing)
template<typename T> int buffer_copyfromdevice(T *devbuffer, T **hostbuffer, long N, bool overwrite)
{
int ret = 0;
cudaError_t err = cudaSuccess;
if(N<=0)
{
ret = 0;
return ret;
}
if(devbuffer==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostbuffer!=NULL)
{
delete[] (*hostbuffer); hostbuffer = NULL;
}
if(*hostbuffer==NULL)
{
*hostbuffer = new(std::nothrow) T[N];
if(*hostbuffer==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostbuffer, devbuffer, sizeof(T)*N, cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) to the device
//initializes devptr from NULL if not already initialized
template<typename T> int cuda_copytodevice(T *hostptr, T **devptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(hostptr==NULL)
{
ret = -2; //host buffer is NULL
return ret;
}
if(overwrite==1)
{
if(*devptr !=NULL)
{
cuda_free(devptr);
}
}
if(*devptr==NULL)
{
err = cudaMalloc(devptr,sizeof(T));
if(err!=cudaSuccess)
{
ret = -3; //failed to allocate
*devptr = NULL;
return ret;
}
}
err = cudaMemcpy(*devptr,hostptr,sizeof(T),cudaMemcpyHostToDevice);
if(err!=cudaSuccess)
{
ret = -4; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
//wrapper for cudaMemcpy - copies an item or struct (count 1) from device
//initializes hostptr from NULL with new if not already initialized
template<typename T> int cuda_copyfromdevice(T *devptr, T **hostptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
bool overwrite = 1;
if(devptr==NULL)
{
ret = -5; //null dev buffer
return ret;
}
if(overwrite==1 && *hostptr!=NULL)
{
delete (*hostptr); hostptr = NULL;
}
if(*hostptr==NULL)
{
*hostptr = new(std::nothrow) T;
if(*hostptr==NULL)
{
ret = -6; //failed to allocate host buffer
return ret;
}
}
err = cudaMemcpy(*hostptr, devptr, sizeof(T), cudaMemcpyDeviceToHost);
if(err!=cudaSuccess)
{
ret = -7; //failed to copy
}
else
{
ret = 1;
}
return ret;
}
};
#endif

View File

@ -1,55 +1,55 @@
#ifndef __AMSCU_RANDOM_HPP__
#define __AMSCU_RANDOM_HPP__
namespace amscuda
{
// Random Number Gerneators
// faster floating point hash function used in fractal generators
__device__ __host__ float fhash1d_su(float x);
__device__ __host__ float fhash3d_su(float x, float y, float z);
__device__ __host__ float fhash4d_su(float x, float y, float z, float w);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int32_t Generator //
//////////////////////////////////////////////////
//Next seed in simple 32 bit integer deterministic psuedo-rand generator
__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout);
//Simple 32 bit integer deterministic pseudo-random generator
// *not* for cryptography
// Frequency of generated floats should be uniform [0,1)
__host__ __device__ float dpr32_randf(int32_t *rseed_inout);
//box muller standard normal pseudorandom variable
__host__ __device__ float dpr32_randnf(int32_t *rseed_inout);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int64_t Generator //
//////////////////////////////////////////////////
//operates without side-effects on explicit seed for threaded use
//deterministic pseudorandom number generator - takes seed and returns next seed
__host__ __device__ void dpr64_nextseed(int64_t *seedinout);
//deterministic pseudorandom number generator - takes seed and returns next seed
//returns uniformly distributed double
__host__ __device__ double dpr64_randd(int64_t *seedinout);
__host__ __device__ float dpr64_randf(int64_t *seedinout);
void test_dprg64();
void test_dprg32();
}; //end namespace amscuda
#endif
#ifndef __AMSCU_RANDOM_HPP__
#define __AMSCU_RANDOM_HPP__
namespace amscuda
{
// Random Number Gerneators
// faster floating point hash function used in fractal generators
__device__ __host__ float fhash1d_su(float x);
__device__ __host__ float fhash3d_su(float x, float y, float z);
__device__ __host__ float fhash4d_su(float x, float y, float z, float w);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int32_t Generator //
//////////////////////////////////////////////////
//Next seed in simple 32 bit integer deterministic psuedo-rand generator
__host__ __device__ void dpr32_nextseed(int32_t *rseed_inout);
//Simple 32 bit integer deterministic pseudo-random generator
// *not* for cryptography
// Frequency of generated floats should be uniform [0,1)
__host__ __device__ float dpr32_randf(int32_t *rseed_inout);
//box muller standard normal pseudorandom variable
__host__ __device__ float dpr32_randnf(int32_t *rseed_inout);
//////////////////////////////////////////////////
// Deterministic Pseudorandom int64_t Generator //
//////////////////////////////////////////////////
//operates without side-effects on explicit seed for threaded use
//deterministic pseudorandom number generator - takes seed and returns next seed
__host__ __device__ void dpr64_nextseed(int64_t *seedinout);
//deterministic pseudorandom number generator - takes seed and returns next seed
//returns uniformly distributed double
__host__ __device__ double dpr64_randd(int64_t *seedinout);
__host__ __device__ float dpr64_randf(int64_t *seedinout);
void test_dprg64();
void test_dprg32();
}; //end namespace amscuda
#endif

View File

@ -1,47 +1,47 @@
#ifndef __CUARRAY_HPP__
#define __CUARRAY_HPP__
namespace amscuda
{
template<typename T> class cuarray
{
public:
int length;
T* data;
__device__ __host__ cuarray();
__device__ __host__ ~cuarray();
//Only call this on the device for thread/block local
// dynamic arrays
__device__ __host__ int resize(const int _length);
__device__ __host__ int size() const;
__device__ __host__ T& at(const int I);
__device__ __host__ const T& at(const int I) const;
__device__ __host__ T& operator[](const int I);
__device__ __host__ const T& operator[](const int I) const;
__host__ int device_send(cuarray<T> **dptr);
__host__ int _device_send_overwrite(cuarray<T> **dptr);
__host__ int _device_send_copy(cuarray<T> *dptr);
__host__ int device_pull(cuarray<T> *dptr);
__host__ int device_free(cuarray<T> **dptr);
__host__ int device_length(cuarray<T> *dptr);
__host__ T* device_data_ptr(cuarray<T> *dptr);
};
void test_cuarray();
};
#include <amsculib2/amscuarray_impl.hpp>
#ifndef __CUARRAY_HPP__
#define __CUARRAY_HPP__
namespace amscuda
{
template<typename T> class cuarray
{
public:
int length;
T* data;
__device__ __host__ cuarray();
__device__ __host__ ~cuarray();
//Only call this on the device for thread/block local
// dynamic arrays
__device__ __host__ int resize(const int _length);
__device__ __host__ int size() const;
__device__ __host__ T& at(const int I);
__device__ __host__ const T& at(const int I) const;
__device__ __host__ T& operator[](const int I);
__device__ __host__ const T& operator[](const int I) const;
__host__ int device_send(cuarray<T> **dptr);
__host__ int _device_send_overwrite(cuarray<T> **dptr);
__host__ int _device_send_copy(cuarray<T> *dptr);
__host__ int device_pull(cuarray<T> *dptr);
__host__ int device_free(cuarray<T> **dptr);
__host__ int device_length(cuarray<T> *dptr);
__host__ T* device_data_ptr(cuarray<T> *dptr);
};
void test_cuarray();
};
#include <amsculib2/amscuarray_impl.hpp>
#endif

View File

@ -1,76 +1,76 @@
#ifndef __AMSCUARRAY_DOPS_HPP__
#define __AMSCUARRAY_DOPS_HPP__
//Device Operations on Arrays
//
//Device Operations on Device Buffers
// dodb
namespace amscuda
{
//sum
template<typename T> T devcuarray_sum(cuarray<T> *devptr);
template<typename T> T dbuff_sum(T *devbuffer, int N);
struct dbuff_statstruct
{
public:
float min;
float max;
float mean;
float stdev;
float sum;
};
//stats (min,max,mean,stdev)
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max);
template<typename T> dbuff_statstruct dbuff_stats(T *devbuffer, int N); //
//sets all elements to setto
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads);
//random device buffer functions
void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads); //
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//vector_add
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
// Tests //
void test_dbuff_rand_dpr32();
};
#include <amsculib2/amscuarray_dops_impl.hpp>
#endif
#ifndef __AMSCUARRAY_DOPS_HPP__
#define __AMSCUARRAY_DOPS_HPP__
//Device Operations on Arrays
//
//Device Operations on Device Buffers
// dodb
namespace amscuda
{
//sum
template<typename T> T devcuarray_sum(cuarray<T> *devptr);
template<typename T> T dbuff_sum(T *devbuffer, int N);
struct dbuff_statstruct
{
public:
float min;
float max;
float mean;
float stdev;
float sum;
};
//stats (min,max,mean,stdev)
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max);
template<typename T> dbuff_statstruct dbuff_stats(T *devbuffer, int N); //
//sets all elements to setto
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads);
//random device buffer functions
void dbuff_rand_dpr32(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr32n(float *devbuffer, int N, int32_t *rseedinout, int nblocks, int nthreads); //
void dbuff_rand_dpr64(float *devbuffer, int N, int64_t *rseedinout, int nblocks, int nthreads); //
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads);
//vector_add
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads);
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads);
// Tests //
void test_dbuff_rand_dpr32();
};
#include <amsculib2/amscuarray_dops_impl.hpp>
#endif

View File

@ -1,404 +1,404 @@
#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
#define __AMSCUARRAY_DOPS_IMPL_HPP__
namespace amscuda
{
template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T ret = (T) 0;
for(I=I0;I<N;I=I+Is)
{
ret = ret + devbuffer[I];
}
rets[I0] = ret;
}
template<typename T> T devcuarray_sum(cuarray<T> *devptr)
{
T ret = T();
cudaError_t err = cudaSuccess;
cuarray<T> ldptr;
cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = devbuffer_sum(ldptr.data,ldptr.length);
ldptr.data = NULL;
ldptr.length=0;
return ret;
}
template<typename T> T dbuff_sum(T *dbuff, int N)
{
int I;
T ret = T();
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
if(dbuff==NULL || N<=0)
{
return ret;
}
if(N>100)
{
nblocks = 10;
nthreads = (int)sqrt((float) (N/nblocks));
if(nthreads<=0) nthreads=1;
if(nthreads>512) nthreads=512;
}
else
{
nblocks = 1;
nthreads = 1;
}
T *rets = NULL;
T *devrets = NULL;
rets = new T[nblocks*nthreads];
cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
ret = (T)0;
for(I=0;I<nblocks*nthreads;I++)
{
ret = ret + rets[I];
}
cudaFree(devrets); devrets = NULL;
delete[] rets;
return ret;
}
template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
if(I==I0)
{
maxs[I0] = devbuffer[I];
mins[I0] = devbuffer[I];
}
else
{
if(devbuffer[I]>maxs[I0])
{
maxs[I0] = devbuffer[I];
}
if(devbuffer[I]<mins[I0])
{
mins[I0] = devbuffer[I];
}
}
}
return;
}
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
{
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
int I;
T *maxs = NULL;
T *dev_maxs = NULL;
T *mins = NULL;
T *dev_mins = NULL;
T localmax = T(0);
T localmin = T(0);
if(devbuffer==NULL || N<=0)
{
if(min!=NULL) *min = T(0);
if(max!=NULL) *max = T(0);
return;
}
if(N>25)
{
nblocks = 25;
nthreads = (int) sqrt((float)(N/nblocks));
if(nthreads<1) nthreads = 1;
if(nthreads>512) nthreads = 512;
}
else
{
nblocks = 1;
nthreads = 1;
}
maxs = new T[nblocks*nthreads];
mins = new T[nblocks*nthreads];
cudaMalloc(&dev_maxs,nblocks*nthreads);
cudaMalloc(&dev_mins,nblocks*nthreads);
dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
for(I=0;I<nblocks*nthreads;I++)
{
if(I==0)
{
localmax = maxs[0];
localmin = mins[0];
}
else
{
if(maxs[I]>localmax) localmax = maxs[I];
if(mins[I]<localmin) localmin = mins[I];
}
}
if(max!=NULL) *max = localmax;
if(min!=NULL) *min = localmin;
cudaFree(dev_maxs); dev_maxs = NULL;
cudaFree(dev_mins); dev_mins = NULL;
delete[] maxs; maxs = NULL;
delete[] mins; mins = NULL;
return;
}
template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
devbuffer[I] = setto;
}
return;
}
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(devbuffer==NULL || N<=0)
{
return;
}
dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = dbuf_b[I];
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = par_b;
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T> T dbuff_add_fn(T a, T b)
{
return a+b;
}
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_sub_fn(T a, T b)
{
return a-b;
}
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_mult_fn(T a, T b)
{
return a*b;
}
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_div_fn(T a, T b)
{
return a/b;
}
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_ldiv_fn(T a, T b)
{
return b/a;
}
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
return;
}
};
#endif
#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
#define __AMSCUARRAY_DOPS_IMPL_HPP__
namespace amscuda
{
template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T ret = (T) 0;
for(I=I0;I<N;I=I+Is)
{
ret = ret + devbuffer[I];
}
rets[I0] = ret;
}
template<typename T> T devcuarray_sum(cuarray<T> *devptr)
{
T ret = T();
cudaError_t err = cudaSuccess;
cuarray<T> ldptr;
cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = devbuffer_sum(ldptr.data,ldptr.length);
ldptr.data = NULL;
ldptr.length=0;
return ret;
}
template<typename T> T dbuff_sum(T *dbuff, int N)
{
int I;
T ret = T();
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
if(dbuff==NULL || N<=0)
{
return ret;
}
if(N>100)
{
nblocks = 10;
nthreads = (int)sqrt((float) (N/nblocks));
if(nthreads<=0) nthreads=1;
if(nthreads>512) nthreads=512;
}
else
{
nblocks = 1;
nthreads = 1;
}
T *rets = NULL;
T *devrets = NULL;
rets = new T[nblocks*nthreads];
cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
ret = (T)0;
for(I=0;I<nblocks*nthreads;I++)
{
ret = ret + rets[I];
}
cudaFree(devrets); devrets = NULL;
delete[] rets;
return ret;
}
template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
if(I==I0)
{
maxs[I0] = devbuffer[I];
mins[I0] = devbuffer[I];
}
else
{
if(devbuffer[I]>maxs[I0])
{
maxs[I0] = devbuffer[I];
}
if(devbuffer[I]<mins[I0])
{
mins[I0] = devbuffer[I];
}
}
}
return;
}
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
{
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
int I;
T *maxs = NULL;
T *dev_maxs = NULL;
T *mins = NULL;
T *dev_mins = NULL;
T localmax = T(0);
T localmin = T(0);
if(devbuffer==NULL || N<=0)
{
if(min!=NULL) *min = T(0);
if(max!=NULL) *max = T(0);
return;
}
if(N>25)
{
nblocks = 25;
nthreads = (int) sqrt((float)(N/nblocks));
if(nthreads<1) nthreads = 1;
if(nthreads>512) nthreads = 512;
}
else
{
nblocks = 1;
nthreads = 1;
}
maxs = new T[nblocks*nthreads];
mins = new T[nblocks*nthreads];
cudaMalloc(&dev_maxs,nblocks*nthreads);
cudaMalloc(&dev_mins,nblocks*nthreads);
dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
for(I=0;I<nblocks*nthreads;I++)
{
if(I==0)
{
localmax = maxs[0];
localmin = mins[0];
}
else
{
if(maxs[I]>localmax) localmax = maxs[I];
if(mins[I]<localmin) localmin = mins[I];
}
}
if(max!=NULL) *max = localmax;
if(min!=NULL) *min = localmin;
cudaFree(dev_maxs); dev_maxs = NULL;
cudaFree(dev_mins); dev_mins = NULL;
delete[] maxs; maxs = NULL;
delete[] mins; mins = NULL;
return;
}
template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
devbuffer[I] = setto;
}
return;
}
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(devbuffer==NULL || N<=0)
{
return;
}
dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = dbuf_b[I];
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = par_b;
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T> T dbuff_add_fn(T a, T b)
{
return a+b;
}
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_sub_fn(T a, T b)
{
return a-b;
}
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_mult_fn(T a, T b)
{
return a*b;
}
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_div_fn(T a, T b)
{
return a/b;
}
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_ldiv_fn(T a, T b)
{
return b/a;
}
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
return;
}
};
#endif

View File

@ -1,323 +1,323 @@
#ifndef __CUARRAY_IMPL_HPP__
#define __CUARRAY_IMPL_HPP__
namespace amscuda
{
// New Version cuarray<T>
// simpler, less crap going on
template<typename T> __device__ __host__ cuarray<T>::cuarray()
{
length = 0;
data = NULL;
}
template<typename T> __device__ __host__ cuarray<T>::~cuarray()
{
if(data!=NULL)
{
delete[] data; data = NULL;
}
length = 0;
}
template<typename T> __device__ __host__ int cuarray<T>::resize(const int _length)
{
int ret = 0;
T *newbuffer = NULL;
if(length==_length)
{
//do nothing
ret = 1;
return ret;
}
if(_length<=0)
{
if(data!=NULL)
{
delete[] data;
data = NULL;
}
length = 0;
ret = 1;
}
newbuffer = new T[_length];
if(newbuffer==NULL)
{
ret = -1; //failed to allocate memory
return ret;
}
int I;
T def;
if(data!=NULL)
{
for(I=0;I<length&&I<_length;I++)
{
newbuffer[I] = data[I];
}
for(I=length;I<_length;I++)
{
newbuffer[I] = def;
}
delete[] data; data=NULL;
}
else
{
for(I=0;I<_length;I++)
{
newbuffer[I] = def;
}
}
data = newbuffer;
length = _length;
ret = 1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_send(cuarray<T> **dptr)
{
int ret = 0;
int dlength;
if(*dptr==NULL)
{
ret = _device_send_overwrite(dptr);
}
else
{
dlength = device_length(*dptr);
if(dlength=length)
{
ret = _device_send_copy(*dptr);
}
else
{
ret = _device_send_overwrite(dptr);
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_overwrite(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
cudaError_t err = cudaSuccess;
device_free(dptr);
if(length>=0 && data!=NULL)
{
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
err = cudaMalloc(&(dlocal.data),sizeof(T)*length);
dlocal.length = length;
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
if(data!=NULL)
err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice);
else
err = cudaSuccess;
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -3;
}
}
else
{
ret = -2;
}
}
else
{
ret = -1;
}
}
else
{
dlocal.data = NULL;
dlocal.length = 0;
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
ret = 1;
}
else
{
ret = -4;
}
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_copy(cuarray<T> *dptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
T* ddata = NULL;
ddata = device_data_ptr(dptr);
err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -1;
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_pull(cuarray<T> *dptr)
{
int ret = 0;
int dlength;
T* ddata;
cudaError_t err;
if(dptr==NULL)
{
ret = -1; // null d pointer
return ret;
}
dlength = device_length(dptr);
if(dlength!=length)
{
this->resize(dlength);
}
ddata = device_data_ptr(dptr);
if(length>0 && data!=NULL && ddata!=NULL)
{
err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -2;
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_free(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
if(*dptr!=NULL)
{
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
if(dlocal.data!=NULL)
{
cudaFree(dlocal.data);
dlocal.data = NULL;
}
cudaFree(*dptr);
*dptr = NULL;
ret = 1;
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_length(cuarray<T> *dptr)
{
int ret = -1;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.length;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ T* cuarray<T>::device_data_ptr(cuarray<T> *dptr)
{
T* ret = NULL;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.data;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __device__ __host__ int cuarray<T>::size() const
{
return this->length;
}
template<typename T> __device__ __host__ T& cuarray<T>::at(const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::at(const int I) const
{
return this->data[I];
}
template<typename T> __device__ __host__ T& cuarray<T>::operator[](const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::operator[](const int I) const
{
return this->data[I];
}
};
#ifndef __CUARRAY_IMPL_HPP__
#define __CUARRAY_IMPL_HPP__
namespace amscuda
{
// New Version cuarray<T>
// simpler, less crap going on
template<typename T> __device__ __host__ cuarray<T>::cuarray()
{
length = 0;
data = NULL;
}
template<typename T> __device__ __host__ cuarray<T>::~cuarray()
{
if(data!=NULL)
{
delete[] data; data = NULL;
}
length = 0;
}
template<typename T> __device__ __host__ int cuarray<T>::resize(const int _length)
{
int ret = 0;
T *newbuffer = NULL;
if(length==_length)
{
//do nothing
ret = 1;
return ret;
}
if(_length<=0)
{
if(data!=NULL)
{
delete[] data;
data = NULL;
}
length = 0;
ret = 1;
}
newbuffer = new T[_length];
if(newbuffer==NULL)
{
ret = -1; //failed to allocate memory
return ret;
}
int I;
T def;
if(data!=NULL)
{
for(I=0;I<length&&I<_length;I++)
{
newbuffer[I] = data[I];
}
for(I=length;I<_length;I++)
{
newbuffer[I] = def;
}
delete[] data; data=NULL;
}
else
{
for(I=0;I<_length;I++)
{
newbuffer[I] = def;
}
}
data = newbuffer;
length = _length;
ret = 1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_send(cuarray<T> **dptr)
{
int ret = 0;
int dlength;
if(*dptr==NULL)
{
ret = _device_send_overwrite(dptr);
}
else
{
dlength = device_length(*dptr);
if(dlength=length)
{
ret = _device_send_copy(*dptr);
}
else
{
ret = _device_send_overwrite(dptr);
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_overwrite(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
cudaError_t err = cudaSuccess;
device_free(dptr);
if(length>=0 && data!=NULL)
{
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
err = cudaMalloc(&(dlocal.data),sizeof(T)*length);
dlocal.length = length;
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
if(data!=NULL)
err = cudaMemcpy(dlocal.data,data,sizeof(T)*length,cudaMemcpyHostToDevice);
else
err = cudaSuccess;
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -3;
}
}
else
{
ret = -2;
}
}
else
{
ret = -1;
}
}
else
{
dlocal.data = NULL;
dlocal.length = 0;
err = cudaMalloc(dptr,sizeof(cuarray<T>));
if(err==cudaSuccess)
{
cudaMemcpy(*dptr,&dlocal,sizeof(cuarray<T>),cudaMemcpyHostToDevice);
ret = 1;
}
else
{
ret = -4;
}
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::_device_send_copy(cuarray<T> *dptr)
{
int ret = 0;
cudaError_t err = cudaSuccess;
T* ddata = NULL;
ddata = device_data_ptr(dptr);
err = cudaMemcpy(ddata,data,sizeof(T)*length,cudaMemcpyHostToDevice);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -1;
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_pull(cuarray<T> *dptr)
{
int ret = 0;
int dlength;
T* ddata;
cudaError_t err;
if(dptr==NULL)
{
ret = -1; // null d pointer
return ret;
}
dlength = device_length(dptr);
if(dlength!=length)
{
this->resize(dlength);
}
ddata = device_data_ptr(dptr);
if(length>0 && data!=NULL && ddata!=NULL)
{
err = cudaMemcpy(data,dptr,length*sizeof(T),cudaMemcpyDeviceToHost);
if(err==cudaSuccess)
{
ret = 1;
}
else
{
ret = -2;
}
}
return ret;
}
template<typename T> __host__ int cuarray<T>::device_free(cuarray<T> **dptr)
{
int ret = 0;
cuarray<T> dlocal;
if(*dptr!=NULL)
{
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
if(dlocal.data!=NULL)
{
cudaFree(dlocal.data);
dlocal.data = NULL;
}
cudaFree(*dptr);
*dptr = NULL;
ret = 1;
}
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ int cuarray<T>::device_length(cuarray<T> *dptr)
{
int ret = -1;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.length;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __host__ T* cuarray<T>::device_data_ptr(cuarray<T> *dptr)
{
T* ret = NULL;
cuarray<T> dlocal;
if(dptr==NULL)
{
return ret;
}
cudaMemcpy(&dlocal,dptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = dlocal.data;
dlocal.data = NULL;
dlocal.length = -1;
return ret;
}
template<typename T> __device__ __host__ int cuarray<T>::size() const
{
return this->length;
}
template<typename T> __device__ __host__ T& cuarray<T>::at(const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::at(const int I) const
{
return this->data[I];
}
template<typename T> __device__ __host__ T& cuarray<T>::operator[](const int I)
{
return this->data[I];
}
template<typename T> __device__ __host__ const T& cuarray<T>::operator[](const int I) const
{
return this->data[I];
}
};
#endif

View File

@ -1,19 +1,19 @@
#ifndef __AMSCUDA_BINARRRW_HPP__
#define __AMSCUDA_BINARRRW_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer);
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer);
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer);
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer);
}; //end namespace amscuda
#include <amsculib2/amscuda_binarrrw_impl.hpp>
#endif
#ifndef __AMSCUDA_BINARRRW_HPP__
#define __AMSCUDA_BINARRRW_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer);
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer);
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer);
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer);
}; //end namespace amscuda
#include <amsculib2/amscuda_binarrrw_impl.hpp>
#endif

View File

@ -1,194 +1,194 @@
#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__
#define __AMSCUDA_BINARRRW_IMPL_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer)
{
int ret = 1;
int I;
long piprod;
int32_t q;
int cnt;
int32_t Nd;
if(fp!=NULL)
{
if(!feof(fp))
{
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
shape->resize(Nd);
piprod = 1;
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
shape->at(I) = q;
if(q>0)
{
piprod = piprod*q;
}
else
{
piprod = 0;
}
}
buffer->resize(piprod);
if(piprod>0)
{
cnt = fread((buffer->data),sizeof(T),piprod,fp);
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod);
ret = 0;
}
}
}
else
{
printf("fread_ndarray: Read a number of dimensions<=0.\n");
Nd = 0;
shape->resize(0);
buffer->resize(0);
}
}
else
{
printf("fread_ndarray: fp=NULL.\n");
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer)
{
int ret = 1;
long piprod;
int I;
int32_t Nd;
if(fp==NULL)
{
ret = 0;
printf("fwrite_ndarray: fp=NULL\n");
return ret;
}
piprod = 1;
for(I=0;I<shape->size();I++)
{
if(shape->at(I)>0)
{
piprod = piprod*shape->at(I);
}
else
{
piprod = 0;
}
}
Nd = (int32_t) shape->size();
if(piprod!=buffer->size())
{
ret = 0;
printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod);
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
if(Nd>0)
{
fwrite(shape->data,sizeof(int32_t),Nd,fp);
if(piprod>0)
{
fwrite(buffer->data,sizeof(T),buffer->size(),fp);
}
}
return ret;
}
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer)
{
int ret = 0;
int Nd = 1;
if(fp==NULL)
{
ret = 0;
printf("fwrite_buffer: fp=NULL\n");
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
fwrite(&N,sizeof(int32_t),1,fp);
fwrite(buffer,sizeof(T),N,fp);
return ret;
}
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer)
{
int ret = 0;
int cnt;
int32_t Nd;
int32_t *dims = NULL;
int piprod;
int32_t q;
int I;
int Nr;
if(fp==NULL) {ret = -1; return ret;}
if(feof(fp)) {ret = -2; return ret;}
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
piprod = 1;
dims = new(std::nothrow) int32_t[Nd];
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
dims[I] = q;
piprod = piprod*dims[I];
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fwrite_buffer, read %d values, expecting %d\n",cnt,piprod);
}
}
Nr = amscuda::min<int32_t>(Nmax,piprod);
cnt = fread(buffer,sizeof(T),Nr,fp);
}
if(dims!=NULL) {delete[] dims; dims=NULL;}
return ret;
}
}; //end namespace amscuda
#endif
#ifndef __AMSCUDA_BINARRRW_IMPL_HPP__
#define __AMSCUDA_BINARRRW_IMPL_HPP__
namespace amscuda
{
template<typename T> int fread_ndarray(FILE *fp, cuarray<int32_t> *shape, cuarray<T> *buffer)
{
int ret = 1;
int I;
long piprod;
int32_t q;
int cnt;
int32_t Nd;
if(fp!=NULL)
{
if(!feof(fp))
{
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
shape->resize(Nd);
piprod = 1;
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
shape->at(I) = q;
if(q>0)
{
piprod = piprod*q;
}
else
{
piprod = 0;
}
}
buffer->resize(piprod);
if(piprod>0)
{
cnt = fread((buffer->data),sizeof(T),piprod,fp);
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fread_ndarray, read %d values, expecting %ld\n",cnt,piprod);
ret = 0;
}
}
}
else
{
printf("fread_ndarray: Read a number of dimensions<=0.\n");
Nd = 0;
shape->resize(0);
buffer->resize(0);
}
}
else
{
printf("fread_ndarray: fp=NULL.\n");
ret = 0;
}
}
else
{
ret = 0;
}
return ret;
}
template<typename T> int fwrite_ndarray(FILE *fp, const cuarray<int32_t> *shape, const cuarray<T> *buffer)
{
int ret = 1;
long piprod;
int I;
int32_t Nd;
if(fp==NULL)
{
ret = 0;
printf("fwrite_ndarray: fp=NULL\n");
return ret;
}
piprod = 1;
for(I=0;I<shape->size();I++)
{
if(shape->at(I)>0)
{
piprod = piprod*shape->at(I);
}
else
{
piprod = 0;
}
}
Nd = (int32_t) shape->size();
if(piprod!=buffer->size())
{
ret = 0;
printf("fwrite_ndarray: buffer is size %ld, while shape is size %ld\n",(long)buffer->size(),(long)piprod);
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
if(Nd>0)
{
fwrite(shape->data,sizeof(int32_t),Nd,fp);
if(piprod>0)
{
fwrite(buffer->data,sizeof(T),buffer->size(),fp);
}
}
return ret;
}
template<typename T> int fwrite_buffer(FILE *fp, const int N, const T *buffer)
{
int ret = 0;
int Nd = 1;
if(fp==NULL)
{
ret = 0;
printf("fwrite_buffer: fp=NULL\n");
return ret;
}
fwrite(&Nd,sizeof(int32_t),1,fp);
fwrite(&N,sizeof(int32_t),1,fp);
fwrite(buffer,sizeof(T),N,fp);
return ret;
}
template<typename T> int fread_buffer(FILE *fp, const int Nmax, const T *buffer)
{
int ret = 0;
int cnt;
int32_t Nd;
int32_t *dims = NULL;
int piprod;
int32_t q;
int I;
int Nr;
if(fp==NULL) {ret = -1; return ret;}
if(feof(fp)) {ret = -2; return ret;}
cnt = fread(&Nd,sizeof(int32_t),1,fp);
if(Nd>0 && cnt>0)
{
piprod = 1;
dims = new(std::nothrow) int32_t[Nd];
for(I=0;I<Nd;I++)
{
cnt = fread(&q,sizeof(int32_t),1,fp);
dims[I] = q;
piprod = piprod*dims[I];
if(piprod==cnt)
{
ret = 1;
}
else
{
printf("fwrite_buffer, read %d values, expecting %d\n",cnt,piprod);
}
}
Nr = amscuda::min<int32_t>(Nmax,piprod);
cnt = fread(buffer,sizeof(T),Nr,fp);
}
if(dims!=NULL) {delete[] dims; dims=NULL;}
return ret;
}
}; //end namespace amscuda
#endif

View File

@ -1,11 +1,11 @@
#ifndef __AMSCUGEOM_HPP__
#define __AMSCUGEOM_HPP__
namespace amscuda
{
}; //end namespace amscuda
#endif
#ifndef __AMSCUGEOM_HPP__
#define __AMSCUGEOM_HPP__
namespace amscuda
{
}; //end namespace amscuda
#endif

View File

@ -1,70 +1,70 @@
#ifndef __AMSCULIB2_HPP__
#define __AMSCULIB2_HPP__
//Std Lib Includes
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <time.h>
#include <new>
#include <cuda_runtime_api.h> //where all the cuda functions live
#include <cuda_runtime.h>
#include <cuda.h>
//Dependencies
//Predeclarations
class cuvect2;
class cuvect3;
class cuvect4;
class cuvect2f;
class cuvect3f;
class cuvect4f;
//Need a way to define the same symbols using both host and device code
//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory
#ifdef __CUDA_ARCH__
#define AMSCU_CONST __constant__
#else
#define AMSCU_CONST
#endif
namespace amscuda
{
//default thread and block execution
AMSCU_CONST static const int amscu_defnblocks = 256;
AMSCU_CONST static const int amscu_defnthreads = 512;
//default numthreads to execute on cpu
AMSCU_CONST static const int amscu_defcputhreads = 8;
}; //end namespace amscuda
//Components
#include <amsculib2/amscu_cudafunctions.hpp>
#include <amsculib2/amscumath.hpp>
#include <amsculib2/amscu_comp64.hpp>
#include <amsculib2/amscu_comp128.hpp>
#include <amsculib2/cuvect2.hpp>
#include <amsculib2/cuvect3.hpp>
#include <amsculib2/cuvect4.hpp>
#include <amsculib2/cuvect2f.hpp>
#include <amsculib2/cuvect3f.hpp>
#include <amsculib2/cuvect4f.hpp>
#include <amsculib2/amscugeom.hpp>
#include <amsculib2/amscuarray.hpp>
#include <amsculib2/amscuda_binarrrw.hpp>
#include <amsculib2/amscu_random.hpp>
#include <amsculib2/amscuarray_dops.hpp>
#include <amsculib2/amscurarray.cuh>
#endif
#ifndef __AMSCULIB2_HPP__
#define __AMSCULIB2_HPP__
//Std Lib Includes
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <time.h>
#include <new>
#include <cuda_runtime_api.h> //where all the cuda functions live
#include <cuda_runtime.h>
#include <cuda.h>
//Dependencies
//Predeclarations
class cuvect2;
class cuvect3;
class cuvect4;
class cuvect2f;
class cuvect3f;
class cuvect4f;
//Need a way to define the same symbols using both host and device code
//A solution was found here: https://stackoverflow.com/questions/9457572/cuda-host-and-device-using-same-constant-memory
#ifdef __CUDA_ARCH__
#define AMSCU_CONST __constant__
#else
#define AMSCU_CONST
#endif
namespace amscuda
{
//default thread and block execution
AMSCU_CONST static const int amscu_defnblocks = 256;
AMSCU_CONST static const int amscu_defnthreads = 512;
//default numthreads to execute on cpu
AMSCU_CONST static const int amscu_defcputhreads = 8;
}; //end namespace amscuda
//Components
#include <amsculib2/amscu_cudafunctions.hpp>
#include <amsculib2/amscumath.hpp>
#include <amsculib2/amscu_comp64.hpp>
#include <amsculib2/amscu_comp128.hpp>
#include <amsculib2/cuvect2.hpp>
#include <amsculib2/cuvect3.hpp>
#include <amsculib2/cuvect4.hpp>
#include <amsculib2/cuvect2f.hpp>
#include <amsculib2/cuvect3f.hpp>
#include <amsculib2/cuvect4f.hpp>
#include <amsculib2/amscugeom.hpp>
#include <amsculib2/amscuarray.hpp>
#include <amsculib2/amscuda_binarrrw.hpp>
#include <amsculib2/amscu_random.hpp>
#include <amsculib2/amscuarray_dops.hpp>
#include <amsculib2/amscurarray.cuh>
#endif

View File

@ -1,56 +1,56 @@
#ifndef __AMSCUMATH_HPP__
#define __AMSCUMATH_HPP__
namespace amscuda
{
//Problem: These are not in the namespace
//#define nan NAN
//#define fnan (float) NAN
//#define inf INFINITY
//#define finf (float) INFINITY
//#define pi 3.1415926535897936
//These need to be the same symbol for both host and device code
AMSCU_CONST static const double nan = NAN;
AMSCU_CONST static const float fnan = (float) NAN;
AMSCU_CONST static const double inf = INFINITY;
AMSCU_CONST static const float finf = (float) INFINITY;
AMSCU_CONST static const double pi = 3.1415926535897936;
AMSCU_CONST static const float pif = 3.1415926535897936;
__host__ __device__ double dabs(double x);
__host__ __device__ float fabs(float x);
template<typename T> __host__ __device__ T abs(const T in)
{
T ret = in;
if(in<0) ret = -in;
return ret;
}
__host__ __device__ double mod(double a, double md);
__host__ __device__ float mod(float a, float md);
__host__ __device__ int mod(int x, int n);
__host__ __device__ long mod(long x, long n);
__host__ __device__ int truediv(int x, int y);
__host__ __device__ long truediv(long x, long y);
template<typename T> __host__ __device__ T min(T a, T b);
template<typename T> __host__ __device__ T max(T a, T b);
__device__ __host__ double arg(double x, double y);
__device__ __host__ void get_azel(double x, double y, double z, double *az, double *el);
void test_amscumath1();
}; //end namespace amscuda
#include <amsculib2/amscumath_impl.hpp>
#endif
#ifndef __AMSCUMATH_HPP__
#define __AMSCUMATH_HPP__
namespace amscuda
{
//Problem: These are not in the namespace
//#define nan NAN
//#define fnan (float) NAN
//#define inf INFINITY
//#define finf (float) INFINITY
//#define pi 3.1415926535897936
//These need to be the same symbol for both host and device code
AMSCU_CONST static const double nan = NAN;
AMSCU_CONST static const float fnan = (float) NAN;
AMSCU_CONST static const double inf = INFINITY;
AMSCU_CONST static const float finf = (float) INFINITY;
AMSCU_CONST static const double pi = 3.1415926535897936;
AMSCU_CONST static const float pif = 3.1415926535897936;
__host__ __device__ double dabs(double x);
__host__ __device__ float fabs(float x);
template<typename T> __host__ __device__ T abs(const T in)
{
T ret = in;
if(in<0) ret = -in;
return ret;
}
__host__ __device__ double mod(double a, double md);
__host__ __device__ float mod(float a, float md);
__host__ __device__ int mod(int x, int n);
__host__ __device__ long mod(long x, long n);
__host__ __device__ int truediv(int x, int y);
__host__ __device__ long truediv(long x, long y);
template<typename T> __host__ __device__ T min(T a, T b);
template<typename T> __host__ __device__ T max(T a, T b);
__device__ __host__ double arg(double x, double y);
__device__ __host__ void get_azel(double x, double y, double z, double *az, double *el);
void test_amscumath1();
}; //end namespace amscuda
#include <amsculib2/amscumath_impl.hpp>
#endif

View File

@ -1,42 +1,42 @@
#ifndef __AMSCUMATH_IMPL_HPP__
#define __AMSCUMATH_IMPL_HPP__
namespace amscuda
{
template<typename T> __host__ __device__ T min(T a, T b)
{
if(a>b)
{
return b;
}
else
{
return a;
}
return a;
}
template<typename T> __host__ __device__ T max(T a, T b)
{
if(a>b)
{
return a;
}
else
{
return b;
}
return a;
}
template<> __host__ __device__ double min(double a, double b);
template<> __host__ __device__ float min(float a, float b);
template<> __host__ __device__ double max(double a, double b);
template<> __host__ __device__ float max(float a, float b);
}; //end namespace amscuda
#endif
#ifndef __AMSCUMATH_IMPL_HPP__
#define __AMSCUMATH_IMPL_HPP__
namespace amscuda
{
template<typename T> __host__ __device__ T min(T a, T b)
{
if(a>b)
{
return b;
}
else
{
return a;
}
return a;
}
template<typename T> __host__ __device__ T max(T a, T b)
{
if(a>b)
{
return a;
}
else
{
return b;
}
return a;
}
template<> __host__ __device__ double min(double a, double b);
template<> __host__ __device__ float min(float a, float b);
template<> __host__ __device__ double max(double a, double b);
template<> __host__ __device__ float max(float a, float b);
}; //end namespace amscuda
#endif

View File

@ -1,66 +1,66 @@
#ifndef __AMSCURARRAY_HPP__
#define __AMSCURARRAY_HPP__
namespace amscuda
{
//Cuda ragged array class
template<typename T> class curarray
{
public:
int device;
curarray* devptr; //pointer to mirror class on the device
int Narrays; //number of arrays
int *N; //dimension of each array
T** hostarrayptrs; //pointers to each array on the host - null on the device
T** devarrayptrs; //pointers to each array on the device
//the double pointer is a host pointer to device pointers on the host class
//for the device class, only the second set of arrays is in use
//the constructor and destructor set all pointers to NULL, they
// do *not* manage memory. This is done with curarray_new and curarray_delete
__device__ __host__ curarray();
__device__ __host__ ~curarray();
__host__ int push();
__host__ int pull();
//__device__ int dev_resizearray(int arraynum, int arraysize);
__host__ int resizearray(int arraynum, int arraysize);
// I may want a way to resize arrays on the device without pushing/pulling all the array contents
};
template<typename T> int curarray_new(curarray<T>** ptr, int Narrays);
template<typename T> int curarray_delete(curarray<T>** ptr);
template<typename T> int curarray_device_new(curarray<T> *hostptr);
template<typename T> int curarray_device_delete(curarray<T> *hostptr);
template<typename T> int curarray_push(curarray<T> *hostptr);
template<typename T> int curarray_pull(curarray<T> *hostptr);
//template<typename T> int curarray_host_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> int curarray_device_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> __host__ int curarray_deletearray(curarray<T> *hostptr, int arrayindex);
//template<typename T> __device__ int curarray_dev_deletearray(curarray<T> *devptr, int arrayindex);
//template<typename T> __host__ int curarray_allocarray(curarray<T> *hostptr, int arrayindex, int size);
//template<typename T> __device__ int curarray_dev_allocarray(curarray<T> *devptr, int arrayindex, int size);
void test_amscurarray1();
};
#include <amsculib2/amscurarray_impl.cuh>
#ifndef __AMSCURARRAY_HPP__
#define __AMSCURARRAY_HPP__
namespace amscuda
{
//Cuda ragged array class
template<typename T> class curarray
{
public:
int device;
curarray* devptr; //pointer to mirror class on the device
int Narrays; //number of arrays
int *N; //dimension of each array
T** hostarrayptrs; //pointers to each array on the host - null on the device
T** devarrayptrs; //pointers to each array on the device
//the double pointer is a host pointer to device pointers on the host class
//for the device class, only the second set of arrays is in use
//the constructor and destructor set all pointers to NULL, they
// do *not* manage memory. This is done with curarray_new and curarray_delete
__device__ __host__ curarray();
__device__ __host__ ~curarray();
__host__ int push();
__host__ int pull();
//__device__ int dev_resizearray(int arraynum, int arraysize);
__host__ int resizearray(int arraynum, int arraysize);
// I may want a way to resize arrays on the device without pushing/pulling all the array contents
};
template<typename T> int curarray_new(curarray<T>** ptr, int Narrays);
template<typename T> int curarray_delete(curarray<T>** ptr);
template<typename T> int curarray_device_new(curarray<T> *hostptr);
template<typename T> int curarray_device_delete(curarray<T> *hostptr);
template<typename T> int curarray_push(curarray<T> *hostptr);
template<typename T> int curarray_pull(curarray<T> *hostptr);
//template<typename T> int curarray_host_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> int curarray_device_fillall(curarray<T> *hostptr, const T &val);
//template<typename T> __host__ int curarray_deletearray(curarray<T> *hostptr, int arrayindex);
//template<typename T> __device__ int curarray_dev_deletearray(curarray<T> *devptr, int arrayindex);
//template<typename T> __host__ int curarray_allocarray(curarray<T> *hostptr, int arrayindex, int size);
//template<typename T> __device__ int curarray_dev_allocarray(curarray<T> *devptr, int arrayindex, int size);
void test_amscurarray1();
};
#include <amsculib2/amscurarray_impl.cuh>
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,84 +1,85 @@
#ifndef __CUVECT2_HPP__
#define __CUVECT2_HPP__
namespace amscuda
{
class cuvect2
{
public:
double x;
double y;
__host__ __device__ cuvect2();
__host__ __device__ ~cuvect2();
__host__ __device__ cuvect2(double _x, double _y);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect2 operator+(cuvect2 lhs);
__host__ __device__ cuvect2 operator-(cuvect2 lhs);
__host__ __device__ cuvect2 operator*(double lhs);
__host__ __device__ cuvect2 operator/(double lhs);
};
class cumat2
{
public:
double dat[4];
__host__ __device__ cumat2();
__host__ __device__ ~cumat2();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat2 operator+(cumat2 lhs);
__host__ __device__ cumat2 operator-(cumat2 lhs);
__host__ __device__ cumat2 operator*(double lhs);
__host__ __device__ cumat2 operator/(double lhs);
__host__ __device__ cuvect2 operator*(cuvect2 lhs);
__host__ __device__ cumat2 operator*(cumat2 lhs);
__host__ __device__ double det();
__host__ __device__ cumat2 transpose();
__host__ __device__ cumat2 inverse();
};
__host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_norm(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2_transpose(double *mat2inout);
//copies src to dest
__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src);
//inverts mat?inout[4]
__host__ __device__ void mat2_inverse(double *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2);
//multiplies c = a*b
__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c);
// ret = a*b
__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b);
void test_cuvect2_1();
}; //end namespace amscuda
#endif
#ifndef __CUVECT2_HPP__
#define __CUVECT2_HPP__
namespace amscuda
{
class cuvect2
{
public:
double x;
double y;
__host__ __device__ cuvect2();
__host__ __device__ ~cuvect2();
__host__ __device__ cuvect2(double _x, double _y);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect2 operator+(cuvect2 lhs);
__host__ __device__ cuvect2 operator-(cuvect2 lhs);
__host__ __device__ cuvect2 operator*(double lhs);
__host__ __device__ cuvect2 operator/(double lhs);
};
class cumat2
{
public:
double dat[4];
__host__ __device__ cumat2();
__host__ __device__ ~cumat2();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat2 operator+(cumat2 lhs);
__host__ __device__ cumat2 operator-(cumat2 lhs);
__host__ __device__ cumat2 operator*(double lhs);
__host__ __device__ cumat2 operator/(double lhs);
__host__ __device__ cuvect2 operator*(cuvect2 lhs);
__host__ __device__ cumat2 operator*(cumat2 lhs);
__host__ __device__ double det();
__host__ __device__ cumat2 transpose();
__host__ __device__ cumat2 inverse();
};
__host__ __device__ double cuvect2_dot(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_cross(cuvect2 a, cuvect2 b);
__host__ __device__ double cuvect2_norm(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_normalize(cuvect2 a);
__host__ __device__ cuvect2 cuvect2_proj(cuvect2 a, cuvect2 b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2_transpose(double *mat2inout);
//copies src to dest
__host__ __device__ void mat2_copy(double *mat2_dest, const double *mat2_src);
//inverts mat?inout[4]
__host__ __device__ void mat2_inverse(double *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2_rot_from_angle(double angle, double *mat2);
//multiplies c = a*b
__host__ __device__ void mat2_mult(double *mat2a, double *mat2b, double *mat2c);
// ret = a*b
__host__ __device__ cuvect2 mat2_mult(double *mat2a, cuvect2 b);
void test_cuvect2_1();
}; //end namespace amscuda
#endif

View File

@ -1,84 +1,85 @@
#ifndef __CUVECT2F_HPP__
#define __CUVECT2F_HPP__
namespace amscuda
{
class cuvect2f
{
public:
float x;
float y;
__host__ __device__ cuvect2f();
__host__ __device__ ~cuvect2f();
__host__ __device__ cuvect2f(float _x, float _y);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect2f operator+(cuvect2f lhs);
__host__ __device__ cuvect2f operator-(cuvect2f lhs);
__host__ __device__ cuvect2f operator*(float lhs);
__host__ __device__ cuvect2f operator/(float lhs);
};
class cumat2f
{
public:
float dat[4];
__host__ __device__ cumat2f();
__host__ __device__ ~cumat2f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat2f operator+(cumat2f lhs);
__host__ __device__ cumat2f operator-(cumat2f lhs);
__host__ __device__ cumat2f operator*(float lhs);
__host__ __device__ cumat2f operator/(float lhs);
__host__ __device__ cuvect2f operator*(cuvect2f lhs);
__host__ __device__ cumat2f operator*(cumat2f lhs);
__host__ __device__ float det();
__host__ __device__ cumat2f transpose();
__host__ __device__ cumat2f inverse();
};
__host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_norm(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2f_transpose(float *mat2inout);
//copies src to dest
__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src);
//inverts mat?inout[4]
__host__ __device__ void mat2f_inverse(float *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2);
//multiplies c = a*b
__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c);
// ret = a*b
__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b);
void test_cuvect2f_1();
};
#endif
#ifndef __CUVECT2F_HPP__
#define __CUVECT2F_HPP__
namespace amscuda
{
class cuvect2f
{
public:
float x;
float y;
__host__ __device__ cuvect2f();
__host__ __device__ ~cuvect2f();
__host__ __device__ cuvect2f(float _x, float _y);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect2f operator+(cuvect2f lhs);
__host__ __device__ cuvect2f operator-(cuvect2f lhs);
__host__ __device__ cuvect2f operator*(float lhs);
__host__ __device__ cuvect2f operator/(float lhs);
__host__ __device__ friend cuvect2f operator-(cuvect2f rhs);
};
class cumat2f
{
public:
float dat[4];
__host__ __device__ cumat2f();
__host__ __device__ ~cumat2f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat2f operator+(cumat2f lhs);
__host__ __device__ cumat2f operator-(cumat2f lhs);
__host__ __device__ cumat2f operator*(float lhs);
__host__ __device__ cumat2f operator/(float lhs);
__host__ __device__ cuvect2f operator*(cuvect2f lhs);
__host__ __device__ cumat2f operator*(cumat2f lhs);
__host__ __device__ float det();
__host__ __device__ cumat2f transpose();
__host__ __device__ cumat2f inverse();
};
__host__ __device__ float cuvect2f_dot(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_cross(cuvect2f a, cuvect2f b);
__host__ __device__ float cuvect2f_norm(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_normalize(cuvect2f a);
__host__ __device__ cuvect2f cuvect2f_proj(cuvect2f a, cuvect2f b);
//2x2 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transpose a 2x2 matrix in place
__host__ __device__ void mat2f_transpose(float *mat2inout);
//copies src to dest
__host__ __device__ void mat2f_copy(float *mat2f_dest, const float *mat2f_src);
//inverts mat?inout[4]
__host__ __device__ void mat2f_inverse(float *mat2inout);
//rotatin matrix from angle
__host__ __device__ void mat2f_rot_from_angle(float angle, float *mat2);
//multiplies c = a*b
__host__ __device__ void mat2f_mult(float *mat2a, float *mat2b, float *mat2c);
// ret = a*b
__host__ __device__ cuvect2f mat2f_mult(float *mat2a, cuvect2f b);
void test_cuvect2f_1();
};
#endif

View File

@ -1,86 +1,86 @@
#ifndef __CUVECT3_HPP__
#define __CUVECT3_HPP__
namespace amscuda
{
class cuvect3
{
public:
double x;
double y;
double z;
__host__ __device__ cuvect3();
__host__ __device__ ~cuvect3();
__host__ __device__ cuvect3(double _x, double _y, double _z);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect3 operator+(cuvect3 lhs);
__host__ __device__ cuvect3 operator-(cuvect3 lhs);
__host__ __device__ cuvect3 operator*(double lhs);
__host__ __device__ cuvect3 operator/(double lhs);
};
class cumat3
{
public:
double dat[9];
__host__ __device__ cumat3();
__host__ __device__ ~cumat3();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat3 operator+(cumat3 lhs);
__host__ __device__ cumat3 operator-(cumat3 lhs);
__host__ __device__ cumat3 operator*(double lhs);
__host__ __device__ cumat3 operator/(double lhs);
__host__ __device__ cuvect3 operator*(cuvect3 lhs);
__host__ __device__ cumat3 operator*(cumat3 lhs);
__host__ __device__ double det();
__host__ __device__ cumat3 transpose();
__host__ __device__ cumat3 inverse();
};
__host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b);
__host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b);
__host__ __device__ double cuvect3_norm(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3_transpose(double *mat3inout);
//copies src to dest
__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src);
//returns determinant of 3x3 matrix
__host__ __device__ double mat3_det(double *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3_inverse(double *mat3inout);
__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin);
__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout);
__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout);
__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout);
__host__ void test_cudavect_logic1();
}; //end namespace amscuda
#endif
#ifndef __CUVECT3_HPP__
#define __CUVECT3_HPP__
namespace amscuda
{
class cuvect3
{
public:
double x;
double y;
double z;
__host__ __device__ cuvect3();
__host__ __device__ ~cuvect3();
__host__ __device__ cuvect3(double _x, double _y, double _z);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect3 operator+(cuvect3 lhs);
__host__ __device__ cuvect3 operator-(cuvect3 lhs);
__host__ __device__ cuvect3 operator*(double lhs);
__host__ __device__ cuvect3 operator/(double lhs);
};
class cumat3
{
public:
double dat[9];
__host__ __device__ cumat3();
__host__ __device__ ~cumat3();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat3 operator+(cumat3 lhs);
__host__ __device__ cumat3 operator-(cumat3 lhs);
__host__ __device__ cumat3 operator*(double lhs);
__host__ __device__ cumat3 operator/(double lhs);
__host__ __device__ cuvect3 operator*(cuvect3 lhs);
__host__ __device__ cumat3 operator*(cumat3 lhs);
__host__ __device__ double det();
__host__ __device__ cumat3 transpose();
__host__ __device__ cumat3 inverse();
};
__host__ __device__ double cuvect3_dot(cuvect3 a, cuvect3 b);
__host__ __device__ cuvect3 cuvect3_cross(cuvect3 a, cuvect3 b);
__host__ __device__ double cuvect3_norm(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_normalize(cuvect3 a);
__host__ __device__ cuvect3 cuvect3_proj(cuvect3 a, cuvect3 b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3_transpose(double *mat3inout);
//copies src to dest
__host__ __device__ void mat3_copy(double *mat3_dest, const double *mat3_src);
//returns determinant of 3x3 matrix
__host__ __device__ double mat3_det(double *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3_inverse(double *mat3inout);
__host__ __device__ cuvect3 mat3_mult(double *mat3in, cuvect3 cvin);
__host__ __device__ void mat3_mult(double *matina, double *matinb, double *matout);
__host__ __device__ void mat3_hodgedual(cuvect3 vecin, double *matout);
__host__ __device__ void mat3_hodgedual(double *matin, cuvect3 vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3_rot_from_axisangle(cuvect3 axis, double angle, double *matout);
__host__ void test_cudavect_logic1();
}; //end namespace amscuda
#endif

View File

@ -1,86 +1,87 @@
#ifndef __CUVECT3F_HPP__
#define __CUVECT3F_HPP__
namespace amscuda
{
class cuvect3f
{
public:
float x;
float y;
float z;
__host__ __device__ cuvect3f();
__host__ __device__ ~cuvect3f();
__host__ __device__ cuvect3f(float _x, float _y, float _z);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect3f operator+(cuvect3f lhs);
__host__ __device__ cuvect3f operator-(cuvect3f lhs);
__host__ __device__ cuvect3f operator*(float lhs);
__host__ __device__ cuvect3f operator/(float lhs);
};
class cumat3f
{
public:
float dat[9];
__host__ __device__ cumat3f();
__host__ __device__ ~cumat3f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat3f operator+(cumat3f lhs);
__host__ __device__ cumat3f operator-(cumat3f lhs);
__host__ __device__ cumat3f operator*(float lhs);
__host__ __device__ cumat3f operator/(float lhs);
__host__ __device__ cuvect3f operator*(cuvect3f lhs);
__host__ __device__ cumat3f operator*(cumat3f lhs);
__host__ __device__ float det();
__host__ __device__ cumat3f transpose();
__host__ __device__ cumat3f inverse();
};
__host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b);
__host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b);
__host__ __device__ float cuvect3f_norm(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3f_transpose(float *mat3inout);
//copies src to dest
__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src);
//returns determinant of 3x3 matrix
__host__ __device__ float mat3f_det(float *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3f_inverse(float *mat3inout);
__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin);
__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout);
__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout);
__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout);
__host__ void test_cudavectf_logic1();
};
#endif
#ifndef __CUVECT3F_HPP__
#define __CUVECT3F_HPP__
namespace amscuda
{
class cuvect3f
{
public:
float x;
float y;
float z;
__host__ __device__ cuvect3f();
__host__ __device__ ~cuvect3f();
__host__ __device__ cuvect3f(float _x, float _y, float _z);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect3f operator+(cuvect3f lhs);
__host__ __device__ cuvect3f operator-(cuvect3f lhs);
__host__ __device__ cuvect3f operator*(float lhs);
__host__ __device__ cuvect3f operator/(float lhs);
__host__ __device__ friend cuvect3f operator-(cuvect3f rhs);
};
class cumat3f
{
public:
float dat[9];
__host__ __device__ cumat3f();
__host__ __device__ ~cumat3f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat3f operator+(cumat3f lhs);
__host__ __device__ cumat3f operator-(cumat3f lhs);
__host__ __device__ cumat3f operator*(float lhs);
__host__ __device__ cumat3f operator/(float lhs);
__host__ __device__ cuvect3f operator*(cuvect3f lhs);
__host__ __device__ cumat3f operator*(cumat3f lhs);
__host__ __device__ float det();
__host__ __device__ cumat3f transpose();
__host__ __device__ cumat3f inverse();
};
__host__ __device__ float cuvect3f_dot(cuvect3f a, cuvect3f b);
__host__ __device__ cuvect3f cuvect3f_cross(cuvect3f a, cuvect3f b);
__host__ __device__ float cuvect3f_norm(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_normalize(cuvect3f a);
__host__ __device__ cuvect3f cuvect3f_proj(cuvect3f a, cuvect3f b);
//3x3 matrix operations
//matrix order is assumed to be mat[I,J] = mat[I+3*J]
//transposes a 3x3 (9 element) matrix
__host__ __device__ void mat3f_transpose(float *mat3inout);
//copies src to dest
__host__ __device__ void mat3f_copy(float *mat3f_dest, const float *mat3f_src);
//returns determinant of 3x3 matrix
__host__ __device__ float mat3f_det(float *mat3in);
//inverts a 3x3 (9 element) matrix
__host__ __device__ void mat3f_inverse(float *mat3inout);
__host__ __device__ cuvect3f mat3f_mult(float *mat3in, cuvect3f cvin);
__host__ __device__ void mat3f_mult(float *matina, float *matinb, float *matout);
__host__ __device__ void mat3f_hodgedual(cuvect3f vecin, float *matout);
__host__ __device__ void mat3f_hodgedual(float *matin, cuvect3f vecout);
//returns direction cosine rotation matrix from axis and angle
__host__ __device__ void mat3f_rot_from_axisangle(cuvect3f axis, float angle, float *matout);
__host__ void test_cudavectf_logic1();
};
#endif

View File

@ -1,59 +1,59 @@
#ifndef __CUVECT4_HPP__
#define __CUVECT4_HPP__
namespace amscuda
{
class cuvect4
{
public:
double x;
double y;
double z;
double w;
__host__ __device__ cuvect4();
__host__ __device__ ~cuvect4();
__host__ __device__ cuvect4(double _x, double _y, double _z, double _w);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect4 operator+(cuvect4 lhs);
__host__ __device__ cuvect4 operator-(cuvect4 lhs);
__host__ __device__ cuvect4 operator*(double lhs);
__host__ __device__ cuvect4 operator/(double lhs);
};
class cumat4
{
public:
double dat[16];
__host__ __device__ cumat4();
__host__ __device__ ~cumat4();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat4 operator+(cumat4 lhs);
__host__ __device__ cumat4 operator-(cumat4 lhs);
__host__ __device__ cumat4 operator*(double lhs);
__host__ __device__ cumat4 operator/(double lhs);
__host__ __device__ cuvect4 operator*(cuvect4 lhs);
__host__ __device__ cumat4 operator*(cumat4 lhs);
__host__ __device__ double det();
__host__ __device__ cumat4 transpose();
__host__ __device__ cumat4 inverse();
};
__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b);
__host__ __device__ double cuvect4_norm(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b);
}; //end namespace amscuda
#endif
#ifndef __CUVECT4_HPP__
#define __CUVECT4_HPP__
namespace amscuda
{
class cuvect4
{
public:
double x;
double y;
double z;
double w;
__host__ __device__ cuvect4();
__host__ __device__ ~cuvect4();
__host__ __device__ cuvect4(double _x, double _y, double _z, double _w);
__host__ __device__ double& operator[](const int I);
__host__ __device__ const double& operator[](const int I) const;
__host__ __device__ cuvect4 operator+(cuvect4 lhs);
__host__ __device__ cuvect4 operator-(cuvect4 lhs);
__host__ __device__ cuvect4 operator*(double lhs);
__host__ __device__ cuvect4 operator/(double lhs);
};
class cumat4
{
public:
double dat[16];
__host__ __device__ cumat4();
__host__ __device__ ~cumat4();
__host__ __device__ double& operator[](const int I);
__host__ __device__ double& operator()(const int I, const int J);
__host__ __device__ double& at(const int I, const int J);
__host__ __device__ cumat4 operator+(cumat4 lhs);
__host__ __device__ cumat4 operator-(cumat4 lhs);
__host__ __device__ cumat4 operator*(double lhs);
__host__ __device__ cumat4 operator/(double lhs);
__host__ __device__ cuvect4 operator*(cuvect4 lhs);
__host__ __device__ cumat4 operator*(cumat4 lhs);
__host__ __device__ double det();
__host__ __device__ cumat4 transpose();
__host__ __device__ cumat4 inverse();
};
__host__ __device__ double cuvect4_dot(cuvect4 a, cuvect4 b);
__host__ __device__ double cuvect4_norm(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_normalize(cuvect4 a);
__host__ __device__ cuvect4 cuvect4_proj(cuvect4 a, cuvect4 b);
}; //end namespace amscuda
#endif

View File

@ -1,60 +1,61 @@
#ifndef __CUVECT4F_HPP__
#define __CUVECT4F_HPP__
namespace amscuda
{
class cuvect4f
{
public:
float x;
float y;
float z;
float w;
__host__ __device__ cuvect4f();
__host__ __device__ ~cuvect4f();
__host__ __device__ cuvect4f(float _x, float _y, float _z, float _w);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect4f operator+(cuvect4f lhs);
__host__ __device__ cuvect4f operator-(cuvect4f lhs);
__host__ __device__ cuvect4f operator*(float lhs);
__host__ __device__ cuvect4f operator/(float lhs);
};
class cumat4f
{
public:
float dat[16];
__host__ __device__ cumat4f();
__host__ __device__ ~cumat4f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat4f operator+(cumat4f lhs);
__host__ __device__ cumat4f operator-(cumat4f lhs);
__host__ __device__ cumat4f operator*(float lhs);
__host__ __device__ cumat4f operator/(float lhs);
__host__ __device__ cuvect4f operator*(cuvect4f lhs);
__host__ __device__ cumat4f operator*(cumat4f lhs);
__host__ __device__ float det();
__host__ __device__ cumat4f transpose();
__host__ __device__ cumat4f inverse();
};
__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b);
__host__ __device__ float cuvect4f_norm(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b);
};
#endif
#ifndef __CUVECT4F_HPP__
#define __CUVECT4F_HPP__
namespace amscuda
{
class cuvect4f
{
public:
float x;
float y;
float z;
float w;
__host__ __device__ cuvect4f();
__host__ __device__ ~cuvect4f();
__host__ __device__ cuvect4f(float _x, float _y, float _z, float _w);
__host__ __device__ float& operator[](const int I);
__host__ __device__ const float& operator[](const int I) const;
__host__ __device__ cuvect4f operator+(cuvect4f lhs);
__host__ __device__ cuvect4f operator-(cuvect4f lhs);
__host__ __device__ cuvect4f operator*(float lhs);
__host__ __device__ cuvect4f operator/(float lhs);
__host__ __device__ friend cuvect4f operator-(cuvect4f rhs);
};
class cumat4f
{
public:
float dat[16];
__host__ __device__ cumat4f();
__host__ __device__ ~cumat4f();
__host__ __device__ float& operator[](const int I);
__host__ __device__ float& operator()(const int I, const int J);
__host__ __device__ float& at(const int I, const int J);
__host__ __device__ cumat4f operator+(cumat4f lhs);
__host__ __device__ cumat4f operator-(cumat4f lhs);
__host__ __device__ cumat4f operator*(float lhs);
__host__ __device__ cumat4f operator/(float lhs);
__host__ __device__ cuvect4f operator*(cuvect4f lhs);
__host__ __device__ cumat4f operator*(cumat4f lhs);
__host__ __device__ float det();
__host__ __device__ cumat4f transpose();
__host__ __device__ cumat4f inverse();
};
__host__ __device__ float cuvect4f_dot(cuvect4f a, cuvect4f b);
__host__ __device__ float cuvect4f_norm(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_normalize(cuvect4f a);
__host__ __device__ cuvect4f cuvect4f_proj(cuvect4f a, cuvect4f b);
};
#endif