405 lines
9.5 KiB
C++
405 lines
9.5 KiB
C++
#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
|
|
#define __AMSCUARRAY_DOPS_IMPL_HPP__
|
|
|
|
namespace amscuda
|
|
{
|
|
|
|
template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
|
|
{
|
|
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
|
|
int Is = blockDim.x*gridDim.x;
|
|
int I;
|
|
|
|
T ret = (T) 0;
|
|
for(I=I0;I<N;I=I+Is)
|
|
{
|
|
ret = ret + devbuffer[I];
|
|
}
|
|
rets[I0] = ret;
|
|
}
|
|
|
|
template<typename T> T devcuarray_sum(cuarray<T> *devptr)
|
|
{
|
|
T ret = T();
|
|
cudaError_t err = cudaSuccess;
|
|
|
|
cuarray<T> ldptr;
|
|
|
|
cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
|
|
|
|
ret = devbuffer_sum(ldptr.data,ldptr.length);
|
|
|
|
ldptr.data = NULL;
|
|
ldptr.length=0;
|
|
|
|
return ret;
|
|
}
|
|
|
|
template<typename T> T dbuff_sum(T *dbuff, int N)
|
|
{
|
|
int I;
|
|
T ret = T();
|
|
cudaError_t err = cudaSuccess;
|
|
|
|
int nblocks;
|
|
int nthreads;
|
|
|
|
if(dbuff==NULL || N<=0)
|
|
{
|
|
return ret;
|
|
}
|
|
|
|
if(N>100)
|
|
{
|
|
nblocks = 10;
|
|
nthreads = (int)sqrt((float) (N/nblocks));
|
|
if(nthreads<=0) nthreads=1;
|
|
if(nthreads>512) nthreads=512;
|
|
}
|
|
else
|
|
{
|
|
nblocks = 1;
|
|
nthreads = 1;
|
|
}
|
|
|
|
T *rets = NULL;
|
|
T *devrets = NULL;
|
|
|
|
rets = new T[nblocks*nthreads];
|
|
cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
|
|
|
|
dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
|
|
cudaDeviceSynchronize();
|
|
err = cudaGetLastError();
|
|
if(err!=cudaSuccess)
|
|
{
|
|
printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
|
|
}
|
|
|
|
cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
|
|
|
|
ret = (T)0;
|
|
for(I=0;I<nblocks*nthreads;I++)
|
|
{
|
|
ret = ret + rets[I];
|
|
}
|
|
|
|
cudaFree(devrets); devrets = NULL;
|
|
delete[] rets;
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
|
|
{
|
|
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
|
|
int Is = blockDim.x*gridDim.x;
|
|
int I;
|
|
|
|
for(I=I0;I<N;I=I+Is)
|
|
{
|
|
if(I==I0)
|
|
{
|
|
maxs[I0] = devbuffer[I];
|
|
mins[I0] = devbuffer[I];
|
|
}
|
|
else
|
|
{
|
|
if(devbuffer[I]>maxs[I0])
|
|
{
|
|
maxs[I0] = devbuffer[I];
|
|
}
|
|
if(devbuffer[I]<mins[I0])
|
|
{
|
|
mins[I0] = devbuffer[I];
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
|
|
{
|
|
cudaError_t err = cudaSuccess;
|
|
int nblocks;
|
|
int nthreads;
|
|
int I;
|
|
|
|
T *maxs = NULL;
|
|
T *dev_maxs = NULL;
|
|
T *mins = NULL;
|
|
T *dev_mins = NULL;
|
|
|
|
T localmax = T(0);
|
|
T localmin = T(0);
|
|
|
|
if(devbuffer==NULL || N<=0)
|
|
{
|
|
if(min!=NULL) *min = T(0);
|
|
if(max!=NULL) *max = T(0);
|
|
return;
|
|
}
|
|
|
|
if(N>25)
|
|
{
|
|
nblocks = 25;
|
|
nthreads = (int) sqrt((float)(N/nblocks));
|
|
if(nthreads<1) nthreads = 1;
|
|
if(nthreads>512) nthreads = 512;
|
|
}
|
|
else
|
|
{
|
|
nblocks = 1;
|
|
nthreads = 1;
|
|
}
|
|
|
|
maxs = new T[nblocks*nthreads];
|
|
mins = new T[nblocks*nthreads];
|
|
cudaMalloc(&dev_maxs,nblocks*nthreads);
|
|
cudaMalloc(&dev_mins,nblocks*nthreads);
|
|
|
|
dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
|
|
cudaDeviceSynchronize();
|
|
err = cudaGetLastError();
|
|
if(err!=cudaSuccess)
|
|
{
|
|
printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
|
|
}
|
|
|
|
cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
|
|
|
|
|
|
for(I=0;I<nblocks*nthreads;I++)
|
|
{
|
|
if(I==0)
|
|
{
|
|
localmax = maxs[0];
|
|
localmin = mins[0];
|
|
}
|
|
else
|
|
{
|
|
if(maxs[I]>localmax) localmax = maxs[I];
|
|
if(mins[I]<localmin) localmin = mins[I];
|
|
}
|
|
}
|
|
|
|
if(max!=NULL) *max = localmax;
|
|
if(min!=NULL) *min = localmin;
|
|
|
|
cudaFree(dev_maxs); dev_maxs = NULL;
|
|
cudaFree(dev_mins); dev_mins = NULL;
|
|
delete[] maxs; maxs = NULL;
|
|
delete[] mins; mins = NULL;
|
|
|
|
return;
|
|
}
|
|
|
|
template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
|
|
{
|
|
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
|
|
int Is = blockDim.x*gridDim.x;
|
|
int I;
|
|
|
|
for(I=I0;I<N;I=I+Is)
|
|
{
|
|
devbuffer[I] = setto;
|
|
}
|
|
return;
|
|
}
|
|
|
|
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
|
|
{
|
|
cudaError_t err = cudaSuccess;
|
|
|
|
if(devbuffer==NULL || N<=0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
|
|
cudaDeviceSynchronize();
|
|
err = cudaGetLastError();
|
|
if(err!=cudaSuccess)
|
|
{
|
|
printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
|
|
{
|
|
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
|
|
int Is = blockDim.x*gridDim.x;
|
|
int I;
|
|
|
|
T1 a;
|
|
T2 b;
|
|
T3 c;
|
|
|
|
for(I=I0;I<N;I=I+Is)
|
|
{
|
|
a = dbuf_a[I];
|
|
b = dbuf_b[I];
|
|
c = fpnt(a,b);
|
|
dbuf_out[I] = c;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
|
|
{
|
|
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
|
|
int Is = blockDim.x*gridDim.x;
|
|
int I;
|
|
|
|
T1 a;
|
|
T2 b;
|
|
T3 c;
|
|
|
|
for(I=I0;I<N;I=I+Is)
|
|
{
|
|
a = dbuf_a[I];
|
|
b = par_b;
|
|
c = fpnt(a,b);
|
|
dbuf_out[I] = c;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
//Elementwise device-buffer vector binary operation
|
|
//takes two input arrays ( , ) --> one output array
|
|
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
|
|
{
|
|
cudaError_t err = cudaSuccess;
|
|
|
|
if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
|
|
cudaDeviceSynchronize();
|
|
err = cudaGetLastError();
|
|
if(err!=cudaSuccess)
|
|
{
|
|
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
//Elementwise device-buffer vector two-parameter operation
|
|
//takes one input array, and a constant paramter ( ) ---> one output array
|
|
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
|
|
{
|
|
cudaError_t err = cudaSuccess;
|
|
|
|
if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
|
|
cudaDeviceSynchronize();
|
|
err = cudaGetLastError();
|
|
if(err!=cudaSuccess)
|
|
{
|
|
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
template<typename T> T dbuff_add_fn(T a, T b)
|
|
{
|
|
return a+b;
|
|
}
|
|
|
|
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> T dbuff_sub_fn(T a, T b)
|
|
{
|
|
return a-b;
|
|
}
|
|
|
|
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> T dbuff_mult_fn(T a, T b)
|
|
{
|
|
return a*b;
|
|
}
|
|
|
|
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> T dbuff_div_fn(T a, T b)
|
|
{
|
|
return a/b;
|
|
}
|
|
|
|
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
template<typename T> T dbuff_ldiv_fn(T a, T b)
|
|
{
|
|
return b/a;
|
|
}
|
|
|
|
|
|
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
|
|
{
|
|
dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
|
|
return;
|
|
}
|
|
|
|
|
|
};
|
|
|
|
#endif
|
|
|