Files
amsculib2/include/amsculib2/amscuarray_dops_impl.hpp
2026-02-20 11:46:15 -05:00

405 lines
9.5 KiB
C++

#ifndef __AMSCUARRAY_DOPS_IMPL_HPP__
#define __AMSCUARRAY_DOPS_IMPL_HPP__
namespace amscuda
{
template<typename T> __global__ void dbuff_sum_kf(T *devbuffer, int N, T *rets)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T ret = (T) 0;
for(I=I0;I<N;I=I+Is)
{
ret = ret + devbuffer[I];
}
rets[I0] = ret;
}
template<typename T> T devcuarray_sum(cuarray<T> *devptr)
{
T ret = T();
cudaError_t err = cudaSuccess;
cuarray<T> ldptr;
cudaMemcpy(&ldptr,devptr,sizeof(cuarray<T>),cudaMemcpyDeviceToHost);
ret = devbuffer_sum(ldptr.data,ldptr.length);
ldptr.data = NULL;
ldptr.length=0;
return ret;
}
template<typename T> T dbuff_sum(T *dbuff, int N)
{
int I;
T ret = T();
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
if(dbuff==NULL || N<=0)
{
return ret;
}
if(N>100)
{
nblocks = 10;
nthreads = (int)sqrt((float) (N/nblocks));
if(nthreads<=0) nthreads=1;
if(nthreads>512) nthreads=512;
}
else
{
nblocks = 1;
nthreads = 1;
}
T *rets = NULL;
T *devrets = NULL;
rets = new T[nblocks*nthreads];
cudaMalloc(&devrets,sizeof(T)*nblocks*nthreads);
dbuff_sum_kf<<<nblocks,nthreads>>>(dbuff,N,devrets);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_sum error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(rets,devrets,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
ret = (T)0;
for(I=0;I<nblocks*nthreads;I++)
{
ret = ret + rets[I];
}
cudaFree(devrets); devrets = NULL;
delete[] rets;
return ret;
}
template<typename T> __global__ void dbuff_minmax_kf(T *devbuffer, int N, T *maxs, T *mins)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
if(I==I0)
{
maxs[I0] = devbuffer[I];
mins[I0] = devbuffer[I];
}
else
{
if(devbuffer[I]>maxs[I0])
{
maxs[I0] = devbuffer[I];
}
if(devbuffer[I]<mins[I0])
{
mins[I0] = devbuffer[I];
}
}
}
return;
}
template<typename T> void dbuff_minmax(T *devbuffer, int N, T *min, T *max)
{
cudaError_t err = cudaSuccess;
int nblocks;
int nthreads;
int I;
T *maxs = NULL;
T *dev_maxs = NULL;
T *mins = NULL;
T *dev_mins = NULL;
T localmax = T(0);
T localmin = T(0);
if(devbuffer==NULL || N<=0)
{
if(min!=NULL) *min = T(0);
if(max!=NULL) *max = T(0);
return;
}
if(N>25)
{
nblocks = 25;
nthreads = (int) sqrt((float)(N/nblocks));
if(nthreads<1) nthreads = 1;
if(nthreads>512) nthreads = 512;
}
else
{
nblocks = 1;
nthreads = 1;
}
maxs = new T[nblocks*nthreads];
mins = new T[nblocks*nthreads];
cudaMalloc(&dev_maxs,nblocks*nthreads);
cudaMalloc(&dev_mins,nblocks*nthreads);
dbuff_minmax_kf<<<nblocks,nthreads>>>(devbuffer,N,dev_maxs,dev_mins);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_minmax error: %s\n",cudaGetErrorString(err));
}
cudaMemcpy(maxs,dev_maxs,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
cudaMemcpy(mins,dev_mins,sizeof(T)*nblocks*nthreads,cudaMemcpyDeviceToHost);
for(I=0;I<nblocks*nthreads;I++)
{
if(I==0)
{
localmax = maxs[0];
localmin = mins[0];
}
else
{
if(maxs[I]>localmax) localmax = maxs[I];
if(mins[I]<localmin) localmin = mins[I];
}
}
if(max!=NULL) *max = localmax;
if(min!=NULL) *min = localmin;
cudaFree(dev_maxs); dev_maxs = NULL;
cudaFree(dev_mins); dev_mins = NULL;
delete[] maxs; maxs = NULL;
delete[] mins; mins = NULL;
return;
}
template<typename T> __global__ void dbuff_setall_kf(T *devbuffer, int N, T setto)
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
for(I=I0;I<N;I=I+Is)
{
devbuffer[I] = setto;
}
return;
}
template<typename T> void dbuff_setall(T *devbuffer, int N, T setto, int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(devbuffer==NULL || N<=0)
{
return;
}
dbuff_setall_kf<<<nblocks,nthreads>>>(devbuffer,N,setto);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::dbuff_setall error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf1(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = dbuf_b[I];
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
template<typename T1, typename T2, typename T3> __global__ void dbuff_vectorbinop_kf2(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2))
{
int I0 = threadIdx.x + blockIdx.x*blockDim.x;
int Is = blockDim.x*gridDim.x;
int I;
T1 a;
T2 b;
T3 c;
for(I=I0;I<N;I=I+Is)
{
a = dbuf_a[I];
b = par_b;
c = fpnt(a,b);
dbuf_out[I] = c;
}
return;
}
//Elementwise device-buffer vector binary operation
//takes two input arrays ( , ) --> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 *dbuf_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_b == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf1<<<nblocks,nthreads>>>(dbuf_a,dbuf_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
//Elementwise device-buffer vector two-parameter operation
//takes one input array, and a constant paramter ( ) ---> one output array
template<typename T1, typename T2, typename T3> void dbuff_vectorbinop(T1 *dbuf_a, T2 par_b, T3 *dbuf_out, int N, T3 (*fpnt)(T1,T2), int nblocks, int nthreads)
{
cudaError_t err = cudaSuccess;
if(dbuf_a == NULL || dbuf_out == NULL || N<=0)
{
return;
}
dbuff_vectorbinop_kf2<<<nblocks,nthreads>>>(dbuf_a,par_b,dbuf_out,N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess)
{
printf("amscu::devbuffer_vectorbinop error: %s\n",cudaGetErrorString(err));
}
return;
}
template<typename T> T dbuff_add_fn(T a, T b)
{
return a+b;
}
template<typename T> void dbuff_add(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_add(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_add_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_sub_fn(T a, T b)
{
return a-b;
}
template<typename T> void dbuff_sub(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_sub(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_sub_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_mult_fn(T a, T b)
{
return a*b;
}
template<typename T> void dbuff_mult(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_mult(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_mult_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_div_fn(T a, T b)
{
return a/b;
}
template<typename T> void dbuff_div(T *dbuff_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,dbuff_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> void dbuff_div(T *dbuff_a, T par_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_a,par_b,dbuff_out,N,&dbuff_div_fn,nblocks,nthreads);
return;
}
template<typename T> T dbuff_ldiv_fn(T a, T b)
{
return b/a;
}
template<typename T> void dbuff_div(T par_a, T *dbuff_b, T *dbuff_out, int N, int nblocks, int nthreads)
{
dbuff_vectorbinop(dbuff_b,par_a,dbuff_out,N,&dbuff_ldiv_fn,nblocks,nthreads);
return;
}
};
#endif