#ifndef __AMSCURARRAY_IMPL_HPP__ #define __AMSCURARRAY_IMPL_HPP__ namespace amscuda { template curarray::curarray() { device = -1; devptr = NULL; Narrays = 0; N = NULL; hostarrayptrs = NULL; devarrayptrs = NULL; } template curarray::~curarray() { device = -1; devptr = NULL; Narrays = 0; N = NULL; hostarrayptrs = NULL; devarrayptrs = NULL; } template int curarray_new(curarray** ptr, int Narrays) { int ret = 0; int device; curarray *lhptr = *ptr; cudaGetDevice(&device); if(lhptr!=NULL) { curarray_delete(ptr); } *ptr = new(std::nothrow) curarray(); lhptr = *ptr; int I; if(Narrays<0) Narrays=0; lhptr->Narrays = Narrays; lhptr->device = device; lhptr->N = new(std::nothrow) int[Narrays]; lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays]; lhptr->devarrayptrs = new(std::nothrow) T*[Narrays]; for(I=0;IN[I] = 0; lhptr->hostarrayptrs[I] = NULL; lhptr->devarrayptrs[I] = NULL; } curarray_device_new(lhptr); return ret; } template int curarray_delete(curarray** ptr) { int ret = 0; curarray *lptr = NULL; int olddev; int I; if(*ptr==NULL) { return 0; } lptr = *ptr; cudaGetDevice(&olddev); cudaSetDevice(lptr->device); if(lptr->devptr!=NULL) { curarray_device_delete(lptr); } lptr->device = -1; for(I=0;INarrays;I++) { if(lptr->hostarrayptrs!=NULL) { if(lptr->hostarrayptrs[I]!=NULL) { delete[] lptr->hostarrayptrs[I]; lptr->hostarrayptrs[I] = NULL; } } if(lptr->devarrayptrs!=NULL) { if(lptr->devarrayptrs[I]!=NULL) { //erasing device memory should have been handled in curarray_device_delete lptr->devarrayptrs[I] = NULL; } } lptr->N[I] = 0; } if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;} if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;} if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;} if(*ptr!=NULL) {delete *ptr; *ptr = NULL;} cudaSetDevice(olddev); return ret; } template int curarray_device_new(curarray *hostptr) { int ret = 0; curarray ldevdata; if(hostptr==NULL) return -1; if(hostptr->devptr!=NULL) { curarray_device_delete(hostptr); } int I; cudaGetDevice(&(hostptr->device)); ldevdata.device = hostptr->device; ldevdata.Narrays = hostptr->Narrays; int Narrays = hostptr->Narrays; cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays); cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice); ldevdata.hostarrayptrs = NULL; for(I=0;IN[I]>0) { if(hostptr->devarrayptrs[I]!=NULL) { cudaFree(hostptr->devarrayptrs[I]); hostptr->devarrayptrs[I] = NULL; } cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]); cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice); } else { if(hostptr->devarrayptrs[I]!=NULL) { cudaFree(hostptr->devarrayptrs[I]); hostptr->devarrayptrs[I] = NULL; } } } cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays); cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice); cudaMalloc(&(hostptr->devptr),sizeof(curarray)); cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray),cudaMemcpyHostToDevice); ret = 1; return ret; } template int curarray_device_delete(curarray *hostptr) { int ret = 0; curarray ldevdata; int olddev; if(hostptr->devptr==NULL) { return 0; } cudaGetDevice(&olddev); cudaSetDevice(hostptr->device); cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); int I; for(I=0;INarrays;I++) { if(hostptr->devarrayptrs[I]!=NULL) { cudaFree(hostptr->devarrayptrs[I]); hostptr->devarrayptrs[I] = NULL; } } cudaFree(ldevdata.devarrayptrs); cudaFree(ldevdata.N); cudaFree(hostptr->devptr); hostptr->devptr = NULL; hostptr->device = -1; cudaSetDevice(olddev); ret = 1; return ret; } template int curarray_push(curarray *hostptr) { int ret = 0; int olddev; curarray ldevdata; T** ldevarrayptrs = NULL; int *devN = NULL; if(hostptr==NULL) return -1; cudaGetDevice(&olddev); cudaSetDevice(hostptr->device); int Narrays = hostptr->Narrays; cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); ldevarrayptrs = new(std::nothrow) T*[Narrays]; devN = new(std::nothrow) int[Narrays]; cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost); cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost); int I; for(I=0;IN[I]!=devN[I]) || (hostptr->devarrayptrs[I] != ldevarrayptrs[I]) ) { cudaFree(ldevarrayptrs[I]); ldevarrayptrs[I] = NULL; hostptr->devarrayptrs[I] = NULL; if(hostptr->N[I]>0) { cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]); ldevarrayptrs[I] = hostptr->devarrayptrs[I]; devN[I] = hostptr->N[I]; } else { devN[I] = 0; } } if(hostptr->N[I]>0) { //copy host data to device cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice); } } //for each array //rectify and copy device data structure to device ldevdata.device = hostptr->device; ldevdata.devptr = NULL; ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true ldevdata.hostarrayptrs = NULL; cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice); cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice); cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray),cudaMemcpyHostToDevice); cuda_errortrap("curarray_push cuda error:"); cudaSetDevice(olddev); delete[] ldevarrayptrs; delete[] devN; return ret; } template int curarray_pull(curarray *hostptr) { int ret = 0; int olddev; curarray ldevdata; T** ldevarrayptrs = NULL; int *devN = NULL; if(hostptr==NULL) return -1; cudaGetDevice(&olddev); cudaSetDevice(hostptr->device); cuda_errortrap("dbg1"); int Narrays = hostptr->Narrays; cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray),cudaMemcpyDeviceToHost); ldevarrayptrs = new(std::nothrow) T*[Narrays]; devN = new(std::nothrow) int[Narrays]; cuda_errortrap("dbg2"); cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost); cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost); cuda_errortrap("dbg3"); char dbgjnk[50]; int I; for(I=0;Idevarrayptrs[I] != ldevarrayptrs[I]) { hostptr->devarrayptrs[I] = ldevarrayptrs[I]; } if(hostptr->N[I]!=devN[I]) { if(hostptr->hostarrayptrs[I]!=NULL) { delete[] hostptr->hostarrayptrs[I]; hostptr->hostarrayptrs[I] = NULL; } if(devN[I]>0) { hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]]; hostptr->N[I] = devN[I]; } else { hostptr->N[I] = 0; } } if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL) { cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost); sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]); cuda_errortrap(dbgjnk); } } //for each array //for the pull operation, I don't think any update of the device data structure is necessary cudaSetDevice(olddev); delete[] ldevarrayptrs; delete[] devN; return ret; } template __host__ int curarray::push() { return curarray_push(this); } template __host__ int curarray::pull() { return curarray_pull(this); } /* https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D cudaMalloc() and cudaFree() have distinct semantics between the host and device environments. When invoked from the host, cudaMalloc() allocates a new region from unused device memory. When invoked from the device runtime these functions map to device-side malloc() and free(). This implies that within the device environment the total allocatable memory is limited to the device malloc() heap size, which may be smaller than the available unused device memory. Also, it is an error to invoke cudaFree() from the host program on a pointer which was allocated by cudaMalloc() on the device or vice-versa. So, basically this entire function is not going to work. I'll be unable to resize within a kernel. */ /* template __device__ int curarray::dev_resizearray(int arraynum, int arraysize) { int ret = 0; T* newptr = NULL; int I; T def; if(arraynum>=0 && arraynum __host__ int curarray::resizearray(int arraynum, int arraysize) { int ret = 0; T* newptr = NULL; int I; T def; if(arraynum>=0 && arraynum