amsculib3/old/9apr26_prerefactor/include/amsculib2/amscurarray_impl.cuh

#ifndef __AMSCURARRAY_IMPL_HPP__
#define __AMSCURARRAY_IMPL_HPP__

namespace amscuda
{

template<typename T> curarray<T>::curarray()
{
    device = -1;
    devptr = NULL;
    Narrays = 0;
    N = NULL;
    hostarrayptrs = NULL;
    devarrayptrs = NULL;

}

template<typename T> curarray<T>::~curarray()
{
    device = -1;
    devptr = NULL;
    Narrays = 0;
    N = NULL;
    hostarrayptrs = NULL;
    devarrayptrs = NULL;

}

template<typename T> int curarray_new(curarray<T>** ptr, int Narrays)
{
    int ret = 0;
    int device;
    curarray<T> *lhptr = *ptr;

    cudaGetDevice(&device);

    if(lhptr!=NULL)
    {
        curarray_delete(ptr);
    }

    *ptr = new(std::nothrow) curarray<T>();
    lhptr = *ptr;

    int I;

    if(Narrays<0) Narrays=0;

    lhptr->Narrays = Narrays;
    lhptr->device = device;
    lhptr->N = new(std::nothrow) int[Narrays];
    lhptr->hostarrayptrs = new(std::nothrow) T*[Narrays];
    lhptr->devarrayptrs = new(std::nothrow) T*[Narrays];

    for(I=0;I<Narrays;I++)
    {
        lhptr->N[I] = 0;
        lhptr->hostarrayptrs[I] = NULL;
        lhptr->devarrayptrs[I] = NULL;
    }

    curarray_device_new(lhptr);

    return ret;
}

template<typename T> int curarray_delete(curarray<T>** ptr)
{
    int ret = 0;
    curarray<T> *lptr = NULL;
    int olddev;

    int I;

    if(*ptr==NULL)
    {
        return 0;
    }
    lptr = *ptr;

    cudaGetDevice(&olddev);
    cudaSetDevice(lptr->device);


    if(lptr->devptr!=NULL)
    {
        curarray_device_delete(lptr);
    }

    lptr->device = -1;

    for(I=0;I<lptr->Narrays;I++)
    {
        if(lptr->hostarrayptrs!=NULL)
        {
            if(lptr->hostarrayptrs[I]!=NULL)
            {
                delete[] lptr->hostarrayptrs[I];
                lptr->hostarrayptrs[I] = NULL;
            }
        }

        if(lptr->devarrayptrs!=NULL)
        {
            if(lptr->devarrayptrs[I]!=NULL)
            {
                //erasing device memory should have been handled in curarray_device_delete
                lptr->devarrayptrs[I] = NULL;
            }
        }

        lptr->N[I] = 0;
    }

    if(lptr->N != NULL) {delete[] lptr->N; lptr->N = NULL;}
    if(lptr->hostarrayptrs!=NULL) {delete[] lptr->hostarrayptrs; lptr->hostarrayptrs=NULL;}
    if(lptr->devarrayptrs!=NULL) {delete[] lptr->devarrayptrs; lptr->devarrayptrs=NULL;}

    if(*ptr!=NULL) {delete *ptr; *ptr = NULL;}

    cudaSetDevice(olddev);

    return ret;
}

template<typename T> int curarray_device_new(curarray<T> *hostptr)
{
    int ret = 0;
    curarray<T> ldevdata;

    if(hostptr==NULL) return -1;
    if(hostptr->devptr!=NULL)
    {
        curarray_device_delete(hostptr);
    }

    int I;

    cudaGetDevice(&(hostptr->device));

    ldevdata.device = hostptr->device;
    ldevdata.Narrays = hostptr->Narrays;
    int Narrays = hostptr->Narrays;

    cudaMalloc(&(ldevdata.N),sizeof(int)*Narrays);
    cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);

    ldevdata.hostarrayptrs = NULL;

    for(I=0;I<Narrays;I++)
    {
        if(hostptr->N[I]>0)
        {
            if(hostptr->devarrayptrs[I]!=NULL)
            {
                cudaFree(hostptr->devarrayptrs[I]);
                hostptr->devarrayptrs[I] = NULL;
            }

            cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
            cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
        }
        else
        {
            if(hostptr->devarrayptrs[I]!=NULL)
            {
                cudaFree(hostptr->devarrayptrs[I]);
                hostptr->devarrayptrs[I] = NULL;
            }
        }
    }

    cudaMalloc(&(ldevdata.devarrayptrs),sizeof(T*)*Narrays);
    cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);

    cudaMalloc(&(hostptr->devptr),sizeof(curarray<T>));
    cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);

    ret = 1;

    return ret;
}

template<typename T> int curarray_device_delete(curarray<T> *hostptr)
{
    int ret = 0;

    curarray<T> ldevdata;
    int olddev;

    if(hostptr->devptr==NULL)
    {
        return 0;
    }

    cudaGetDevice(&olddev);
    cudaSetDevice(hostptr->device);

    cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);

    int I;
    for(I=0;I<hostptr->Narrays;I++)
    {
        if(hostptr->devarrayptrs[I]!=NULL)
        {
            cudaFree(hostptr->devarrayptrs[I]);
            hostptr->devarrayptrs[I] = NULL;
        }
    }

    cudaFree(ldevdata.devarrayptrs);
    cudaFree(ldevdata.N);

    cudaFree(hostptr->devptr);
    hostptr->devptr = NULL;
    hostptr->device = -1;

    cudaSetDevice(olddev);

    ret = 1;
    return ret;
}

template<typename T> int curarray_push(curarray<T> *hostptr)
{
    int ret = 0;

    int olddev;

    curarray<T> ldevdata;
    T** ldevarrayptrs = NULL;
    int *devN = NULL;

    if(hostptr==NULL) return -1;

    cudaGetDevice(&olddev);
    cudaSetDevice(hostptr->device);

    int Narrays = hostptr->Narrays;

    cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
    ldevarrayptrs = new(std::nothrow) T*[Narrays];
    devN = new(std::nothrow) int[Narrays];

    cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
    cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);


    int I;

    for(I=0;I<Narrays;I++)
    {
        //check to see that host size is the same as device size, and that
        //the host device pointer is the same as the device device pointer
        if( (hostptr->N[I]!=devN[I]) ||
            (hostptr->devarrayptrs[I] != ldevarrayptrs[I])
        )
        {
            cudaFree(ldevarrayptrs[I]);
            ldevarrayptrs[I] = NULL;
            hostptr->devarrayptrs[I] = NULL;

            if(hostptr->N[I]>0)
            {
                cudaMalloc(&(hostptr->devarrayptrs[I]),sizeof(T)*hostptr->N[I]);
                ldevarrayptrs[I] = hostptr->devarrayptrs[I];
                devN[I] = hostptr->N[I];
            }
            else
            {
                devN[I] = 0;
            }
        }
        if(hostptr->N[I]>0)
        {
            //copy host data to device
            cudaMemcpy(hostptr->devarrayptrs[I],hostptr->hostarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyHostToDevice);
        }
    } //for each array

    //rectify and copy device data structure to device
    ldevdata.device = hostptr->device;
    ldevdata.devptr = NULL;
    ldevdata.Narrays = hostptr->Narrays; //later - logic for dealing with when this is not true
    ldevdata.hostarrayptrs = NULL;


    cudaMemcpy(ldevdata.N,hostptr->N,sizeof(int)*Narrays,cudaMemcpyHostToDevice);
    cudaMemcpy(ldevdata.devarrayptrs,hostptr->devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyHostToDevice);

    cudaMemcpy(hostptr->devptr,&ldevdata,sizeof(curarray<T>),cudaMemcpyHostToDevice);

    cuda_errortrap("curarray_push cuda error:");

    cudaSetDevice(olddev);

    delete[] ldevarrayptrs;
    delete[] devN;

    return ret;
}

template<typename T> int curarray_pull(curarray<T> *hostptr)
{
    int ret = 0;

    int olddev;

    curarray<T> ldevdata;
    T** ldevarrayptrs = NULL;
    int *devN = NULL;

    if(hostptr==NULL) return -1;

    cudaGetDevice(&olddev);
    cudaSetDevice(hostptr->device);

    cuda_errortrap("dbg1");

    int Narrays = hostptr->Narrays;

    cudaMemcpy(&ldevdata,hostptr->devptr,sizeof(curarray<T>),cudaMemcpyDeviceToHost);
    ldevarrayptrs = new(std::nothrow) T*[Narrays];
    devN = new(std::nothrow) int[Narrays];

    cuda_errortrap("dbg2");

    cudaMemcpy(ldevarrayptrs,ldevdata.devarrayptrs,sizeof(T*)*Narrays,cudaMemcpyDeviceToHost);
    cudaMemcpy(devN,ldevdata.N,sizeof(int)*Narrays,cudaMemcpyDeviceToHost);

    cuda_errortrap("dbg3");
    char dbgjnk[50];

    int I;
    for(I=0;I<Narrays;I++)
    {
        //check to see that host size is the same as device size, and that
        //the host device pointer is the same as the device device pointer
        if(hostptr->devarrayptrs[I] != ldevarrayptrs[I])
        {
            hostptr->devarrayptrs[I] = ldevarrayptrs[I];
        }

        if(hostptr->N[I]!=devN[I])
        {
            if(hostptr->hostarrayptrs[I]!=NULL)
            {
                delete[] hostptr->hostarrayptrs[I];
                hostptr->hostarrayptrs[I] = NULL;
            }

            if(devN[I]>0)
            {
                hostptr->hostarrayptrs[I] = new(std::nothrow) T[devN[I]];
                hostptr->N[I] = devN[I];
            }
            else
            {
                hostptr->N[I] = 0;
            }
        }

        if(hostptr->hostarrayptrs[I]!=NULL && hostptr->devarrayptrs[I]!=NULL)
        {
            cudaMemcpy(hostptr->hostarrayptrs[I],hostptr->devarrayptrs[I],sizeof(T)*hostptr->N[I],cudaMemcpyDeviceToHost);
            sprintf(dbgjnk,"%d dbg %d",I,hostptr->N[I]);
            cuda_errortrap(dbgjnk);
        }
    } //for each array

    //for the pull operation, I don't think any update of the device data structure is necessary

    cudaSetDevice(olddev);

    delete[] ldevarrayptrs;
    delete[] devN;

    return ret;
}

template<typename T> __host__ int curarray<T>::push()
{
    return curarray_push(this);
}
template<typename T> __host__ int curarray<T>::pull()
{
    return curarray_pull(this);
}


/*
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-allocation-and-lifetime%5B/url%5D

cudaMalloc() and cudaFree() have distinct semantics between the host and
device environments. When invoked from the host, cudaMalloc() allocates a
new region from unused device memory. When invoked from the device runtime
these functions map to device-side malloc() and free(). This implies that
within the device environment the total allocatable memory is limited to the
device malloc() heap size, which may be smaller than the available unused
device memory. Also, it is an error to invoke cudaFree() from the host
program on a pointer which was allocated by cudaMalloc() on the device
or vice-versa.

So, basically this entire function is not going to work. I'll be unable to resize within
a kernel.
*/

/*

template<typename T> __device__ int curarray<T>::dev_resizearray(int arraynum, int arraysize)
{
    int ret = 0;
    T* newptr = NULL;
    int I;
    T def;

    if(arraynum>=0 && arraynum<Narrays)
    {
        if(N[arraynum]!=arraysize)
        {
            if(arraysize<=0)
            {
                if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
                devarrayptrs[arraynum] = NULL;
                N[arraynum] = 0;
                ret = 1;
                return ret;
            }

            cudaMalloc(&newptr,arraysize*sizeof(T));
            if(newptr!=NULL)
            {
                //do I want to assume there is a copy operator? (operator=)
                //for now, yes - write a more restrictive class later if I don't want it
                if(devarrayptrs[arraynum]!=NULL)
                {
                    for(I=0;I<N[arraynum]&&I<arraysize;I++)
                    {
                        newptr[I] = devarrayptrs[arraynum][I];
                    }
                }
                for(I=N[arraynum];I<arraysize;I++)
                {
                    newptr[I] = def;
                }

                if(devarrayptrs[arraynum]!=NULL) cudaFree(devarrayptrs[arraynum]);
                devarrayptrs[arraynum] = newptr;
                N[arraynum] = arraysize;
                ret = 1;
            }
            else
            {
                ret = -1;
            }
        }
        else
        {
            ret = 1;
        }
    }

    return ret;
}

*/

template<typename T> __host__ int curarray<T>::resizearray(int arraynum, int arraysize)
{
    int ret = 0;
    T* newptr = NULL;
    int I;
    T def;

    if(arraynum>=0 && arraynum<Narrays)
    {
        if(N[arraynum]!=arraysize)
        {
            if(arraysize<=0)
            {
                delete[] hostarrayptrs[arraynum];
                hostarrayptrs[arraynum] = NULL;
                N[arraynum] = 0;
                ret = 1;
                return ret;
            }

            newptr = new(std::nothrow) T[arraysize];
            if(newptr!=NULL)
            {
                //do I want to assume there is a copy operator? (operator=)
                //for now, yes - write a more restrictive class later if I don't want it

                if(hostarrayptrs[arraynum]!=NULL)
                {
                    for(I=0;I<N[arraynum]&&I<arraysize;I++)
                    {
                        newptr[I] = hostarrayptrs[arraynum][I];
                    }
                }
                for(I=N[arraynum];I<arraysize;I++)
                {
                    newptr[I] = def;
                }

                //cudaFree(hostarrayptrs[arraynum]);
                delete[] hostarrayptrs[arraynum];
                hostarrayptrs[arraynum] = newptr;
                N[arraynum] = arraysize;
                ret = 1;
            }
            else
            {
                ret = -1;
            }
        }
        else
        {
            ret = 1;
        }
    }

    return ret;
}

};

#endif