File: testpycuda.py

package info (click to toggle)
pyhst2 2020c-1
links: PTS, VCS
area: contrib
in suites: bullseye
size: 12,532 kB
sloc: ansic: 11,807; python: 9,663; cpp: 6,786; makefile: 147; sh: 31
file content (369 lines) | stat: -rw-r--r-- 9,470 bytes
parent folder | download | duplicates (2)
import pycuda.driver as drv
import numpy as np
import math
import time
from pycuda.compiler import SourceModule
import h5py

def iceil(x):
    return int(math.ceil(x))


def create_2D_array(np_array):
    h,w = np_array.shape
    descr = drv.ArrayDescriptor()
    descr.width = w
    descr.height = h

    descr.format = drv.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0
    device_array = drv.Array(descr)
        
    return device_array

def cu_mem_to_array2D(device_array, cu_mem, h,w):
    #  h, w = np_array.shape
    copy = drv.Memcpy2D()
    print(" COPYING ", cu_mem, " SU ", device_array ) 
    copy.set_src_device(cu_mem)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = w*4
    copy.src_height = copy.height = h
    copy(aligned=False)
    print(" OK " )
    
def host_to_array2D(device_array, npmem):
    #  h, w = np_array.shape
    copy = drv.Memcpy2D()
    copy.set_src_host(npmem)
    copy.set_dst_array(device_array)
    
    print(npmem.dtype)
    print(npmem.shape)
    
    copy.width_in_bytes = copy.src_pitch = npmem.shape[1]*4
    copy.src_height = copy.height = npmem.shape[0]
    copy(aligned=False)




def cu_mem_to_array(device_array, cu_mem, d,h,w):
    # d, h, w = np_array.shape
    copy = drv.Memcpy3D()
    copy.set_src_device(cu_mem)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = w*4
    copy.src_height = copy.height = h
    copy.depth = d
    copy()



def write( cu_mem, filename,datasetname, mode,stp):
        if not isinstance(cu_mem, (np.ndarray, np.generic) ):
            if cu_mem in stp.cu_vols+[stp.cu_reserved_vol]:

                vol_data_recv = np.zeros([stp.NY,stp.NX]    ,"f")
                drv.memcpy_dtoh(  vol_data_recv    ,  cu_mem )

                h5py.File(filename,mode)[datasetname] =vol_data_recv

            elif cu_mem in  stp.cu_sinos+[stp.cu_reserved_sino]:
                vol_data_recv = np.zeros([stp.NA,stp.NX]    ,"f")
                drv.memcpy_dtoh(  vol_data_recv    ,  cu_mem )
                h5py.File(filename,mode)[datasetname] =vol_data_recv
        else:

            h5py.File(filename,mode)[datasetname] =cu_mem



def setup_calculation(  Nvols=2, Nsinos = 2,NX=2048, NA=1500) :


    """            -----------------
    CODICE CUDA
    -----------------
    """
    kernels_space = SourceModule("""
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_sino;
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_vol ;
    
    
    #define CUDART_PI_F 3.141592654f 
    
    

    
    __global__ void forwproject( float * sino ,int NX, int NY, int NA, float da,  float  ax_pos)
    {
     const int tix = threadIdx.x;
     const int tia = threadIdx.y;
    
     const int bidx = blockIdx.x;
     const int bida = blockIdx.y;

     int ix = (bidx* blockDim.x + tix) ; 
     int ia  = bida* blockDim.y + tia ; 

     int ip =  ix + NX*( ia ) ; 

     float centerX = (NX-1.0f)*0.5f;
     float centerY = (NY-1.0f)*0.5f;


     if( ix<NX && ia<NA ) {

       float angle = ia*da ; 
       float cosa = cosf(angle);
       float sina = sinf(angle);
	
       if(fabs(cosa)>fabs(sina)) {
	  
	 float sum = 0.0;
	 for(int vy = 0; vy<NY; vy++) {
	   float fx;
	   fx = centerX + ( (ix-ax_pos)  + (vy-centerY)*sina) /cosa ;
           float b = tex2D(tex_vol,  fx  + 0.5f  , vy + 0.5f    ) ;
	   sum += (b)/fabs(cosa); 
         }
	 sino[ip] = sum;
       } else {
	 float sum = 0.0;
	 for(int vx = 0; vx<NX; vx++) {
	   float fy;
	   fy = centerY + ( -(ix-ax_pos)  + (vx-centerX)*cosa) /sina ;
	   float b = tex2D (tex_vol,  vx+0.5f    , fy +0.5f );
           sum += (b)/fabs(sina);
         }
	 sino[ip] = sum;
       }
     }
    }
    __global__ void backproject( float * vol ,int NX, int NY, int NA, float da,float  ax_pos)
    {
       const int tix = threadIdx.x;
       const int tiy = threadIdx.y;

       const int bidx = blockIdx.x;
       const int bidy = blockIdx.y;


       int ix4 = (bidx* blockDim.x + tix)*4 ; 
       int iy2 = (bidy* blockDim.y + tiy)*2 ; 

       float centerX = (NX-1.0f)*0.5f;
       float centerY = (NY-1.0f)*0.5f;


       int ix;
       int iy;
       float sum[8] ; 
       int k;
       for(k=0; k<8; k++)  sum[k]=0.0;
       

       for(int ia=0; ia<NA; ia++) {
	 float angle = ia*da ; 
	 float cosa = cosf(angle);
	 float sina = sinf(angle);
         k=0;
	 for(ix=ix4; ix<ix4+4; ix++) {
	   for(iy=iy2; iy<iy2+2; iy++) {

	     
	     if( ix<NX && iy<NY ) {	       
	       float fx = ax_pos + cosa* (  ix-centerX   ) - sina*(iy-centerY) ;
	       float b =  tex2D(tex_sino, fx+0.5f    , ia+0.5f  );
	       sum[k] += b; 
	     }	 
             k++;
	   }
	 }
       }
       k=0;
       for(ix=ix4; ix<ix4+4; ix++) {
          for(iy=iy2; iy<iy2+2; iy++) {
             if( ix<NX && iy<NY ) {
                int ip =  ix + NX*( iy ) ; 
                vol[ip] = sum[k];
             }
             k++;
           }
       }
      }
    """ 
                                 ###  , arch="sm_70"
                                 )

    class Kspace:
        tex_vol            =kernels_space.get_texref('tex_vol' )
        tex_sino           =kernels_space.get_texref('tex_sino')
        backproject = kernels_space.get_function("backproject")
        forwproject = kernels_space.get_function("forwproject")
    



    
    class setup_return_object:
        pass

    stp = setup_return_object()
    
    NY = NX

    stp.size_of_dtype = 4

    """              --------------------
              PREPARATION TEXTURE
              --------------------
    """
    tex_vol            =Kspace.tex_vol
    tex_sino           =Kspace.tex_sino

    for tex in tex_vol,tex_sino:
        # tex.filterMode     = drv.filter_mode.LINEAR

        # tex.set_filter_mode(drv.filter_mode.POINT)
        tex.set_filter_mode(drv.filter_mode.LINEAR)

        tex.set_address_mode(0, drv.address_mode.BORDER)  
        tex.set_address_mode(1, drv.address_mode.BORDER)
        tex.normalized = False

    stp.tex_vol  = tex_vol
    stp.tex_sino = tex_sino

        
    """
                ------------------------
                creation array  entree/sortie   cuda
                ------------------------
    """
    
    np_tmp     = np.zeros( [NY,NX], "f" )
    print( " alloco ",      NY,NX ,  NY*NX*4,  np_tmp.nbytes , "  x ", Nvols )
    cu_vols = [ drv.mem_alloc( np_tmp.nbytes ) for i in range(Nvols) ]
    stp.cu_reserved_vol = drv.mem_alloc( np_tmp.nbytes )
    
    stp.vol_array = create_2D_array(np_tmp)
    stp.tex_vol.set_array(stp.vol_array)

    ### SINOS    
    np_tmp     = np.zeros( [NA,NX], "f" )
    print( " alloco ",      NA,NX ,  NA*NX*4,  np_tmp.nbytes , "  x ", Nsinos )

    cu_sinos = [  drv.mem_alloc( np_tmp.nbytes ) for i in range(Nsinos) ]
    stp.cu_reserved_sino = drv.mem_alloc( np_tmp.nbytes )


    
    stp.sino_array = create_2D_array(np_tmp)
    stp.tex_sino.set_array(stp.sino_array)


    stp.cu_vols = cu_vols
    stp.cu_sinos = cu_sinos
    stp.NA = NA
    stp.NY = NY
    stp.NX = NX
    
    stp.cu_NA = np.int32(NA)
    stp.cu_NY = np.int32(NY)
    stp.cu_NX = np.int32(NX)

    stp.cufunc_backproject = Kspace.backproject
    stp.cufunc_forwproject = Kspace.forwproject


    
    stp.cu_sinos = stp.cu_sinos

    return stp
         



if __name__ == "__main__":
    gpuid = 0
    ax_pos = (2048-1)/2.0
    NX = 2048
    NA = 1500
    
    NX = 1024
    NA = 1024
    
    da = 180.0/NA *np.pi/180.0

    Ncall = 1024
    
    drv.init()
    cudadevice = drv.Device(gpuid)
    attrs=cudadevice.get_attributes()
    print("\n===Attributes for device %d"%gpuid)
    for (key,value) in attrs.items():
        print("%s:%s"%(str(key),str(value)))
    cudacontext = cudadevice.make_context()
    
    

    Nvols = 2
    Nsinos = 1
    stp  = setup_calculation( NX = NX, NA=NA,     Nvols = Nvols, Nsinos=Nsinos)

    i_init_vol = 0
    i_final_vol = 1
    i_data_sino = 0
    
    vol_ini = np.zeros([ NX,NX  ],"f")
    vol_ini[ int(NX*0.9):int(NX*0.9)+4   ,  NX//2:NX//2+4    ]=1.0


    host_to_array2D(stp.vol_array   ,  vol_ini )
    stp.tex_vol.set_array(stp.vol_array)




    bsx=16
    bsy=16    
    block=(bsx,bsy,1)
    grid=( iceil( float(NX/float((bsx)))), iceil( float(NA)/float((bsy))  ), 1)

    

    
    stp.cufunc_forwproject(stp.cu_sinos[i_data_sino] ,   np.int32(NX),    np.int32(NX),   np.int32(NA),  np.float32(da),  np.float32(ax_pos), block=block, grid=grid ) 


    cu_mem_to_array2D(stp.sino_array ,  stp.cu_sinos[i_data_sino] , NA, NX )
    stp.tex_sino.set_array(stp.sino_array)


    time0 = time.time()

    bsx=16
    bsy=16    
    block=(bsx,bsy,1)
    grid=( iceil( float(NX/float((bsx*4)))), iceil( float(NX)/float((bsy*2))  ), 1)

    
    vol_data_recv = np.zeros([1],"f")
    
    for i in range( Ncall):
        print ( i)
        stp.cufunc_backproject( stp.cu_vols[i_final_vol] ,    np.int32(NX),    np.int32(NX),   np.int32(NA),  np.float32(da),  np.float32(ax_pos),
        block=block, grid=grid)
        drv.memcpy_dtoh(  vol_data_recv    ,  stp.cu_vols[i_final_vol]  ) # just une lecture de un float pour forcer la synchronisation avec la sortie du kernel

    time1 = time.time()
    print(" NSECONDS = ", time1-time0 ,"  pour Ncall =  ",  Ncall)
    
    write( stp.cu_vols[i_final_vol]  , "final.h5","vol", "w",stp)
    write( stp.cu_sinos[i_data_sino]  , "final.h5","init_sino", "r+",stp)

        
    cudacontext.pop()