File: wavelets.cu

package info (click to toggle)
pyhst2 2020c-1
links: PTS, VCS
area: contrib
in suites: bullseye
size: 12,532 kB
sloc: ansic: 11,807; python: 9,663; cpp: 6,786; makefile: 147; sh: 31
file content (452 lines) | stat: -rw-r--r-- 16,457 bytes
parent folder | download | duplicates (2)
#include<string.h>
#include <stdio.h>
#include <stdlib.h>
#include<math.h>
#include<math_constants.h>
#include <cuda.h>
#include <cublas.h>
#include <cuComplex.h>
#include<time.h>



/// ****************************************************************************
/// ******************** Code from other files  ********************************
/// ****************************************************************************


#define FROMCU
extern "C" {
#include<CCspace.h>
}
#  define CUDACHECK \
{ cudaThreadSynchronize(); \
  cudaError_t last = cudaGetLastError();\
  if(last!=cudaSuccess) {\
  printf("ERRORX: %s  %s  %i \n", cudaGetErrorString( last),    __FILE__, __LINE__    );    \
  exit(1);\
  }\
}

#define WKSIZE 256
#define fftbunch 128
#define blsize_cufft 32

#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
  cudaError err = call;                                                    \
  if( cudaSuccess != err) {                                                \
  fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
  __FILE__, __LINE__, cudaGetErrorString( err) );              \
  exit(EXIT_FAILURE);                                                  \
  } }

#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);                                            \


#include <cufft.h>
#define CUDA_SAFE_FFT(call){                                                   \
  cufftResult err = call;                                                    \
  if( CUFFT_SUCCESS != err) {                                                \
  fprintf(stderr, "Cuda error in file '%s' in line %i : %d.\n",          \
  __FILE__, __LINE__, err );                                     \
  exit(EXIT_FAILURE);                                                    \
  } }


typedef struct ParamsForTomo {
  Gpu_Context *ctxstruct;
  float DETECTOR_DUTY_RATIO;
  int DETECTOR_DUTY_OVERSAMPLING;
} ParamsForTomo ;


extern float* global_sino_tmp;
extern float* global_slice_tmp;


int iDivUp(int a, int b);

int nextpow2_cp_padded(int v);

int nextpow2_cp(int v);

void proj_wrapper(ParamsForTomo p4t, float* d_sino, float* d_image, int dimslice);

void backproj_wrapper(ParamsForTomo p4t, float* d_sino, float* d_image, float *d_sino_tmp=NULL);

__global__  void cp_kern_compute_discrete_ramp(int length, cufftReal* oArray);

cufftComplex* cp_compute_discretized_ramp_filter(int length, cufftReal* d_r, cufftComplex* d_i, cufftHandle myplan);

__global__ void cp_kern_fourier_filter(cufftComplex* inArray, cufftComplex* filter, int sizeX, int sizeY);

__global__  void add_rings_to_sinogram_kernel(float *sino, float* rings, float alpha_rings, int num_bins, int nprojs_span);


#include "pdwt/src/wt.h"



/// ****************************************************************************
/// ******************** Other kernels and calls *******************************
/// ****************************************************************************


/// Transpose of add_rings_to_sinogram_kernel
__global__  void reduce_sinogram_to_rings_kernel(float *sino, float* rings, float alpha_rings, int num_bins, int nprojs_span) {
  int gidx = threadIdx.x + blockIdx.x*blockDim.x;
  if (gidx >= num_bins) return;
  float acc_x = 0;
  for (int i = 0; i < nprojs_span; i++) {
    acc_x += sino[i*num_bins +gidx];
  }
  rings[gidx] = alpha_rings * acc_x;
}


void call_add_rings_to_sinogram(float* d_sino, float* d_rings, float alpha_rings, int num_bins, int nprojs_span) {
  // 2D grid and block
  dim3 blk= dim3( blsize_cufft , blsize_cufft , 1 );
  dim3 grd_rings2 = dim3(iDivUp(num_bins ,blsize_cufft), iDivUp(nprojs_span, blsize_cufft), 1);
  add_rings_to_sinogram_kernel<<<grd_rings2, blk>>>(d_sino, d_rings, alpha_rings, num_bins, nprojs_span);

}

void call_reduce_sinogram_to_rings(float* d_sino, float* d_rings, float alpha_rings, int num_bins, int nprojs_span) {
  // 1D grid and block for "reduction"
  dim3 blk= dim3( blsize_cufft , 1 , 1 );
  dim3 grd_rings = dim3(iDivUp(num_bins ,blsize_cufft), 1, 1);
  reduce_sinogram_to_rings_kernel<<<grd_rings, blk>>>(d_sino, d_rings, alpha_rings, num_bins, nprojs_span);
}


__global__  void soft_threshold_1D_kernel(float* rings, int num_bins, float beta) {
  int gidx = threadIdx.x + blockIdx.x*blockDim.x;
  if (gidx >= num_bins) return;
  float val = rings[gidx];
  rings[gidx] = copysignf(max(fabsf(val)-beta, 0.0f), val);
}

void call_soft_threshold_1D(float* d_rings, int num_bins, float beta) {
  // 1D grid and block
  dim3 blk= dim3( blsize_cufft , 1 , 1 );
  dim3 grd_rings = dim3(iDivUp(num_bins ,blsize_cufft), 1, 1);
  soft_threshold_1D_kernel<<<grd_rings, blk>>>(d_rings, num_bins, beta);
}






/// ****************************************************************************
/// ******************** Algorithms ********************************************
/// ****************************************************************************

#define W_VERBOSE 1
#define W_DEBUG 1


/// Calculate Lipschitz constant for analysis formulation
float w_calculate_Lipschitz(ParamsForTomo p4t, float* d_data, float* d_sino, float* d_result, int dimslice, int niter) {
  float L = 0.0f;
  backproj_wrapper(p4t, d_data, d_result);
  for (int k = 0; k < niter; k++) {
    proj_wrapper(p4t, d_sino, d_result, dimslice);
    backproj_wrapper(p4t, d_sino, d_result);
    L = cublasSnrm2(dimslice*dimslice, d_result, 1);
    cublasSscal(dimslice*dimslice, 1.0f/L, d_result, 1);
    if (W_VERBOSE && (k % 10) == 0) printf("Lipschitz (%d) %f\n", k, L);
  }
  return L;
}




/// Calculate Lipschitz constant for analysis formulation with rings variables
float w_calculate_Lipschitz_rings(ParamsForTomo p4t, float* d_data, float* d_sino, float* d_result, float* d_rings, int dimslice, int niter, float alpha_rings) {
  int num_bins = p4t.ctxstruct->num_bins;
  int num_projs = p4t.ctxstruct->nprojs_span;
  float *d_sino_tmp;
  cudaMalloc(&d_sino_tmp, num_bins*num_projs*sizeof(float));

  // Initial x, r
  backproj_wrapper(p4t, d_data, d_result, d_sino_tmp);
  // call_reduce_sinogram_to_rings(d_data, d_rings, alpha_rings, num_bins, num_projs);
  call_reduce_sinogram_to_rings(d_sino_tmp , d_rings, alpha_rings, num_bins, num_projs);
  float L = 0.0f, L_x = 0, L_r = 0;
  for (int k = 0; k < niter; k++) {
    // F(x, r) = ||P x + U r - d||_2^2
    //
    // x_n     ( P^T P   P^T U )  x_{n-1}
    //      =  (               )
    // r_n     ( U^T P   U^T U )  r_{n-1}
    proj_wrapper(p4t, d_sino, d_result, dimslice);
    call_add_rings_to_sinogram(d_sino, d_rings, alpha_rings, num_bins, num_projs);
    backproj_wrapper(p4t, d_sino, d_result, d_sino_tmp);
    call_reduce_sinogram_to_rings(d_sino_tmp, d_rings, alpha_rings, num_bins, num_projs);
    L_x = cublasSnrm2(dimslice*dimslice, d_result, 1);
    L_r = cublasSnrm2(num_bins, d_rings, 1);
    L = sqrtf(L_x*L_x + L_r*L_r);
    cublasSscal(dimslice*dimslice, 1.0f/L, d_result, 1);
    if (W_VERBOSE && (k % 10) == 0) printf("Lipschitz (%d) %f\n", k, L);
  }
  cudaFree(d_sino_tmp);
  
  return L;
}


int wavelets_fista(
    ParamsForTomo p4t,
    float* d_data,
    float* d_result,
    int niter,
    float beta,
    float beta_rings,
    float rings_height,
    float alpha_rings)
{
  // Retrieve the parameters
  int num_bins = p4t.ctxstruct->num_bins;
  int num_projs = p4t.ctxstruct->nprojs_span;
  int dimslice = p4t.ctxstruct->num_x;
  int numels_slice = dimslice*dimslice;
  float a = p4t.ctxstruct->W_FISTA_PARAM;  //Fista "a" parameter : y = x + k/(k+a) * (x - x_old)
  int nlevels = p4t.ctxstruct->W_LEVELS;
  int do_cycle_spin = p4t.ctxstruct->W_CYCLE_SPIN;
  int do_swt = p4t.ctxstruct->W_SWT;
  char* wname = p4t.ctxstruct->W_WNAME;
  int verbosity = p4t.ctxstruct->verbosity;
  int W_NORMALIZE = p4t.ctxstruct->W_NORMALIZE;
  int W_DTA = p4t.ctxstruct->W_THRESHOLD_APPCOEFFS;
  char DO_RING_CORR = (rings_height > 0.00001 ? 1 : 0);

  float* d_rings, *d_rings_old, *d_grad_rings, *d_rings2;
  if (DO_RING_CORR) {
    cudaMalloc(&d_rings, num_bins*sizeof(float));
    cudaMalloc(&d_rings2, num_bins*sizeof(float));
    cudaMalloc(&d_rings_old, num_bins*sizeof(float));
    cudaMalloc(&d_grad_rings, num_bins*sizeof(float));
  }

  if (verbosity > 2) printf("levels : %d \t cycle spin : %d \t a : %f \t dimslice : %d \n", nlevels, do_cycle_spin, a, dimslice);


  float* d_sino, *d_sino_tmp; // temp array
  cudaMalloc(&d_sino, num_bins*num_projs*sizeof(float));
  cudaMalloc(&d_sino_tmp, num_bins*num_projs*sizeof(float));
  cudaMemcpy(d_sino, d_data, num_bins*num_projs*sizeof(float), cudaMemcpyDeviceToDevice);

  float Lip = 0;
  // Calculate the Lipschitz constant : largest eigenvalue of PT*P (for analysis formulation)
  if (!DO_RING_CORR) Lip = w_calculate_Lipschitz(p4t, d_data, d_sino, d_result, dimslice, p4t.ctxstruct->LIPSCHITZ_ITERATIONS);
  // Calculate the Lipschitz constant with rings: largest singular value of [P, U]
  else Lip = w_calculate_Lipschitz_rings(p4t, d_data, d_sino, d_result, d_rings, dimslice, p4t.ctxstruct->LIPSCHITZ_ITERATIONS, alpha_rings);

  Lip *= p4t.ctxstruct->LIPSCHITZFACTOR;
  if (verbosity > 9) printf("Lipschitz = %f\n", Lip);
  CUDACHECK;
  // First, backproject the sinogram to get the initial image
  backproj_wrapper(p4t, d_data, d_result);
  // Build the wavelet structure from this image
  //Wavelets(img, Nr, Nc, wname, levels, memisonhost=1, do_separable=1, do_cycle_spinning=0, do_swt=0);
  Wavelets W(d_result, dimslice, dimslice, wname, nlevels, 0, 1, do_cycle_spin, do_swt);
  if (verbosity > 3) W.print_informations();

  // --
  float* x, *y, *grad_y, *x_old;
  y = W.d_image;
  float* energy_vector = (float*) calloc(niter-1, sizeof(float));

  cudaMalloc(&x, numels_slice*sizeof(float));
  cudaMemset(x, 0, numels_slice*sizeof(float));
  cudaMalloc(&x_old, numels_slice*sizeof(float));
  cudaMalloc(&grad_y, numels_slice*sizeof(float));
  CUDACHECK;
  float fid = 0, l1 = 0, l1_rings = 0;
  float alpha = 1.0f;
  float t = 1.0f, t_old;

  for (int k = 0; k < niter; k++) {
    // grad_y = PT(P y - d)
    proj_wrapper(p4t, d_sino, y, dimslice);
    cublasSaxpy(num_bins*num_projs, -1.0, d_data, 1, d_sino, 1);
    if (DO_RING_CORR) call_add_rings_to_sinogram(d_sino, d_rings, alpha_rings, num_bins, num_projs);
    backproj_wrapper(p4t, d_sino, grad_y, d_sino_tmp);
    if (DO_RING_CORR) call_reduce_sinogram_to_rings(d_sino_tmp, d_grad_rings, alpha_rings, num_bins, num_projs);

    // y = y - (1/L)*grad_y
    cublasSaxpy(numels_slice, -1.0/Lip, grad_y, 1, y, 1);
    if (DO_RING_CORR) cublasSaxpy(num_bins, -1.0/Lip, d_grad_rings, 1, d_rings, 1);

    // Calculate energy (of previous iteration actually)
    fid = cublasSnrm2(num_bins*num_projs, d_sino, 1);
    fid *= fid*0.5;
    l1 = W.norm1();
    if (DO_RING_CORR) l1_rings = cublasSasum(num_bins, d_rings, 1);

    // x_old = x
    cudaMemcpy(x_old, x, numels_slice*sizeof(float), cudaMemcpyDeviceToDevice);
    if (DO_RING_CORR) cudaMemcpy(d_rings_old, d_rings2, num_bins*sizeof(float), cudaMemcpyDeviceToDevice);

    // Apply proximal operator
    W.forward();
    W.soft_threshold(beta/Lip, W_DTA, W_NORMALIZE);
    W.inverse();
    if (DO_RING_CORR) {
      call_soft_threshold_1D(d_rings, num_bins, beta_rings);
      cudaMemcpy(d_rings2, d_rings, num_bins*sizeof(float), cudaMemcpyDeviceToDevice);
    }

    // y = x + (k-1)/(k+a)*(x - x_old)  for k=1, ...
    cudaMemcpy(x, y, numels_slice*sizeof(float), cudaMemcpyDeviceToDevice); // y is bound to W.image
    t_old = t;
    t = 0.5*(1.0 + sqrt(1 + 4*t*t));
    alpha = (t_old - 1.0)/t;
    cublasSscal(numels_slice, 1.0 + alpha, y, 1);
    cublasSaxpy(numels_slice, -alpha, x_old, 1, y, 1);
    if (DO_RING_CORR) {
      cublasSscal(num_bins, 1.0 + k/(k+1.0+a), d_rings, 1);
      cublasSaxpy(num_bins, -k/(k+1.0+a), d_rings_old, 1, d_rings, 1);
    }

    // Display energy
    if (k > 0) energy_vector[k-1] = fid + beta*l1 + beta_rings*l1_rings;
//    if (k > 50 && energy_vector[k-1] > energy_vector[k-2]) t = 1.0; // TODO: test
    if (W_VERBOSE && (k % 10) == 0) printf("It %d \t Energy %e \t Fidelity %e \t L1 %e \t Rings %e\n", k, fid + beta*l1, fid, l1, l1_rings);

  }

  cudaMemcpy(d_result, x, numels_slice*sizeof(float), cudaMemcpyDeviceToDevice);
  if (verbosity > 2) {
    FILE* flid = fopen("energy_wavelets.dat", "wb");
    fwrite(energy_vector, sizeof(float), niter, flid);
    fclose(flid);
  }
  cudaFree(x);
  cudaFree(x_old);
  cudaFree(grad_y);
  cudaFree(d_sino);
  cudaFree(d_sino_tmp);
  if (DO_RING_CORR) {
    cudaFree(d_rings);
    cudaFree(d_rings2);
    cudaFree(d_rings_old);
    cudaFree(d_grad_rings);
  }
  free(energy_vector);

  return 0;
}








extern "C" {
  int wavelets_driver(
      Gpu_Context* self,
      float* data,
      float* SLICE,
      float DETECTOR_DUTY_RATIO,
      int DETECTOR_DUTY_OVERSAMPLING,
      float beta,
      float beta_rings,
      float rings_height,
      float alpha_rings);
}




float* w_get_device_complex(float2* d_array, int numels) {
  float2* h_array = (float2*) calloc(numels,sizeof(float2));
  CUDA_SAFE_CALL(cudaMemcpy(h_array,d_array, numels*sizeof(float), cudaMemcpyDeviceToHost));
  float* res = (float*) calloc(2*numels, sizeof(float));
  for (int i=0; i<numels; i++) {
    res[i] = h_array[i].x;
    res[i+numels] = h_array[i].y;
  }
  free(h_array);
  return res;
}
float* w_get_device_complex_abs(float2* d_array, int numels) {
  float2* h_array = (float2*) calloc(numels,sizeof(float2));
  CUDA_SAFE_CALL(cudaMemcpy(h_array,d_array, numels*sizeof(float), cudaMemcpyDeviceToHost));
  float* res = (float*) calloc(numels, sizeof(float));
  for (int i=0; i<numels; i++) {
    res[i] = sqrtf(h_array[i].x*h_array[i].x + h_array[i].y*h_array[i].y);
  }
  free(h_array);
  return res;
}








int wavelets_driver(Gpu_Context* self, float* data, float* SLICE, float DETECTOR_DUTY_RATIO, int DETECTOR_DUTY_OVERSAMPLING, float beta, float beta_rings, float rings_height, float alpha_rings) {

  if (W_VERBOSE) {
    puts("------------------------------------------------------------------------------");
    puts("------------------ Entering Wavelets driver ----------------------------------");
    puts("------------------------------------------------------------------------------");
  }
  cuCtxSetCurrent ( *((CUcontext *) self->gpuctx  ))  ;

  //Import parameters from self
  int num_bins = self->num_bins;
  int num_projs = self->nprojs_span;
  int dimslice = self->num_x ;
  ParamsForTomo p4t  =  (ParamsForTomo)  { (Gpu_Context*) self, DETECTOR_DUTY_RATIO, DETECTOR_DUTY_OVERSAMPLING } ;


  //Prepare cuFFT plan for FBP
  CUDA_SAFE_CALL(cudaMalloc(&self->precond_params_dl.d_r_sino_error, fftbunch*nextpow2_cp_padded(num_bins)*sizeof(cufftReal)));
  CUDA_SAFE_CALL(cudaMalloc(&self->precond_params_dl.d_i_sino_error, fftbunch*nextpow2_cp_padded(num_bins)*sizeof(cufftComplex)));
  static int plans_are_computed = 0;
  if(!plans_are_computed) {
      plans_are_computed = 1;
      CUDA_SAFE_FFT(cufftPlan1d((cufftHandle *) &self->precond_params_dl.planRamp_forward, nextpow2_cp_padded(num_bins),CUFFT_R2C,fftbunch));
      CUDA_SAFE_FFT(cufftPlan1d((cufftHandle *) &self->precond_params_dl.planRamp_backward,nextpow2_cp_padded(num_bins),CUFFT_C2R,fftbunch));
  }
  cufftComplex* d_i_discrete_ramp = cp_compute_discretized_ramp_filter(nextpow2_cp_padded(num_bins), self->precond_params_dl.d_r_sino_error, self->precond_params_dl.d_i_sino_error, self->precond_params_dl.planRamp_forward);
  self->precond_params_dl.filter_coeffs = d_i_discrete_ramp; //size : nextpow2(num_bins)/2+1


  // Allocate memory
  float* d_result;
  cudaMalloc(&d_result, dimslice*dimslice*sizeof(float));
  float* d_data;
  CUDA_SAFE_CALL(cudaMalloc(&d_data, num_bins*num_projs*sizeof(float)));
  CUDA_SAFE_CALL(cudaMemcpy(d_data, data,  num_bins*num_projs*sizeof(float), cudaMemcpyHostToDevice  ));
  CUDA_SAFE_CALL(cudaMalloc(&global_sino_tmp, num_bins*num_projs*sizeof(float)));
  CUDA_SAFE_CALL(cudaMalloc(&global_slice_tmp, dimslice*dimslice*sizeof(float)));


  // Run
  wavelets_fista(p4t, d_data, d_result, self->ITERATIVE_CORRECTIONS, beta, beta_rings, rings_height, alpha_rings);
  cudaMemcpy(SLICE, d_result, dimslice*dimslice*sizeof(float), cudaMemcpyDeviceToHost);

  // Free memory
  cudaFree(d_result);
  cudaFree(d_data);
  cudaFree(self->precond_params_dl.d_r_sino_error);
  cudaFree(self->precond_params_dl.d_i_sino_error);
  cudaFree(global_sino_tmp);
  cudaFree(global_slice_tmp);
  CUDACHECK;

  return 0;
}