File: devxlib_memcpy_d2d.jf90

package info (click to toggle)
devicexlib 0.8.6-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 10,364 kB
sloc: f90: 77,678; sh: 3,701; fortran: 773; makefile: 268; python: 246; ansic: 69; awk: 36
file content (186 lines) | stat: -rw-r--r-- 8,352 bytes
!
! Copyright (C) 2022, MaX CoE
! Distributed under the MIT License 
! (license terms are at http://opensource.org/licenses/MIT).
!
!--
!
!
! Utility functions to perform sync and async device-device memcpy
! using CUDA-Fortran, OpenACC or OpenMP Offload
!
!==================================================================
!==================================================================
! *DO NOT EDIT*: automatically generated from device_memcpy_d2d.jf90
!==================================================================
!==================================================================
!
#include<devxlib_macros.h>
#include<devxlib_defs.h>
!
!=======================================
!
! Note about dimensions:
! The lower bound of the assumed shape array passed to the subroutine is 1
! lbound and range instead refer to the indexing in the parent caller.
!
submodule (devxlib_memcpy) devxlib_memcpy_d2d

   implicit none

   contains
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
      module subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d(array_out, array_in, &
                                                  {% for dd in range(d) -%}
                                                  {{ "range%s"|format(dd+1) }}, {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, &
                                                  {% endif %}{% endfor %} )
         implicit none
         !
         {{t}}({{p.val}}) DEV_ATTR, intent(inout) :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
         {{t}}({{p.val}}) DEV_ATTR, intent(in)    :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
         integer, optional, intent(in) :: {% for dd in range(d) %} {{ "range%s(2)"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
         integer, optional, intent(in) :: {% for dd in range(d) %} {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
         !
{%- for dd in range(d) %}
         integer :: i{{dd+1}}, d{{dd+1}}s, d{{dd+1}}e
         integer :: lbound{{dd+1}}_, range{{dd+1}}_(2)
{%- endfor %}
         !
{%- for dd in range(d) %}
         lbound{{dd+1}}_=1
         if (present(lbound{{dd+1}})) lbound{{dd+1}}_=lbound{{dd+1}} 
         range{{dd+1}}_=(/1,size(array_out, {{dd+1}})/)
         if (present(range{{dd+1}})) range{{dd+1}}_=range{{dd+1}} 
         !
         d{{dd+1}}s = range{{dd+1}}_(1) -lbound{{dd+1}}_ +1
         d{{dd+1}}e = range{{dd+1}}_(2) -lbound{{dd+1}}_ +1
         !
{%- endfor %}
         !DEV_CUF kernel do({{d}})
         !DEV_ACC data present(array_out, array_in)
         !DEV_ACC parallel loop collapse({{d}})
         !DEV_OMPGPU target map(present,alloc:array_out, array_in)
         !DEV_OMPGPU teams loop collapse({{d}})
         !DEV_OMP  parallel do
{%- for dd in range(d,0,-1) %}
         do i{{dd}} = d{{dd}}s, d{{dd}}e
{%- endfor %}
            array_out( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} ) = array_in( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} )
{%- for dd in range(d) %}
         enddo
{%- endfor %}
         !DEV_ACC end data
         !DEV_OMPGPU end target
         !
      end subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d
!
{%- endfor %}
{%- endfor %}
{%- endfor %}

{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
      module subroutine {{p.name}}_devxlib_memcpy_d2d_async_{{t[0]|lower}}{{d}}d(array_out, array_in, async_id, &
                                                  {% for dd in range(d) -%}
                                                  {{ "range%s"|format(dd+1) }}, {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, &
                                                  {% endif %}{% endfor %} )
         implicit none
         !
         {{t}}({{p.val}}) DEV_ATTR, intent(inout) :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
         {{t}}({{p.val}}) DEV_ATTR, intent(in)    :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
         integer(kind=dxl_async_kind), intent(in) :: async_id
         integer, optional, intent(in) :: {% for dd in range(d) %} {{ "range%s(2)"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
         integer, optional, intent(in) :: {% for dd in range(d) %} {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
         !
{%- for dd in range(d) %}
         integer :: i{{dd+1}}, d{{dd+1}}s, d{{dd+1}}e
         integer :: lbound{{dd+1}}_, range{{dd+1}}_(2)
{%- endfor %}
         !
{%- for dd in range(d) %}
         lbound{{dd+1}}_=1
         if (present(lbound{{dd+1}})) lbound{{dd+1}}_=lbound{{dd+1}} 
         range{{dd+1}}_=(/1,size(array_out, {{dd+1}})/)
         if (present(range{{dd+1}})) range{{dd+1}}_=range{{dd+1}} 
         !
         d{{dd+1}}s = range{{dd+1}}_(1) -lbound{{dd+1}}_ +1
         d{{dd+1}}e = range{{dd+1}}_(2) -lbound{{dd+1}}_ +1
{%- endfor %}
         !
         !DEV_CUF kernel do({{d}}) <<<*,*,0,stream=async_id>>>
         !DEV_ACC data present(array_out, array_in)
         !DEV_ACC parallel loop collapse({{d}}) async(async_id)
         !DEV_OMPGPU target map(present,alloc:array_out, array_in)
         !DEV_OMPGPU teams loop collapse({{d}})
{%- for dd in range(d,0,-1) %}
         do i{{dd}} = d{{dd}}s, d{{dd}}e
{%- endfor %}
            array_out( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} ) = array_in( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} )
{%- for dd in range(d) %}
         enddo
{%- endfor %}
         !DEV_ACC end data
         !DEV_OMPGPU end target
         !
      end subroutine {{p.name}}_devxlib_memcpy_d2d_async_{{t[0]|lower}}{{d}}d
!
{%- endfor %}
{%- endfor %}
{%- endfor %}

{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
      module subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d_p(&
#if defined __DXL_OPENMP_GPU
                                            array_out, array_in, device_id_out, device_id_in)
#else
                                            array_out, array_in)
#endif
         implicit none
         !
         {{t}}({{p.val}}), pointer, contiguous, intent(inout) DEV_ATTR :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
         {{t}}({{p.val}}), pointer, contiguous, intent(in)    DEV_ATTR :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
#if defined __DXL_OPENMP_GPU
         integer, optional, intent(in) :: device_id_out, device_id_in
         integer                       :: device_id_out_local, device_id_in_local
#endif
         !
#if defined __DXL_CUDAF || defined __DXL_OPENMP_GPU
         integer :: ierr = 0
#endif
         !
#if defined __DXL_CUDAF
         ierr = cudaMemcpy_f(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
                             size(array_out,kind=int64) / 8_int64,c_size_t), cudaMemcpyDeviceToDevice)
#elif defined __DXL_OPENACC
         call acc_memcpy_device_f(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
                                  size(array_out,kind=int64) / 8_int64, c_size_t))
#elif defined __DXL_OPENMP_GPU
         if (present(device_id_out)) then
            device_id_out_local = device_id_out
         else
            device_id_out_local = omp_get_default_device()
         endif
         if (present(device_id_in)) then
            device_id_in_local = device_id_in
         else
            device_id_in_local = omp_get_default_device()
         endif
         ierr = int( omp_target_memcpy(c_loc(array_out), c_loc(array_in), &
                     int(storage_size(array_out,kind=int64) *size(array_out,kind=int64) / 8_int64, c_size_t), &
                     int(0,c_size_t), int(0,c_size_t), device_id_out_local, device_id_in_local), kind=int32)
#else
         ! host2host fall-back
         array_out=array_in
#endif
         !
      end subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d_p
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
endsubmodule devxlib_memcpy_d2d