1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
|
!
! Copyright (C) 2022, MaX CoE
! Distributed under the MIT License
! (license terms are at http://opensource.org/licenses/MIT).
!
!--
!
!
! Utility functions to perform sync and async device-device memcpy
! using CUDA-Fortran, OpenACC or OpenMP Offload
!
!==================================================================
!==================================================================
! *DO NOT EDIT*: automatically generated from device_memcpy_d2d.jf90
!==================================================================
!==================================================================
!
#include<devxlib_macros.h>
#include<devxlib_defs.h>
!
!=======================================
!
! Note about dimensions:
! The lower bound of the assumed shape array passed to the subroutine is 1
! lbound and range instead refer to the indexing in the parent caller.
!
submodule (devxlib_memcpy) devxlib_memcpy_d2d
implicit none
contains
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
module subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d(array_out, array_in, &
{% for dd in range(d) -%}
{{ "range%s"|format(dd+1) }}, {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, &
{% endif %}{% endfor %} )
implicit none
!
{{t}}({{p.val}}) DEV_ATTR, intent(inout) :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
{{t}}({{p.val}}) DEV_ATTR, intent(in) :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "range%s(2)"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
!
{%- for dd in range(d) %}
integer :: i{{dd+1}}, d{{dd+1}}s, d{{dd+1}}e
integer :: lbound{{dd+1}}_, range{{dd+1}}_(2)
{%- endfor %}
!
{%- for dd in range(d) %}
lbound{{dd+1}}_=1
if (present(lbound{{dd+1}})) lbound{{dd+1}}_=lbound{{dd+1}}
range{{dd+1}}_=(/1,size(array_out, {{dd+1}})/)
if (present(range{{dd+1}})) range{{dd+1}}_=range{{dd+1}}
!
d{{dd+1}}s = range{{dd+1}}_(1) -lbound{{dd+1}}_ +1
d{{dd+1}}e = range{{dd+1}}_(2) -lbound{{dd+1}}_ +1
!
{%- endfor %}
!DEV_CUF kernel do({{d}})
!DEV_ACC data present(array_out, array_in)
!DEV_ACC parallel loop collapse({{d}})
!DEV_OMPGPU target map(present,alloc:array_out, array_in)
!DEV_OMPGPU teams loop collapse({{d}})
!DEV_OMP parallel do
{%- for dd in range(d,0,-1) %}
do i{{dd}} = d{{dd}}s, d{{dd}}e
{%- endfor %}
array_out( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} ) = array_in( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} )
{%- for dd in range(d) %}
enddo
{%- endfor %}
!DEV_ACC end data
!DEV_OMPGPU end target
!
end subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
module subroutine {{p.name}}_devxlib_memcpy_d2d_async_{{t[0]|lower}}{{d}}d(array_out, array_in, async_id, &
{% for dd in range(d) -%}
{{ "range%s"|format(dd+1) }}, {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, &
{% endif %}{% endfor %} )
implicit none
!
{{t}}({{p.val}}) DEV_ATTR, intent(inout) :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
{{t}}({{p.val}}) DEV_ATTR, intent(in) :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
integer(kind=dxl_async_kind), intent(in) :: async_id
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "range%s(2)"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
!
{%- for dd in range(d) %}
integer :: i{{dd+1}}, d{{dd+1}}s, d{{dd+1}}e
integer :: lbound{{dd+1}}_, range{{dd+1}}_(2)
{%- endfor %}
!
{%- for dd in range(d) %}
lbound{{dd+1}}_=1
if (present(lbound{{dd+1}})) lbound{{dd+1}}_=lbound{{dd+1}}
range{{dd+1}}_=(/1,size(array_out, {{dd+1}})/)
if (present(range{{dd+1}})) range{{dd+1}}_=range{{dd+1}}
!
d{{dd+1}}s = range{{dd+1}}_(1) -lbound{{dd+1}}_ +1
d{{dd+1}}e = range{{dd+1}}_(2) -lbound{{dd+1}}_ +1
{%- endfor %}
!
!DEV_CUF kernel do({{d}}) <<<*,*,0,stream=async_id>>>
!DEV_ACC data present(array_out, array_in)
!DEV_ACC parallel loop collapse({{d}}) async(async_id)
!DEV_OMPGPU target map(present,alloc:array_out, array_in)
!DEV_OMPGPU teams loop collapse({{d}})
{%- for dd in range(d,0,-1) %}
do i{{dd}} = d{{dd}}s, d{{dd}}e
{%- endfor %}
array_out( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} ) = array_in( {%- for dd in range(d) %}i{{dd+1}}{% if not loop.last %}, {%- endif %} {%- endfor %} )
{%- for dd in range(d) %}
enddo
{%- endfor %}
!DEV_ACC end data
!DEV_OMPGPU end target
!
end subroutine {{p.name}}_devxlib_memcpy_d2d_async_{{t[0]|lower}}{{d}}d
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
module subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d_p(&
#if defined __DXL_OPENMP_GPU
array_out, array_in, device_id_out, device_id_in)
#else
array_out, array_in)
#endif
implicit none
!
{{t}}({{p.val}}), pointer, contiguous, intent(inout) DEV_ATTR :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
{{t}}({{p.val}}), pointer, contiguous, intent(in) DEV_ATTR :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
#if defined __DXL_OPENMP_GPU
integer, optional, intent(in) :: device_id_out, device_id_in
integer :: device_id_out_local, device_id_in_local
#endif
!
#if defined __DXL_CUDAF || defined __DXL_OPENMP_GPU
integer :: ierr = 0
#endif
!
#if defined __DXL_CUDAF
ierr = cudaMemcpy_f(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
size(array_out,kind=int64) / 8_int64,c_size_t), cudaMemcpyDeviceToDevice)
#elif defined __DXL_OPENACC
call acc_memcpy_device_f(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
size(array_out,kind=int64) / 8_int64, c_size_t))
#elif defined __DXL_OPENMP_GPU
if (present(device_id_out)) then
device_id_out_local = device_id_out
else
device_id_out_local = omp_get_default_device()
endif
if (present(device_id_in)) then
device_id_in_local = device_id_in
else
device_id_in_local = omp_get_default_device()
endif
ierr = int( omp_target_memcpy(c_loc(array_out), c_loc(array_in), &
int(storage_size(array_out,kind=int64) *size(array_out,kind=int64) / 8_int64, c_size_t), &
int(0,c_size_t), int(0,c_size_t), device_id_out_local, device_id_in_local), kind=int32)
#else
! host2host fall-back
array_out=array_in
#endif
!
end subroutine {{p.name}}_devxlib_memcpy_d2d_{{t[0]|lower}}{{d}}d_p
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
endsubmodule devxlib_memcpy_d2d
|