1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
!
! Copyright (C) 2022, MaX CoE
! Distributed under the MIT License
! (license terms are at http://opensource.org/licenses/MIT).
!
!--
!
!
! Utility functions to perform host-device memcpy
! using CUDA-Fortran, OpenACC or OpenMP Offload
!
!==================================================================
!==================================================================
! *DO NOT EDIT*: automatically generated from device_memcpy_h2d.jf90
!==================================================================
!==================================================================
!
#include<devxlib_macros.h>
#include<devxlib_defs.h>
!
!=======================================
!
! Note about dimensions:
! The lower bound of the assumed shape array passed to the subroutine is 1
! lbound and range instead refer to the indexing in the parent caller.
!
submodule (devxlib_memcpy) devxlib_memcpy_h2d
implicit none
contains
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
module subroutine {{p.name}}_devxlib_memcpy_h2d_{{t[0]|lower}}{{d}}d(array_out, array_in, &
{% for dd in range(d) -%}
{{ "range%s"|format(dd+1) }}, {{ "lbound%s"|format(dd+1) }} {% if not loop.last %}, &
{% endif %}{% endfor %})
implicit none
!
{{t}}({{p.val}}) DEV_ATTR, intent(inout) :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
{{t}}({{p.val}}), intent(in) :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "range%s(2)"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
integer :: ierr = 0
{%- for dd in range(d) %}
integer :: d{{dd+1}}_start, d{{dd+1}}_end, d{{dd+1}}_size, d{{dd+1}}_ld
integer :: lbound{{dd+1}}_, range{{dd+1}}_(2)
{%- endfor %}
!
{%- for dd in range(d) %}
lbound{{dd+1}}_=1
if (present(lbound{{dd+1}})) lbound{{dd+1}}_=lbound{{dd+1}}
range{{dd+1}}_=(/1,size(array_out, {{dd+1}})/)
if (present(range{{dd+1}})) range{{dd+1}}_=range{{dd+1}}
!
d{{dd+1}}_start = range{{dd+1}}_(1) -lbound{{dd+1}}_ +1
d{{dd+1}}_end = range{{dd+1}}_(2) -lbound{{dd+1}}_ +1
d{{dd+1}}_size = range{{dd+1}}_(2) -range{{dd+1}}_(1) + 1
d{{dd+1}}_ld = size(array_out, {{dd+1}})
{%- endfor %}
!
#if defined __DXL_CUDAF
{%- if d==1 %}
ierr = cudaMemcpy( array_out(d1_start), array_in(d1_start), d1_size, cudaMemcpyHostToDevice )
{%- elif d==2 %}
ierr = cudaMemcpy2D( array_out(d1_start, d2_start) , d1_ld, array_in(d1_start, d2_start), d1_ld, d1_size, d2_size )
{%- elif d>=3 %}
array_out({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%}) = &
array_in({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%})
{%- endif %}
if ( ierr /= 0) call devxlib_error("{{p.name}}_devxlib_memcpy_h2d_{{t[0]|lower}}{{d}}d",cudaGetErrorString(ierr),ierr)
#elif defined __DXL_OPENACC || defined __DXL_OPENMP_GPU
array_out({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%}) = &
array_in({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%})
!DEV_ACC update device(array_out)
!DEV_OMPGPU target update to(array_out)
!
#else
array_out({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%}) = &
array_in({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%})
! call devxlib_error("{{p.name}}_devxlib_memcpy_h2d_{{t[0]|lower}}{{d}}d","unexpected error",10)
#endif
!
end subroutine {{p.name}}_devxlib_memcpy_h2d_{{t[0]|lower}}{{d}}d
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
module subroutine {{p.name}}_devxlib_memcpy_h2d_async_{{t[0]|lower}}{{d}}d(array_out, array_in, async_id, &
{% for dd in range(d) -%}
{{ "range%s"|format(dd+1) }}, {{ "lbound%s"|format(dd+1) }} {% if not loop.last %}, &
{% endif %}{% endfor %} )
implicit none
!
{{t}}({{p.val}}), target, intent(inout) DEV_ATTR :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
{{t}}({{p.val}}), target, intent(in) :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
integer(kind=dxl_async_kind), intent(in) :: async_id
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "range%s(2)"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
integer, optional, intent(in) :: {% for dd in range(d) %} {{ "lbound%s"|format(dd+1) }}{% if not loop.last %}, {%- endif %}{% endfor %}
type(c_ptr) :: ptr1,ptr2
integer :: ierr = 0
{%- for dd in range(d) %}
integer :: d{{dd+1}}_start, d{{dd+1}}_end, d{{dd+1}}_size, d{{dd+1}}_ld
integer :: lbound{{dd+1}}_, range{{dd+1}}_(2)
{%- endfor %}
!
{%- for dd in range(d) %}
lbound{{dd+1}}_=1
if (present(lbound{{dd+1}})) lbound{{dd+1}}_=lbound{{dd+1}}
range{{dd+1}}_=(/1,size(array_out, {{dd+1}})/)
if (present(range{{dd+1}})) range{{dd+1}}_=range{{dd+1}}
!
d{{dd+1}}_start = range{{dd+1}}_(1) -lbound{{dd+1}}_ +1
d{{dd+1}}_end = range{{dd+1}}_(2) -lbound{{dd+1}}_ +1
d{{dd+1}}_size = range{{dd+1}}_(2) -range{{dd+1}}_(1) + 1
d{{dd+1}}_ld = size(array_out, {{dd+1}})
{%- endfor %}
!
#if defined __DXL_CUDAF
ierr = CudaMemcpyAsync(array_out, array_in, &
count={% for dd in range(d)%}d{{dd+1}}_size{% if not loop.last %}*{%- endif %}{%endfor%},&
kdir=cudaMemcpyDeviceToHost,stream=async_id)
if ( ierr /= 0) call devxlib_error("{{p.name}}_memcpy_h2d_async_{{t[0]|lower}}{{d}}d",cudaGetErrorString(ierr),ierr)
#elif defined __DXL_OPENACC || defined __DXL_OPENMP_GPU
ptr1=c_loc(array_out({% for dd in range(d)%}d{{dd+1}}_start{% if not loop.last %},{%- endif %}{%endfor%}))
ptr2=c_loc( array_in({% for dd in range(d)%}d{{dd+1}}_start{% if not loop.last %},{%- endif %}{%endfor%}))
!
if ( .not. c_associated( ptr1, ptr2 )) then
array_out({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%}) = &
array_in({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%})
endif
!DEV_ACC update device(array_out) async(async_id)
!DEV_OMPGPU target update to(array_out)
#else
array_out({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%}) = &
array_in({% for dd in range(d)%}d{{dd+1}}_start:d{{dd+1}}_end{% if not loop.last %},{%- endif %}{%endfor%})
! call devxlib_error("{{p.name}}_devxlib_memcpy_h2d_async_{{t[0]|lower}}{{d}}d","unexpected error",12)
#endif
!
end subroutine {{p.name}}_devxlib_memcpy_h2d_async_{{t[0]|lower}}{{d}}d
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
{%- for t in types %}
{%- for p in kinds[t] %}
{%- for d in range(1,dimensions+1) %}
module subroutine {{p.name}}_devxlib_memcpy_h2d_{{t[0]|lower}}{{d}}d_p(&
#if defined __DXL_OPENMP_GPU
array_out, array_in, device_id)
#else
array_out, array_in)
#endif
implicit none
!
{{t}}({{p.val}}), pointer, contiguous, intent(inout) DEV_ATTR :: array_out({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
{{t}}({{p.val}}), target, intent(in) :: array_in({% for dd in range(d) %}:{% if not loop.last %}, {%- endif %}{% endfor %})
#if defined __DXL_OPENMP_GPU
integer, optional, intent(in) :: device_id
#endif
!
#if defined __DXL_CUDAF || defined __DXL_OPENMP_GPU
integer :: ierr = 0
#endif
!
#if defined __DXL_CUDAF
ierr = cudaMemcpy_f(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
size(array_out,kind=int64) / 8_int64, c_size_t), cudaMemcpyHostToDevice)
#elif defined __DXL_OPENACC
call acc_memcpy_to_device_f(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
size(array_out,kind=int64) / 8_int64, c_size_t))
#elif defined __DXL_OPENMP_GPU
if (present(device_id)) then
ierr = int(omp_target_memcpy(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
size(array_out,kind=int64) / 8_int64, c_size_t), int(0,c_size_t), int(0,c_size_t), device_id, omp_get_initial_device()), kind=int32)
else
ierr = int(omp_target_memcpy(c_loc(array_out), c_loc(array_in), int(storage_size(array_out,kind=int64) * &
size(array_out,kind=int64) / 8_int64, c_size_t), int(0,c_size_t), int(0,c_size_t), omp_get_default_device(), omp_get_initial_device()), kind=int32)
endif
#else
! host2host fall-back
array_out=array_in
#endif
!
end subroutine {{p.name}}_devxlib_memcpy_h2d_{{t[0]|lower}}{{d}}d_p
!
{%- endfor %}
{%- endfor %}
{%- endfor %}
endsubmodule devxlib_memcpy_h2d
|