1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
|
/*
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
offset_t bytesToRead )
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
pSrc += ( srcOffsetInBytes + get_global_id(0) );
pDst += ( dstOffsetInBytes + get_global_id(0) );
pDst[ 0 ] = pSrc[ 0 ];
}
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
uint misalignmentInBits)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4((gid + 1), pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
vstore4(result, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) {
ALIGNED4(dst);
ALIGNED4(src);
idx_t gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegion(
__global uchar* pDst,
const __global uchar* pSrc,
idx_t len,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
pDstWithOffset[ gid ] = pSrcWithOffset[ gid ];
}
}
__kernel void CopyBufferToBufferMiddleRegion(
__global uint* pDst,
const __global uint* pSrc,
idx_t elems,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
uint4 loaded = vload4(gid, pSrcWithOffset);
vstore4(loaded, gid, pDstWithOffset);
}
}
)==="
|