File: cm_mem_os_sse4_impl.cpp

package info (click to toggle)
intel-media-driver 21.1.1%2Bdfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 108,772 kB
  • sloc: cpp: 1,078,942; ansic: 164,231; asm: 45,336; python: 554; sh: 177; makefile: 13
file content (129 lines) | stat: -rw-r--r-- 4,765 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/*
* Copyright (c) 2020, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
//!
//! \file      cm_mem_os_sse4_impl.cpp
//! \brief     Contains CM memory function implementations 
//!

#include "cm_mem_os_sse4_impl.h"

#if defined(__SSE4_1__)

#include "cm_mem.h"
#include <mmintrin.h>

void CmFastMemCopyFromWC_SSE4( void* dst, const void* src, const size_t bytes )
{
    // Cache pointers to memory
    uint8_t *tempDst = (uint8_t*)dst;
    uint8_t *tempSrc = (uint8_t*)src;

    size_t count = bytes;

    if( count >= CM_CPU_FASTCOPY_THRESHOLD )
    {
        //Streaming Load must be 16-byte aligned but should
        //be 64-byte aligned for optimal performance
        const size_t doubleHexWordAlignBytes =
            GetAlignmentOffset( tempSrc, sizeof(DHWORD) );

        // Copy portion of the source memory that is not aligned
        if( doubleHexWordAlignBytes )
        {
            CmSafeMemCopy( tempDst, tempSrc, doubleHexWordAlignBytes );

            tempDst += doubleHexWordAlignBytes;
            tempSrc += doubleHexWordAlignBytes;
            count -= doubleHexWordAlignBytes;
        }

        CM_ASSERT( IsAligned( tempSrc, sizeof(DHWORD) ) == true );

        // Get the number of bytes to be copied (rounded down to nearets DHWORD)
        const size_t doubleHexWordsToCopy = count / sizeof(DHWORD);

        if( doubleHexWordsToCopy )
        {
            // Determine if the destination address is aligned
            const bool isDstDoubleQuadWordAligned =
                IsAligned( tempDst, sizeof(DQWORD) );

            __m128i* mmSrc = (__m128i*)(tempSrc);
            __m128i* mmDst = reinterpret_cast<__m128i*>(tempDst);
            __m128i  xmm0, xmm1, xmm2, xmm3;

            if( isDstDoubleQuadWordAligned )
            {
                for( size_t i=0; i<doubleHexWordsToCopy; i++ )
                {
                    // Sync the WC memory data before issuing the MOVNTDQA instruction.
                    _mm_mfence();
                    xmm0 = _mm_stream_load_si128(mmSrc);
                    xmm1 = _mm_stream_load_si128(mmSrc + 1);
                    xmm2 = _mm_stream_load_si128(mmSrc + 2);
                    xmm3 = _mm_stream_load_si128(mmSrc + 3);
                    mmSrc += 4;

                    _mm_store_si128(mmDst, xmm0);
                    _mm_store_si128(mmDst + 1, xmm1);
                    _mm_store_si128(mmDst + 2, xmm2);
                    _mm_store_si128(mmDst + 3, xmm3);
                    mmDst += 4;

                    tempDst += sizeof(DHWORD);
                    tempSrc += sizeof(DHWORD);
                    count -= sizeof(DHWORD);
                }
            }
            else
            {
                for( size_t i=0; i<doubleHexWordsToCopy; i++ )
                {
                    // Sync the WC memory data before issuing the MOVNTDQA instruction.
                    _mm_mfence();
                    xmm0 = _mm_stream_load_si128(mmSrc);
                    xmm1 = _mm_stream_load_si128(mmSrc + 1);
                    xmm2 = _mm_stream_load_si128(mmSrc + 2);
                    xmm3 = _mm_stream_load_si128(mmSrc + 3);
                    mmSrc += 4;

                    _mm_storeu_si128(mmDst, xmm0);
                    _mm_storeu_si128(mmDst + 1, xmm1);
                    _mm_storeu_si128(mmDst + 2, xmm2);
                    _mm_storeu_si128(mmDst + 3, xmm3);
                    mmDst += 4;
                    tempDst += sizeof(DHWORD);
                    tempSrc += sizeof(DHWORD);
                    count -= sizeof(DHWORD);
                }
            }
        }
    }

    // Copy remaining uint8_t(s)
    if( count )
    {
        CmSafeMemCopy( tempDst, tempSrc, count );
    }
}

#endif // __SSE4_1__