1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
|
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
*/
#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/asm.h>
/*
* copy_user_nocache - Uncached memory copy with exception handling
*
* This copies from user space into kernel space, but the kernel
* space accesses can take a machine check exception, so they too
* need exception handling.
*
* Note: only 32-bit and 64-bit stores have non-temporal versions,
* and we only use aligned versions. Any unaligned parts at the
* start or end of the copy will be done using normal cached stores.
*
* Input:
* rdi destination
* rsi source
* edx count
*
* Output:
* rax uncopied bytes or 0 if successful.
*/
SYM_FUNC_START(__copy_user_nocache)
/* If destination is not 7-byte aligned, we'll have to align it */
testb $7,%dil
jne .Lalign
.Lis_aligned:
cmp $64,%edx
jb .Lquadwords
.p2align 4,0x90
.Lunrolled:
10: movq (%rsi),%r8
11: movq 8(%rsi),%r9
12: movq 16(%rsi),%r10
13: movq 24(%rsi),%r11
20: movnti %r8,(%rdi)
21: movnti %r9,8(%rdi)
22: movnti %r10,16(%rdi)
23: movnti %r11,24(%rdi)
30: movq 32(%rsi),%r8
31: movq 40(%rsi),%r9
32: movq 48(%rsi),%r10
33: movq 56(%rsi),%r11
40: movnti %r8,32(%rdi)
41: movnti %r9,40(%rdi)
42: movnti %r10,48(%rdi)
43: movnti %r11,56(%rdi)
addq $64,%rsi
addq $64,%rdi
sub $64,%edx
cmp $64,%edx
jae .Lunrolled
/*
* First set of user mode loads have been done
* without any stores, so if they fail, we can
* just try the non-unrolled loop.
*/
_ASM_EXTABLE_UA(10b, .Lquadwords)
_ASM_EXTABLE_UA(11b, .Lquadwords)
_ASM_EXTABLE_UA(12b, .Lquadwords)
_ASM_EXTABLE_UA(13b, .Lquadwords)
/*
* The second set of user mode loads have been
* done with 32 bytes stored to the destination,
* so we need to take that into account before
* falling back to the unrolled loop.
*/
_ASM_EXTABLE_UA(30b, .Lfixup32)
_ASM_EXTABLE_UA(31b, .Lfixup32)
_ASM_EXTABLE_UA(32b, .Lfixup32)
_ASM_EXTABLE_UA(33b, .Lfixup32)
/*
* An exception on a write means that we're
* done, but we need to update the count
* depending on where in the unrolled loop
* we were.
*/
_ASM_EXTABLE_UA(20b, .Ldone0)
_ASM_EXTABLE_UA(21b, .Ldone8)
_ASM_EXTABLE_UA(22b, .Ldone16)
_ASM_EXTABLE_UA(23b, .Ldone24)
_ASM_EXTABLE_UA(40b, .Ldone32)
_ASM_EXTABLE_UA(41b, .Ldone40)
_ASM_EXTABLE_UA(42b, .Ldone48)
_ASM_EXTABLE_UA(43b, .Ldone56)
.Lquadwords:
cmp $8,%edx
jb .Llong
50: movq (%rsi),%rax
51: movnti %rax,(%rdi)
addq $8,%rsi
addq $8,%rdi
sub $8,%edx
jmp .Lquadwords
/*
* If we fail on the last full quadword, we will
* not try to do any byte-wise cached accesses.
* We will try to do one more 4-byte uncached
* one, though.
*/
_ASM_EXTABLE_UA(50b, .Llast4)
_ASM_EXTABLE_UA(51b, .Ldone0)
.Llong:
test $4,%dl
je .Lword
60: movl (%rsi),%eax
61: movnti %eax,(%rdi)
addq $4,%rsi
addq $4,%rdi
sub $4,%edx
.Lword:
sfence
test $2,%dl
je .Lbyte
70: movw (%rsi),%ax
71: movw %ax,(%rdi)
addq $2,%rsi
addq $2,%rdi
sub $2,%edx
.Lbyte:
test $1,%dl
je .Ldone
80: movb (%rsi),%al
81: movb %al,(%rdi)
dec %edx
.Ldone:
mov %edx,%eax
RET
/*
* If we fail on the last four bytes, we won't
* bother with any fixups. It's dead, Jim. Note
* that there's no need for 'sfence' for any
* of this, since the exception will have been
* serializing.
*/
_ASM_EXTABLE_UA(60b, .Ldone)
_ASM_EXTABLE_UA(61b, .Ldone)
_ASM_EXTABLE_UA(70b, .Ldone)
_ASM_EXTABLE_UA(71b, .Ldone)
_ASM_EXTABLE_UA(80b, .Ldone)
_ASM_EXTABLE_UA(81b, .Ldone)
/*
* This is the "head needs aliging" case when
* the destination isn't 8-byte aligned. The
* 4-byte case can be done uncached, but any
* smaller alignment is done with regular stores.
*/
.Lalign:
test $1,%dil
je .Lalign_word
test %edx,%edx
je .Ldone
90: movb (%rsi),%al
91: movb %al,(%rdi)
inc %rsi
inc %rdi
dec %edx
.Lalign_word:
test $2,%dil
je .Lalign_long
cmp $2,%edx
jb .Lbyte
92: movw (%rsi),%ax
93: movw %ax,(%rdi)
addq $2,%rsi
addq $2,%rdi
sub $2,%edx
.Lalign_long:
test $4,%dil
je .Lis_aligned
cmp $4,%edx
jb .Lword
94: movl (%rsi),%eax
95: movnti %eax,(%rdi)
addq $4,%rsi
addq $4,%rdi
sub $4,%edx
jmp .Lis_aligned
/*
* If we fail on the initial alignment accesses,
* we're all done. Again, no point in trying to
* do byte-by-byte probing if the 4-byte load
* fails - we're not doing any uncached accesses
* any more.
*/
_ASM_EXTABLE_UA(90b, .Ldone)
_ASM_EXTABLE_UA(91b, .Ldone)
_ASM_EXTABLE_UA(92b, .Ldone)
_ASM_EXTABLE_UA(93b, .Ldone)
_ASM_EXTABLE_UA(94b, .Ldone)
_ASM_EXTABLE_UA(95b, .Ldone)
/*
* Exception table fixups for faults in the middle
*/
.Ldone56: sub $8,%edx
.Ldone48: sub $8,%edx
.Ldone40: sub $8,%edx
.Ldone32: sub $8,%edx
.Ldone24: sub $8,%edx
.Ldone16: sub $8,%edx
.Ldone8: sub $8,%edx
.Ldone0:
mov %edx,%eax
RET
.Lfixup32:
addq $32,%rsi
addq $32,%rdi
sub $32,%edx
jmp .Lquadwords
.Llast4:
52: movl (%rsi),%eax
53: movnti %eax,(%rdi)
sfence
sub $4,%edx
mov %edx,%eax
RET
_ASM_EXTABLE_UA(52b, .Ldone0)
_ASM_EXTABLE_UA(53b, .Ldone0)
SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
|