File: mem_zero_detect_avx2.asm

package info (click to toggle)
libisal 2.31.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,776 kB
  • sloc: asm: 44,577; ansic: 42,149; sh: 915; makefile: 622; pascal: 345
file content (172 lines) | stat: -rw-r--r-- 4,382 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg0  rdi
 %define arg1  rsi
 %define arg2  rdx
 %define arg3  rcx
 %define arg4  r8
 %define arg5  r9
 %define tmp   r11
 %define tmpb  r11b
 %define tmp3  arg4
 %define return rax
 %define func(x) x: endbranch
 %define FUNC_SAVE
 %define FUNC_RESTORE
%endif

%ifidn __OUTPUT_FORMAT__, win64
 %define arg0  rcx
 %define arg1  rdx
 %define arg2  r8
 %define arg3  r9
 %define tmp   r11
 %define tmpb  r11b
 %define tmp3  r10
 %define return rax
 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
	end_prolog
 %endmacro
 %macro FUNC_RESTORE 0
 %endmacro
%endif

%define src arg0
%define	len arg1
%define tmp0 arg2
%define tmp1 arg3

; Workaround for YASM
%ifdef HAVE_AS_KNOWS_AVX512
%use smartalign
ALIGNMODE P6
%endif

default rel

[bits 64]
section .text
align 32	; maximize mu-ops cache usage
mk_global  mem_zero_detect_avx2, function
func(mem_zero_detect_avx2)
	FUNC_SAVE
	cmp	len, 127
	jbe	.mem_z_small_block
	; check the first 128 bytes
	vpxor	xmm2, xmm2, xmm2
	vmovdqu ymm0, [src]
	vpor	ymm0, ymm0, [src+32]
	vmovdqu	ymm1, [src+64]
	vpor	ymm1, ymm1, [src+96]
	vpor	ymm0, ymm0, ymm1
	vpcmpeqb ymm0, ymm2, ymm0
	vpmovmskb DWORD(tmp0), ymm0
	not	DWORD(tmp0)
	mov	DWORD(tmp1), DWORD(len)
	and	DWORD(tmp1), 127
	add	src, tmp1
	xor	eax, eax
	shr	len, 7	; len/128
	test	len, len; break partial flag stall
	setz	al	; if len < 128, eax != 0
	add	eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion
	jnz .return
	xor	eax, eax

align 16
.mem_z_loop:
	vmovdqu	ymm0, [src]
	vpor	ymm0, ymm0,[src+32]
	vmovdqu	ymm1, [src+64]
	vpor	ymm1, ymm1, [src+96]
	add	src, 128
	xor	DWORD(tmp1), DWORD(tmp1)
	sub	len, 1
	setz	BYTE(tmp1)
	vpor	ymm0, ymm0, ymm1
	vpcmpeqb ymm0, ymm2, ymm0
	vpmovmskb DWORD(tmp0), ymm0
	not	DWORD(tmp0)
	add	DWORD(tmp1), DWORD(tmp0)
	jz	.mem_z_loop

.return:
	xor	eax, eax
	test	tmp0, tmp0
	setnz	al
	FUNC_RESTORE
	ret


align 16
.mem_z_small_block:
	;len < 128
	xor	DWORD(tmp0), DWORD(tmp0)
	movzx	DWORD(tmp1), BYTE(len)
	cmp	DWORD(len), 16
	jb     .mem_z_small_check_zero
	;17 < len < 128
	shr	DWORD(len), 4
	xor	eax, eax ; alignment
.mem_z_small_block_loop:
	xor	eax, eax
	mov	tmp0, [src]
	or	tmp0, [src+8]
	sub	DWORD(len), 1
	setz	al
	add	src, 16
	add	rax, tmp0
	jz	.mem_z_small_block_loop

	test	tmp0, tmp0
	jnz	.return_small
	movzx	DWORD(len), BYTE(tmp1)

.mem_z_small_check_zero:
	xor	DWORD(tmp0), DWORD(tmp0)
	and	DWORD(len), 15
	jz	.return_small
.mem_z_small_byte_loop:
	movzx	eax, byte [src]
	add	src, 1
	or	DWORD(tmp0), eax
	sub	DWORD(len), 1
	jnz	.mem_z_small_byte_loop
.return_small:
	xor	eax, eax
	test	tmp0, tmp0
	setnz	al
	ret

endproc_frame