File: memcmp16_arm64.S

package info (click to toggle)
android-platform-art 11.0.0%2Br48-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 78,932 kB
  • sloc: cpp: 459,858; java: 163,268; asm: 22,644; python: 9,815; sh: 6,330; ansic: 4,117; xml: 2,855; perl: 77; makefile: 73
file content (143 lines) | stat: -rw-r--r-- 4,283 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64
 */

#ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
#define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_

#include "asm_support_arm64.S"

/* Parameters and result.  */
#define src1        x0
#define src2        x1
#define limit       x2
#define result      x0

/* Internal variables.  */
#define data1       x3
#define data1w      w3
#define data2       x4
#define data2w      w4
#define has_nul     x5
#define diff        x6
#define endloop     x7
#define tmp1        x8
#define tmp2        x9
#define tmp3        x10
#define limit_wd    x12
#define mask        x13

// WARNING: If you change this code to use x14 and x15, you must also change
//          art_quick_string_compareto, which relies on these temps being unused.

ENTRY __memcmp16
  cbz     limit, .Lret0
  lsl     limit, limit, #1  /* Half-words to bytes.  */
  eor     tmp1, src1, src2
  tst     tmp1, #7
  b.ne    .Lmisaligned8
  ands    tmp1, src1, #7
  b.ne    .Lmutual_align
  add     limit_wd, limit, #7
  lsr     limit_wd, limit_wd, #3
  /* Start of performance-critical section  -- one 64B cache line.  */
.Lloop_aligned:
  ldr     data1, [src1], #8
  ldr     data2, [src2], #8
.Lstart_realigned:
  subs    limit_wd, limit_wd, #1
  eor     diff, data1, data2  /* Non-zero if differences found.  */
  csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
  cbz     endloop, .Lloop_aligned
  /* End of performance-critical section  -- one 64B cache line.  */

  /* Not reached the limit, must have found a diff.  */
  cbnz    limit_wd, .Lnot_limit

  /* Limit % 8 == 0 => all bytes significant.  */
  ands    limit, limit, #7
  b.eq    .Lnot_limit

  lsl     limit, limit, #3  /* Bits -> bytes.  */
  mov     mask, #~0
  lsl     mask, mask, limit
  bic     data1, data1, mask
  bic     data2, data2, mask

.Lnot_limit:

  // Swap the byte order of diff. Exact reverse is not important, as we only need to detect
  // the half-word.
  rev     diff, diff
  // The most significant bit of DIFF marks the least significant bit of change between DATA1/2
  clz     diff, diff
  // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
  bfi     diff, xzr, #0, #4
  // Create a 16b mask
  mov     mask, #0xFFFF
  // Shift to the right half-word.
  lsr     data1, data1, diff
  lsr     data2, data2, diff
  // Mask the lowest half-word.
  and     data1, data1, mask
  and     data2, data2, mask
  // Compute difference.
  sub     result, data1, data2
  ret

.Lmutual_align:
  /* Sources are mutually aligned, but are not currently at an
     alignment boundary.  Round down the addresses and then mask off
     the bytes that precede the start point.  */
  bic     src1, src1, #7
  bic     src2, src2, #7
  add     limit, limit, tmp1  /* Adjust the limit for the extra.  */
  lsl     tmp1, tmp1, #3    /* Bytes beyond alignment -> bits.  */
  ldr     data1, [src1], #8
  neg     tmp1, tmp1    /* Bits to alignment -64.  */
  ldr     data2, [src2], #8
  mov     tmp2, #~0
  /* Little-endian.  Early bytes are at LSB.  */
  lsr     tmp2, tmp2, tmp1  /* Shift (tmp1 & 63).  */
  add     limit_wd, limit, #7
  orr     data1, data1, tmp2
  orr     data2, data2, tmp2
  lsr     limit_wd, limit_wd, #3
  b       .Lstart_realigned

.Lret0:
  mov     result, #0
  ret

  .p2align 6
.Lmisaligned8:
  sub     limit, limit, #1
1:
  /* Perhaps we can do better than this.  */
  ldrh    data1w, [src1], #2
  ldrh    data2w, [src2], #2
  subs    limit, limit, #2
  ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
  b.eq    1b
  sub     result, data1, data2
  ret
END __memcmp16

#endif  // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_