1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
|
; Copyright (c) 2007-2008 CSIRO
; Copyright (c) 2007-2009 Xiph.Org Foundation
; Copyright (c) 2013 Parrot
; Written by Aurélien Zanelli
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
;
; - Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; - Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
AREA |.text|, CODE, READONLY
GET celt/arm/armopts.s
IF OPUS_ARM_MAY_HAVE_EDSP
EXPORT celt_pitch_xcorr_edsp
ENDIF
IF OPUS_ARM_MAY_HAVE_NEON
EXPORT celt_pitch_xcorr_neon
ENDIF
IF OPUS_ARM_MAY_HAVE_NEON
; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
xcorr_kernel_neon PROC
xcorr_kernel_neon_start
; input:
; r3 = int len
; r4 = opus_val16 *x
; r5 = opus_val16 *y
; q0 = opus_val32 sum[4]
; output:
; q0 = opus_val32 sum[4]
; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
; internal usage:
; r12 = int j
; d3 = y_3|y_2|y_1|y_0
; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
; q8 = scratch
;
; Load y[0...3]
; This requires len>0 to always be valid (which we assert in the C code).
VLD1.16 {d5}, [r5]!
SUBS r12, r3, #8
BLE xcorr_kernel_neon_process4
; Process 8 samples at a time.
; This loop loads one y value more than we actually need. Therefore we have to
; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
; reading past the end of the array.
xcorr_kernel_neon_process8
; This loop has 19 total instructions (10 cycles to issue, minimum), with
; - 2 cycles of ARM insrtuctions,
; - 10 cycles of load/store/byte permute instructions, and
; - 9 cycles of data processing instructions.
; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
; latter two categories, meaning the whole loop should run in 10 cycles per
; iteration, barring cache misses.
;
; Load x[0...7]
VLD1.16 {d6, d7}, [r4]!
; Unlike VMOV, VAND is a data processing instruction (and doesn't get
; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
VAND d3, d5, d5
SUBS r12, r12, #8
; Load y[4...11]
VLD1.16 {d4, d5}, [r5]!
VMLAL.S16 q0, d3, d6[0]
VEXT.16 d16, d3, d4, #1
VMLAL.S16 q0, d4, d7[0]
VEXT.16 d17, d4, d5, #1
VMLAL.S16 q0, d16, d6[1]
VEXT.16 d16, d3, d4, #2
VMLAL.S16 q0, d17, d7[1]
VEXT.16 d17, d4, d5, #2
VMLAL.S16 q0, d16, d6[2]
VEXT.16 d16, d3, d4, #3
VMLAL.S16 q0, d17, d7[2]
VEXT.16 d17, d4, d5, #3
VMLAL.S16 q0, d16, d6[3]
VMLAL.S16 q0, d17, d7[3]
BGT xcorr_kernel_neon_process8
; Process 4 samples here if we have > 4 left (still reading one extra y value).
xcorr_kernel_neon_process4
ADDS r12, r12, #4
BLE xcorr_kernel_neon_process2
; Load x[0...3]
VLD1.16 d6, [r4]!
; Use VAND since it's a data processing instruction again.
VAND d4, d5, d5
SUB r12, r12, #4
; Load y[4...7]
VLD1.16 d5, [r5]!
VMLAL.S16 q0, d4, d6[0]
VEXT.16 d16, d4, d5, #1
VMLAL.S16 q0, d16, d6[1]
VEXT.16 d16, d4, d5, #2
VMLAL.S16 q0, d16, d6[2]
VEXT.16 d16, d4, d5, #3
VMLAL.S16 q0, d16, d6[3]
; Process 2 samples here if we have > 2 left (still reading one extra y value).
xcorr_kernel_neon_process2
ADDS r12, r12, #2
BLE xcorr_kernel_neon_process1
; Load x[0...1]
VLD2.16 {d6[],d7[]}, [r4]!
; Use VAND since it's a data processing instruction again.
VAND d4, d5, d5
SUB r12, r12, #2
; Load y[4...5]
VLD1.32 {d5[]}, [r5]!
VMLAL.S16 q0, d4, d6
VEXT.16 d16, d4, d5, #1
; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
; instead of VEXT, since it's a data-processing instruction.
VSRI.64 d5, d4, #32
VMLAL.S16 q0, d16, d7
; Process 1 sample using the extra y value we loaded above.
xcorr_kernel_neon_process1
; Load next *x
VLD1.16 {d6[]}, [r4]!
ADDS r12, r12, #1
; y[0...3] are left in d5 from prior iteration(s) (if any)
VMLAL.S16 q0, d5, d6
MOVLE pc, lr
; Now process 1 last sample, not reading ahead.
; Load last *y
VLD1.16 {d4[]}, [r5]!
VSRI.64 d4, d5, #16
; Load last *x
VLD1.16 {d6[]}, [r4]!
VMLAL.S16 q0, d4, d6
MOV pc, lr
ENDP
; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
; opus_val32 *xcorr, int len, int max_pitch, int arch)
celt_pitch_xcorr_neon PROC
; input:
; r0 = opus_val16 *_x
; r1 = opus_val16 *_y
; r2 = opus_val32 *xcorr
; r3 = int len
; output:
; r0 = int maxcorr
; internal usage:
; r4 = opus_val16 *x (for xcorr_kernel_neon())
; r5 = opus_val16 *y (for xcorr_kernel_neon())
; r6 = int max_pitch
; r12 = int j
; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
; ignored:
; int arch
STMFD sp!, {r4-r6, lr}
LDR r6, [sp, #16]
VMOV.S32 q15, #1
; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
SUBS r6, r6, #4
BLT celt_pitch_xcorr_neon_process4_done
celt_pitch_xcorr_neon_process4
; xcorr_kernel_neon parameters:
; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
MOV r4, r0
MOV r5, r1
VEOR q0, q0, q0
; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
; So we don't save/restore any other registers.
BL xcorr_kernel_neon_start
SUBS r6, r6, #4
VST1.32 {q0}, [r2]!
; _y += 4
ADD r1, r1, #8
VMAX.S32 q15, q15, q0
; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
BGE celt_pitch_xcorr_neon_process4
; We have less than 4 sums left to compute.
celt_pitch_xcorr_neon_process4_done
ADDS r6, r6, #4
; Reduce maxcorr to a single value
VMAX.S32 d30, d30, d31
VPMAX.S32 d30, d30, d30
; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
BLE celt_pitch_xcorr_neon_done
; Now compute each remaining sum one at a time.
celt_pitch_xcorr_neon_process_remaining
MOV r4, r0
MOV r5, r1
VMOV.I32 q0, #0
SUBS r12, r3, #8
BLT celt_pitch_xcorr_neon_process_remaining4
; Sum terms 8 at a time.
celt_pitch_xcorr_neon_process_remaining_loop8
; Load x[0...7]
VLD1.16 {q1}, [r4]!
; Load y[0...7]
VLD1.16 {q2}, [r5]!
SUBS r12, r12, #8
VMLAL.S16 q0, d4, d2
VMLAL.S16 q0, d5, d3
BGE celt_pitch_xcorr_neon_process_remaining_loop8
; Sum terms 4 at a time.
celt_pitch_xcorr_neon_process_remaining4
ADDS r12, r12, #4
BLT celt_pitch_xcorr_neon_process_remaining4_done
; Load x[0...3]
VLD1.16 {d2}, [r4]!
; Load y[0...3]
VLD1.16 {d3}, [r5]!
SUB r12, r12, #4
VMLAL.S16 q0, d3, d2
celt_pitch_xcorr_neon_process_remaining4_done
; Reduce the sum to a single value.
VADD.S32 d0, d0, d1
VPADDL.S32 d0, d0
ADDS r12, r12, #4
BLE celt_pitch_xcorr_neon_process_remaining_loop_done
; Sum terms 1 at a time.
celt_pitch_xcorr_neon_process_remaining_loop1
VLD1.16 {d2[]}, [r4]!
VLD1.16 {d3[]}, [r5]!
SUBS r12, r12, #1
VMLAL.S16 q0, d2, d3
BGT celt_pitch_xcorr_neon_process_remaining_loop1
celt_pitch_xcorr_neon_process_remaining_loop_done
VST1.32 {d0[0]}, [r2]!
VMAX.S32 d30, d30, d0
SUBS r6, r6, #1
; _y++
ADD r1, r1, #2
; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
BGT celt_pitch_xcorr_neon_process_remaining
celt_pitch_xcorr_neon_done
VMOV.32 r0, d30[0]
LDMFD sp!, {r4-r6, pc}
ENDP
ENDIF
IF OPUS_ARM_MAY_HAVE_EDSP
; This will get used on ARMv7 devices without NEON, so it has been optimized
; to take advantage of dual-issuing where possible.
xcorr_kernel_edsp PROC
xcorr_kernel_edsp_start
; input:
; r3 = int len
; r4 = opus_val16 *_x (must be 32-bit aligned)
; r5 = opus_val16 *_y (must be 32-bit aligned)
; r6...r9 = opus_val32 sum[4]
; output:
; r6...r9 = opus_val32 sum[4]
; preserved: r0-r5
; internal usage
; r2 = int j
; r12,r14 = opus_val16 x[4]
; r10,r11 = opus_val16 y[4]
STMFD sp!, {r2,r4,r5,lr}
LDR r10, [r5], #4 ; Load y[0...1]
SUBS r2, r3, #4 ; j = len-4
LDR r11, [r5], #4 ; Load y[2...3]
BLE xcorr_kernel_edsp_process4_done
LDR r12, [r4], #4 ; Load x[0...1]
; Stall
xcorr_kernel_edsp_process4
; The multiplies must issue from pipeline 0, and can't dual-issue with each
; other. Every other instruction here dual-issues with a multiply, and is
; thus "free". There should be no stalls in the body of the loop.
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
LDR r14, [r4], #4 ; Load x[2...3]
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
SUBS r2, r2, #4 ; j-=4
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
LDR r10, [r5], #4 ; Load y[4...5]
SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
LDRGT r12, [r4], #4 ; Load x[0...1]
SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
LDR r11, [r5], #4 ; Load y[6...7]
SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
BGT xcorr_kernel_edsp_process4
xcorr_kernel_edsp_process4_done
ADDS r2, r2, #4
BLE xcorr_kernel_edsp_done
LDRH r12, [r4], #2 ; r12 = *x++
SUBS r2, r2, #1 ; j--
; Stall
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
LDRHGT r14, [r4], #2 ; r14 = *x++
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
BLE xcorr_kernel_edsp_done
SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
SUBS r2, r2, #1 ; j--
SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
LDRH r10, [r5], #2 ; r10 = y_4 = *y++
SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
LDRHGT r12, [r4], #2 ; r12 = *x++
SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
BLE xcorr_kernel_edsp_done
SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
CMP r2, #1 ; j--
SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
LDRH r2, [r5], #2 ; r2 = y_5 = *y++
SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
LDRHGT r14, [r4] ; r14 = *x
SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
BLE xcorr_kernel_edsp_done
SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
LDRH r11, [r5] ; r11 = y_6 = *y
SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
xcorr_kernel_edsp_done
LDMFD sp!, {r2,r4,r5,pc}
ENDP
celt_pitch_xcorr_edsp PROC
; input:
; r0 = opus_val16 *_x (must be 32-bit aligned)
; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
; r2 = opus_val32 *xcorr
; r3 = int len
; output:
; r0 = maxcorr
; internal usage
; r4 = opus_val16 *x
; r5 = opus_val16 *y
; r6 = opus_val32 sum0
; r7 = opus_val32 sum1
; r8 = opus_val32 sum2
; r9 = opus_val32 sum3
; r1 = int max_pitch
; r12 = int j
; ignored:
; int arch
STMFD sp!, {r4-r11, lr}
MOV r5, r1
LDR r1, [sp, #36]
MOV r4, r0
TST r5, #3
; maxcorr = 1
MOV r0, #1
BEQ celt_pitch_xcorr_edsp_process1u_done
; Compute one sum at the start to make y 32-bit aligned.
SUBS r12, r3, #4
; r14 = sum = 0
MOV r14, #0
LDRH r8, [r5], #2
BLE celt_pitch_xcorr_edsp_process1u_loop4_done
LDR r6, [r4], #4
MOV r8, r8, LSL #16
celt_pitch_xcorr_edsp_process1u_loop4
LDR r9, [r5], #4
SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
LDR r7, [r4], #4
SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
LDR r8, [r5], #4
SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
SUBS r12, r12, #4 ; j-=4
SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
LDRGT r6, [r4], #4
BGT celt_pitch_xcorr_edsp_process1u_loop4
MOV r8, r8, LSR #16
celt_pitch_xcorr_edsp_process1u_loop4_done
ADDS r12, r12, #4
celt_pitch_xcorr_edsp_process1u_loop1
LDRHGE r6, [r4], #2
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
SUBSGE r12, r12, #1
LDRHGT r8, [r5], #2
BGT celt_pitch_xcorr_edsp_process1u_loop1
; Restore _x
SUB r4, r4, r3, LSL #1
; Restore and advance _y
SUB r5, r5, r3, LSL #1
; maxcorr = max(maxcorr, sum)
CMP r0, r14
ADD r5, r5, #2
MOVLT r0, r14
SUBS r1, r1, #1
; xcorr[i] = sum
STR r14, [r2], #4
BLE celt_pitch_xcorr_edsp_done
celt_pitch_xcorr_edsp_process1u_done
; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
SUBS r1, r1, #4
BLT celt_pitch_xcorr_edsp_process2
celt_pitch_xcorr_edsp_process4
; xcorr_kernel_edsp parameters:
; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
MOV r6, #0
MOV r7, #0
MOV r8, #0
MOV r9, #0
BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
CMP r0, r6
; _y+=4
ADD r5, r5, #8
MOVLT r0, r6
CMP r0, r7
MOVLT r0, r7
CMP r0, r8
MOVLT r0, r8
CMP r0, r9
MOVLT r0, r9
STMIA r2!, {r6-r9}
SUBS r1, r1, #4
BGE celt_pitch_xcorr_edsp_process4
celt_pitch_xcorr_edsp_process2
ADDS r1, r1, #2
BLT celt_pitch_xcorr_edsp_process1a
SUBS r12, r3, #4
; {r10, r11} = {sum0, sum1} = {0, 0}
MOV r10, #0
MOV r11, #0
LDR r8, [r5], #4
BLE celt_pitch_xcorr_edsp_process2_loop_done
LDR r6, [r4], #4
LDR r9, [r5], #4
celt_pitch_xcorr_edsp_process2_loop4
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDR r7, [r4], #4
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
SUBS r12, r12, #4 ; j-=4
SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
LDR r8, [r5], #4
SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
LDRGT r6, [r4], #4
SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
LDRGT r9, [r5], #4
SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
BGT celt_pitch_xcorr_edsp_process2_loop4
celt_pitch_xcorr_edsp_process2_loop_done
ADDS r12, r12, #2
BLE celt_pitch_xcorr_edsp_process2_1
LDR r6, [r4], #4
; Stall
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDR r9, [r5], #4
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
SUB r12, r12, #2
SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
MOV r8, r9
SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
celt_pitch_xcorr_edsp_process2_1
LDRH r6, [r4], #2
ADDS r12, r12, #1
; Stall
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDRHGT r7, [r4], #2
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
BLE celt_pitch_xcorr_edsp_process2_done
LDRH r9, [r5], #2
SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
celt_pitch_xcorr_edsp_process2_done
; Restore _x
SUB r4, r4, r3, LSL #1
; Restore and advance _y
SUB r5, r5, r3, LSL #1
; maxcorr = max(maxcorr, sum0)
CMP r0, r10
ADD r5, r5, #2
MOVLT r0, r10
SUB r1, r1, #2
; maxcorr = max(maxcorr, sum1)
CMP r0, r11
; xcorr[i] = sum
STR r10, [r2], #4
MOVLT r0, r11
STR r11, [r2], #4
celt_pitch_xcorr_edsp_process1a
ADDS r1, r1, #1
BLT celt_pitch_xcorr_edsp_done
SUBS r12, r3, #4
; r14 = sum = 0
MOV r14, #0
BLT celt_pitch_xcorr_edsp_process1a_loop_done
LDR r6, [r4], #4
LDR r8, [r5], #4
LDR r7, [r4], #4
LDR r9, [r5], #4
celt_pitch_xcorr_edsp_process1a_loop4
SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
SUBS r12, r12, #4 ; j-=4
SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
LDRGE r6, [r4], #4
SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
LDRGE r8, [r5], #4
SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
LDRGE r7, [r4], #4
LDRGE r9, [r5], #4
BGE celt_pitch_xcorr_edsp_process1a_loop4
celt_pitch_xcorr_edsp_process1a_loop_done
ADDS r12, r12, #2
LDRGE r6, [r4], #4
LDRGE r8, [r5], #4
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
SUBGE r12, r12, #2
SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
ADDS r12, r12, #1
LDRHGE r6, [r4], #2
LDRHGE r8, [r5], #2
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
; maxcorr = max(maxcorr, sum)
CMP r0, r14
; xcorr[i] = sum
STR r14, [r2], #4
MOVLT r0, r14
celt_pitch_xcorr_edsp_done
LDMFD sp!, {r4-r11, pc}
ENDP
ENDIF
END
|