| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 
 | //===----------------------Hexagon builtin routine ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// An optimized version of a memcpy which is equivalent to the following loop:
//
//   volatile unsigned *dest;
//   unsigned *src;
//
//   for (i = 0; i < num_words; ++i)
//     *dest++ = *src++;
//
// The corresponding C prototype for this function would be
// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
//                                      const unsigned *src,
//                                      unsigned num_words);
//
// *** Both dest and src must be aligned to 32-bit boundaries. ***
// The code does not perform any runtime checks for this, and will fail
// in bad ways if this requirement is not met.
//
// The "forward" in the name refers to the fact that the function copies
// the words going forward in memory.  It is incorrect to use this function
// for cases where the original code copied words in any other order.
//
// *** This function is only for the use by the compiler. ***
// The only indended use is for the LLVM compiler to generate calls to
// this function, when a mem-copy loop, like the one above, is detected.
  .text
// Inputs:
//   r0: dest
//   r1: src
//   r2: num_words
  .globl  hexagon_memcpy_forward_vp4cp4n2
  .balign 32
  .type   hexagon_memcpy_forward_vp4cp4n2,@function
hexagon_memcpy_forward_vp4cp4n2:
    // Compute r3 to be the number of words remaining in the current page.
    // At the same time, compute r4 to be the number of 32-byte blocks
    // remaining in the page (for prefetch).
  {
    r3 = sub(##4096, r1)
    r5 = lsr(r2, #3)
  }
  {
    // The word count before end-of-page is in the 12 lowest bits of r3.
    // (If the address in r1 was already page-aligned, the bits are 0.)
    r3 = extractu(r3, #10, #2)
    r4 = extractu(r3, #7, #5)
  }
  {
    r3 = minu(r2, r3)
    r4 = minu(r5, r4)
  }
  {
    r4 = or(r4, ##2105344)      // 2105344 = 0x202000
    p0 = cmp.eq(r3, #0)
    if (p0.new) jump:nt .Lskipprolog
  }
    l2fetch(r1, r4)
  {
    loop0(.Lprolog, r3)
    r2 = sub(r2, r3)            // r2 = number of words left after the prolog.
  }
  .falign
.Lprolog:
  {
    r4 = memw(r1++#4)
    memw(r0++#4) = r4.new
  } :endloop0
.Lskipprolog:
  {
    // Let r3 = number of whole pages left (page = 1024 words).
    r3 = lsr(r2, #10)
    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
  }
  {
    loop1(.Lout, r3)
    r2 = extractu(r2, #10, #0)  // r2 = r2 & 1023
    r3 = ##2105472              // r3 = 0x202080 (prefetch info)
  }
    // Iterate over pages.
  .falign
.Lout:
    // Prefetch each individual page.
    l2fetch(r1, r3)
    loop0(.Lpage, #512)
  .falign
.Lpage:
    r5:4 = memd(r1++#8)
  {
    memw(r0++#8) = r4
    memw(r0+#4) = r5
  } :endloop0:endloop1
.Lskipmain:
  {
    r3 = ##2105344              // r3 = 0x202000 (prefetch info)
    r4 = lsr(r2, #3)            // r4 = number of 32-byte blocks remaining.
    p0 = cmp.eq(r2, #0)
    if (p0.new) jumpr:nt r31
  }
  {
    r3 = or(r3, r4)
    loop0(.Lepilog, r2)
  }
    l2fetch(r1, r3)
  .falign
.Lepilog:
  {
    r4 = memw(r1++#4)
    memw(r0++#4) = r4.new
  } :endloop0
    jumpr r31
.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
 |