1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
|
dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 3 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C P5: 12.0 for 32-bit multiplier
C 7.0 for 16-bit multiplier
C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
C mp_limb_t multiplier);
C
C When the multiplier is 16 bits some special case MMX code is used. Small
C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
C size==8 end up being quite close. If src isn't aligned to an 8 byte
C boundary then one limb is processed separately with roughly a 5 cycle
C penalty, so in that case it's say size==8 and size==9 which are close.
C
C Alternatives:
C
C MMX is not believed to be of any use for 32-bit multipliers, since for
C instance the current method would just have to be more or less duplicated
C for the high and low halves of the multiplier, and would probably
C therefore run at about 14 cycles, which is slower than the plain integer
C at 12.
C
C Adding the high and low MMX products using integer code seems best. An
C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
C any joy. Perhaps something could be done keeping the values signed and
C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
C perhaps not.
C
C Future:
C
C An mpn_mul_1c entrypoint would need a double carry out of the low result
C limb in the 16-bit code, unless it could be assumed the carry fits in 16
C bits, possibly as carry<multiplier, this being true of a big calculation
C done piece by piece. But let's worry about that if/when mul_1c is
C actually used.
defframe(PARAM_MULTIPLIER,16)
defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC, 8)
defframe(PARAM_DST, 4)
TEXT
ALIGN(8)
PROLOGUE(mpn_mul_1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
movl PARAM_SRC, %edx
cmpl $1, %ecx
jne L(two_or_more)
C one limb only
movl PARAM_MULTIPLIER, %eax
movl PARAM_DST, %ecx
mull (%edx)
movl %eax, (%ecx)
movl %edx, %eax
ret
L(two_or_more):
C eax size
C ebx
C ecx carry
C edx
C esi src
C edi
C ebp
pushl %esi FRAME_pushl()
pushl %edi FRAME_pushl()
movl %edx, %esi C src
movl PARAM_DST, %edi
movl PARAM_MULTIPLIER, %eax
pushl %ebx FRAME_pushl()
leal (%esi,%ecx,4), %esi C src end
leal (%edi,%ecx,4), %edi C dst end
negl %ecx C -size
pushl %ebp FRAME_pushl()
cmpl $65536, %eax
jb L(small)
L(big):
xorl %ebx, %ebx C carry limb
sarl %ecx C -size/2
jnc L(top) C with carry flag clear
C size was odd, process one limb separately
mull 4(%esi,%ecx,8) C m * src[0]
movl %eax, 4(%edi,%ecx,8)
incl %ecx
orl %edx, %ebx C carry limb, and clear carry flag
L(top):
C eax
C ebx carry
C ecx counter, negative
C edx
C esi src end
C edi dst end
C ebp (scratch carry)
adcl $0, %ebx
movl (%esi,%ecx,8), %eax
mull PARAM_MULTIPLIER
movl %edx, %ebp
addl %eax, %ebx
adcl $0, %ebp
movl 4(%esi,%ecx,8), %eax
mull PARAM_MULTIPLIER
movl %ebx, (%edi,%ecx,8)
addl %ebp, %eax
movl %eax, 4(%edi,%ecx,8)
incl %ecx
movl %edx, %ebx
jnz L(top)
adcl $0, %ebx
popl %ebp
movl %ebx, %eax
popl %ebx
popl %edi
popl %esi
ret
L(small):
C Special case for 16-bit multiplier.
C
C eax multiplier
C ebx
C ecx -size
C edx src
C esi src end
C edi dst end
C ebp multiplier
C size<3 not supported here. At size==3 we're already a couple of
C cycles faster, so there's no threshold as such, just use the MMX
C as soon as possible.
cmpl $-3, %ecx
ja L(big)
movd %eax, %mm7 C m
pxor %mm6, %mm6 C initial carry word
punpcklwd %mm7, %mm7 C m replicated 2 times
addl $2, %ecx C -size+2
punpckldq %mm7, %mm7 C m replicated 4 times
andl $4, %edx C test alignment, clear carry flag
movq %mm7, %mm0 C m
jz L(small_entry)
C Source is unaligned, process one limb separately.
C
C Plain integer code is used here, since it's smaller and is about
C the same 13 cycles as an mmx block would be.
C
C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
C the use of separate incl and orl.
mull -8(%esi,%ecx,4) C m * src[0]
movl %eax, -8(%edi,%ecx,4) C dst[0]
incl %ecx C one limb processed
movd %edx, %mm6 C initial carry
orl %eax, %eax C clear carry flag
jmp L(small_entry)
C The scheduling here is quite tricky, since so many instructions have
C pairing restrictions. In particular the js won't pair with a movd, and
C can't be paired with an adc since it wants flags from the inc, so
C instructions are rotated to the top of the loop to find somewhere useful
C for it.
C
C Trouble has been taken to avoid overlapping successive loop iterations,
C since that would greatly increase the size of the startup and finishup
C code. Actually there's probably not much advantage to be had from
C overlapping anyway, since the difficulties are mostly with pairing, not
C with latencies as such.
C
C In the comments x represents the src data and m the multiplier (16
C bits, but replicated 4 times).
C
C The m signs calculated in %mm3 are a loop invariant and could be held in
C say %mm5, but that would save only one instruction and hence be no faster.
L(small_top):
C eax l.low, then l.high
C ebx (h.low)
C ecx counter, -size+2 to 0 or 1
C edx (h.high)
C esi &src[size]
C edi &dst[size]
C ebp
C
C %mm0 (high products)
C %mm1 (low products)
C %mm2 (adjust for m using x signs)
C %mm3 (adjust for x using m signs)
C %mm4
C %mm5
C %mm6 h.low, then carry
C %mm7 m replicated 4 times
movd %mm6, %ebx C h.low
psrlq $32, %mm1 C l.high
movd %mm0, %edx C h.high
movq %mm0, %mm6 C new c
adcl %eax, %ebx
incl %ecx
movd %mm1, %eax C l.high
movq %mm7, %mm0
adcl %eax, %edx
movl %ebx, -16(%edi,%ecx,4)
movl %edx, -12(%edi,%ecx,4)
psrlq $32, %mm6 C c
L(small_entry):
pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
movq %mm7, %mm1
pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
movq %mm7, %mm3
movq -8(%esi,%ecx,4), %mm2 C x
psraw $15, %mm3 C m signs
pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
psraw $15, %mm2 C x signs
paddw %mm3, %mm0 C add x to h if m neg
pand %mm7, %mm2 C m selected by x signs
paddw %mm2, %mm0 C add m to h if x neg
incl %ecx
movd %mm1, %eax C l.low
punpcklwd %mm0, %mm6 C c + h.low << 16
psrlq $16, %mm0 C h.high
js L(small_top)
movd %mm6, %ebx C h.low
psrlq $32, %mm1 C l.high
adcl %eax, %ebx
popl %ebp FRAME_popl()
movd %mm0, %edx C h.high
psrlq $32, %mm0 C l.high
movd %mm1, %eax C l.high
adcl %eax, %edx
movl %ebx, -12(%edi,%ecx,4)
movd %mm0, %eax C c
adcl $0, %eax
movl %edx, -8(%edi,%ecx,4)
orl %ecx, %ecx
jnz L(small_done) C final %ecx==1 means even, ==0 odd
C Size odd, one extra limb to process.
C Plain integer code is used here, since it's smaller and is about
C the same speed as another mmx block would be.
movl %eax, %ecx
movl PARAM_MULTIPLIER, %eax
mull -4(%esi)
addl %ecx, %eax
adcl $0, %edx
movl %eax, -4(%edi)
movl %edx, %eax
L(small_done):
popl %ebx
popl %edi
popl %esi
emms
ret
EPILOGUE()
|