1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
|
/*
* mulsf3.s
*
* Floating point multiply, single precision: r0r1 *= r2r3
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.0 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is Librcx floating point code, released May 27, 1999.
*
* The Initial Developer of the Original Code is Kekoa Proudfoot.
* Portions created by Kekoa Proudfoot are Copyright (C) 1999
* Kekoa Proudfoot. All Rights Reserved.
*
* Contributor(s): Kekoa Proudfoot <kekoa@graphics.stanford.edu>
*/
; possible optimizations:
; - use push/pop to save exponent and sign on stack
; - might it have been faster/simpler to inline a 24 iteration multiply?
; - this only requires 3b + 3b + 4b + 1b in regs to implement?
.section .text
;;
;; function: mulsf3
;; input: float in r0r1 and float at sp+2
;; output: float in r0r1
;;
.global ___mulsf3
___mulsf3:
; Invoke the prologue to expand input operands
jsr ___startsf
; At this point, registers/stack contain the following:
; r0r1 - first operand
; r2h - second operand flags
; r2l - second operand sign
; r3h - first operand flags
; r3l - first operand sign
; r4 - first operand exponent
; r5r6 - first operand mantissa
; sp+0 - second operand exponent
; sp+2 - second operand flags (same as r2h)
; sp+3 - second operand sign (same as r2l)
; sp+4 - second operand mantissa
; sp+16 - second operand
; Note on flag bits: 0=zero, 1=inf, 2=nan
; Is the first operand a NaN?
; If yes, return the first operand (the value already in r0r1)
bld #2,r3h ; if nan flag of first operand set
; Hack!
bcs return_jmp ; carry set indicates true
; Is the second operand a NaN?
bld #2,r2h ; if nan flag of second operand set
bcc endif_0 ; carry clear indicates false
; Return the second operand (which we need to load off stack)
mov.w @(16,r7),r0 ; set return value to second operand (sp+16)
mov.w @(18,r7),r1
; Hack!
return_jmp:
bra return
endif_0:
; Is the first operand infinity and the second operand zero?
; Or is the second operand infinity and the first operand zero?
bld #1,r3h ; load inf flag of first operand to carry
band #0,r2h ; and carry with zero flag of second operand
bcs if_1 ; carry set indicates true
bld #1,r2h ; load inf flag of second operand to carry
band #0,r3h ; and carry with zero flag of first operand
bcc endif_1 ; carry clear indicates false
if_1:
; Zero multiplied by infinity, so return NaN
mov.w #0x7fff,r0 ; set return value to NaN (7fffffff)
mov.w #0xffff,r1
bra return
endif_1:
; At this point, we no longer need the following:
; r0r1 - first operand
; r2h - second operand flags
; r3h - first operand flags
; Compute new exponent
mov.w @r7,r0 ; load second operand exponent (sp+0) to temp
add.w r0,r4 ; add temp to exponent of first operand (r4)
add.b #-127,r4l ; subtract bias
addx.b #-1,r4h ; finish subtraction
; Compute new sign
xor.b r2l,r3l ; xor second sign (r2l) into first sign (r3l)
; At this point r2l is also free (and therefore all of r2 is free)
; We now want to multiply the two mantissas
; We need r3 and r4 free to do this
; Save result exponent and sign to stack
mov.w r4,@r7 ; sp+0 is result exponent
mov.b r3l,@(3,r7) ; sp+3 is result sign
; Given two 1.23 mantissas, compute a 2.30 product with a sticky lsb
; Use three 24 by 8 multiplies to multiply two 24-bit mantissas
; Save first mantissa to r2r3
mov.w r5,r2 ; copy r5r6 to r2r3
mov.w r6,r3 ; note multiply done using r5r6
; Perform first multiply (first mantissa * lower byte of second mantissa)
mov.b @(7,r7),r4l ; load lower byte of second mantissa to r4l
bsr mul_24_8 ; multiply r5l r6h r6l by r4l
; Save lower four result bytes in r0r1
mov.w r5,r0 ; copy r5r6 to r0r1
mov.w r6,r1
; Perform second multiply (first mantissa * middle byte of second mantissa)
mov.w r2,r5 ; copy first mantissa back to r5r6 from r2r3
mov.w r3,r6
mov.b @(6,r7),r4l ; load middle byte of second mantissa to r4l
bsr mul_24_8 ; multiply r5l r6h r6l by r4l
; Combine new result into old and propagate carry into fifth result byte
add.b r6l,r1h ; add corresponding bytes
addx.b r6h,r0l
addx.b r5l,r0h
addx.b #0,r5h ; propagate carry
; Combine lower two result bytes into a single sticky byte
or.b r1l,r1h ; compute new sticky byte
; Save fifth result byte
mov.b r5h,r1l ; store fifth result byte
; Perform final multiply (first mantissa * upper byte of second mantissa)
mov.w r2,r5 ; copy first mantissa back to r5r6 from r2r3
mov.w r3,r6
mov.b @(5,r7),r4l ; load upper byte of second mantissa to r4l
bsr mul_24_8 ; multiply r5l r6h r6l by r4l
; Combine old result into new and propagate carry into upper result byte
add.w r0,r6 ; add corresponding bytes
addx.b r1l,r5l
addx.b #0,r5h ; propagate carry
; 2.30 result now in r5r6
; Set sticky bit if sticky byte or result lsb non-zero
add.b #0xff,r1h ; set carry if sticky byte non-zero
bor #0,r6l ; or carry with lsb to get sticky bit
bst #0,r6l ; store sticky bit in lsb
; Sticky shift the result right one place to get a 2.29 value
shlr.b r5h ; shift mantissa right 1 place
rotxr.b r5l
rotxr.b r6h
rotxr.b r6l ; places old sticky bit in carry
bor #0,r6l ; or lsb with old sticky bit to get new bit
bst #0,r6l ; store new sticky bit
; Restore result exponent and sign from stack
mov.w @r7,r4 ; sp+0 is result exponent
mov.b @(3,r7),r3l ; sp+3 is result sign
; Pack the result
jsr ___joinsf
return:
; Invoke the epilogue to cleanup and return
jmp ___finishsf
;;
;; function: mul_24_8:
;; input: three-byte operand in r5l r6h r6l, single-byte operand in r4l
;; output: four-byte product in r5 r6
;; registers: assumes r4h r5h free
;;
mul_24_8:
; Rearrange bytes
mov.b r6h,r4h ; move middle byte of three-byte operand to r4h
; Perform bytewise multiply
mulxu.b r4l,r5 ; multiply r5l r4h r6l by single-byte operand
mulxu.b r4l,r6 ; result split into words in r5 r4 r6
mulxu.b r4h,r4 ; r5 r4 r6 are resp. upper middle lower words
; Combine results
; Add lower byte of middle word into upper byte of lower word
; Add upper byte of middle word into lower byte of upper word, using carry
; Propagate second carry to upper byte of upper word
add.b r4l,r6h ; add corresponding bytes
addx.b r4h,r5l ; add corresponding bytes, with carry
addx.b #0,r5h ; propagate second carry
rts
|