File: mulsf3.s

package info (click to toggle)
brickos 0.9.0-1
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 1,700 kB
  • ctags: 1,727
  • sloc: ansic: 9,139; cpp: 860; makefile: 717; asm: 693; sh: 123; perl: 61
file content (255 lines) | stat: -rw-r--r-- 8,032 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/*
 *  mulsf3.s
 *
 *  Floating point multiply, single precision: r0r1 *= r2r3
 *
 *  The contents of this file are subject to the Mozilla Public License
 *  Version 1.0 (the "License"); you may not use this file except in
 *  compliance with the License. You may obtain a copy of the License at
 *  http://www.mozilla.org/MPL/
 *
 *  Software distributed under the License is distributed on an "AS IS"
 *  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 *  License for the specific language governing rights and limitations
 *  under the License.
 *
 *  The Original Code is Librcx floating point code, released May 27, 1999.
 *
 *  The Initial Developer of the Original Code is Kekoa Proudfoot.
 *  Portions created by Kekoa Proudfoot are Copyright (C) 1999
 *  Kekoa Proudfoot. All Rights Reserved.
 *
 *  Contributor(s): Kekoa Proudfoot <kekoa@graphics.stanford.edu>
 */

; possible optimizations:
;  - use push/pop to save exponent and sign on stack
;  - might it have been faster/simpler to inline a 24 iteration multiply?
;       - this only requires 3b + 3b + 4b + 1b in regs to implement?

    .section .text

;;
;; function: mulsf3
;; input: float in r0r1 and float at sp+2
;; output: float in r0r1
;;

    .global ___mulsf3

___mulsf3:

    ; Invoke the prologue to expand input operands

    jsr  ___startsf

    ; At this point, registers/stack contain the following:
    ;    r0r1  - first operand
    ;    r2h   - second operand flags
    ;    r2l   - second operand sign
    ;    r3h   - first operand flags
    ;    r3l   - first operand sign
    ;    r4    - first operand exponent
    ;    r5r6  - first operand mantissa
    ;    sp+0  - second operand exponent
    ;    sp+2  - second operand flags (same as r2h)
    ;    sp+3  - second operand sign (same as r2l)
    ;    sp+4  - second operand mantissa
    ;    sp+16 - second operand

    ; Note on flag bits: 0=zero, 1=inf, 2=nan

    ; Is the first operand a NaN?
    ; If yes, return the first operand (the value already in r0r1)

    bld     #2,r3h              ; if nan flag of first operand set

    ; Hack!
    bcs     return_jmp          ; carry set indicates true

    ; Is the second operand a NaN?

    bld     #2,r2h              ; if nan flag of second operand set
    bcc     endif_0             ; carry clear indicates false

        ; Return the second operand (which we need to load off stack)

        mov.w   @(16,r7),r0     ; set return value to second operand (sp+16)
        mov.w   @(18,r7),r1

        ; Hack!
        return_jmp:

        bra     return

    endif_0:

    ; Is the first operand infinity and the second operand zero?
    ; Or is the second operand infinity and the first operand zero?

    bld     #1,r3h              ; load inf flag of first operand to carry
    band    #0,r2h              ; and carry with zero flag of second operand
    bcs     if_1                ; carry set indicates true

    bld     #1,r2h              ; load inf flag of second operand to carry
    band    #0,r3h              ; and carry with zero flag of first operand
    bcc     endif_1             ; carry clear indicates false

        if_1:

        ; Zero multiplied by infinity, so return NaN

        mov.w   #0x7fff,r0      ; set return value to NaN (7fffffff)
        mov.w   #0xffff,r1
        bra     return

    endif_1:

    ; At this point, we no longer need the following:
    ;    r0r1 - first operand
    ;    r2h  - second operand flags
    ;    r3h  - first operand flags

    ; Compute new exponent

    mov.w   @r7,r0              ; load second operand exponent (sp+0) to temp
    add.w   r0,r4               ; add temp to exponent of first operand (r4)
    add.b   #-127,r4l           ; subtract bias
    addx.b  #-1,r4h             ; finish subtraction

    ; Compute new sign

    xor.b   r2l,r3l             ; xor second sign (r2l) into first sign (r3l)

    ; At this point r2l is also free (and therefore all of r2 is free)
    ; We now want to multiply the two mantissas
    ; We need r3 and r4 free to do this

    ; Save result exponent and sign to stack

    mov.w   r4,@r7              ; sp+0 is result exponent
    mov.b   r3l,@(3,r7)         ; sp+3 is result sign

    ; Given two 1.23 mantissas, compute a 2.30 product with a sticky lsb
    ; Use three 24 by 8 multiplies to multiply two 24-bit mantissas

    ; Save first mantissa to r2r3

    mov.w   r5,r2               ; copy r5r6 to r2r3
    mov.w   r6,r3               ; note multiply done using r5r6

    ; Perform first multiply (first mantissa * lower byte of second mantissa)

    mov.b   @(7,r7),r4l         ; load lower byte of second mantissa to r4l

    bsr     mul_24_8            ; multiply r5l r6h r6l by r4l

    ; Save lower four result bytes in r0r1

    mov.w   r5,r0               ; copy r5r6 to r0r1
    mov.w   r6,r1

    ; Perform second multiply (first mantissa * middle byte of second mantissa)

    mov.w   r2,r5               ; copy first mantissa back to r5r6 from r2r3
    mov.w   r3,r6

    mov.b   @(6,r7),r4l         ; load middle byte of second mantissa to r4l

    bsr     mul_24_8            ; multiply r5l r6h r6l by r4l

    ; Combine new result into old and propagate carry into fifth result byte

    add.b   r6l,r1h             ; add corresponding bytes
    addx.b  r6h,r0l
    addx.b  r5l,r0h
    addx.b  #0,r5h              ; propagate carry

    ; Combine lower two result bytes into a single sticky byte

    or.b    r1l,r1h             ; compute new sticky byte

    ; Save fifth result byte

    mov.b   r5h,r1l             ; store fifth result byte

    ; Perform final multiply (first mantissa * upper byte of second mantissa)

    mov.w   r2,r5               ; copy first mantissa back to r5r6 from r2r3
    mov.w   r3,r6

    mov.b   @(5,r7),r4l         ; load upper byte of second mantissa to r4l

    bsr     mul_24_8            ; multiply r5l r6h r6l by r4l

    ; Combine old result into new and propagate carry into upper result byte

    add.w   r0,r6               ; add corresponding bytes
    addx.b  r1l,r5l
    addx.b  #0,r5h              ; propagate carry

    ; 2.30 result now in r5r6

    ; Set sticky bit if sticky byte or result lsb non-zero

    add.b   #0xff,r1h           ; set carry if sticky byte non-zero
    bor     #0,r6l              ; or carry with lsb to get sticky bit
    bst     #0,r6l              ; store sticky bit in lsb

    ; Sticky shift the result right one place to get a 2.29 value

    shlr.b  r5h                 ; shift mantissa right 1 place
    rotxr.b r5l
    rotxr.b r6h
    rotxr.b r6l                 ; places old sticky bit in carry

    bor     #0,r6l              ; or lsb with old sticky bit to get new bit
    bst     #0,r6l              ; store new sticky bit

    ; Restore result exponent and sign from stack

    mov.w   @r7,r4              ; sp+0 is result exponent
    mov.b   @(3,r7),r3l         ; sp+3 is result sign

    ; Pack the result

    jsr  ___joinsf

return:

    ; Invoke the epilogue to cleanup and return

    jmp  ___finishsf



;;
;; function: mul_24_8:
;; input: three-byte operand in r5l r6h r6l, single-byte operand in r4l
;; output: four-byte product in r5 r6
;; registers: assumes r4h r5h free
;;

mul_24_8:

    ; Rearrange bytes

    mov.b   r6h,r4h             ; move middle byte of three-byte operand to r4h

    ; Perform bytewise multiply

    mulxu.b r4l,r5              ; multiply r5l r4h r6l by single-byte operand
    mulxu.b r4l,r6              ; result split into words in r5 r4 r6
    mulxu.b r4h,r4              ; r5 r4 r6 are resp. upper middle lower words

    ; Combine results

    ; Add lower byte of middle word into upper byte of lower word
    ; Add upper byte of middle word into lower byte of upper word, using carry
    ; Propagate second carry to upper byte of upper word

    add.b   r4l,r6h             ; add corresponding bytes
    addx.b  r4h,r5l             ; add corresponding bytes, with carry
    addx.b  #0,r5h              ; propagate second carry

    rts