File: mul_hilo.cl

package info (click to toggle)
intel-graphics-compiler 1.0.12504.6-1%2Bdeb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 83,912 kB
  • sloc: cpp: 910,147; lisp: 202,655; ansic: 15,197; python: 4,025; yacc: 2,241; lex: 1,570; pascal: 244; sh: 104; makefile: 25
file content (169 lines) | stat: -rw-r--r-- 6,337 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#ifndef __MUL_HILO_CL__
#define __MUL_HILO_CL__


/*****************************************************************************\

Function:
    __intc_umul_hilo

Description:
    This function multiplies 2 unsigned longs (64 bits) operands and returns
    the 128 bit result in two parts (high - returned by function / low -
    return through pointer to lo parameter).

Input:
    ulong    sourceA    - operand
    ulong    sourceB    - operand
    ulong*   destLow    - return the low bits of the multiplication
Output:
    return the high bits of the multiplication.

\*****************************************************************************/
ulong ___intc_umul_hilo(ulong sourceA,
                        ulong sourceB,
                        __private ulong *destLow)
{
    ulong lowA, lowB;
    ulong highA, highB;

    // Split up the values
    lowA = sourceA & 0xffffffff;
    highA = sourceA >> 32;
    lowB = sourceB & 0xffffffff;
    highB = sourceB >> 32;

    // Note that, with this split, our multiplication becomes:
    //     ( a * b )
    // = ( ( aHI << 32 + aLO ) * ( bHI << 32 + bLO ) ) >> 64
    // = ( ( aHI << 32 * bHI << 32 ) + ( aHI << 32 * bLO ) + ( aLO * bHI << 32 ) + ( aLO * bLO ) ) >> 64
    // = ( ( aHI * bHI << 64 ) + ( aHI * bLO << 32 ) + ( aLO * bHI << 32 ) + ( aLO * bLO ) ) >> 64
    // = ( aHI * bHI ) + ( aHI * bLO >> 32 ) + ( aLO * bHI >> 32 ) + ( aLO * bLO >> 64 )

    // Now, since each value is 32 bits, the max size of any multiplication is:
    // ( 2 ^ 32 - 1 ) * ( 2 ^ 32 - 1 ) = 2^64 - 4^32 + 1 = 2^64 - 2^33 + 1,
    // which fits within 64 bits. Which means we can do each component within a
    // 64-bit integer as necessary (each component above marked as AB1 - AB4)
    ulong aHibHi = highA * highB;
    ulong aHibLo = highA * lowB;
    ulong aLobHi = lowA * highB;
    ulong aLobLo = lowA * lowB;

    // Assemble terms.
    //  We note that in certain cases, sums of products cannot overflow:
    //
    //      The maximum product of two N-bit unsigned numbers is
    //
    //          (2**N-1)^2 = 2**2N - 2**(N+1) + 1
    //
    //      We note that we can add the maximum N-bit number to the 2N-bit product
    //      twice without overflow:
    //
    //          (2**N-1)^2 + 2*(2**N-1) = 2**2N - 2**(N+1) + 1 + 2**(N+1) - 2 = 2**2N - 1
    //
    //  If we breakdown the product of two numbers a,b into high and low halves of
    //  partial products as follows:
    //
    //                                              a.hi                a.lo
    // x                                            b.hi                b.lo
    //===============================================================================
    //  (b.hi*a.hi).hi      (b.hi*a.hi).lo
    //                      (b.lo*a.hi).hi      (b.lo*a.hi).lo
    //                      (b.hi*a.lo).hi      (b.hi*a.lo).lo
    // +                                        (b.lo*a.lo).hi      (b.lo*a.lo).lo
    //===============================================================================
    //
    // The (b.lo*a.lo).lo term cannot cause a carry, so we can ignore them
    // for now.  We also know from above, that we can add (b.lo*a.lo).hi
    // and (b.hi*a.lo).lo to the 2N bit term [(b.lo*a.hi).hi + (b.lo*a.hi).lo]
    // without overflow.  That takes care of all of the terms
    // on the right half that might carry.  Do that now.
    //
    ulong aLobLoHi = aLobLo >> 32;
    ulong aLobHiLo = aLobHi & 0xFFFFFFFFUL;
    aHibLo += aLobLoHi + aLobHiLo;

    // That leaves us with these terms:
    //
    //                                              a.hi                a.lo
    // x                                            b.hi                b.lo
    //===============================================================================
    //  (b.hi*a.hi).hi      (b.hi*a.hi).lo
    //                      (b.hi*a.lo).hi
    //                    [ (b.lo*a.hi).hi + (b.lo*a.hi).lo + other ]
    // +                                                                (b.lo*a.lo).lo
    //===============================================================================

    // All of the overflow potential from the right half has now been accumulated
    // into the [ (b.lo*a.hi).hi + (b.lo*a.hi).lo ] 2N bit term.
    // We can safely separate into high and low parts. Per our rule above, we know we
    // can accumulate the high part of that and (b.hi*a.lo).hi
    // into the 2N bit term (b.lo*a.hi) without carry.  The low part can be pieced
    // together with (b.lo*a.lo).lo, to give the final low result

    *destLow = (aHibLo << 32) | ( aLobLo & 0xFFFFFFFFUL );
    return ( aHibHi + (aHibLo >> 32 ) + (aLobHi >> 32) );       // Cant overflow
}


/*****************************************************************************\

Function:
    __intc_mul_hilo

Description:
    This function multiplies 2 longs (64 bits) operands and returns
    the 128 bit result in two parts (high - returned by function / low -
    return through pointer to lo parameter).

Input:
    long    sourceA    - operand
    long    sourceB    - operand
    ulong*  destLow   - return the low bits of the multiplication
Output:
    return the high bits of the multiplication.

\*****************************************************************************/
long ___intc_mul_hilo(long sourceA,
                      long sourceB,
                      __private ulong *destLow)
{
    // Find sign of result
    long aSign = sourceA >> 63;
    long bSign = sourceB >> 63;
    long resultSign = aSign ^ bSign;

    // take absolute values of the argument
    sourceA = (sourceA ^ aSign) - aSign;
    sourceB = (sourceB ^ bSign) - bSign;

    ulong hi;
    hi = ___intc_umul_hilo( (ulong) sourceA,
                            (ulong) sourceB,
                            destLow );

    // Fix the sign
    if( resultSign )
    {
        *destLow ^= resultSign;
        hi  ^= resultSign;
        *destLow -= resultSign;

        //carry if necessary
        if( 0 == *destLow )
            hi -= resultSign;
    }

    return (long) hi;
}


#endif // __MUL_HILO_CL__