File: cl_DS_recipsqrt.cc

package info (click to toggle)
cln 1.3.4-4
  • links: PTS
  • area: main
  • in suites: buster
  • size: 10,724 kB
  • sloc: cpp: 80,930; sh: 11,982; ansic: 3,278; makefile: 1,313
file content (167 lines) | stat: -rw-r--r-- 6,915 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// cl_UDS_recipsqrt().

// General includes.
#include "base/cl_sysdep.h"

// Specification.
#include "base/digitseq/cl_DS.h"


// Implementation.

#include "base/cl_low.h"
#include "cln/exception.h"

namespace cln {

// Compute the reciprocal square root of a digit sequence.
// Input: UDS a_MSDptr/a_len/.. of length a_len,
//        with 1/4 <= a < 1.
//        [i.e. 1/4*beta^a_len <= a < beta^a_len]
// Output: UDS b_MSDptr/b_len+2/.. of length b_len+1 (b_len>1), plus 1 more bit
//         in the last limb) such that
//         1 <= b <= 2  [i.e. beta^b_len <= b <= 2*beta^b_len]
//         and  | 1/sqrt(a) - b | < 1/2*beta^(-b_len).
// If a_len > b_len, only the most significant b_len+1 limbs of a are used.
  extern void cl_UDS_recipsqrt (const uintD* a_MSDptr, uintC a_len,
                                uintD* b_MSDptr, uintC b_len);
// Method:
// Using Newton iteration for computation of x^-1/2.
// The Newton iteration for f(y) = x-1/y^2 reads:
//   y --> y - (x-1/y^2)/(2/y^3) = y + y*(1-x*y^2)/2 =: g(y).
// We have  T^3-3*T+2 = (T-1)^2*(T+2), hence
//   1/sqrt(x) - g(y) = 1/(2*sqrt(x)) * (sqrt(x)*y-1)^2 * (sqrt(x)*y+2).
// Hence g(y) <= 1/sqrt(x).
// If we choose 0 < y_0 <= 1/sqrt(x), then set y_(n+1) := g(y_n), we will
// always have 0 < y_n <= 1/sqrt(x).
// Since
//   1/sqrt(x) - g(y) = sqrt(x)*(sqrt(x)*y+2)/2 * (1/sqrt(x) - y)^2,
// which is >= 0 and < 3/2 * (1/sqrt(x) - y)^2, we have a quadratically
// convergent iteration.
// For n = 1,2,...,b_len we compute approximations y with 1 <= yn <= 2
// and  | 1/sqrt(x) - yn | < 1/2*beta^(-n).
// Step n=1:
//   Compute the isqrt of the leading two digits of x, yields one digit.
//   Compute its reciprocal, then do one iteration as below (n=0 -> m=1).
// Step n -> m with n < m <= 2*n:
//   Write x = xm + xr with 0 <= xr < beta^-(m+1).
//   Set ym' = yn + (yn*(1-xm*yn*yn))/2, round down to a multiple ym
//   of beta^-(m+1).
//   (Actually, compute yn*yn, round up to a multiple of beta^-(m+1),   [1]
//    multiply with xm,        round up to a multiple of beta^-(m+1),   [2]
//    subtract from 1,         no rounding needed,                      [2]
//    multiply with yn,        round down to a multiple of beta^-(m+1), [5]
//    divide by 2,             round down to a multiple of beta^-(m+1), [3]
//    add to yn,               no rounding needed.  [Max rounding error: ^])
//   The exact value ym' (no rounding) would satisfy
//     0 <= 1/sqrt(xm) - ym' < 3/2 * (1/sqrt(xm) - yn)^2
//                           < 3/8 * beta^(-2*n)          by hypothesis,
//                           <= 3/8 * beta^-m.
//   The rounding errors all go into the same direction, so
//     0 <= ym' - ym < 3 * beta^-(m+1) < 1/4 * beta^-m.
//   Combine both inequalities:
//     0 <= 1/sqrt(xm) - ym < 1/2 * beta^-m.
//   Neglecting xr can introduce a small error in the opposite direction:
//     0 <= 1/sqrt(xm) - 1/sqrt(x) = (sqrt(x) - sqrt(xm))/(sqrt(x)*sqrt(xm))
//        = xr / (sqrt(x)*sqrt(xm)*(sqrt(x)+sqrt(xm)))
//        <= 4*xr < 4*beta^-(m+1) < 1/2*beta^-m.
//   Combine both inequalities:
//     | 1/sqrt(x) - ym | < 1/2 * beta^-m.
//   (Actually, choosing the opposite rounding direction wouldn't hurt either.)
// Choice of n:
//   So that the computation is minimal, e.g. in the case b_len=10:
//   1 -> 2 -> 3 -> 5 -> 10 and not 1 -> 2 -> 4 -> 8 -> 10.
  void cl_UDS_recipsqrt (const uintD* a_MSDptr, uintC a_len,
                         uintD* b_MSDptr, uintC b_len)
    {
	var uintC y_len = b_len+2;
	var uintC x_len = (a_len <= b_len ? a_len : b_len+1);
	var const uintD* const x_MSDptr = a_MSDptr;
	var uintD* y_MSDptr;
	var uintD* y2_MSDptr;
	var uintD* y3_MSDptr;
	var uintD* y4_MSDptr;
	CL_ALLOCA_STACK;
	num_stack_alloc(y_len,y_MSDptr=,);
	num_stack_alloc(2*y_len,y2_MSDptr=,);
	num_stack_alloc(2*y_len,y3_MSDptr=,);
	num_stack_alloc(2*y_len,y4_MSDptr=,);
	// Step n = 1.
	{ var uintD x1 = mspref(x_MSDptr,0);
	  var uintD x2 = (a_len > 1 ? mspref(x_MSDptr,1) : 0);
	  var uintD y0;
	  var uintD y1;
	  var bool sqrtp;
	  isqrtD(x1,x2, y1=,sqrtp=);
	  // 2^31 <= y1 < 2^32.
	  y0 = 1;
	  if (!sqrtp) // want to compute 1/sqrt(x) rounded down
		if (++y1 == 0)
			goto step1_done; // 1/1.0000 = 1.0000
	  // Set y0|y1 := 2^(2*intDsize)/y1
	  //            = 2^intDsize + (2^(2*intDsize)-2^intDsize*y1)/y1.
	  if ((uintD)(-y1) >= y1) {
		y0 = 2; y1 = 0;
	  } else {
		#if HAVE_DD
		divuD(highlowDD_0((uintD)(-y1)),y1, y1=,);
		#else
		divuD((uintD)(-y1),0,y1, y1=,);
		#endif
	  }
	step1_done:
	  mspref(y_MSDptr,0) = y0;
	  mspref(y_MSDptr,1) = y1;
	}
	// Other steps.
	var int k;
	integerlengthC(b_len-1,k=);
	// 2^(k-1) < b_len <= 2^k, so we need k steps, plus one
	// one more step at the beginning (because step 1 was not complete).
	var uintC n = 0;
	for (; k>=0; k--)
	  { var uintC m = ((b_len-1)>>k)+1; // = ceiling(b_len/2^k)
	    // Compute ym := yn + (yn*(1-xm*yn*yn))/2, rounded.
	    // Storage: at y_MSDptr: (1 + n+1) limbs, yn.
	    //          at y2_MSDptr: (2 + 2*n+2) limbs, yn^2.
	    //          at y3_MSDptr: (1 + m+1) limbs, xm*yn*yn, 1-xm*yn*yn.
	    //          at y4_MSDptr: (2-n + m+n+2) limbs, yn*(1-xm*yn*yn).
	    clear_loop_msp(y_MSDptr mspop (n+2),m-n);
	    cl_UDS_mul_square(y_MSDptr mspop (n+2),n+2,
	                      y2_MSDptr mspop 2*(n+2));
	    var uintC xm_len = (m < x_len ? m+1 : x_len);
	    var uintC y2_len = m+2; // = (m+1 <= 2*n+2 ? m+2 : 2*n+3);
	    cl_UDS_mul(x_MSDptr mspop xm_len,xm_len,
	               y2_MSDptr mspop (y2_len+1),y2_len,
	               y3_MSDptr mspop (xm_len+y2_len));
	    if (mspref(y3_MSDptr,0)==0)
	      // xm*yn*yn < 1
	      { neg_loop_lsp(y3_MSDptr mspop (m+2),m+2);
	        mspref(y3_MSDptr,0) += 1;
	        if (test_loop_msp(y3_MSDptr,n)) throw runtime_exception(); // check 0 <= y3 < beta^-(n-1)
	        cl_UDS_mul(y_MSDptr mspop (n+2),n+2,
	                   y3_MSDptr mspop (m+2),m+2-n,
	                   y4_MSDptr mspop (m+4));
	        shift1right_loop_msp(y4_MSDptr,m+3-n,0);
	        if (addto_loop_lsp(y4_MSDptr mspop (m+3-n),y_MSDptr mspop (m+2),m+3-n))
	          if ((n<1) || inc_loop_lsp(y_MSDptr mspop (n-1),n-1)) throw runtime_exception();
	      }
	      else
	      // xm*yn*yn >= 1 (this can happen since xm >= xn)
	      { mspref(y3_MSDptr,0) -= 1;
	        if (test_loop_msp(y3_MSDptr,n)) throw runtime_exception(); // check 0 >= y3 > -beta^-(n-1)
	        cl_UDS_mul(y_MSDptr mspop (n+2),n+2,
	                   y3_MSDptr mspop (m+2),m+2-n,
	                   y4_MSDptr mspop (m+4));
	        shift1right_loop_msp(y4_MSDptr,m+3-n,0);
	        if (subfrom_loop_lsp(y4_MSDptr mspop (m+3-n),y_MSDptr mspop (m+2),m+3-n))
	          if ((n<1) || dec_loop_lsp(y_MSDptr mspop (n-1),n-1)) throw runtime_exception();
	      }
	    n = m;
	    // n = ceiling(b_len/2^k) limbs of y have now been computed.
	  }
	copy_loop_msp(y_MSDptr,b_MSDptr,b_len+2);
}
// Bit complexity (N := b_len): O(M(N)).

}  // namespace cln