File: autogen.py

package info (click to toggle)
gmp-ecm 6.4.4-2
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 5,652 kB
  • ctags: 4,757
  • sloc: asm: 38,813; ansic: 27,394; sh: 11,468; xml: 885; python: 840; makefile: 308
file content (310 lines) | stat: -rwxr-xr-x 8,938 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/python

import re
import sys


def offaddr(addr, offset):
	if offset == 0:
		return "("+addr+")"
	else:
		return str(offset)+"("+addr+")"

# Generate asm for addmul1_k
# src and dst are pointers (stored in regs) + offsets
# multiplier is in a register
# rax, rbx, rcx, rdx are free for use.



def addmul1_k(src, off_src, dst, off_dst, mult, k):
	init = "### addmul1: src[0] is " + offaddr(src, off_src) + "\n"
	init = init + "###          dst[0] is " + offaddr(dst, off_dst) + "\n"
	init = init + "###          mult is " + mult + "\n"
	init = init + "###          k is " + str(k) + "\n"
	init = init + "###          kills %rax, %rbx, %rcx, %rdx\n"
	init = init + "###   dst[0,k[ += mult*src[0,k[  plus carry put in rcx or rbx\n"
	init = init + "	movq	" + offaddr(src, off_src) + ", %rax\n"
	init = init + "	mulq	" + mult + "\n"
	init = init + "	movq	%rax, %rbx\n"
	init = init + "	movq	%rdx, %rcx\n"

	block = """
	movq	__xii__, %rax
	mulq	__mult__
	addq	__cylo__, __zi__
	adcq	%rax, __cyhi__
	movq	%rdx, __cylo__
	adcq	$0, __cylo__
"""
	
	code = init
	
	cylo = "%rbx"
	cyhi = "%rcx"
	for i in range(0,k-1):
		blocki = re.sub('__cylo__', cylo, block)
		blocki = re.sub('__cyhi__', cyhi, blocki)
		blocki = re.sub('__xii__', offaddr(src, off_src+(i+1)*8), blocki)
		blocki = re.sub('__zi__', offaddr(dst, off_dst+i*8), blocki)
		blocki = re.sub('__mult__', mult, blocki)
		code = code + blocki
		tmp = cylo
		cylo = cyhi
		cyhi = tmp
	
	final = "	addq	" + cylo + ", " + offaddr(dst, off_dst+8*(k-1)) + "\n"
	final = final + "	adcq	$0, " + cyhi + "\n"
	final = final + "### carry limb is in " + cyhi + "\n"

	code = code + final
	return code, cyhi


######## TODO: improve this code!!!!

def mul1_k(src, off_src, dst, off_dst, mult, k):
	init = "### mul1: src[0] is " + offaddr(src, off_src) + "\n"
	init = init + "###          dst[0] is " + offaddr(dst, off_dst) + "\n"
	init = init + "###          mult is " + mult + "\n"
	init = init + "###          k is " + str(k) + "\n"
	init = init + "###          kills %rax, %rbx, %rcx, %rdx\n"
	init = init + "###   dst[0,k[ = mult*src[0,k[  plus carry put in rcx or rbx\n"
	init = init + "	movq	" + offaddr(src, off_src) + ", %rax\n"
	init = init + "	mulq	" + mult + "\n"
	init = init + "	movq	%rax, %rbx\n"
	init = init + "	movq	%rdx, %rcx\n"

	block = """
	movq	__xii__, %rax
	mulq	__mult__
	movq	__cylo__, __zi__
	addq	%rax, __cyhi__
	movq	%rdx, __cylo__
	adcq	$0, __cylo__
"""
	
	code = init
	
	cylo = "%rbx"
	cyhi = "%rcx"
	for i in range(0,k-1):
		blocki = re.sub('__cylo__', cylo, block)
		blocki = re.sub('__cyhi__', cyhi, blocki)
		blocki = re.sub('__xii__', offaddr(src, off_src+(i+1)*8), blocki)
		blocki = re.sub('__zi__', offaddr(dst, off_dst+i*8), blocki)
		blocki = re.sub('__mult__', mult, blocki)
		code = code + blocki
		tmp = cylo
		cylo = cyhi
		cyhi = tmp
	
	final = "	movq	" + cylo + ", " + offaddr(dst, off_dst+8*(k-1)) + "\n"
	final = final + "### carry limb is in " + cyhi + "\n"

	code = code + final
	return code


def mulredc_k_rolled(k):
	header = """# mp_limb_t mulredc__k(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,
#                 const mp_limb_t *m, mp_limb_t inv_m);
#

include(`config.m4')
	TEXT
	GLOBL GSYM_PREFIX`'mulredc__k
	TYPE(GSYM_PREFIX`'mulredc__k,`function')

GSYM_PREFIX`'mulredc__k:
"""
	init = re.sub("__k", str(k), header)
  
	init = init + """	movq	%rdx, %r11
	movq	%rcx, %r10
	pushq	%rbx
	pushq	%rbp
"""
	init = init + "	subq	$" + str(8*(2*k+1)) + ", %rsp\n"
	init = init + """#      %r8  : inv_m
#     %r10 : m
#     %r11 : y
#     %rsi : x
#     %rdi : z
#     %rsp : tmp
# Free registers
#     %rax, %rbx, %rcx, %rdx, %r9

### set tmp[0..2k+1[ to 0
"""
	for i in range(0,2*k+1):
		init = init + "	movq	$0, " + offaddr("%rsp", 8*i) + "\n"
	
	code = init
	
	middle_code = "###########################################\n"
	middle_code = middle_code + "	movq	$" + str(k) + ", %rbp\n"
	middle_code = middle_code + """
	.align 64
Loop:
	## compute u and store in %r9
	movq	(%rsi), %rax
	mulq	(%r11)
	addq	(%rsp), %rax
	mulq	%r8
	movq    %rax, %r9
"""
	codeaddmul, carry = addmul1_k("%r10", 0, "%rsp", 0, "%r9", k)
	middle_code = middle_code + codeaddmul
	middle_code = middle_code + "	addq	" + carry + ", " + offaddr("%rsp", 8*k) + "\n"
	middle_code = middle_code + "	adcq	$0, " + offaddr("%rsp", 8*(k+1)) + "\n"
	middle_code = middle_code + "	movq	(%rsi), %r9\n"
	codeaddmul, carry = addmul1_k("%r11", 0, "%rsp", 0, "%r9", k)
	middle_code = middle_code + codeaddmul
	middle_code = middle_code + "   addq    " + carry + ", " + offaddr("%rsp", 8*k) + "\n"
	middle_code = middle_code + "   adcq    $0, " + offaddr("%rsp", 8*(k+1)) + "\n\n"
	middle_code = middle_code + """
	addq	$8, %rsi
	addq	$8, %rsp
	decq	%rbp
	jnz	Loop
"""
	code = code + middle_code

	final = "###########################################\n"
	final = final + "### Copy result in z\n"
	for i in range(0,k):
		final = final + "	movq	" + offaddr("%rsp", 8*i) + ", %rax\n"
		final = final + "	movq	%rax, " + offaddr("%rdi", 8*i) + "\n"
	final = final + "	movq	" + offaddr("%rsp", 8*k) + ", %rax	# carry\n"
	final = final + "	addq    $" + str(8*(k+1)) + ", %rsp\n"
	final = final + "	popq	%rbp\n"
	final = final + "	popq	%rbx\n"
	final = final + "	ret\n"

	code = code + final
	
	return code



def mulredc_k(k):
	header = """# mp_limb_t mulredc__k(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,
#                 const mp_limb_t *m, mp_limb_t inv_m);
#

include(`config.m4')
	TEXT
	GLOBL GSYM_PREFIX`'mulredc__k
	TYPE(GSYM_PREFIX`'mulredc__k,`function')

GSYM_PREFIX`'mulredc__k:
"""
	init = re.sub("__k", str(k), header)
  
	init = init + """	movq	%rdx, %r11
	movq	%rcx, %r10
	pushq	%rbx
"""
	init = init + "	subq	$" + str(8*(2*k+1)) + ", %rsp\n"
	init = init + """#      %r8  : inv_m
#     %r10 : m
#     %r11 : y
#     %rsi : x
#     %rdi : z
#     %rsp : tmp
# Free registers
#     %rax, %rbx, %rcx, %rdx, %r9

### set tmp[0..2k+1[ to 0
"""
	for i in range(0,2*k+1):
		init = init + "	movq	$0, " + offaddr("%rsp", 8*i) + "\n"
	
	code = init
	
	for i in range(0,k):
		blocki = "###########################################\n"
		blocki = blocki + "### Step " + str(i) + "\n"
		blocki = blocki + "### Compute u and store in %r9\n"
		blocki = blocki + "	movq	" + offaddr("%rsi", 8*i) + ", %rax\n"
		blocki = blocki + "	mulq	(%r11)\n"
		blocki = blocki + "	addq	" + offaddr("%rsp", 8*i) + ", %rax\n"
		blocki = blocki + "	mulq	%r8\n"
		blocki = blocki + "	movq	%rax, %r9\n"
		blocki = blocki + "### tmp[i,i+k] += x[i]*y + u*m\n"
		codeaddmul, carry = addmul1_k("%r10", 0, "%rsp", 8*i, "%r9", k)
		blocki = blocki + codeaddmul
		blocki = blocki + "	addq	" + carry + ", " + offaddr("%rsp", 8*(k+i)) + "\n"
		blocki = blocki + "	adcq	$0, " + offaddr("%rsp", 8*(k+i+1)) + "\n"
		blocki = blocki + "	movq	" + offaddr("%rsi", 8*i) + ", %r9\n"
		codeaddmul, carry = addmul1_k("%r11", 0, "%rsp", 8*i, "%r9", k)
		blocki = blocki + codeaddmul
		blocki = blocki + "	addq	" + carry + ", " + offaddr("%rsp", 8*(k+i)) + "\n"
		blocki = blocki + "	adcq	$0, " + offaddr("%rsp", 8*(k+i+1)) + "\n"
		code = code + blocki
	
	final = "###########################################\n"
	final = final + "### Copy result in z\n"
	for i in range(0,k):
		final = final + "	movq	" + offaddr("%rsp", 8*(k+i)) + ", %rax\n"
		final = final + "	movq	%rax, " + offaddr("%rdi", 8*i) + "\n"
	final = final + "	movq	" + offaddr("%rsp", 16*k) + ", %rax	# carry\n"
	final = final + "	addq    $" + str(8*(2*k+1)) + ", %rsp\n"
	final = final + "	popq	%rbx\n"
	final = final + "	ret\n"

	code = code + final
	
	return code

	
##print addmul1_k("%rsi", 0, "%dsi", 0, "%r9", 3)

k = int(sys.argv[1])
if k == 1:
	print """#
#  mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y,
#                 const mp_limb_t m, mp_limb_t inv_m)
#
#  Compute z := x*y mod m, in Montgomery representation, where x, y < m
#  and m is n limb wide.  inv_m is the less significant limb of the
#  inverse of m modulo 2^(n*GMP_LIMB_BITS)
#
#  The result might be unreduced (larger than m) but becomes reduced
#  after subtracting m. The calling function should take care of that.
#
#  We use a temporary space for unreduced product on the stack.
#  Therefore, this can not be used for large integers (anyway, the
#  algorithm is quadratic).
#
#  WARNING: z is only n limbs but since it might be unreduced, there
#  could be a carry that does not fit in z. This carry is returned.


include(`config.m4')
	TEXT
	GLOBL GSYM_PREFIX`'mulredc1
	TYPE(GSYM_PREFIX`'mulredc1,`function')

GSYM_PREFIX`'mulredc1:
#     %r8  : inv_m
#     %rcx : m
#     %rdx : y
#     %rsi : x
#     %rdi : z
	movq	%rdx, %rax
	mulq	%rsi
	movq	%rdx, %r10
	movq	%rax, %r9       # store xy in [r9:r10]
	mulq	%r8             # compute u
	mulq	%rcx          # compute u*m
	addq	%r9, %rax       # rax is 0, now (carry is important)
	adcq	%r10, %rdx
	movq	%rdx, (%rdi)
	adcq	$0, %rax
	ret
"""
else:
	print mulredc_k_rolled(k)