File: mulredc1_2.asm

package info (click to toggle)
gmp-ecm 7.0.4%2Bds-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster
  • size: 4,728 kB
  • sloc: asm: 36,431; ansic: 34,057; xml: 885; python: 799; sh: 698; makefile: 348
file content (125 lines) | stat: -rw-r--r-- 3,436 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# mp_limb_t mulredc1_2(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y,
#                 const mp_limb_t *m, mp_limb_t inv_m);
#
# Linux:   z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8
#          Needs %rbx, %rsp, %rbp, %r12-%r15 restored
# Windows: z: %rcx, x: %rdx, y: %r8,  m: %r9, inv_m: 28(%rsp)
#          Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored



include(`config.m4')

ifdef(`WINDOWS64_ABI',
`define(`Y_PARAM', `%r8')dnl
define(`INVM_PARAM',`72(%rsp)')dnl'
,
`define(`Y_PARAM', `%rdx')dnl
define(`INVM_PARAM',`%r8')dnl'
)dnl
	TEXT
.p2align 6 # Opteron L1 code cache line is 64 bytes long
	GLOBL GSYM_PREFIX`'mulredc1_2
	TYPE(GSYM_PREFIX`'mulredc1_`'2,`function')

# Implements multiplication and REDC for one input numbers of LENGTH words
# and a multiplier of one word
ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI')

# Values that are referenced only once in the loop over j go into r8 .. r14,
# In the inner loop (over j), tmp, x[i], y, m, and u are constant.
# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values
# stay in registers and are referenced as
# YP = y, MP = m, 
# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry

define(`T0', `%rsi')dnl
define(`T1', `%rbx')dnl
define(`CY', `%rcx')dnl
define(`CYl', `%ecx')dnl
define(`CYb', `%cl')dnl
define(`X', `%r14')dnl		# register that holds x value
define(`U', `%r11')dnl
define(`YP', `%r9')dnl		# register that points to the y array
define(`MP', `%r10')dnl		# register that points to the m array
define(`ZP', `%rdi')dnl		# register that holds z

`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U
`#'                `YP' = YP, `MP' = MP

GSYM_PREFIX`'mulredc1_2:


#########################################################################
# i = 0 pass
#########################################################################

`#' register values at loop entry: YP = y, MP = m

# We need to compute u

	movq	(Y_PARAM), %rax		# rax = y[0] (time critical, do first)
	pushq	%rbx
	pushq	%r14
ifdef(`WINDOWS64_ABI',
`	pushq	%rsi
	pushq	%rdi
	movq	%r9, MP			# store m in MP
	movq    Y_PARAM, YP
	movq	%rcx, ZP
	movq	%rdx, X'
,
`	movq	Y_PARAM, YP
	movq	%rcx, MP
	movq    %rsi, X		# store x in X
	# ZP is same as passed in'
)

	xorl	CYl, CYl		# set %CY to 0

	mulq	X			# rdx:rax = y[0] * x

	movq 	%rax, T0		# Move low word of product to T0
	movq	%rdx, T1		# Move high word of product to T1

	imulq	INVM_PARAM, %rax	# %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64
	movq	%rax, U			# this is the new u value

	mulq	(MP)			# multipy u*m[0]
	addq	%rax, T0		# Now %T0 = 0, need not be stored
	movq	8(YP), %rax		# Fetch y[1]
	adcq	%rdx, T1		# 
	setc	CYb
	# CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence
	# CY:T1 <= 2*2^64 - 4

define(`TT', defn(`T0'))dnl
define(`T0', defn(`T1'))dnl
define(`T1', defn(`TT'))dnl
undefine(`TT')dnl
`#' Now `T0' = T0, `T1' = T1


# Pass for j = 1. Don't fetch new data from y[j+1].

	movq	CY, T1		# T1 = CY <= 1
	
	mulq	X		# y[j] * x[i]
	addq	%rax, T0	# Add low word to T0
	movq	8(MP), %rax	# Fetch m[j] into %rax
	adcq	%rdx, T1 	# Add high word with carry to T1
	mulq    U		# m[j]*u
	addq	%rax, T0	# Add low word to T0
	movq	T0, 0(ZP)	# Store T0 in z[j-1]
	adcq	%rdx, T1	# Add high word with carry to T1
	movq	T1, 8(ZP)	# Store T1 in tmp[j]
	setc	CYb		# %CY <= 1

	movq	CY, %rax	# use carry as return value
ifdef(`WINDOWS64_ABI',
`	popq	%rdi
	popq	%rsi
') dnl
	popq	%r14
	popq	%rbx
	ret