Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
asm_macros.hpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: not started, auditors: [], date: YYYY-MM-DD }
3// external_1: { status: not started, auditors: [], date: YYYY-MM-DD }
4// external_2: { status: not started, auditors: [], date: YYYY-MM-DD }
5// =====================
6
7#pragma once
8// clang-format off
9
10/*
11 * Clear all flags via xorq opcode
12 **/
13#define CLEAR_FLAGS(empty_reg) \
14 "xorq " empty_reg ", " empty_reg " \n\t"
15
20#define LOAD_FIELD_ELEMENT(a, lolo, lohi, hilo, hihi) \
21 "movq 0(" a "), " lolo " \n\t" \
22 "movq 8(" a "), " lohi " \n\t" \
23 "movq 16(" a "), " hilo " \n\t" \
24 "movq 24(" a "), " hihi " \n\t"
25
31#define STORE_FIELD_ELEMENT(r, lolo, lohi, hilo, hihi) \
32 "movq " lolo ", 0(" r ") \n\t" \
33 "movq " lohi ", 8(" r ") \n\t" \
34 "movq " hilo ", 16(" r ") \n\t" \
35 "movq " hihi ", 24(" r ") \n\t"
36
37#if !defined(__ADX__) || defined(DISABLE_ADX)
42#define ADD(b) \
43 "addq 0(" b "), %%r12 \n\t" \
44 "adcq 8(" b "), %%r13 \n\t" \
45 "adcq 16(" b "), %%r14 \n\t" \
46 "adcq 24(" b "), %%r15 \n\t"
47
52#define SUB(b) \
53 "subq 0(" b "), %%r12 \n\t" \
54 "sbbq 8(" b "), %%r13 \n\t" \
55 "sbbq 16(" b "), %%r14 \n\t" \
56 "sbbq 24(" b "), %%r15 \n\t"
57
58
63#define ADD_REDUCE(b, modulus_0, modulus_1, modulus_2, modulus_3) \
64 "addq 0(" b "), %%r12 \n\t" \
65 "adcq 8(" b "), %%r13 \n\t" \
66 "adcq 16(" b "), %%r14 \n\t" \
67 "adcq 24(" b "), %%r15 \n\t" \
68 "movq %%r12, %%r8 \n\t" \
69 "movq %%r13, %%r9 \n\t" \
70 "movq %%r14, %%r10 \n\t" \
71 "movq %%r15, %%r11 \n\t" \
72 "addq " modulus_0 ", %%r12 \n\t" \
73 "adcq " modulus_1 ", %%r13 \n\t" \
74 "adcq " modulus_2 ", %%r14 \n\t" \
75 "adcq " modulus_3 ", %%r15 \n\t" \
76 "cmovncq %%r8, %%r12 \n\t" \
77 "cmovncq %%r9, %%r13 \n\t" \
78 "cmovncq %%r10, %%r14 \n\t" \
79 "cmovncq %%r11, %%r15 \n\t"
80
81
82
87#define REDUCE_FIELD_ELEMENT(neg_modulus_0, neg_modulus_1, neg_modulus_2, neg_modulus_3) \
88 /* Duplicate `r` */ \
89 "movq %%r12, %%r8 \n\t" \
90 "movq %%r13, %%r9 \n\t" \
91 "movq %%r14, %%r10 \n\t" \
92 "movq %%r15, %%r11 \n\t" \
93 "addq " neg_modulus_0 ", %%r12 \n\t" /* r'[0] -= modulus.data[0] */ \
94 "adcq " neg_modulus_1 ", %%r13 \n\t" /* r'[1] -= modulus.data[1] */ \
95 "adcq " neg_modulus_2 ", %%r14 \n\t" /* r'[2] -= modulus.data[2] */ \
96 "adcq " neg_modulus_3 ", %%r15 \n\t" /* r'[3] -= modulus.data[3] */ \
97 \
98 /* if r does not need to be reduced, overflow flag is 1 */ \
99 /* set r' = r if this flag is set */ \
100 "cmovncq %%r8, %%r12 \n\t" \
101 "cmovncq %%r9, %%r13 \n\t" \
102 "cmovncq %%r10, %%r14 \n\t" \
103 "cmovncq %%r11, %%r15 \n\t"
104
109#define SQR(a) \
110 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
111 \
112 "xorq %%r8, %%r8 \n\t" /* clear flags */ \
113 /* compute a[0] *a[1], a[0]*a[2], a[0]*a[3], a[1]*a[2], a[1]*a[3], a[2]*a[3] */ \
114 "mulxq 8(" a "), %%r9, %%r10 \n\t" /* (r[1], r[2]) <- a[0] * a[1] */ \
115 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[1], t[2]) <- a[0] * a[2] */ \
116 "mulxq 24(" a "), %%r11, %%r12 \n\t" /* (r[3], r[4]) <- a[0] * a[3] */ \
117 \
118 \
119 /* accumulate products into result registers */ \
120 "addq %%r8, %%r10 \n\t" /* r[2] += t[1] */ \
121 "adcq %%r15, %%r11 \n\t" /* r[3] += t[2] */ \
122 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %r%dx */ \
123 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[5], t[6]) <- a[1] * a[2] */ \
124 "mulxq 24(" a "), %%rdi, %%rcx \n\t" /* (t[3], t[4]) <- a[1] * a[3] */ \
125 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %%rdx */ \
126 "mulxq 16(" a "), %%r13, %%r14 \n\t" /* (r[5], r[6]) <- a[3] * a[2] */ \
127 "adcq %%rdi, %%r12 \n\t" /* r[4] += t[3] */ \
128 "adcq %%rcx, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
129 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
130 "addq %%r8, %%r11 \n\t" /* r[3] += t[5] */ \
131 "adcq %%r15, %%r12 \n\t" /* r[4] += t[6] */ \
132 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
133 \
134 /* double result registers */ \
135 "addq %%r9, %%r9 \n\t" /* r[1] = 2r[1] */ \
136 "adcq %%r10, %%r10 \n\t" /* r[2] = 2r[2] */ \
137 "adcq %%r11, %%r11 \n\t" /* r[3] = 2r[3] */ \
138 "adcq %%r12, %%r12 \n\t" /* r[4] = 2r[4] */ \
139 "adcq %%r13, %%r13 \n\t" /* r[5] = 2r[5] */ \
140 "adcq %%r14, %%r14 \n\t" /* r[6] = 2r[6] */ \
141 \
142 /* compute a[3]*a[3], a[2]*a[2], a[1]*a[1], a[0]*a[0] */ \
143 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
144 "mulxq %%rdx, %%r8, %%rcx \n\t" /* (r[0], t[4]) <- a[0] * a[0] */ \
145 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
146 "mulxq %%rdx, %%rdx, %%rdi \n\t" /* (t[7], t[8]) <- a[2] * a[2] */ \
147 /* add squares into result registers */ \
148 "addq %%rdx, %%r12 \n\t" /* r[4] += t[7] */ \
149 "adcq %%rdi, %%r13 \n\t" /* r[5] += t[8] */ \
150 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
151 "addq %%rcx, %%r9 \n\t" /* r[1] += t[4] */ \
152 "movq 24(" a "), %%rdx \n\t" /* r[2] += flag_c */ \
153 "mulxq %%rdx, %%rcx, %%r15 \n\t" /* (t[5], r[7]) <- a[3] * a[3] */ \
154 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
155 "mulxq %%rdx, %%rdi, %%rdx \n\t" /* (t[3], t[6]) <- a[1] * a[1] */ \
156 "adcq %%rdi, %%r10 \n\t" /* r[2] += t[3] */ \
157 "adcq %%rdx, %%r11 \n\t" /* r[3] += t[6] */ \
158 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
159 "addq %%rcx, %%r14 \n\t" /* r[6] += t[5] */ \
160 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
161 \
162 /* perform modular reduction: r[0] */ \
163 "movq %%r8, %%rdx \n\t" /* move r8 into %rdx */ \
164 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
165 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
166 "addq %%rdi, %%r8 \n\t" /* r[0] += t[0] (%r8 now free) */ \
167 "adcq %%rcx, %%r9 \n\t" /* r[1] += t[1] + flag_c */ \
168 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
169 "adcq %%rcx, %%r10 \n\t" /* r[2] += t[3] + flag_c */ \
170 "adcq $0, %%r11 \n\t" /* r[4] += flag_c */ \
171 /* Partial fix "adcq $0, %%r12 \n\t"*/ /* r[4] += flag_c */ \
172 "addq %%rdi, %%r9 \n\t" /* r[1] += t[2] */ \
173 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
174 "mulxq %[modulus_3], %%r8, %%rdx \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
175 "adcq %%rdi, %%r10 \n\t" /* r[2] += t[0] + flag_c */ \
176 "adcq %%rcx, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
177 "adcq %%rdx, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
178 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
179 "addq %%r8, %%r11 \n\t" /* r[3] += t[2] + flag_c */ \
180 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
181 \
182 /* perform modular reduction: r[1] */ \
183 "movq %%r9, %%rdx \n\t" /* move r9 into %rdx */ \
184 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
185 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
186 "addq %%rdi, %%r9 \n\t" /* r[1] += t[0] (%r8 now free) */ \
187 "adcq %%rcx, %%r10 \n\t" /* r[2] += t[1] + flag_c */ \
188 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
189 "adcq %%rcx, %%r11 \n\t" /* r[3] += t[3] + flag_c */ \
190 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
191 "addq %%rdi, %%r10 \n\t" /* r[2] += t[2] */ \
192 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
193 "mulxq %[modulus_3], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
194 "adcq %%rdi, %%r11 \n\t" /* r[3] += t[0] + flag_c */ \
195 "adcq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
196 "adcq %%r9, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
197 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
198 "addq %%r8, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
199 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
200 \
201 /* perform modular reduction: r[2] */ \
202 "movq %%r10, %%rdx \n\t" /* move r10 into %rdx */ \
203 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
204 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
205 "addq %%rdi, %%r10 \n\t" /* r[2] += t[0] (%r8 now free) */ \
206 "adcq %%rcx, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
207 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
208 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
209 "mulxq %[modulus_3], %%r10, %%rdx \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
210 "adcq %%rcx, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
211 "adcq %%r9, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
212 "adcq %%rdx, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
213 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
214 "addq %%rdi, %%r11 \n\t" /* r[3] += t[2] */ \
215 "adcq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
216 "adcq %%r10, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
217 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
218 \
219 /* perform modular reduction: r[3] */ \
220 "movq %%r11, %%rdx \n\t" /* move r11 into %rdx */ \
221 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
222 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
223 "mulxq %[modulus_1], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
224 "addq %%rdi, %%r11 \n\t" /* r[3] += t[0] (%r11 now free) */ \
225 "adcq %%r8, %%r12 \n\t" /* r[4] += t[2] */ \
226 "adcq %%r9, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
227 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
228 "mulxq %[modulus_3], %%r10, %%r11 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
229 "adcq %%r9, %%r14 \n\t" /* r[6] += t[1] + flag_c */ \
230 "adcq %%r11, %%r15 \n\t" /* r[7] += t[3] + flag_c */ \
231 "addq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
232 "adcq %%r8, %%r13 \n\t" /* r[5] += t[0] + flag_c */ \
233 "adcq %%r10, %%r14 \n\t" /* r[6] += t[2] + flag_c */ \
234 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */
235
236
241#define MUL(a1, a2, a3, a4, b) \
242 "movq " a1 ", %%rdx \n\t" /* load a[0] into %rdx */ \
243 "xorq %%r8, %%r8 \n\t" /* clear r10 register, we use this when we need 0 */ \
244 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
245 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
246 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
247 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
248 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
249 /* zero flags */ \
250 \
251 /* start computing modular reduction */ \
252 "movq %%r13, %%rdx \n\t" /* move r[0] into %rdx */ \
253 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
254 \
255 /* start first addition chain */ \
256 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
257 "adcq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
258 "adcq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
259 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
260 \
261 /* reduce by r[0] * k */ \
262 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
263 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
264 "addq %%r8, %%r13 \n\t" /* r[0] += t[0] (%r13 now free) */ \
265 "adcq %%rdi, %%r14 \n\t" /* r[1] += t[0] */ \
266 "adcq %%r11, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
267 "adcq $0, %%r10 \n\t" /* r[3] += flag_c */ \
268 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
269 "addq %%r9, %%r14 \n\t" /* r[1] += t[1] + flag_c */ \
270 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
271 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
272 "adcq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
273 "adcq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
274 "adcq %%r11, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
275 "addq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
276 "adcq $0, %%r12 \n\t" /* r[4] += flag_i */ \
277 \
278 /* modulus = 254 bits, so max(t[3]) = 62 bits */ \
279 /* b also 254 bits, so (a[0] * b[3]) = 62 bits */ \
280 /* i.e. carry flag here is always 0 if b is in mont form, no need to update r[5] */ \
281 /* (which is very convenient because we're out of registers!) */ \
282 /* N.B. the value of r[4] now has a max of 63 bits and can accept another 62 bit value before overflowing */ \
283 \
284 /* a[1] * b */ \
285 "movq " a2 ", %%rdx \n\t" /* load a[1] into %rdx */ \
286 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
287 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
288 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
289 "adcq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
290 "adcq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
291 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
292 "addq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
293 \
294 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
295 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (t[6], r[5]) <- (a[1] * b[3]) */ \
296 "adcq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
297 "adcq %%rdi, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
298 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
299 "addq %%r9, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
300 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
301 \
302 /* reduce by r[1] * k */ \
303 "movq %%r14, %%rdx \n\t" /* move r[1] into %rdx */ \
304 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
305 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
306 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
307 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] (%r14 now free) */ \
308 "adcq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
309 "adcq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
310 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
311 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
312 "addq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
313 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
314 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
315 "adcq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
316 "adcq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
317 "adcq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
318 "addq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
319 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
320 \
321 /* a[2] * b */ \
322 "movq " a3 ", %%rdx \n\t" /* load a[2] into %rdx */ \
323 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
324 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
325 "addq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
326 "adcq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
327 "adcq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
328 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
329 "addq %%rdi, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
330 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[2]) */ \
331 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (t[2], r[6]) <- (a[2] * b[3]) */ \
332 "adcq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
333 "adcq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
334 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
335 "addq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
336 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
337 \
338 /* reduce by r[2] * k */ \
339 "movq %%r15, %%rdx \n\t" /* move r[2] into %rdx */ \
340 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
341 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
342 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
343 "addq %%r8, %%r15 \n\t" /* r[2] += t[0] (%r15 now free) */ \
344 "adcq %%r9, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
345 "adcq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
346 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
347 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
348 "addq %%rdi, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
349 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
350 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
351 "adcq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
352 "adcq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
353 "adcq %%r11, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
354 "addq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
355 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
356 \
357 /* a[3] * b */ \
358 "movq " a4 ", %%rdx \n\t" /* load a[3] into %rdx */ \
359 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
360 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[3] * b[1]) */ \
361 "addq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
362 "adcq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
363 "adcq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
364 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
365 "addq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
366 \
367 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[3] * b[2]) */ \
368 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (t[6], r[7]) <- (a[3] * b[3]) */ \
369 "adcq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
370 "adcq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
371 "adcq $0, %%r15 \n\t" /* r[7] += + flag_c */ \
372 "addq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
373 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
374 \
375 /* reduce by r[3] * k */ \
376 "movq %%r10, %%rdx \n\t" /* move r_inv into %rdx */ \
377 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
378 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
379 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[1] * k) */ \
380 "addq %%r8, %%r10 \n\t" /* r[3] += t[0] (%rsi now free) */ \
381 "adcq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
382 "adcq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
383 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
384 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
385 "addq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
386 \
387 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[4], t[5]) <- (modulus.data[2] * k) */ \
388 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (t[6], t[7]) <- (modulus.data[3] * k) */ \
389 "adcq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
390 "adcq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
391 "adcq %%rdx, %%r15 \n\t" /* r[7] += t[7] + flag_c */ \
392 "addq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
393 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */
394
395
400#define MUL_256(a, b, r) \
401 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
402 \
403 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
404 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
405 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
406 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
407 "mulxq 16(" b "), %%r15, %%rax \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
408 /* zero flags */ \
409 "xorq %%r10, %%r10 \n\t" /* clear r10 register, we use this when we need 0 */ \
410 \
411 \
412 /* start first addition chain */ \
413 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
414 "adcq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
415 "adcq %%r10, %%rax \n\t" /* r[3] += flag_c */ \
416 "addq %%rdi, %%rax \n\t" /* r[3] += t[2] + flag_c */ \
417 \
418 /* a[1] * b */ \
419 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
420 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
421 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
422 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
423 "adcq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
424 "adcq %%rsi, %%rax \n\t" /* r[3] += t[1] + flag_c */ \
425 "addq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
426 \
427 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
428 "adcq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
429 \
430 /* a[2] * b */ \
431 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
432 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
433 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
434 "addq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
435 "adcq %%r9, %%rax \n\t" /* r[3] += t[1] + flag_c */ \
436 "addq %%rdi, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
437 \
438 \
439 /* a[3] * b */ \
440 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %rdx */ \
441 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
442 "adcq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
443 "movq %%r13, 0(" r ") \n\t" \
444 "movq %%r14, 8(" r ") \n\t" \
445 "movq %%r15, 16(" r ") \n\t" \
446 "movq %%rax, 24(" r ") \n\t"
447
448
449#else // 6047895us
454#define ADD(b) \
455 "adcxq 0(" b "), %%r12 \n\t" \
456 "adcxq 8(" b "), %%r13 \n\t" \
457 "adcxq 16(" b "), %%r14 \n\t" \
458 "adcxq 24(" b "), %%r15 \n\t"
459
464#define SUB(b) \
465 "subq 0(" b "), %%r12 \n\t" \
466 "sbbq 8(" b "), %%r13 \n\t" \
467 "sbbq 16(" b "), %%r14 \n\t" \
468 "sbbq 24(" b "), %%r15 \n\t"
469
474#define ADD_REDUCE(b, modulus_0, modulus_1, modulus_2, modulus_3) \
475 "adcxq 0(" b "), %%r12 \n\t" \
476 "movq %%r12, %%r8 \n\t" \
477 "adoxq " modulus_0 ", %%r12 \n\t" \
478 "adcxq 8(" b "), %%r13 \n\t" \
479 "movq %%r13, %%r9 \n\t" \
480 "adoxq " modulus_1 ", %%r13 \n\t" \
481 "adcxq 16(" b "), %%r14 \n\t" \
482 "movq %%r14, %%r10 \n\t" \
483 "adoxq " modulus_2 ", %%r14 \n\t" \
484 "adcxq 24(" b "), %%r15 \n\t" \
485 "movq %%r15, %%r11 \n\t" \
486 "adoxq " modulus_3 ", %%r15 \n\t" \
487 "cmovnoq %%r8, %%r12 \n\t" \
488 "cmovnoq %%r9, %%r13 \n\t" \
489 "cmovnoq %%r10, %%r14 \n\t" \
490 "cmovnoq %%r11, %%r15 \n\t"
491
492
497#define REDUCE_FIELD_ELEMENT(neg_modulus_0, neg_modulus_1, neg_modulus_2, neg_modulus_3) \
498 /* Duplicate `r` */ \
499 "movq %%r12, %%r8 \n\t" \
500 "movq %%r13, %%r9 \n\t" \
501 "movq %%r14, %%r10 \n\t" \
502 "movq %%r15, %%r11 \n\t" \
503 /* Add the negative representation of 'modulus' into `r`. We do this instead */ \
504 /* of subtracting, because we can use `adoxq`. */ \
505 /* This opcode only has a dependence on the overflow */ \
506 /* flag (sub/sbb changes both carry and overflow flags). */ \
507 /* We can process an `adcxq` and `acoxq` opcode simultaneously. */ \
508 "adoxq " neg_modulus_0 ", %%r12 \n\t" /* r'[0] -= modulus.data[0] */ \
509 "adoxq " neg_modulus_1 ", %%r13 \n\t" /* r'[1] -= modulus.data[1] */ \
510 "adoxq " neg_modulus_2 ", %%r14 \n\t" /* r'[2] -= modulus.data[2] */ \
511 "adoxq " neg_modulus_3 ", %%r15 \n\t" /* r'[3] -= modulus.data[3] */ \
512 \
513 /* if r does not need to be reduced, overflow flag is 1 */ \
514 /* set r' = r if this flag is set */ \
515 "cmovnoq %%r8, %%r12 \n\t" \
516 "cmovnoq %%r9, %%r13 \n\t" \
517 "cmovnoq %%r10, %%r14 \n\t" \
518 "cmovnoq %%r11, %%r15 \n\t"
519
520
525#define SQR(a) \
526 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
527 \
528 "xorq %%r8, %%r8 \n\t" /* clear flags */ \
529 /* compute a[0] *a[1], a[0]*a[2], a[0]*a[3], a[1]*a[2], a[1]*a[3], a[2]*a[3] */ \
530 "mulxq 8(" a "), %%r9, %%r10 \n\t" /* (r[1], r[2]) <- a[0] * a[1] */ \
531 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[1], t[2]) <- a[0] * a[2] */ \
532 "mulxq 24(" a "), %%r11, %%r12 \n\t" /* (r[3], r[4]) <- a[0] * a[3] */ \
533 \
534 \
535 /* accumulate products into result registers */ \
536 "adoxq %%r8, %%r10 \n\t" /* r[2] += t[1] */ \
537 "adcxq %%r15, %%r11 \n\t" /* r[3] += t[2] */ \
538 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %r%dx */ \
539 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[5], t[6]) <- a[1] * a[2] */ \
540 "mulxq 24(" a "), %%rdi, %%rcx \n\t" /* (t[3], t[4]) <- a[1] * a[3] */ \
541 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %%rdx */ \
542 "mulxq 16(" a "), %%r13, %%r14 \n\t" /* (r[5], r[6]) <- a[3] * a[2] */ \
543 "adoxq %%r8, %%r11 \n\t" /* r[3] += t[5] */ \
544 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[3] */ \
545 "adoxq %%r15, %%r12 \n\t" /* r[4] += t[6] */ \
546 "adcxq %%rcx, %%r13 \n\t" /* r[5] += t[4] + flag_o */ \
547 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
548 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
549 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
550 \
551 /* double result registers */ \
552 "adoxq %%r9, %%r9 \n\t" /* r[1] = 2r[1] */ \
553 "adcxq %%r12, %%r12 \n\t" /* r[4] = 2r[4] */ \
554 "adoxq %%r10, %%r10 \n\t" /* r[2] = 2r[2] */ \
555 "adcxq %%r13, %%r13 \n\t" /* r[5] = 2r[5] */ \
556 "adoxq %%r11, %%r11 \n\t" /* r[3] = 2r[3] */ \
557 "adcxq %%r14, %%r14 \n\t" /* r[6] = 2r[6] */ \
558 \
559 /* compute a[3]*a[3], a[2]*a[2], a[1]*a[1], a[0]*a[0] */ \
560 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
561 "mulxq %%rdx, %%r8, %%rcx \n\t" /* (r[0], t[4]) <- a[0] * a[0] */ \
562 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
563 "mulxq %%rdx, %%rdx, %%rdi \n\t" /* (t[7], t[8]) <- a[2] * a[2] */ \
564 /* add squares into result registers */ \
565 "adcxq %%rcx, %%r9 \n\t" /* r[1] += t[4] */ \
566 "adoxq %%rdx, %%r12 \n\t" /* r[4] += t[7] */ \
567 "adoxq %%rdi, %%r13 \n\t" /* r[5] += t[8] */ \
568 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %rdx */ \
569 "mulxq %%rdx, %%rcx, %%r15 \n\t" /* (t[5], r[7]) <- a[3] * a[3] */ \
570 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
571 "mulxq %%rdx, %%rdi, %%rdx \n\t" /* (t[3], t[6]) <- a[1] * a[1] */ \
572 "adcxq %%rdi, %%r10 \n\t" /* r[2] += t[3] */ \
573 "adcxq %%rdx, %%r11 \n\t" /* r[3] += t[6] */ \
574 "adoxq %%rcx, %%r14 \n\t" /* r[6] += t[5] */ \
575 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */ \
576 \
577 /* perform modular reduction: r[0] */ \
578 "movq %%r8, %%rdx \n\t" /* move r8 into %rdx */ \
579 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
580 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
581 "adoxq %%rdi, %%r8 \n\t" /* r[0] += t[0] (%r8 now free) */ \
582 "mulxq %[modulus_3], %%r8, %%rdi \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
583 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
584 "adoxq %%rcx, %%r9 \n\t" /* r[1] += t[1] + flag_o */ \
585 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
586 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
587 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
588 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
589 "adoxq %%rcx, %%r10 \n\t" /* r[2] += t[3] + flag_o */ \
590 "adcxq %%rdi, %%r9 \n\t" /* r[1] += t[2] */ \
591 "adoxq %%r8, %%r11 \n\t" /* r[3] += t[2] + flag_o */ \
592 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
593 "adcxq %%rdi, %%r10 \n\t" /* r[2] += t[0] + flag_c */ \
594 "adcxq %%rcx, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
595 \
596 /* perform modular reduction: r[1] */ \
597 "movq %%r9, %%rdx \n\t" /* move r9 into %rdx */ \
598 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
599 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
600 "adoxq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
601 "mulxq %[modulus_3], %%r8, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
602 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
603 "adoxq %%rcx, %%r13 \n\t" /* r[5] += t[3] + flag_o */ \
604 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
605 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
606 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
607 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */ \
608 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
609 "mulxq %[modulus_0], %%r8, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
610 "adcxq %%r8, %%r9 \n\t" /* r[1] += t[0] (%r9 now free) */ \
611 "adoxq %%rcx, %%r10 \n\t" /* r[2] += t[1] + flag_c */ \
612 "mulxq %[modulus_1], %%r8, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
613 "adcxq %%r8, %%r10 \n\t" /* r[2] += t[2] */ \
614 "adoxq %%rcx, %%r11 \n\t" /* r[3] += t[3] + flag_o */ \
615 "adcxq %%rdi, %%r11 \n\t" /* r[3] += t[0] + flag_c */ \
616 \
617 /* perform modular reduction: r[2] */ \
618 "movq %%r10, %%rdx \n\t" /* move r10 into %rdx */ \
619 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
620 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
621 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
622 "adoxq %%rcx, %%r12 \n\t" /* r[4] += t[3] + flag_o */ \
623 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_o */ \
624 "adoxq %%r9, %%r13 \n\t" /* r[5] += t[1] + flag_o */ \
625 "mulxq %[modulus_3], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
626 "adcxq %%r8, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
627 "adoxq %%r9, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
628 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
629 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */ \
630 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
631 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
632 "adcxq %%r8, %%r10 \n\t" /* r[2] += t[0] (%r10 now free) */ \
633 "adoxq %%r9, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
634 "adcxq %%rdi, %%r11 \n\t" /* r[3] += t[2] */ \
635 "adoxq %[zero_reference], %%r12 \n\t" /* r[4] += flag_o */ \
636 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
637 \
638 /* perform modular reduction: r[3] */ \
639 "movq %%r11, %%rdx \n\t" /* move r11 into %rdx */ \
640 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
641 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
642 "mulxq %[modulus_1], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
643 "adoxq %%rdi, %%r11 \n\t" /* r[3] += t[0] (%r11 now free) */ \
644 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[2] */ \
645 "adoxq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
646 "adcxq %%r9, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
647 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
648 "mulxq %[modulus_3], %%r10, %%r11 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
649 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[0] + flag_o */ \
650 "adcxq %%r10, %%r14 \n\t" /* r[6] += t[2] + flag_c */ \
651 "adoxq %%r9, %%r14 \n\t" /* r[6] += t[1] + flag_o */ \
652 "adcxq %%r11, %%r15 \n\t" /* r[7] += t[3] + flag_c */ \
653 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */
654
659#define MUL(a1, a2, a3, a4, b) \
660 "movq " a1 ", %%rdx \n\t" /* load a[0] into %rdx */ \
661 "xorq %%r8, %%r8 \n\t" /* clear r10 register, we use this when we need 0 */ \
662 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
663 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
664 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
665 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
666 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
667 /* zero flags */ \
668 \
669 /* start computing modular reduction */ \
670 "movq %%r13, %%rdx \n\t" /* move r[0] into %rdx */ \
671 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
672 \
673 /* start first addition chain */ \
674 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
675 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_o */ \
676 "adcxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
677 \
678 /* reduce by r[0] * k */ \
679 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
680 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
681 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
682 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
683 "adcxq %[zero_reference], %%r12 \n\t" /* r[4] += flag_i */ \
684 "adoxq %%r8, %%r13 \n\t" /* r[0] += t[0] (%r13 now free) */ \
685 "adcxq %%r9, %%r14 \n\t" /* r[1] += t[1] + flag_o */ \
686 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
687 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
688 "adoxq %%rdi, %%r14 \n\t" /* r[1] += t[0] */ \
689 "adcxq %%r11, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
690 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_o */ \
691 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
692 \
693 /* modulus = 254 bits, so max(t[3]) = 62 bits */ \
694 /* b also 254 bits, so (a[0] * b[3]) = 62 bits */ \
695 /* i.e. carry flag here is always 0 if b is in mont form, no need to update r[5] */ \
696 /* (which is very convenient because we're out of registers!) */ \
697 /* N.B. the value of r[4] now has a max of 63 bits and can accept another 62 bit value before overflowing */ \
698 \
699 /* a[1] * b */ \
700 "movq " a2 ", %%rdx \n\t" /* load a[1] into %rdx */ \
701 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
702 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (t[6], r[5]) <- (a[1] * b[3]) */ \
703 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
704 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
705 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
706 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
707 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
708 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
709 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
710 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
711 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
712 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
713 "adoxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
714 \
715 /* reduce by r[1] * k */ \
716 "movq %%r14, %%rdx \n\t" /* move r[1] into %rdx */ \
717 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
718 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
719 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
720 "adcxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_o */ \
721 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
722 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
723 "adoxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
724 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
725 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
726 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
727 "adoxq %%r8, %%r14 \n\t" /* r[1] += t[0] (%r14 now free) */ \
728 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
729 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
730 "adcxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
731 \
732 /* a[2] * b */ \
733 "movq " a3 ", %%rdx \n\t" /* load a[2] into %rdx */ \
734 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
735 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[2]) */ \
736 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
737 "adcxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
738 "adoxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
739 "adcxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_o */ \
740 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (t[2], r[6]) <- (a[2] * b[3]) */ \
741 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
742 "adoxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
743 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
744 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
745 "adcxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
746 "adoxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
747 \
748 /* reduce by r[2] * k */ \
749 "movq %%r15, %%rdx \n\t" /* move r[2] into %rdx */ \
750 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
751 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
752 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
753 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
754 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
755 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_o */ \
756 "adoxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
757 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
758 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
759 "adcxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_o */ \
760 "adoxq %%r11, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
761 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
762 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] (%r15 now free) */ \
763 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
764 \
765 /* a[3] * b */ \
766 "movq " a4 ", %%rdx \n\t" /* load a[3] into %rdx */ \
767 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
768 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[3] * b[1]) */ \
769 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
770 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
771 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
772 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_o */ \
773 \
774 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[3] * b[2]) */ \
775 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (t[6], r[7]) <- (a[3] * b[3]) */ \
776 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
777 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_o */ \
778 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
779 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += + flag_o */ \
780 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
781 \
782 /* reduce by r[3] * k */ \
783 "movq %%r10, %%rdx \n\t" /* move r_inv into %rdx */ \
784 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
785 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
786 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[1] * k) */ \
787 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] (%rsi now free) */ \
788 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
789 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
790 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
791 \
792 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[4], t[5]) <- (modulus.data[2] * k) */ \
793 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (t[6], t[7]) <- (modulus.data[3] * k) */ \
794 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_o */ \
795 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
796 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_o */ \
797 "adcxq %%rdx, %%r15 \n\t" /* r[7] += t[7] + flag_c */ \
798 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */
799
804#define MUL_FOO(a1, a2, a3, a4, b) \
805 "movq " a1 ", %%rdx \n\t" /* load a[0] into %rdx */ \
806 "xorq %%r8, %%r8 \n\t" /* clear r10 register, we use this when we need 0 */ \
807 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
808 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
809 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
810 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
811 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
812 /* zero flags */ \
813 \
814 /* start computing modular reduction */ \
815 "movq %%r13, %%rdx \n\t" /* move r[0] into %rdx */ \
816 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
817 \
818 /* start first addition chain */ \
819 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
820 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_o */ \
821 "adcxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
822 \
823 /* reduce by r[0] * k */ \
824 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
825 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
826 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
827 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
828 "adcxq %[zero_reference], %%r12 \n\t" /* r[4] += flag_i */ \
829 "adoxq %%r8, %%r13 \n\t" /* r[0] += t[0] (%r13 now free) */ \
830 "adcxq %%r9, %%r14 \n\t" /* r[1] += t[1] + flag_o */ \
831 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
832 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
833 "adoxq %%rdi, %%r14 \n\t" /* r[1] += t[0] */ \
834 "adcxq %%r11, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
835 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_o */ \
836 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
837 \
838 /* modulus = 254 bits, so max(t[3]) = 62 bits */ \
839 /* b also 254 bits, so (a[0] * b[3]) = 62 bits */ \
840 /* i.e. carry flag here is always 0 if b is in mont form, no need to update r[5] */ \
841 /* (which is very convenient because we're out of registers!) */ \
842 /* N.B. the value of r[4] now has a max of 63 bits and can accept another 62 bit value before overflowing */ \
843 \
844 /* a[1] * b */ \
845 "movq " a2 ", %%rdx \n\t" /* load a[1] into %rdx */ \
846 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
847 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (t[6], r[5]) <- (a[1] * b[3]) */ \
848 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
849 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
850 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
851 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
852 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
853 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
854 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
855 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
856 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
857 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
858 "adoxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
859 \
860 /* reduce by r[1] * k */ \
861 "movq %%r14, %%rdx \n\t" /* move r[1] into %rdx */ \
862 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
863 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
864 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
865 "adcxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_o */ \
866 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
867 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
868 "adoxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
869 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
870 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
871 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
872 "adoxq %%r8, %%r14 \n\t" /* r[1] += t[0] (%r14 now free) */ \
873 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
874 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
875 "adcxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
876 \
877 /* a[2] * b */ \
878 "movq " a3 ", %%rdx \n\t" /* load a[2] into %rdx */ \
879 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
880 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[2]) */ \
881 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
882 "adcxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
883 "adoxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
884 "adcxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_o */ \
885 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (t[2], r[6]) <- (a[2] * b[3]) */ \
886 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
887 "adoxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
888 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
889 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
890 "adcxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
891 "adoxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
892 \
893 /* reduce by r[2] * k */ \
894 "movq %%r15, %%rdx \n\t" /* move r[2] into %rdx */ \
895 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
896 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
897 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
898 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
899 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
900 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_o */ \
901 "adoxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
902 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
903 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
904 "adcxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_o */ \
905 "adoxq %%r11, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
906 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
907 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] (%r15 now free) */ \
908 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
909 \
910 /* a[3] * b */ \
911 "movq " a4 ", %%rdx \n\t" /* load a[3] into %rdx */ \
912 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
913 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[3] * b[1]) */ \
914 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
915 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
916 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
917 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_o */ \
918 \
919 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[3] * b[2]) */ \
920 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (t[6], r[7]) <- (a[3] * b[3]) */ \
921 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
922 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_o */ \
923 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
924 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += + flag_o */ \
925 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
926 \
927 /* reduce by r[3] * k */ \
928 "movq %%r10, %%rdx \n\t" /* move r_inv into %rdx */ \
929 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
930 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
931 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[1] * k) */ \
932 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] (%rsi now free) */ \
933 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
934 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
935 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
936 \
937 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[4], t[5]) <- (modulus.data[2] * k) */ \
938 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (t[6], t[7]) <- (modulus.data[3] * k) */ \
939 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_o */ \
940 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
941 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_o */ \
942 "adcxq %%rdx, %%r15 \n\t" /* r[7] += t[7] + flag_c */ \
943 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */
944
949#define MUL_256(a, b, r) \
950 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
951 \
952 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
953 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
954 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
955 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
956 "mulxq 16(" b "), %%r15, %%rax \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
957 /* zero flags */ \
958 "xorq %%r10, %%r10 \n\t" /* clear r10 register, we use this when we need 0 */ \
959 \
960 \
961 /* start first addition chain */ \
962 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
963 "adoxq %%rdi, %%rax \n\t" /* r[3] += t[2] + flag_o */ \
964 "adcxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
965 "adcxq %%r10, %%rax \n\t" /* r[3] += flag_o */ \
966 \
967 /* a[1] * b */ \
968 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
969 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
970 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
971 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
972 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
973 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
974 "adoxq %%rsi, %%rax \n\t" /* r[3] += t[1] + flag_o */ \
975 \
976 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
977 "adcxq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
978 \
979 /* a[2] * b */ \
980 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
981 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
982 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
983 "adcxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
984 "adoxq %%r9, %%rax \n\t" /* r[3] += t[1] + flag_o */ \
985 "adcxq %%rdi, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
986 \
987 \
988 /* a[3] * b */ \
989 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %rdx */ \
990 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
991 "adcxq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
992 "movq %%r13, 0(" r ") \n\t" \
993 "movq %%r14, 8(" r ") \n\t" \
994 "movq %%r15, 16(" r ") \n\t" \
995 "movq %%rax, 24(" r ") \n\t"
996#endif