Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
field_impl_generic.hpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: not started, auditors: [], date: YYYY-MM-DD }
3// external_1: { status: not started, auditors: [], date: YYYY-MM-DD }
4// external_2: { status: not started, auditors: [], date: YYYY-MM-DD }
5// =====================
6
7#pragma once
8
9#include <array>
10#include <cstdint>
11
12#include "./field_impl.hpp"
14
15namespace bb {
16
17// NOLINTBEGIN(readability-implicit-bool-conversion)
18template <class T> constexpr std::pair<uint64_t, uint64_t> field<T>::mul_wide(uint64_t a, uint64_t b) noexcept
19{
20#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
21 const uint128_t res = (static_cast<uint128_t>(a) * static_cast<uint128_t>(b));
22 return { static_cast<uint64_t>(res), static_cast<uint64_t>(res >> 64) };
23#else
24 const uint64_t product = a * b;
25 return { product & 0xffffffffULL, product >> 32 };
26#endif
27}
28
29template <class T>
30constexpr uint64_t field<T>::mac(
31 const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t carry_in, uint64_t& carry_out) noexcept
32{
33#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
34 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c)) +
35 static_cast<uint128_t>(carry_in);
36 carry_out = static_cast<uint64_t>(res >> 64);
37 return static_cast<uint64_t>(res);
38#else
39 const uint64_t product = b * c + a + carry_in;
40 carry_out = product >> 32;
41 return product & 0xffffffffULL;
42#endif
43}
44
45template <class T>
46constexpr void field<T>::mac(const uint64_t a,
47 const uint64_t b,
48 const uint64_t c,
49 const uint64_t carry_in,
50 uint64_t& out,
51 uint64_t& carry_out) noexcept
52{
53#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
54 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c)) +
55 static_cast<uint128_t>(carry_in);
56 out = static_cast<uint64_t>(res);
57 carry_out = static_cast<uint64_t>(res >> 64);
58#else
59 const uint64_t product = b * c + a + carry_in;
60 carry_out = product >> 32;
61 out = product & 0xffffffffULL;
62#endif
63}
64
65template <class T>
66constexpr uint64_t field<T>::mac_mini(const uint64_t a,
67 const uint64_t b,
68 const uint64_t c,
69 uint64_t& carry_out) noexcept
70{
71#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
72 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
73 carry_out = static_cast<uint64_t>(res >> 64);
74 return static_cast<uint64_t>(res);
75#else
76 const uint64_t product = b * c + a;
77 carry_out = product >> 32;
78 return product & 0xffffffffULL;
79#endif
80}
81
82template <class T>
83constexpr void field<T>::mac_mini(
84 const uint64_t a, const uint64_t b, const uint64_t c, uint64_t& out, uint64_t& carry_out) noexcept
85{
86#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
87 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
88 out = static_cast<uint64_t>(res);
89 carry_out = static_cast<uint64_t>(res >> 64);
90#else
91 const uint64_t result = b * c + a;
92 carry_out = result >> 32;
93 out = result & 0xffffffffULL;
94#endif
95}
96
97template <class T>
98constexpr uint64_t field<T>::mac_discard_lo(const uint64_t a, const uint64_t b, const uint64_t c) noexcept
99{
100#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
101 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
102 return static_cast<uint64_t>(res >> 64);
103#else
104 return (b * c + a) >> 32;
105#endif
106}
107
108template <class T>
109constexpr uint64_t field<T>::addc(const uint64_t a,
110 const uint64_t b,
111 const uint64_t carry_in,
112 uint64_t& carry_out) noexcept
113{
114#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
115 uint128_t res = static_cast<uint128_t>(a) + static_cast<uint128_t>(b) + static_cast<uint128_t>(carry_in);
116 carry_out = static_cast<uint64_t>(res >> 64);
117 return static_cast<uint64_t>(res);
118#else
119 uint64_t r = a + b;
120 const uint64_t carry_temp = r < a;
121 r += carry_in;
122 carry_out = carry_temp + (r < carry_in);
123 return r;
124#endif
125}
126
127template <class T>
128constexpr uint64_t field<T>::sbb(const uint64_t a,
129 const uint64_t b,
130 const uint64_t borrow_in,
131 uint64_t& borrow_out) noexcept
132{
133#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
134 uint128_t res = static_cast<uint128_t>(a) - (static_cast<uint128_t>(b) + static_cast<uint128_t>(borrow_in >> 63));
135 borrow_out = static_cast<uint64_t>(res >> 64);
136 return static_cast<uint64_t>(res);
137#else
138 uint64_t t_1 = a - (borrow_in >> 63ULL);
139 uint64_t borrow_temp_1 = t_1 > a;
140 uint64_t t_2 = t_1 - b;
141 uint64_t borrow_temp_2 = t_2 > t_1;
142 borrow_out = 0ULL - (borrow_temp_1 | borrow_temp_2);
143 return t_2;
144#endif
145}
146
147template <class T>
148constexpr uint64_t field<T>::square_accumulate(const uint64_t a,
149 const uint64_t b,
150 const uint64_t c,
151 const uint64_t carry_in_lo,
152 const uint64_t carry_in_hi,
153 uint64_t& carry_lo,
154 uint64_t& carry_hi) noexcept
155{
156#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
157 const uint128_t product = static_cast<uint128_t>(b) * static_cast<uint128_t>(c);
158 const auto r0 = static_cast<uint64_t>(product);
159 const auto r1 = static_cast<uint64_t>(product >> 64);
160 uint64_t out = r0 + r0;
161 carry_lo = (out < r0);
162 out += a;
163 carry_lo += (out < a);
164 out += carry_in_lo;
165 carry_lo += (out < carry_in_lo);
166 carry_lo += r1;
167 carry_hi = (carry_lo < r1);
168 carry_lo += r1;
169 carry_hi += (carry_lo < r1);
170 carry_lo += carry_in_hi;
171 carry_hi += (carry_lo < carry_in_hi);
172 return out;
173#else
174 const auto product = b * c;
175 const auto t0 = product + a + carry_in_lo;
176 const auto t1 = product + t0;
177 carry_hi = t1 < product;
178 const auto t2 = t1 + (carry_in_hi << 32);
179 carry_hi += t2 < t1;
180 carry_lo = t2 >> 32;
181 return t2 & 0xffffffffULL;
182#endif
183}
184
185template <class T> constexpr field<T> field<T>::reduce() const noexcept
186{
187 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
188 uint256_t val{ data[0], data[1], data[2], data[3] };
189 if (val >= modulus) {
190 val -= modulus;
191 }
192 return { val.data[0], val.data[1], val.data[2], val.data[3] };
193 }
194 uint64_t t0 = data[0] + not_modulus.data[0];
195 uint64_t c = t0 < data[0];
196 auto t1 = addc(data[1], not_modulus.data[1], c, c);
197 auto t2 = addc(data[2], not_modulus.data[2], c, c);
198 auto t3 = addc(data[3], not_modulus.data[3], c, c);
199 const uint64_t selection_mask = 0ULL - c; // 0xffff... if we have overflowed.
200 const uint64_t selection_mask_inverse = ~selection_mask;
201 // if we overflow, we want to swap
202 return {
203 (data[0] & selection_mask_inverse) | (t0 & selection_mask),
204 (data[1] & selection_mask_inverse) | (t1 & selection_mask),
205 (data[2] & selection_mask_inverse) | (t2 & selection_mask),
206 (data[3] & selection_mask_inverse) | (t3 & selection_mask),
207 };
208}
209
210template <class T> constexpr field<T> field<T>::add(const field& other) const noexcept
211{
212 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
213 uint64_t r0 = data[0] + other.data[0];
214 uint64_t c = r0 < data[0];
215 auto r1 = addc(data[1], other.data[1], c, c);
216 auto r2 = addc(data[2], other.data[2], c, c);
217 auto r3 = addc(data[3], other.data[3], c, c);
218 if (c) {
219 uint64_t b = 0;
220 r0 = sbb(r0, modulus.data[0], b, b);
221 r1 = sbb(r1, modulus.data[1], b, b);
222 r2 = sbb(r2, modulus.data[2], b, b);
223 r3 = sbb(r3, modulus.data[3], b, b);
224 // Since both values are in [0, 2**256), the result is in [0, 2**257-2]. Subtracting one p might not be
225 // enough. We need to ensure that we've underflown the 0 and that might require subtracting an additional p
226 if (!b) {
227 b = 0;
228 r0 = sbb(r0, modulus.data[0], b, b);
229 r1 = sbb(r1, modulus.data[1], b, b);
230 r2 = sbb(r2, modulus.data[2], b, b);
231 r3 = sbb(r3, modulus.data[3], b, b);
232 }
233 }
234 return { r0, r1, r2, r3 };
235 } else {
236 uint64_t r0 = data[0] + other.data[0];
237 uint64_t c = r0 < data[0];
238 auto r1 = addc(data[1], other.data[1], c, c);
239 auto r2 = addc(data[2], other.data[2], c, c);
240 uint64_t r3 = data[3] + other.data[3] + c;
241
242 uint64_t t0 = r0 + twice_not_modulus.data[0];
243 c = t0 < twice_not_modulus.data[0];
244 uint64_t t1 = addc(r1, twice_not_modulus.data[1], c, c);
245 uint64_t t2 = addc(r2, twice_not_modulus.data[2], c, c);
246 uint64_t t3 = addc(r3, twice_not_modulus.data[3], c, c);
247 const uint64_t selection_mask = 0ULL - c;
248 const uint64_t selection_mask_inverse = ~selection_mask;
249
250 return {
251 (r0 & selection_mask_inverse) | (t0 & selection_mask),
252 (r1 & selection_mask_inverse) | (t1 & selection_mask),
253 (r2 & selection_mask_inverse) | (t2 & selection_mask),
254 (r3 & selection_mask_inverse) | (t3 & selection_mask),
255 };
256 }
257}
258
259template <class T> constexpr field<T> field<T>::subtract(const field& other) const noexcept
260{
261 uint64_t borrow = 0;
262 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
263 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
264 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
265 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
266
267 r0 += (modulus.data[0] & borrow);
268 uint64_t carry = r0 < (modulus.data[0] & borrow);
269 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
270 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
271 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
272 // The value being subtracted is in [0, 2**256), if we subtract 0 - 2*255 and then add p, the value will stay
273 // negative. If we are adding p, we need to check that we've overflown 2**256. If not, we should add p again
274 if (!carry) {
275 r0 += (modulus.data[0] & borrow);
276 uint64_t carry = r0 < (modulus.data[0] & borrow);
277 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
278 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
279 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
280 }
281 return { r0, r1, r2, r3 };
282}
283
291template <class T> constexpr field<T> field<T>::subtract_coarse(const field& other) const noexcept
292{
293 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
294 return subtract(other);
295 }
296 uint64_t borrow = 0;
297 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
298 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
299 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
300 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
301
302 r0 += (twice_modulus.data[0] & borrow);
303 uint64_t carry = r0 < (twice_modulus.data[0] & borrow);
304 r1 = addc(r1, twice_modulus.data[1] & borrow, carry, carry);
305 r2 = addc(r2, twice_modulus.data[2] & borrow, carry, carry);
306 r3 += (twice_modulus.data[3] & borrow) + carry;
307
308 return { r0, r1, r2, r3 };
309}
310
317template <class T> constexpr field<T> field<T>::montgomery_mul_big(const field& other) const noexcept
318{
319#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
320 uint64_t c = 0;
321 uint64_t t0 = 0;
322 uint64_t t1 = 0;
323 uint64_t t2 = 0;
324 uint64_t t3 = 0;
325 uint64_t t4 = 0;
326 uint64_t t5 = 0;
327 uint64_t k = 0;
328 for (const auto& element : data) {
329 c = 0;
330 mac(t0, element, other.data[0], c, t0, c);
331 mac(t1, element, other.data[1], c, t1, c);
332 mac(t2, element, other.data[2], c, t2, c);
333 mac(t3, element, other.data[3], c, t3, c);
334 t4 = addc(t4, c, 0, t5);
335
336 c = 0;
337 k = t0 * T::r_inv;
338 c = mac_discard_lo(t0, k, modulus.data[0]);
339 mac(t1, k, modulus.data[1], c, t0, c);
340 mac(t2, k, modulus.data[2], c, t1, c);
341 mac(t3, k, modulus.data[3], c, t2, c);
342 t3 = addc(c, t4, 0, c);
343 t4 = t5 + c;
344 }
345 uint64_t borrow = 0;
346 uint64_t r0 = sbb(t0, modulus.data[0], borrow, borrow);
347 uint64_t r1 = sbb(t1, modulus.data[1], borrow, borrow);
348 uint64_t r2 = sbb(t2, modulus.data[2], borrow, borrow);
349 uint64_t r3 = sbb(t3, modulus.data[3], borrow, borrow);
350 borrow = borrow ^ (0ULL - t4);
351 r0 += (modulus.data[0] & borrow);
352 uint64_t carry = r0 < (modulus.data[0] & borrow);
353 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
354 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
355 r3 += (modulus.data[3] & borrow) + carry;
356 return { r0, r1, r2, r3 };
357#else
358
359 // Convert 4 64-bit limbs to 9 29-bit limbs
360 auto left = wasm_convert(data);
361 auto right = wasm_convert(other.data);
362 constexpr uint64_t mask = 0x1fffffff;
363 uint64_t temp_0 = 0;
364 uint64_t temp_1 = 0;
365 uint64_t temp_2 = 0;
366 uint64_t temp_3 = 0;
367 uint64_t temp_4 = 0;
368 uint64_t temp_5 = 0;
369 uint64_t temp_6 = 0;
370 uint64_t temp_7 = 0;
371 uint64_t temp_8 = 0;
372 uint64_t temp_9 = 0;
373 uint64_t temp_10 = 0;
374 uint64_t temp_11 = 0;
375 uint64_t temp_12 = 0;
376 uint64_t temp_13 = 0;
377 uint64_t temp_14 = 0;
378 uint64_t temp_15 = 0;
379 uint64_t temp_16 = 0;
380 uint64_t temp_17 = 0;
381
382 // Multiply-add 0th limb of the left argument by all 9 limbs of the right arguemnt
383 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
384 // Instantly reduce
385 wasm_reduce(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
386 // Continue for other limbs
387 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
388 wasm_reduce(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
389 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
390 wasm_reduce(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
391 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
392 wasm_reduce(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
393 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
394 wasm_reduce(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
395 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
396 wasm_reduce(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
397 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
398 wasm_reduce(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
399 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
400 wasm_reduce(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
401 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
402 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
403
404 // After all multiplications and additions, convert relaxed form to strict (all limbs are 29 bits)
405 temp_10 += temp_9 >> WASM_LIMB_BITS;
406 temp_9 &= mask;
407 temp_11 += temp_10 >> WASM_LIMB_BITS;
408 temp_10 &= mask;
409 temp_12 += temp_11 >> WASM_LIMB_BITS;
410 temp_11 &= mask;
411 temp_13 += temp_12 >> WASM_LIMB_BITS;
412 temp_12 &= mask;
413 temp_14 += temp_13 >> WASM_LIMB_BITS;
414 temp_13 &= mask;
415 temp_15 += temp_14 >> WASM_LIMB_BITS;
416 temp_14 &= mask;
417 temp_16 += temp_15 >> WASM_LIMB_BITS;
418 temp_15 &= mask;
419 temp_17 += temp_16 >> WASM_LIMB_BITS;
420 temp_16 &= mask;
421
422 uint64_t r_temp_0;
423 uint64_t r_temp_1;
424 uint64_t r_temp_2;
425 uint64_t r_temp_3;
426 uint64_t r_temp_4;
427 uint64_t r_temp_5;
428 uint64_t r_temp_6;
429 uint64_t r_temp_7;
430 uint64_t r_temp_8;
431 // Subtract modulus from result
432 r_temp_0 = temp_9 - wasm_modulus[0];
433 r_temp_1 = temp_10 - wasm_modulus[1] - ((r_temp_0) >> 63);
434 r_temp_2 = temp_11 - wasm_modulus[2] - ((r_temp_1) >> 63);
435 r_temp_3 = temp_12 - wasm_modulus[3] - ((r_temp_2) >> 63);
436 r_temp_4 = temp_13 - wasm_modulus[4] - ((r_temp_3) >> 63);
437 r_temp_5 = temp_14 - wasm_modulus[5] - ((r_temp_4) >> 63);
438 r_temp_6 = temp_15 - wasm_modulus[6] - ((r_temp_5) >> 63);
439 r_temp_7 = temp_16 - wasm_modulus[7] - ((r_temp_6) >> 63);
440 r_temp_8 = temp_17 - wasm_modulus[8] - ((r_temp_7) >> 63);
441
442 // Depending on whether the subtraction underflowed, choose original value or the result of subtraction
443 uint64_t new_mask = 0 - (r_temp_8 >> 63);
444 uint64_t inverse_mask = (~new_mask) & mask;
445 temp_9 = (temp_9 & new_mask) | (r_temp_0 & inverse_mask);
446 temp_10 = (temp_10 & new_mask) | (r_temp_1 & inverse_mask);
447 temp_11 = (temp_11 & new_mask) | (r_temp_2 & inverse_mask);
448 temp_12 = (temp_12 & new_mask) | (r_temp_3 & inverse_mask);
449 temp_13 = (temp_13 & new_mask) | (r_temp_4 & inverse_mask);
450 temp_14 = (temp_14 & new_mask) | (r_temp_5 & inverse_mask);
451 temp_15 = (temp_15 & new_mask) | (r_temp_6 & inverse_mask);
452 temp_16 = (temp_16 & new_mask) | (r_temp_7 & inverse_mask);
453 temp_17 = (temp_17 & new_mask) | (r_temp_8 & inverse_mask);
454
455 // Convert back to 4 64-bit limbs
456 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
457 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
458 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
459 (temp_15 >> 18) | (temp_16 << 11) | (temp_17 << 40) };
460
461#endif
462}
463
464#if defined(__wasm__) || !defined(__SIZEOF_INT128__)
465
470template <class T>
471constexpr void field<T>::wasm_madd(uint64_t& left_limb,
472 const std::array<uint64_t, WASM_NUM_LIMBS>& right_limbs,
473 uint64_t& result_0,
474 uint64_t& result_1,
475 uint64_t& result_2,
476 uint64_t& result_3,
477 uint64_t& result_4,
478 uint64_t& result_5,
479 uint64_t& result_6,
480 uint64_t& result_7,
481 uint64_t& result_8)
482{
483 result_0 += left_limb * right_limbs[0];
484 result_1 += left_limb * right_limbs[1];
485 result_2 += left_limb * right_limbs[2];
486 result_3 += left_limb * right_limbs[3];
487 result_4 += left_limb * right_limbs[4];
488 result_5 += left_limb * right_limbs[5];
489 result_6 += left_limb * right_limbs[6];
490 result_7 += left_limb * right_limbs[7];
491 result_8 += left_limb * right_limbs[8];
492}
493
498template <class T>
499constexpr void field<T>::wasm_reduce(uint64_t& result_0,
500 uint64_t& result_1,
501 uint64_t& result_2,
502 uint64_t& result_3,
503 uint64_t& result_4,
504 uint64_t& result_5,
505 uint64_t& result_6,
506 uint64_t& result_7,
507 uint64_t& result_8)
508{
509 constexpr uint64_t mask = 0x1fffffff;
510 constexpr uint64_t r_inv = T::r_inv & mask;
511 uint64_t k = (result_0 * r_inv) & mask;
512 result_0 += k * wasm_modulus[0];
513 result_1 += k * wasm_modulus[1] + (result_0 >> WASM_LIMB_BITS);
514 result_2 += k * wasm_modulus[2];
515 result_3 += k * wasm_modulus[3];
516 result_4 += k * wasm_modulus[4];
517 result_5 += k * wasm_modulus[5];
518 result_6 += k * wasm_modulus[6];
519 result_7 += k * wasm_modulus[7];
520 result_8 += k * wasm_modulus[8];
521}
522
528template <class T>
529constexpr void field<T>::wasm_reduce_yuval(uint64_t& result_0,
530 uint64_t& result_1,
531 uint64_t& result_2,
532 uint64_t& result_3,
533 uint64_t& result_4,
534 uint64_t& result_5,
535 uint64_t& result_6,
536 uint64_t& result_7,
537 uint64_t& result_8,
538 uint64_t& result_9)
539{
540 constexpr uint64_t mask = 0x1fffffff;
541 const uint64_t result_0_masked = result_0 & mask;
542 result_1 += result_0_masked * wasm_r_inv[0] + (result_0 >> WASM_LIMB_BITS);
543 result_2 += result_0_masked * wasm_r_inv[1];
544 result_3 += result_0_masked * wasm_r_inv[2];
545 result_4 += result_0_masked * wasm_r_inv[3];
546 result_5 += result_0_masked * wasm_r_inv[4];
547 result_6 += result_0_masked * wasm_r_inv[5];
548 result_7 += result_0_masked * wasm_r_inv[6];
549 result_8 += result_0_masked * wasm_r_inv[7];
550 result_9 += result_0_masked * wasm_r_inv[8];
551}
556template <class T> constexpr std::array<uint64_t, WASM_NUM_LIMBS> field<T>::wasm_convert(const uint64_t* data)
557{
558 return { data[0] & 0x1fffffff,
559 (data[0] >> WASM_LIMB_BITS) & 0x1fffffff,
560 ((data[0] >> 58) & 0x3f) | ((data[1] & 0x7fffff) << 6),
561 (data[1] >> 23) & 0x1fffffff,
562 ((data[1] >> 52) & 0xfff) | ((data[2] & 0x1ffff) << 12),
563 (data[2] >> 17) & 0x1fffffff,
564 ((data[2] >> 46) & 0x3ffff) | ((data[3] & 0x7ff) << 18),
565 (data[3] >> 11) & 0x1fffffff,
566 (data[3] >> 40) & 0x1fffffff };
567}
568#endif
569template <class T> constexpr field<T> field<T>::montgomery_mul(const field& other) const noexcept
570{
571 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
572 return montgomery_mul_big(other);
573 }
574#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
575 auto [t0, c] = mul_wide(data[0], other.data[0]);
576 uint64_t k = t0 * T::r_inv;
577 uint64_t a = mac_discard_lo(t0, k, modulus.data[0]);
578
579 uint64_t t1 = mac_mini(a, data[0], other.data[1], a);
580 mac(t1, k, modulus.data[1], c, t0, c);
581 uint64_t t2 = mac_mini(a, data[0], other.data[2], a);
582 mac(t2, k, modulus.data[2], c, t1, c);
583 uint64_t t3 = mac_mini(a, data[0], other.data[3], a);
584 mac(t3, k, modulus.data[3], c, t2, c);
585 t3 = c + a;
586
587 mac_mini(t0, data[1], other.data[0], t0, a);
588 k = t0 * T::r_inv;
589 c = mac_discard_lo(t0, k, modulus.data[0]);
590 mac(t1, data[1], other.data[1], a, t1, a);
591 mac(t1, k, modulus.data[1], c, t0, c);
592 mac(t2, data[1], other.data[2], a, t2, a);
593 mac(t2, k, modulus.data[2], c, t1, c);
594 mac(t3, data[1], other.data[3], a, t3, a);
595 mac(t3, k, modulus.data[3], c, t2, c);
596 t3 = c + a;
597
598 mac_mini(t0, data[2], other.data[0], t0, a);
599 k = t0 * T::r_inv;
600 c = mac_discard_lo(t0, k, modulus.data[0]);
601 mac(t1, data[2], other.data[1], a, t1, a);
602 mac(t1, k, modulus.data[1], c, t0, c);
603 mac(t2, data[2], other.data[2], a, t2, a);
604 mac(t2, k, modulus.data[2], c, t1, c);
605 mac(t3, data[2], other.data[3], a, t3, a);
606 mac(t3, k, modulus.data[3], c, t2, c);
607 t3 = c + a;
608
609 mac_mini(t0, data[3], other.data[0], t0, a);
610 k = t0 * T::r_inv;
611 c = mac_discard_lo(t0, k, modulus.data[0]);
612 mac(t1, data[3], other.data[1], a, t1, a);
613 mac(t1, k, modulus.data[1], c, t0, c);
614 mac(t2, data[3], other.data[2], a, t2, a);
615 mac(t2, k, modulus.data[2], c, t1, c);
616 mac(t3, data[3], other.data[3], a, t3, a);
617 mac(t3, k, modulus.data[3], c, t2, c);
618 t3 = c + a;
619 return { t0, t1, t2, t3 };
620#else
622 // Convert 4 64-bit limbs to 9 29-bit ones
623 auto left = wasm_convert(data);
624 auto right = wasm_convert(other.data);
625 constexpr uint64_t mask = 0x1fffffff;
626 uint64_t temp_0 = 0;
627 uint64_t temp_1 = 0;
628 uint64_t temp_2 = 0;
629 uint64_t temp_3 = 0;
630 uint64_t temp_4 = 0;
631 uint64_t temp_5 = 0;
632 uint64_t temp_6 = 0;
633 uint64_t temp_7 = 0;
634 uint64_t temp_8 = 0;
635 uint64_t temp_9 = 0;
636 uint64_t temp_10 = 0;
637 uint64_t temp_11 = 0;
638 uint64_t temp_12 = 0;
639 uint64_t temp_13 = 0;
640 uint64_t temp_14 = 0;
641 uint64_t temp_15 = 0;
642 uint64_t temp_16 = 0;
643
644 // Perform a series of multiplications and reductions (we multiply 1 limb of left argument by the whole right
645 // argument and then reduce)
646 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
647 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
648 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
649 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
650 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
651 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
652 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
653 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
654 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
656 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
657 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
658 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
659 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
660 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
661 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
662 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
663 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
664
665 // In case there is some unforseen edge case encountered in wasm multiplications, we can quickly restore previous
666 // functionality. Comment all "wasm_reduce_yuval" and uncomment the following:
667
668 // wasm_reduce(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
669 // wasm_reduce(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
670 // wasm_reduce(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
671 // wasm_reduce(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
672 // wasm_reduce(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
673 // wasm_reduce(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
674 // wasm_reduce(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
675 // wasm_reduce(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
676
677 // The first 8 limbs are reduced using Yuval's method, the last one is reduced using the regular method
678 // The reason for this is that Yuval's method produces a 10-limb representation of the reduced limb, which is then
679 // added to the higher limbs. If we do this for the last limb we reduce, we'll get a 10-limb representation instead
680 // of a 9-limb one, so we'll have to reduce it again in some other way.
681 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
683 // Convert result to unrelaxed form (all limbs are 29 bits)
684 temp_10 += temp_9 >> WASM_LIMB_BITS;
685 temp_9 &= mask;
686 temp_11 += temp_10 >> WASM_LIMB_BITS;
687 temp_10 &= mask;
688 temp_12 += temp_11 >> WASM_LIMB_BITS;
689 temp_11 &= mask;
690 temp_13 += temp_12 >> WASM_LIMB_BITS;
691 temp_12 &= mask;
692 temp_14 += temp_13 >> WASM_LIMB_BITS;
693 temp_13 &= mask;
694 temp_15 += temp_14 >> WASM_LIMB_BITS;
695 temp_14 &= mask;
696 temp_16 += temp_15 >> WASM_LIMB_BITS;
697 temp_15 &= mask;
698
699 // Convert back to 4 64-bit limbs form
700 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
701 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
702 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
703 (temp_15 >> 18) | (temp_16 << 11) };
704#endif
705}
706
707template <class T> constexpr field<T> field<T>::montgomery_square() const noexcept
708{
709 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
710 return montgomery_mul_big(*this);
711 }
712#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
713 uint64_t carry_hi = 0;
714
715 auto [t0, carry_lo] = mul_wide(data[0], data[0]);
716 uint64_t t1 = square_accumulate(0, data[1], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
717 uint64_t t2 = square_accumulate(0, data[2], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
718 uint64_t t3 = square_accumulate(0, data[3], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
719
720 uint64_t round_carry = carry_lo;
721 uint64_t k = t0 * T::r_inv;
722 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
723 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
724 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
725 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
726 t3 = carry_lo + round_carry;
727
728 t1 = mac_mini(t1, data[1], data[1], carry_lo);
729 carry_hi = 0;
730 t2 = square_accumulate(t2, data[2], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
731 t3 = square_accumulate(t3, data[3], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
732 round_carry = carry_lo;
733 k = t0 * T::r_inv;
734 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
735 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
736 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
737 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
738 t3 = carry_lo + round_carry;
739
740 t2 = mac_mini(t2, data[2], data[2], carry_lo);
741 carry_hi = 0;
742 t3 = square_accumulate(t3, data[3], data[2], carry_lo, carry_hi, carry_lo, carry_hi);
743 round_carry = carry_lo;
744 k = t0 * T::r_inv;
745 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
746 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
747 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
748 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
749 t3 = carry_lo + round_carry;
750
751 t3 = mac_mini(t3, data[3], data[3], carry_lo);
752 k = t0 * T::r_inv;
753 round_carry = carry_lo;
754 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
755 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
756 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
757 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
758 t3 = carry_lo + round_carry;
759 return { t0, t1, t2, t3 };
760#else
761 // Convert from 4 64-bit limbs to 9 29-bit ones
762 auto left = wasm_convert(data);
763 constexpr uint64_t mask = 0x1fffffff;
764 uint64_t temp_0 = 0;
765 uint64_t temp_1 = 0;
766 uint64_t temp_2 = 0;
767 uint64_t temp_3 = 0;
768 uint64_t temp_4 = 0;
769 uint64_t temp_5 = 0;
770 uint64_t temp_6 = 0;
771 uint64_t temp_7 = 0;
772 uint64_t temp_8 = 0;
773 uint64_t temp_9 = 0;
774 uint64_t temp_10 = 0;
775 uint64_t temp_11 = 0;
776 uint64_t temp_12 = 0;
777 uint64_t temp_13 = 0;
778 uint64_t temp_14 = 0;
779 uint64_t temp_15 = 0;
780 uint64_t temp_16 = 0;
781 uint64_t acc;
782 // Perform multiplications, but accumulated results for limb k=i+j so that we can double them at the same time
783 temp_0 += left[0] * left[0];
784 acc = 0;
785 acc += left[0] * left[1];
786 temp_1 += (acc << 1);
787 acc = 0;
788 acc += left[0] * left[2];
789 temp_2 += left[1] * left[1];
790 temp_2 += (acc << 1);
791 acc = 0;
792 acc += left[0] * left[3];
793 acc += left[1] * left[2];
794 temp_3 += (acc << 1);
795 acc = 0;
796 acc += left[0] * left[4];
797 acc += left[1] * left[3];
798 temp_4 += left[2] * left[2];
799 temp_4 += (acc << 1);
800 acc = 0;
801 acc += left[0] * left[5];
802 acc += left[1] * left[4];
803 acc += left[2] * left[3];
804 temp_5 += (acc << 1);
805 acc = 0;
806 acc += left[0] * left[6];
807 acc += left[1] * left[5];
808 acc += left[2] * left[4];
809 temp_6 += left[3] * left[3];
810 temp_6 += (acc << 1);
811 acc = 0;
812 acc += left[0] * left[7];
813 acc += left[1] * left[6];
814 acc += left[2] * left[5];
815 acc += left[3] * left[4];
816 temp_7 += (acc << 1);
817 acc = 0;
818 acc += left[0] * left[8];
819 acc += left[1] * left[7];
820 acc += left[2] * left[6];
821 acc += left[3] * left[5];
822 temp_8 += left[4] * left[4];
823 temp_8 += (acc << 1);
824 acc = 0;
825 acc += left[1] * left[8];
826 acc += left[2] * left[7];
827 acc += left[3] * left[6];
828 acc += left[4] * left[5];
829 temp_9 += (acc << 1);
830 acc = 0;
831 acc += left[2] * left[8];
832 acc += left[3] * left[7];
833 acc += left[4] * left[6];
834 temp_10 += left[5] * left[5];
835 temp_10 += (acc << 1);
836 acc = 0;
837 acc += left[3] * left[8];
838 acc += left[4] * left[7];
839 acc += left[5] * left[6];
840 temp_11 += (acc << 1);
841 acc = 0;
842 acc += left[4] * left[8];
843 acc += left[5] * left[7];
844 temp_12 += left[6] * left[6];
845 temp_12 += (acc << 1);
846 acc = 0;
847 acc += left[5] * left[8];
848 acc += left[6] * left[7];
849 temp_13 += (acc << 1);
850 acc = 0;
851 acc += left[6] * left[8];
852 temp_14 += left[7] * left[7];
853 temp_14 += (acc << 1);
854 acc = 0;
855 acc += left[7] * left[8];
856 temp_15 += (acc << 1);
857 temp_16 += left[8] * left[8];
858
859 // Perform reductions
860
861 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
862 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
863 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
864 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
865 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
866 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
867 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
868 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
869
870 // In case there is some unforseen edge case encountered in wasm multiplications, we can quickly restore previous
871 // functionality. Comment all "wasm_reduce_yuval" and uncomment the following:
872
873 // wasm_reduce(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
874 // wasm_reduce(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
875 // wasm_reduce(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
876 // wasm_reduce(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
877 // wasm_reduce(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
878 // wasm_reduce(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
879 // wasm_reduce(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
880 // wasm_reduce(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
881
882 // The first 8 limbs are reduced using Yuval's method, the last one is reduced using the regular method
883 // The reason for this is that Yuval's method produces a 10-limb representation of the reduced limb, which is then
884 // added to the higher limbs. If we do this for the last limb we reduce, we'll get a 10-limb representation instead
885 // of a 9-limb one, so we'll have to reduce it again in some other way.
886 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
887
888 // Convert to unrelaxed 29-bit form
889 temp_10 += temp_9 >> WASM_LIMB_BITS;
890 temp_9 &= mask;
891 temp_11 += temp_10 >> WASM_LIMB_BITS;
892 temp_10 &= mask;
893 temp_12 += temp_11 >> WASM_LIMB_BITS;
894 temp_11 &= mask;
895 temp_13 += temp_12 >> WASM_LIMB_BITS;
896 temp_12 &= mask;
897 temp_14 += temp_13 >> WASM_LIMB_BITS;
898 temp_13 &= mask;
899 temp_15 += temp_14 >> WASM_LIMB_BITS;
900 temp_14 &= mask;
901 temp_16 += temp_15 >> WASM_LIMB_BITS;
902 temp_15 &= mask;
903 // Convert to 4 64-bit form
904 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
905 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
906 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
907 (temp_15 >> 18) | (temp_16 << 11) };
908#endif
909}
910
911template <class T> constexpr struct field<T>::wide_array field<T>::mul_512(const field& other) const noexcept
912{
913#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
914 uint64_t carry_2 = 0;
915 auto [r0, carry] = mul_wide(data[0], other.data[0]);
916 uint64_t r1 = mac_mini(carry, data[0], other.data[1], carry);
917 uint64_t r2 = mac_mini(carry, data[0], other.data[2], carry);
918 uint64_t r3 = mac_mini(carry, data[0], other.data[3], carry_2);
919
920 r1 = mac_mini(r1, data[1], other.data[0], carry);
921 r2 = mac(r2, data[1], other.data[1], carry, carry);
922 r3 = mac(r3, data[1], other.data[2], carry, carry);
923 uint64_t r4 = mac(carry_2, data[1], other.data[3], carry, carry_2);
924
925 r2 = mac_mini(r2, data[2], other.data[0], carry);
926 r3 = mac(r3, data[2], other.data[1], carry, carry);
927 r4 = mac(r4, data[2], other.data[2], carry, carry);
928 uint64_t r5 = mac(carry_2, data[2], other.data[3], carry, carry_2);
929
930 r3 = mac_mini(r3, data[3], other.data[0], carry);
931 r4 = mac(r4, data[3], other.data[1], carry, carry);
932 r5 = mac(r5, data[3], other.data[2], carry, carry);
933 uint64_t r6 = mac(carry_2, data[3], other.data[3], carry, carry_2);
934
935 return { r0, r1, r2, r3, r4, r5, r6, carry_2 };
936#else
937 // Convert from 4 64-bit limbs to 9 29-bit limbs
938 auto left = wasm_convert(data);
939 auto right = wasm_convert(other.data);
940 constexpr uint64_t mask = 0x1fffffff;
941 uint64_t temp_0 = 0;
942 uint64_t temp_1 = 0;
943 uint64_t temp_2 = 0;
944 uint64_t temp_3 = 0;
945 uint64_t temp_4 = 0;
946 uint64_t temp_5 = 0;
947 uint64_t temp_6 = 0;
948 uint64_t temp_7 = 0;
949 uint64_t temp_8 = 0;
950 uint64_t temp_9 = 0;
951 uint64_t temp_10 = 0;
952 uint64_t temp_11 = 0;
953 uint64_t temp_12 = 0;
954 uint64_t temp_13 = 0;
955 uint64_t temp_14 = 0;
956 uint64_t temp_15 = 0;
957 uint64_t temp_16 = 0;
958
959 // Multiply-add all limbs
960 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
961 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
962 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
963 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
964 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
965 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
966 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
967 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
968 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
969
970 // Convert to unrelaxed 29-bit form
971 temp_1 += temp_0 >> WASM_LIMB_BITS;
972 temp_0 &= mask;
973 temp_2 += temp_1 >> WASM_LIMB_BITS;
974 temp_1 &= mask;
975 temp_3 += temp_2 >> WASM_LIMB_BITS;
976 temp_2 &= mask;
977 temp_4 += temp_3 >> WASM_LIMB_BITS;
978 temp_3 &= mask;
979 temp_5 += temp_4 >> WASM_LIMB_BITS;
980 temp_4 &= mask;
981 temp_6 += temp_5 >> WASM_LIMB_BITS;
982 temp_5 &= mask;
983 temp_7 += temp_6 >> WASM_LIMB_BITS;
984 temp_6 &= mask;
985 temp_8 += temp_7 >> WASM_LIMB_BITS;
986 temp_7 &= mask;
987 temp_9 += temp_8 >> WASM_LIMB_BITS;
988 temp_8 &= mask;
989 temp_10 += temp_9 >> WASM_LIMB_BITS;
990 temp_9 &= mask;
991 temp_11 += temp_10 >> WASM_LIMB_BITS;
992 temp_10 &= mask;
993 temp_12 += temp_11 >> WASM_LIMB_BITS;
994 temp_11 &= mask;
995 temp_13 += temp_12 >> WASM_LIMB_BITS;
996 temp_12 &= mask;
997 temp_14 += temp_13 >> WASM_LIMB_BITS;
998 temp_13 &= mask;
999 temp_15 += temp_14 >> WASM_LIMB_BITS;
1000 temp_14 &= mask;
1001 temp_16 += temp_15 >> WASM_LIMB_BITS;
1002 temp_15 &= mask;
1003
1004 // Convert to 8 64-bit limbs
1005 return { (temp_0 << 0) | (temp_1 << 29) | (temp_2 << 58),
1006 (temp_2 >> 6) | (temp_3 << 23) | (temp_4 << 52),
1007 (temp_4 >> 12) | (temp_5 << 17) | (temp_6 << 46),
1008 (temp_6 >> 18) | (temp_7 << 11) | (temp_8 << 40),
1009 (temp_8 >> 24) | (temp_9 << 5) | (temp_10 << 34) | (temp_11 << 63),
1010 (temp_11 >> 1) | (temp_12 << 28) | (temp_13 << 57),
1011 (temp_13 >> 7) | (temp_14 << 22) | (temp_15 << 51),
1012 (temp_15 >> 13) | (temp_16 << 16) };
1013#endif
1014}
1015
1016// NOLINTEND(readability-implicit-bool-conversion)
1017} // namespace bb
const std::vector< FF > data
FF a
FF b
#define WASM_LIMB_BITS
Entry point for Barretenberg command-line interface.
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
unsigned __int128 uint128_t
Definition serialize.hpp:44
General class for prime fields see Prime field documentation["field documentation"] for general imple...
static BB_INLINE constexpr std::array< uint64_t, WASM_NUM_LIMBS > wasm_convert(const uint64_t *data)
Convert 4 64-bit limbs into 9 29-bit limbs.
static BB_INLINE constexpr std::pair< uint64_t, uint64_t > mul_wide(uint64_t a, uint64_t b) noexcept
static BB_INLINE constexpr uint64_t mac_discard_lo(uint64_t a, uint64_t b, uint64_t c) noexcept
static BB_INLINE constexpr uint64_t sbb(uint64_t a, uint64_t b, uint64_t borrow_in, uint64_t &borrow_out) noexcept
BB_INLINE constexpr field subtract(const field &other) const noexcept
static BB_INLINE constexpr uint64_t mac(uint64_t a, uint64_t b, uint64_t c, uint64_t carry_in, uint64_t &carry_out) noexcept
static BB_INLINE constexpr uint64_t addc(uint64_t a, uint64_t b, uint64_t carry_in, uint64_t &carry_out) noexcept
static BB_INLINE constexpr void wasm_reduce(uint64_t &result_0, uint64_t &result_1, uint64_t &result_2, uint64_t &result_3, uint64_t &result_4, uint64_t &result_5, uint64_t &result_6, uint64_t &result_7, uint64_t &result_8)
Perform 29-bit montgomery reduction on 1 limb (result_0 should be zero modulo 2**29 after this)
BB_INLINE constexpr field montgomery_mul_big(const field &other) const noexcept
Mongtomery multiplication for moduli > 2²⁵⁴
static BB_INLINE constexpr void wasm_madd(uint64_t &left_limb, const std::array< uint64_t, WASM_NUM_LIMBS > &right_limbs, uint64_t &result_0, uint64_t &result_1, uint64_t &result_2, uint64_t &result_3, uint64_t &result_4, uint64_t &result_5, uint64_t &result_6, uint64_t &result_7, uint64_t &result_8)
Multiply left limb by a sequence of 9 limbs and put into result variables.
static BB_INLINE constexpr uint64_t square_accumulate(uint64_t a, uint64_t b, uint64_t c, uint64_t carry_in_lo, uint64_t carry_in_hi, uint64_t &carry_lo, uint64_t &carry_hi) noexcept
static BB_INLINE constexpr void wasm_reduce_yuval(uint64_t &result_0, uint64_t &result_1, uint64_t &result_2, uint64_t &result_3, uint64_t &result_4, uint64_t &result_5, uint64_t &result_6, uint64_t &result_7, uint64_t &result_8, uint64_t &result_9)
Perform 29-bit montgomery reduction on 1 limb using Yuval's method *.
BB_INLINE constexpr field subtract_coarse(const field &other) const noexcept
BB_INLINE constexpr field add(const field &other) const noexcept
BB_INLINE constexpr field montgomery_square() const noexcept
BB_INLINE constexpr field montgomery_mul(const field &other) const noexcept
BB_INLINE constexpr field reduce() const noexcept
static BB_INLINE constexpr uint64_t mac_mini(uint64_t a, uint64_t b, uint64_t c, uint64_t &out) noexcept