Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
polynomial_arithmetic.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: not started, auditors: [], date: YYYY-MM-DD }
3// external_1: { status: not started, auditors: [], date: YYYY-MM-DD }
4// external_2: { status: not started, auditors: [], date: YYYY-MM-DD }
5// =====================
6
14#include <math.h>
15#include <memory.h>
16#include <memory>
17
19
20namespace {
21
22template <typename Fr> std::shared_ptr<Fr[]> get_scratch_space(const size_t num_elements)
23{
24 // WASM needs to release slab so it can be reused elsewhere.
25 // But for native code it's more performant to hold onto it.
26#ifdef __wasm__
27 return std::static_pointer_cast<Fr[]>(get_mem_slab(num_elements * sizeof(Fr)));
28#else
29 static std::shared_ptr<Fr[]> working_memory = nullptr;
30 static size_t current_size = 0;
31 if (num_elements > current_size) {
32 working_memory = std::static_pointer_cast<Fr[]>(get_mem_slab(num_elements * sizeof(Fr)));
33 current_size = num_elements;
34 }
35 return working_memory;
36#endif
37}
38
39} // namespace
40
41inline uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
42{
43 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
44 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
45 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
46 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
47 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
48}
49
50inline bool is_power_of_two(uint64_t x)
51{
52 return x && !(x & (x - 1));
53}
54
55template <typename Fr>
56void copy_polynomial(const Fr* src, Fr* dest, size_t num_src_coefficients, size_t num_target_coefficients)
57{
58 // TODO: fiddle around with avx asm to see if we can speed up
59 memcpy((void*)dest, (void*)src, num_src_coefficients * sizeof(Fr));
60
61 if (num_target_coefficients > num_src_coefficients) {
62 // fill out the polynomial coefficients with zeroes
63 memset((void*)(dest + num_src_coefficients), 0, (num_target_coefficients - num_src_coefficients) * sizeof(Fr));
64 }
65}
66
67template <typename Fr>
69 Fr* target,
70 const EvaluationDomain<Fr>& domain,
71 const Fr& generator_start,
72 const Fr& generator_shift,
73 const size_t generator_size)
74{
75 parallel_for(domain.num_threads, [&](size_t j) {
76 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
77 Fr work_generator = generator_start * thread_shift;
78 const size_t offset = j * (generator_size / domain.num_threads);
79 const size_t end = offset + (generator_size / domain.num_threads);
80 for (size_t i = offset; i < end; ++i) {
81 target[i] = coeffs[i] * work_generator;
82 work_generator *= generator_shift;
83 }
84 });
85}
96template <typename Fr>
97 requires SupportsFFT<Fr>
98void compute_multiplicative_subgroup(const size_t log2_subgroup_size,
99 const EvaluationDomain<Fr>& src_domain,
100 Fr* subgroup_roots)
101{
102 size_t subgroup_size = 1UL << log2_subgroup_size;
103 // Step 1: get primitive 4th root of unity
104 Fr subgroup_root = Fr::get_root_of_unity(log2_subgroup_size);
105
106 // Step 2: compute the cofactor term g^n
107 Fr accumulator = src_domain.generator;
108 for (size_t i = 0; i < src_domain.log2_size; ++i) {
109 accumulator.self_sqr();
110 }
111
112 // Step 3: fill array with subgroup_size values of (g.X)^n, scaled by the cofactor
113 subgroup_roots[0] = accumulator;
114 for (size_t i = 1; i < subgroup_size; ++i) {
115 subgroup_roots[i] = subgroup_roots[i - 1] * subgroup_root;
116 }
117}
118
119template <typename Fr>
120 requires SupportsFFT<Fr>
121void fft_inner_parallel(std::vector<Fr*> coeffs,
122 const EvaluationDomain<Fr>& domain,
123 const Fr&,
124 const std::vector<Fr*>& root_table)
125{
126 auto scratch_space_ptr = get_scratch_space<Fr>(domain.size);
127 auto scratch_space = scratch_space_ptr.get();
128
129 const size_t num_polys = coeffs.size();
130 ASSERT(is_power_of_two(num_polys));
131 const size_t poly_size = domain.size / num_polys;
132 ASSERT(is_power_of_two(poly_size));
133 const size_t poly_mask = poly_size - 1;
134 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
135
136 parallel_for(domain.num_threads, [&](size_t j) {
137 Fr temp_1;
138 Fr temp_2;
139 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
140 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
141 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
142 __builtin_prefetch(&coeffs[next_index_1]);
143 __builtin_prefetch(&coeffs[next_index_2]);
144
145 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
146 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
147
148 size_t poly_idx_1 = swap_index_1 >> log2_poly_size;
149 size_t elem_idx_1 = swap_index_1 & poly_mask;
150 size_t poly_idx_2 = swap_index_2 >> log2_poly_size;
151 size_t elem_idx_2 = swap_index_2 & poly_mask;
152
153 Fr::__copy(coeffs[poly_idx_1][elem_idx_1], temp_1);
154 Fr::__copy(coeffs[poly_idx_2][elem_idx_2], temp_2);
155 scratch_space[i + 1] = temp_1 - temp_2;
156 scratch_space[i] = temp_1 + temp_2;
157 }
158 });
159
160 // hard code exception for when the domain size is tiny - we won't execute the next loop, so need to manually
161 // reduce + copy
162 if (domain.size <= 2) {
163 coeffs[0][0] = scratch_space[0];
164 coeffs[0][1] = scratch_space[1];
165 }
166
167 // outer FFT loop
168 for (size_t m = 2; m < (domain.size); m <<= 1) {
169 parallel_for(domain.num_threads, [&](size_t j) {
170 Fr temp;
171
172 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
173 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
174 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
175 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
176
177 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
178 // directly access the roots of unity lookup table
179 const size_t start = j * (domain.thread_size >> 1);
180 const size_t end = (j + 1) * (domain.thread_size >> 1);
181
182 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
183 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
184 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
185 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
186 // roots of unity
187 // i.e. each successive FFT round will double the set of roots that we need to index.
188 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
189 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
190 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
191 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
192 // iterations, we loop back to the start.
193
194 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
195 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
196 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
197 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
198 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
199 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
200 const size_t block_mask = m - 1;
201
202 // The next problem to tackle, is we now need to efficiently index the polynomial element in
203 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
204 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
205 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
206 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
207 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
208 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
209 // block_mask)`
210 const size_t index_mask = ~block_mask;
211
212 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
213 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
214 // all 1).
215 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
216
217 // Finally, we want to treat the final round differently from the others,
218 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
219 // `scratch_space`
220 if (m != (domain.size >> 1)) {
221 for (size_t i = start; i < end; ++i) {
222 size_t k1 = (i & index_mask) << 1;
223 size_t j1 = i & block_mask;
224 temp = round_roots[j1] * scratch_space[k1 + j1 + m];
225 scratch_space[k1 + j1 + m] = scratch_space[k1 + j1] - temp;
226 scratch_space[k1 + j1] += temp;
227 }
228 } else {
229 for (size_t i = start; i < end; ++i) {
230 size_t k1 = (i & index_mask) << 1;
231 size_t j1 = i & block_mask;
232
233 size_t poly_idx_1 = (k1 + j1) >> log2_poly_size;
234 size_t elem_idx_1 = (k1 + j1) & poly_mask;
235 size_t poly_idx_2 = (k1 + j1 + m) >> log2_poly_size;
236 size_t elem_idx_2 = (k1 + j1 + m) & poly_mask;
237
238 temp = round_roots[j1] * scratch_space[k1 + j1 + m];
239 coeffs[poly_idx_2][elem_idx_2] = scratch_space[k1 + j1] - temp;
240 coeffs[poly_idx_1][elem_idx_1] = scratch_space[k1 + j1] + temp;
241 }
242 }
243 });
244 }
245}
246
247template <typename Fr>
248 requires SupportsFFT<Fr>
250 Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain, const Fr&, const std::vector<Fr*>& root_table)
251{
252 parallel_for(domain.num_threads, [&](size_t j) {
253 Fr temp_1;
254 Fr temp_2;
255 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
256 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
257 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
258 __builtin_prefetch(&coeffs[next_index_1]);
259 __builtin_prefetch(&coeffs[next_index_2]);
260
261 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
262 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
263
264 Fr::__copy(coeffs[swap_index_1], temp_1);
265 Fr::__copy(coeffs[swap_index_2], temp_2);
266 target[i + 1] = temp_1 - temp_2;
267 target[i] = temp_1 + temp_2;
268 }
269 });
270
271 // hard code exception for when the domain size is tiny - we won't execute the next loop, so need to manually
272 // reduce + copy
273 if (domain.size <= 2) {
274 coeffs[0] = target[0];
275 coeffs[1] = target[1];
276 }
277
278 // outer FFT loop
279 for (size_t m = 2; m < (domain.size); m <<= 1) {
280 parallel_for(domain.num_threads, [&](size_t j) {
281 Fr temp;
282
283 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
284 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
285 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
286 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
287
288 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
289 // directly access the roots of unity lookup table
290 const size_t start = j * (domain.thread_size >> 1);
291 const size_t end = (j + 1) * (domain.thread_size >> 1);
292
293 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
294 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
295 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
296 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
297 // roots of unity
298 // i.e. each successive FFT round will double the set of roots that we need to index.
299 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
300 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
301 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
302 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
303 // iterations, we loop back to the start.
304
305 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
306 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
307 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
308 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
309 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
310 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
311 const size_t block_mask = m - 1;
312
313 // The next problem to tackle, is we now need to efficiently index the polynomial element in
314 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
315 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
316 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
317 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
318 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
319 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
320 // block_mask)`
321 const size_t index_mask = ~block_mask;
322
323 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
324 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
325 // all 1).
326 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
327
328 // Finally, we want to treat the final round differently from the others,
329 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
330 // `scratch_space`
331 for (size_t i = start; i < end; ++i) {
332 size_t k1 = (i & index_mask) << 1;
333 size_t j1 = i & block_mask;
334 temp = round_roots[j1] * target[k1 + j1 + m];
335 target[k1 + j1 + m] = target[k1 + j1] - temp;
336 target[k1 + j1] += temp;
337 }
338 });
339 }
340}
341
342template <typename Fr>
343 requires SupportsFFT<Fr>
344void fft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
345{
346 fft_inner_parallel({ coeffs }, domain, domain.root, domain.get_round_roots());
347}
348
349template <typename Fr>
350 requires SupportsFFT<Fr>
351void fft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
352{
353 fft_inner_parallel(coeffs, target, domain, domain.root, domain.get_round_roots());
354}
355
356template <typename Fr>
357 requires SupportsFFT<Fr>
358void fft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
359{
360 fft_inner_parallel<Fr>(coeffs, domain.size, domain.root, domain.get_round_roots());
361}
362
363template <typename Fr>
364 requires SupportsFFT<Fr>
365void ifft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
366{
367 fft_inner_parallel({ coeffs }, domain, domain.root_inverse, domain.get_inverse_round_roots());
369 coeffs[i] *= domain.domain_inverse;
371}
372
373template <typename Fr>
374 requires SupportsFFT<Fr>
375void ifft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
376{
377 fft_inner_parallel(coeffs, target, domain, domain.root_inverse, domain.get_inverse_round_roots());
379 target[i] *= domain.domain_inverse;
381}
382
383template <typename Fr>
384 requires SupportsFFT<Fr>
385void ifft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
386{
387 fft_inner_parallel(coeffs, domain, domain.root_inverse, domain.get_inverse_round_roots());
388
389 const size_t num_polys = coeffs.size();
390 ASSERT(is_power_of_two(num_polys));
391 const size_t poly_size = domain.size / num_polys;
392 ASSERT(is_power_of_two(poly_size));
393 const size_t poly_mask = poly_size - 1;
394 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
395
397 coeffs[i >> log2_poly_size][i & poly_mask] *= domain.domain_inverse;
399}
400
401template <typename Fr>
402 requires SupportsFFT<Fr>
403void coset_fft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
404{
405 scale_by_generator(coeffs, coeffs, domain, Fr::one(), domain.generator, domain.generator_size);
406 fft(coeffs, domain);
407}
408
409template <typename Fr>
410 requires SupportsFFT<Fr>
411void coset_fft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
412{
413 scale_by_generator(coeffs, target, domain, Fr::one(), domain.generator, domain.generator_size);
414 fft(coeffs, target, domain);
415}
416
417template <typename Fr>
418 requires SupportsFFT<Fr>
419void coset_fft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
420{
421 const size_t num_polys = coeffs.size();
422 ASSERT(is_power_of_two(num_polys));
423 const size_t poly_size = domain.size / num_polys;
424 const Fr generator_pow_n = domain.generator.pow(poly_size);
425 Fr generator_start = 1;
426
427 for (size_t i = 0; i < num_polys; i++) {
428 scale_by_generator(coeffs[i], coeffs[i], domain, generator_start, domain.generator, poly_size);
429 generator_start *= generator_pow_n;
430 }
431 fft(coeffs, domain);
432}
433
434template <typename Fr>
435 requires SupportsFFT<Fr>
436void coset_fft(Fr* coeffs,
437 const EvaluationDomain<Fr>& domain,
439 const size_t domain_extension)
440{
441 const size_t log2_domain_extension = static_cast<size_t>(numeric::get_msb(domain_extension));
442 Fr primitive_root = Fr::get_root_of_unity(domain.log2_size + log2_domain_extension);
443
444 // Fr work_root = domain.generator.sqr();
445 // work_root = domain.generator.sqr();
446 auto scratch_space_ptr = get_scratch_space<Fr>(domain.size * domain_extension);
447 auto scratch_space = scratch_space_ptr.get();
448
449 // Fr* temp_memory = static_cast<Fr*>(aligned_alloc(64, sizeof(Fr) * domain.size *
450 // domain_extension));
451
452 std::vector<Fr> coset_generators(domain_extension);
453 coset_generators[0] = domain.generator;
454 for (size_t i = 1; i < domain_extension; ++i) {
455 coset_generators[i] = coset_generators[i - 1] * primitive_root;
456 }
457 for (size_t i = domain_extension - 1; i < domain_extension; --i) {
458 scale_by_generator(coeffs, coeffs + (i * domain.size), domain, Fr::one(), coset_generators[i], domain.size);
459 }
460
461 for (size_t i = 0; i < domain_extension; ++i) {
462 fft_inner_parallel(coeffs + (i * domain.size),
463 scratch_space + (i * domain.size),
464 domain,
465 domain.root,
466 domain.get_round_roots());
467 }
468
469 if (domain_extension == 4) {
470 parallel_for(domain.num_threads, [&](size_t j) {
471 const size_t start = j * domain.thread_size;
472 const size_t end = (j + 1) * domain.thread_size;
473 for (size_t i = start; i < end; ++i) {
474 Fr::__copy(scratch_space[i], coeffs[(i << 2UL)]);
475 Fr::__copy(scratch_space[i + (1UL << domain.log2_size)], coeffs[(i << 2UL) + 1UL]);
476 Fr::__copy(scratch_space[i + (2UL << domain.log2_size)], coeffs[(i << 2UL) + 2UL]);
477 Fr::__copy(scratch_space[i + (3UL << domain.log2_size)], coeffs[(i << 2UL) + 3UL]);
478 }
479 });
480
481 for (size_t i = 0; i < domain.size; ++i) {
482 for (size_t j = 0; j < domain_extension; ++j) {
483 Fr::__copy(scratch_space[i + (j << domain.log2_size)], coeffs[(i << log2_domain_extension) + j]);
484 }
485 }
486 } else {
487 for (size_t i = 0; i < domain.size; ++i) {
488 for (size_t j = 0; j < domain_extension; ++j) {
489 Fr::__copy(scratch_space[i + (j << domain.log2_size)], coeffs[(i << log2_domain_extension) + j]);
490 }
491 }
492 }
493}
494
495template <typename Fr>
496 requires SupportsFFT<Fr>
497void coset_fft_with_constant(Fr* coeffs, const EvaluationDomain<Fr>& domain, const Fr& constant)
498{
499 Fr start = constant;
500 scale_by_generator(coeffs, coeffs, domain, start, domain.generator, domain.generator_size);
501 fft(coeffs, domain);
502}
503
504template <typename Fr>
505 requires SupportsFFT<Fr>
506void coset_fft_with_generator_shift(Fr* coeffs, const EvaluationDomain<Fr>& domain, const Fr& constant)
507{
508 scale_by_generator(coeffs, coeffs, domain, Fr::one(), domain.generator * constant, domain.generator_size);
509 fft(coeffs, domain);
510}
511
512template <typename Fr>
513 requires SupportsFFT<Fr>
514void ifft_with_constant(Fr* coeffs, const EvaluationDomain<Fr>& domain, const Fr& value)
515{
516 fft_inner_parallel({ coeffs }, domain, domain.root_inverse, domain.get_inverse_round_roots());
517 Fr T0 = domain.domain_inverse * value;
519 coeffs[i] *= T0;
521}
522
523template <typename Fr>
524 requires SupportsFFT<Fr>
525void coset_ifft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
526{
527 ifft(coeffs, domain);
528 scale_by_generator(coeffs, coeffs, domain, Fr::one(), domain.generator_inverse, domain.size);
529}
530
531template <typename Fr>
532 requires SupportsFFT<Fr>
533void coset_ifft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
534{
535 ifft(coeffs, domain);
536
537 const size_t num_polys = coeffs.size();
538 ASSERT(is_power_of_two(num_polys));
539 const size_t poly_size = domain.size / num_polys;
540 const Fr generator_inv_pow_n = domain.generator_inverse.pow(poly_size);
541 Fr generator_start = 1;
542
543 for (size_t i = 0; i < num_polys; i++) {
544 scale_by_generator(coeffs[i], coeffs[i], domain, generator_start, domain.generator_inverse, poly_size);
545 generator_start *= generator_inv_pow_n;
546 }
547}
548
549template <typename Fr> Fr evaluate(const Fr* coeffs, const Fr& z, const size_t n)
550{
551 size_t num_threads = get_num_cpus_pow2();
552 size_t range_per_thread = n / num_threads;
553 size_t leftovers = n - (range_per_thread * num_threads);
554 Fr* evaluations = new Fr[num_threads];
555 parallel_for(num_threads, [&](size_t j) {
556 Fr z_acc = z.pow(static_cast<uint64_t>(j * range_per_thread));
557 size_t offset = j * range_per_thread;
558 evaluations[j] = Fr::zero();
559 size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
560 for (size_t i = offset; i < end; ++i) {
561 Fr work_var = z_acc * coeffs[i];
562 evaluations[j] += work_var;
563 z_acc *= z;
564 }
565 });
566
567 Fr r = Fr::zero();
568 for (size_t j = 0; j < num_threads; ++j) {
569 r += evaluations[j];
570 }
571 delete[] evaluations;
572 return r;
573}
574
575template <typename Fr> Fr evaluate(const std::vector<Fr*> coeffs, const Fr& z, const size_t large_n)
576{
577 const size_t num_polys = coeffs.size();
578 const size_t poly_size = large_n / num_polys;
579 ASSERT(is_power_of_two(poly_size));
580 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
581 size_t num_threads = get_num_cpus_pow2();
582 size_t range_per_thread = large_n / num_threads;
583 size_t leftovers = large_n - (range_per_thread * num_threads);
584 Fr* evaluations = new Fr[num_threads];
585 parallel_for(num_threads, [&](size_t j) {
586 Fr z_acc = z.pow(static_cast<uint64_t>(j * range_per_thread));
587 size_t offset = j * range_per_thread;
588 evaluations[j] = Fr::zero();
589 size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
590 for (size_t i = offset; i < end; ++i) {
591 Fr work_var = z_acc * coeffs[i >> log2_poly_size][i & (poly_size - 1)];
592 evaluations[j] += work_var;
593 z_acc *= z;
594 }
595 });
596
597 Fr r = Fr::zero();
598 for (size_t j = 0; j < num_threads; ++j) {
599 r += evaluations[j];
600 }
601 delete[] evaluations;
602 return r;
603}
604
605template <typename Fr>
606 requires SupportsFFT<Fr>
607Fr compute_kate_opening_coefficients(const Fr* src, Fr* dest, const Fr& z, const size_t n)
608{
609 // if `coeffs` represents F(X), we want to compute W(X)
610 // where W(X) = F(X) - F(z) / (X - z)
611 // i.e. divide by the degree-1 polynomial [-z, 1]
612
613 // We assume that the commitment is well-formed and that there is no remainder term.
614 // Under these conditions we can perform this polynomial division in linear time with good constants
615 Fr f = evaluate(src, z, n);
616
617 // compute (1 / -z)
618 Fr divisor = -z.invert();
619
620 // we're about to shove these coefficients into a pippenger multi-exponentiation routine, where we need
621 // to convert out of montgomery form. So, we can use lazy reduction techniques here without triggering overflows
622 dest[0] = src[0] - f;
623 dest[0] *= divisor;
624 for (size_t i = 1; i < n; ++i) {
625 dest[i] = src[i] - dest[i - 1];
626 dest[i] *= divisor;
627 }
628
629 return f;
630}
631
632// Computes r = \sum_{i=0}^{num_coeffs-1} (L_{i+1}(ʓ).f_i)
633//
634// (ʓ^n - 1)
635// Start with L_1(ʓ) = ---------
636// n.(ʓ - 1)
637//
638// ʓ^n - 1
639// L_i(z) = L_1(ʓ.ω^{1-i}) = ------------------
640// n.(ʓ.ω^{1-i)} - 1)
641//
643 const size_t num_coeffs,
644 const fr& z,
645 const EvaluationDomain<fr>& domain)
646{
647 fr* denominators = static_cast<fr*>(aligned_alloc(64, sizeof(fr) * num_coeffs));
648
649 fr numerator = z;
650 for (size_t i = 0; i < domain.log2_size; ++i) {
651 numerator.self_sqr();
652 }
653 numerator -= fr::one();
654 numerator *= domain.domain_inverse; // (ʓ^n - 1) / n
655
656 denominators[0] = z - fr::one();
657 fr work_root = domain.root_inverse; // ω^{-1}
658 for (size_t i = 1; i < num_coeffs; ++i) {
659 denominators[i] =
660 work_root * z; // denominators[i] will correspond to L_[i+1] (since our 'commented maths' notation indexes
661 // L_i from 1). So ʓ.ω^{-i} = ʓ.ω^{1-(i+1)} is correct for L_{i+1}.
662 denominators[i] -= fr::one(); // ʓ.ω^{-i} - 1
663 work_root *= domain.root_inverse;
664 }
665
666 fr::batch_invert(denominators, num_coeffs);
667
668 fr result = fr::zero();
669
670 for (size_t i = 0; i < num_coeffs; ++i) {
671 fr temp = coeffs[i] * denominators[i]; // f_i * 1/(ʓ.ω^{-i} - 1)
672 result = result + temp;
673 }
674
675 result = result *
676 numerator; // \sum_{i=0}^{num_coeffs-1} f_i * [ʓ^n - 1]/[n.(ʓ.ω^{-i} - 1)]
677 // = \sum_{i=0}^{num_coeffs-1} f_i * L_{i+1}
678 // (with our somewhat messy 'commented maths' convention that L_1 corresponds to the 0th coeff).
679
680 aligned_free(denominators);
681
682 return result;
683}
684
685// This function computes sum of all scalars in a given array.
686template <typename Fr> Fr compute_sum(const Fr* src, const size_t n)
687{
688 Fr result = 0;
689 for (size_t i = 0; i < n; ++i) {
690 result += src[i];
691 }
692 return result;
693}
694
695// This function computes the polynomial (x - a)(x - b)(x - c)... given n distinct roots (a, b, c, ...).
696template <typename Fr> void compute_linear_polynomial_product(const Fr* roots, Fr* dest, const size_t n)
697{
698
699 auto scratch_space_ptr = get_scratch_space<Fr>(n);
700 auto scratch_space = scratch_space_ptr.get();
701 memcpy((void*)scratch_space, (void*)roots, n * sizeof(Fr));
702
703 dest[n] = 1;
704 dest[n - 1] = -compute_sum(scratch_space, n);
705
706 Fr temp;
707 Fr constant = 1;
708 for (size_t i = 0; i < n - 1; ++i) {
709 temp = 0;
710 for (size_t j = 0; j < n - 1 - i; ++j) {
711 scratch_space[j] = roots[j] * compute_sum(&scratch_space[j + 1], n - 1 - i - j);
712 temp += scratch_space[j];
713 }
714 dest[n - 2 - i] = temp * constant;
715 constant *= Fr::neg_one();
716 }
717}
718
719template <typename Fr> Fr compute_linear_polynomial_product_evaluation(const Fr* roots, const Fr z, const size_t n)
720{
721 Fr result = 1;
722 for (size_t i = 0; i < n; ++i) {
723 result *= (z - roots[i]);
724 }
725 return result;
726}
727
728template <typename Fr> void compute_interpolation(const Fr* src, Fr* dest, const Fr* evaluation_points, const size_t n)
729{
730 std::vector<Fr> local_roots;
731 Fr local_polynomial[n];
732 Fr denominator = 1;
733 Fr multiplicand;
734 Fr temp_dest[n];
735
736 if (n == 1) {
737 temp_dest[0] = src[0];
738 return;
739 }
740
741 // Initialize dest
742 for (size_t i = 0; i < n; ++i) {
743 temp_dest[i] = 0;
744 }
745
746 for (size_t i = 0; i < n; ++i) {
747
748 // fill in local roots
749 denominator = 1;
750 for (size_t j = 0; j < n; ++j) {
751 if (j == i) {
752 continue;
753 }
754 local_roots.push_back(evaluation_points[j]);
755 denominator *= (evaluation_points[i] - evaluation_points[j]);
756 }
757
758 // bring local roots to coefficient form
759 compute_linear_polynomial_product(&local_roots[0], local_polynomial, n - 1);
760
761 // store the resulting coefficients
762 multiplicand = src[i] / denominator;
763 for (size_t j = 0; j < n; ++j) {
764 temp_dest[j] += multiplicand * local_polynomial[j];
765 }
766
767 // clear up local roots
768 local_roots.clear();
769 }
770
771 memcpy((void*)dest, (void*)temp_dest, n * sizeof(Fr));
772}
773
774template <typename Fr>
775void compute_efficient_interpolation(const Fr* src, Fr* dest, const Fr* evaluation_points, const size_t n)
776{
777 /*
778 We use Lagrange technique to compute polynomial interpolation.
779 Given: (x_i, y_i) for i ∈ {0, 1, ..., n} =: [n]
780 Compute function f(X) such that f(x_i) = y_i for all i ∈ [n].
781 (X - x1)(X - x2)...(X - xn) (X - x0)(X - x2)...(X - xn)
782 F(X) = y0-------------------------------- + y1---------------------------------- + ...
783 (x0 - x_1)(x0 - x_2)...(x0 - xn) (x1 - x_0)(x1 - x_2)...(x1 - xn)
784 We write this as:
785 [ yi ]
786 F(X) = N(X) * |∑_i --------------- |
787 [ (X - xi) * di ]
788 where:
789 N(X) = ∏_{i \in [n]} (X - xi),
790 di = ∏_{j != i} (xi - xj)
791 For division of N(X) by (X - xi), we use the same trick that was used in compute_opening_polynomial()
792 function in the Kate commitment scheme.
793 We denote
794 q_{x_i} = N(X)/(X-x_i) * y_i * (d_i)^{-1} = q_{x_i,0}*1 + ... + q_{x_i,n-1} * X^{n-1} for i=0,..., n-1.
795
796 The computation of F(X) is split into two cases:
797
798 - if 0 is not in the interpolation domain, then the numerator polynomial N(X) has a non-zero constant term
799 that is used to initialize the division algorithm mentioned above; the monomial coefficients q_{x_i, j} of
800 q_{x_i} are accumulated into dest[j] for j=0,..., n-1
801
802 - if 0 is in the domain at index i_0, the constant term of N(X) is 0 and the division algorithm computing
803 q_{x_i} for i != i_0 is initialized with the constant term of N(X)/X. Note that its coefficients are given
804 by numerator_polynomial[j] for j=1,...,n. The monomial coefficients of q_{x_i} are then accumuluated in
805 dest[j] for j=1,..., n-1. Whereas the coefficients of
806 q_{0} = N(X)/X * f(0) * (d_{i_0})^{-1}
807 are added to dest[j] for j=0,..., n-1. Note that these coefficients do not require performing the division
808 algorithm used in Kate commitment scheme, as the coefficients of N(X)/X are given by numerator_polynomial[j]
809 for j=1,...,n.
810 */
811 Fr numerator_polynomial[n + 1];
812 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial, n);
813 // First half contains roots, second half contains denominators (to be inverted)
814 Fr roots_and_denominators[2 * n];
815 Fr temp_src[n];
816 for (size_t i = 0; i < n; ++i) {
817 roots_and_denominators[i] = -evaluation_points[i];
818 temp_src[i] = src[i];
819 dest[i] = 0;
820 // compute constant denominators
821 roots_and_denominators[n + i] = 1;
822 for (size_t j = 0; j < n; ++j) {
823 if (j == i) {
824 continue;
825 }
826 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
827 }
828 }
829 // at this point roots_and_denominators is populated as follows
830 // (x_0,\ldots, x_{n-1}, d_0, \ldots, d_{n-1})
831 Fr::batch_invert(roots_and_denominators, 2 * n);
832
833 Fr z, multiplier;
834 Fr temp_dest[n];
835 size_t idx_zero = 0;
836 bool interpolation_domain_contains_zero = false;
837 // if the constant term of the numerator polynomial N(X) is 0, then the interpolation domain contains 0
838 // we find the index i_0, such that x_{i_0} = 0
839 if (numerator_polynomial[0] == Fr(0)) {
840 for (size_t i = 0; i < n; ++i) {
841 if (evaluation_points[i] == Fr(0)) {
842 idx_zero = i;
843 interpolation_domain_contains_zero = true;
844 break;
845 }
846 }
847 };
848
849 if (!interpolation_domain_contains_zero) {
850 for (size_t i = 0; i < n; ++i) {
851 // set z = - 1/x_i for x_i <> 0
852 z = roots_and_denominators[i];
853 // temp_src[i] is y_i, it gets multiplied by 1/d_i
854 multiplier = temp_src[i] * roots_and_denominators[n + i];
855 temp_dest[0] = multiplier * numerator_polynomial[0];
856 temp_dest[0] *= z;
857 dest[0] += temp_dest[0];
858 for (size_t j = 1; j < n; ++j) {
859 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
860 temp_dest[j] *= z;
861 dest[j] += temp_dest[j];
862 }
863 }
864 } else {
865 for (size_t i = 0; i < n; ++i) {
866 if (i == idx_zero) {
867 // the contribution from the term corresponding to i_0 is computed separately
868 continue;
869 }
870 // get the next inverted root
871 z = roots_and_denominators[i];
872 // compute f(x_i) * d_{x_i}^{-1}
873 multiplier = temp_src[i] * roots_and_denominators[n + i];
874 // get x_i^{-1} * f(x_i) * d_{x_i}^{-1} into the "free" term
875 temp_dest[1] = multiplier * numerator_polynomial[1];
876 temp_dest[1] *= z;
877 // correct the first coefficient as it is now accumulating free terms from
878 // f(x_i) d_i^{-1} prod_(X-x_i, x_i != 0) (X-x_i) * 1/(X-x_i)
879 dest[1] += temp_dest[1];
880 // compute the quotient N(X)/(X-x_i) f(x_i)/d_{x_i} and its contribution to the target coefficients
881 for (size_t j = 2; j < n; ++j) {
882 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
883 temp_dest[j] *= z;
884 dest[j] += temp_dest[j];
885 };
886 }
887 // correct the target coefficients by the contribution from q_{0} = N(X)/X * d_{i_0}^{-1} * f(0)
888 for (size_t i = 0; i < n; ++i) {
889 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];
890 }
891 }
892}
893
894template fr evaluate<fr>(const fr*, const fr&, const size_t);
895template fr evaluate<fr>(const std::vector<fr*>, const fr&, const size_t);
896template void copy_polynomial<fr>(const fr*, fr*, size_t, size_t);
897template void fft_inner_parallel<fr>(std::vector<fr*>, const EvaluationDomain<fr>&, const fr&, const std::vector<fr*>&);
898template void fft<fr>(fr*, const EvaluationDomain<fr>&);
899template void fft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
900template void fft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
901template void coset_fft<fr>(fr*, const EvaluationDomain<fr>&);
902template void coset_fft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
903template void coset_fft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
904template void coset_fft<fr>(fr*, const EvaluationDomain<fr>&, const EvaluationDomain<fr>&, const size_t);
905template void coset_fft_with_constant<fr>(fr*, const EvaluationDomain<fr>&, const fr&);
906template void coset_fft_with_generator_shift<fr>(fr*, const EvaluationDomain<fr>&, const fr&);
907template void ifft<fr>(fr*, const EvaluationDomain<fr>&);
908template void ifft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
909template void ifft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
910template void ifft_with_constant<fr>(fr*, const EvaluationDomain<fr>&, const fr&);
911template void coset_ifft<fr>(fr*, const EvaluationDomain<fr>&);
912template void coset_ifft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
913template fr compute_kate_opening_coefficients<fr>(const fr*, fr*, const fr&, const size_t);
914template fr compute_sum<fr>(const fr*, const size_t);
915template void compute_linear_polynomial_product<fr>(const fr*, fr*, const size_t);
916template void compute_interpolation<fr>(const fr*, fr*, const fr*, const size_t);
917template void compute_efficient_interpolation<fr>(const fr*, fr*, const fr*, const size_t);
918
919template grumpkin::fr evaluate<grumpkin::fr>(const grumpkin::fr*, const grumpkin::fr&, const size_t);
920template grumpkin::fr evaluate<grumpkin::fr>(const std::vector<grumpkin::fr*>, const grumpkin::fr&, const size_t);
921template void copy_polynomial<grumpkin::fr>(const grumpkin::fr*, grumpkin::fr*, size_t, size_t);
922template grumpkin::fr compute_sum<grumpkin::fr>(const grumpkin::fr*, const size_t);
923template void compute_linear_polynomial_product<grumpkin::fr>(const grumpkin::fr*, grumpkin::fr*, const size_t);
924template void compute_interpolation<grumpkin::fr>(const grumpkin::fr*,
926 const grumpkin::fr*,
927 const size_t);
928template void compute_efficient_interpolation<grumpkin::fr>(const grumpkin::fr*,
930 const grumpkin::fr*,
931 const size_t);
932
933} // namespace bb::polynomial_arithmetic
#define ASSERT(expression,...)
Definition assert.hpp:49
const std::vector< FF * > & get_round_roots() const
const std::vector< FF * > & get_inverse_round_roots() const
ssize_t offset
Definition engine.cpp:36
#define ITERATE_OVER_DOMAIN_START(domain)
#define ITERATE_OVER_DOMAIN_END
constexpr bool is_power_of_two(uint64_t x)
Definition pow.hpp:35
void ifft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
Fr compute_linear_polynomial_product_evaluation(const Fr *roots, const Fr z, const size_t n)
uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
void fft_inner_parallel(std::vector< Fr * > coeffs, const EvaluationDomain< Fr > &domain, const Fr &, const std::vector< Fr * > &root_table)
void coset_fft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
void compute_linear_polynomial_product(const Fr *roots, Fr *dest, const size_t n)
void compute_multiplicative_subgroup(const size_t log2_subgroup_size, const EvaluationDomain< Fr > &src_domain, Fr *subgroup_roots)
void coset_fft_with_constant(Fr *coeffs, const EvaluationDomain< Fr > &domain, const Fr &constant)
Fr evaluate(const Fr *coeffs, const Fr &z, const size_t n)
void fft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
fr compute_barycentric_evaluation(const fr *coeffs, const size_t num_coeffs, const fr &z, const EvaluationDomain< fr > &domain)
void compute_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
Fr compute_kate_opening_coefficients(const Fr *src, Fr *dest, const Fr &z, const size_t n)
void coset_ifft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
void ifft_with_constant(Fr *coeffs, const EvaluationDomain< Fr > &domain, const Fr &value)
void copy_polynomial(const Fr *src, Fr *dest, size_t num_src_coefficients, size_t num_target_coefficients)
void compute_efficient_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
void coset_fft_with_generator_shift(Fr *coeffs, const EvaluationDomain< Fr > &domain, const Fr &constant)
void scale_by_generator(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &generator_start, const Fr &generator_shift, const size_t generator_size)
Fr compute_sum(const Fr *src, const size_t n)
size_t get_num_cpus_pow2()
Definition thread.hpp:18
std::shared_ptr< void > get_mem_slab(size_t size)
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:72
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::ScalarField Fr
static constexpr field get_root_of_unity(size_t subgroup_size) noexcept
static constexpr field neg_one()
static constexpr field one()
BB_INLINE constexpr field pow(const uint256_t &exponent) const noexcept
BB_INLINE constexpr void self_sqr() &noexcept
constexpr field invert() const noexcept
static BB_INLINE void __copy(const field &a, field &r) noexcept
static void batch_invert(std::span< field > coeffs) noexcept
static constexpr field zero()