124 const std::vector<Fr*>& root_table)
126 auto scratch_space_ptr = get_scratch_space<Fr>(domain.
size);
127 auto scratch_space = scratch_space_ptr.get();
129 const size_t num_polys = coeffs.size();
131 const size_t poly_size = domain.
size / num_polys;
133 const size_t poly_mask = poly_size - 1;
134 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
139 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
140 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
141 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
142 __builtin_prefetch(&coeffs[next_index_1]);
143 __builtin_prefetch(&coeffs[next_index_2]);
145 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
146 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
148 size_t poly_idx_1 = swap_index_1 >> log2_poly_size;
149 size_t elem_idx_1 = swap_index_1 & poly_mask;
150 size_t poly_idx_2 = swap_index_2 >> log2_poly_size;
151 size_t elem_idx_2 = swap_index_2 & poly_mask;
153 Fr::__copy(coeffs[poly_idx_1][elem_idx_1], temp_1);
154 Fr::__copy(coeffs[poly_idx_2][elem_idx_2], temp_2);
155 scratch_space[i + 1] = temp_1 - temp_2;
156 scratch_space[i] = temp_1 + temp_2;
162 if (domain.size <= 2) {
163 coeffs[0][0] = scratch_space[0];
164 coeffs[0][1] = scratch_space[1];
168 for (
size_t m = 2; m < (domain.size); m <<= 1) {
169 parallel_for(domain.num_threads, [&](
size_t j) {
179 const size_t start = j * (domain.thread_size >> 1);
180 const size_t end = (j + 1) * (domain.thread_size >> 1);
200 const size_t block_mask = m - 1;
210 const size_t index_mask = ~block_mask;
215 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
220 if (m != (domain.size >> 1)) {
221 for (size_t i = start; i < end; ++i) {
222 size_t k1 = (i & index_mask) << 1;
223 size_t j1 = i & block_mask;
224 temp = round_roots[j1] * scratch_space[k1 + j1 + m];
225 scratch_space[k1 + j1 + m] = scratch_space[k1 + j1] - temp;
226 scratch_space[k1 + j1] += temp;
229 for (size_t i = start; i < end; ++i) {
230 size_t k1 = (i & index_mask) << 1;
231 size_t j1 = i & block_mask;
233 size_t poly_idx_1 = (k1 + j1) >> log2_poly_size;
234 size_t elem_idx_1 = (k1 + j1) & poly_mask;
235 size_t poly_idx_2 = (k1 + j1 + m) >> log2_poly_size;
236 size_t elem_idx_2 = (k1 + j1 + m) & poly_mask;
238 temp = round_roots[j1] * scratch_space[k1 + j1 + m];
239 coeffs[poly_idx_2][elem_idx_2] = scratch_space[k1 + j1] - temp;
240 coeffs[poly_idx_1][elem_idx_1] = scratch_space[k1 + j1] + temp;
255 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
256 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
257 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
258 __builtin_prefetch(&coeffs[next_index_1]);
259 __builtin_prefetch(&coeffs[next_index_2]);
261 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
262 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
264 Fr::__copy(coeffs[swap_index_1], temp_1);
265 Fr::__copy(coeffs[swap_index_2], temp_2);
266 target[i + 1] = temp_1 - temp_2;
267 target[i] = temp_1 + temp_2;
273 if (domain.size <= 2) {
274 coeffs[0] = target[0];
275 coeffs[1] = target[1];
279 for (
size_t m = 2; m < (domain.size); m <<= 1) {
280 parallel_for(domain.num_threads, [&](
size_t j) {
290 const size_t start = j * (domain.thread_size >> 1);
291 const size_t end = (j + 1) * (domain.thread_size >> 1);
311 const size_t block_mask = m - 1;
321 const size_t index_mask = ~block_mask;
326 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
331 for (size_t i = start; i < end; ++i) {
332 size_t k1 = (i & index_mask) << 1;
333 size_t j1 = i & block_mask;
334 temp = round_roots[j1] * target[k1 + j1 + m];
335 target[k1 + j1 + m] = target[k1 + j1] - temp;
336 target[k1 + j1] += temp;
439 const size_t domain_extension)
441 const size_t log2_domain_extension =
static_cast<size_t>(numeric::get_msb(domain_extension));
446 auto scratch_space_ptr = get_scratch_space<Fr>(domain.
size * domain_extension);
447 auto scratch_space = scratch_space_ptr.get();
452 std::vector<Fr> coset_generators(domain_extension);
454 for (
size_t i = 1; i < domain_extension; ++i) {
455 coset_generators[i] = coset_generators[i - 1] * primitive_root;
457 for (
size_t i = domain_extension - 1; i < domain_extension; --i) {
461 for (
size_t i = 0; i < domain_extension; ++i) {
463 scratch_space + (i * domain.
size),
469 if (domain_extension == 4) {
471 const size_t start = j * domain.thread_size;
472 const size_t end = (j + 1) * domain.thread_size;
473 for (size_t i = start; i < end; ++i) {
474 Fr::__copy(scratch_space[i], coeffs[(i << 2UL)]);
475 Fr::__copy(scratch_space[i + (1UL << domain.log2_size)], coeffs[(i << 2UL) + 1UL]);
476 Fr::__copy(scratch_space[i + (2UL << domain.log2_size)], coeffs[(i << 2UL) + 2UL]);
477 Fr::__copy(scratch_space[i + (3UL << domain.log2_size)], coeffs[(i << 2UL) + 3UL]);
481 for (
size_t i = 0; i < domain.
size; ++i) {
482 for (
size_t j = 0; j < domain_extension; ++j) {
483 Fr::__copy(scratch_space[i + (j << domain.
log2_size)], coeffs[(i << log2_domain_extension) + j]);
487 for (
size_t i = 0; i < domain.size; ++i) {
488 for (
size_t j = 0; j < domain_extension; ++j) {
489 Fr::__copy(scratch_space[i + (j << domain.log2_size)], coeffs[(i << log2_domain_extension) + j]);
575template <
typename Fr>
Fr evaluate(
const std::vector<Fr*> coeffs,
const Fr& z,
const size_t large_n)
577 const size_t num_polys = coeffs.size();
578 const size_t poly_size = large_n / num_polys;
580 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
582 size_t range_per_thread = large_n / num_threads;
583 size_t leftovers = large_n - (range_per_thread * num_threads);
584 Fr* evaluations =
new Fr[num_threads];
586 Fr z_acc = z.
pow(
static_cast<uint64_t
>(j * range_per_thread));
587 size_t offset = j * range_per_thread;
589 size_t end = (j == num_threads - 1) ?
offset + range_per_thread + leftovers :
offset + range_per_thread;
590 for (
size_t i =
offset; i < end; ++i) {
591 Fr work_var = z_acc * coeffs[i >> log2_poly_size][i & (poly_size - 1)];
592 evaluations[j] += work_var;
598 for (
size_t j = 0; j < num_threads; ++j) {
601 delete[] evaluations;
811 Fr numerator_polynomial[n + 1];
812 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial, n);
814 Fr roots_and_denominators[2 * n];
816 for (
size_t i = 0; i < n; ++i) {
817 roots_and_denominators[i] = -evaluation_points[i];
818 temp_src[i] = src[i];
821 roots_and_denominators[n + i] = 1;
822 for (
size_t j = 0; j < n; ++j) {
826 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
836 bool interpolation_domain_contains_zero =
false;
839 if (numerator_polynomial[0] ==
Fr(0)) {
840 for (
size_t i = 0; i < n; ++i) {
841 if (evaluation_points[i] ==
Fr(0)) {
843 interpolation_domain_contains_zero =
true;
849 if (!interpolation_domain_contains_zero) {
850 for (
size_t i = 0; i < n; ++i) {
852 z = roots_and_denominators[i];
854 multiplier = temp_src[i] * roots_and_denominators[n + i];
855 temp_dest[0] = multiplier * numerator_polynomial[0];
857 dest[0] += temp_dest[0];
858 for (
size_t j = 1; j < n; ++j) {
859 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
861 dest[j] += temp_dest[j];
865 for (
size_t i = 0; i < n; ++i) {
871 z = roots_and_denominators[i];
873 multiplier = temp_src[i] * roots_and_denominators[n + i];
875 temp_dest[1] = multiplier * numerator_polynomial[1];
879 dest[1] += temp_dest[1];
881 for (
size_t j = 2; j < n; ++j) {
882 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
884 dest[j] += temp_dest[j];
888 for (
size_t i = 0; i < n; ++i) {
889 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];