Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
scalar_multiplication.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: not started, auditors: [], date: YYYY-MM-DD }
3// external_1: { status: not started, auditors: [], date: YYYY-MM-DD }
4// external_2: { status: not started, auditors: [], date: YYYY-MM-DD }
5// =====================
9
10#include "./process_buckets.hpp"
18
21
23
33template <typename Curve>
36 std::span<const uint32_t> scalar_indices,
37 size_t range) noexcept
38{
39 typename Curve::Element r = Curve::Group::point_at_infinity;
40 for (size_t i = 0; i < range; ++i) {
41 typename Curve::Element f = points[scalar_indices[i]];
42 r += f * scalars[scalar_indices[i]].to_montgomery_form();
43 }
44 return r;
45}
46
54template <typename Curve>
56 std::vector<uint32_t>& consolidated_indices) noexcept
57{
58 const size_t num_cpus = get_num_cpus();
59
60 const size_t scalars_per_thread = numeric::ceil_div(scalars.size(), num_cpus);
61 std::vector<std::vector<uint32_t>> thread_indices(num_cpus);
62 parallel_for(num_cpus, [&](size_t thread_idx) {
63 bool empty_thread = (thread_idx * scalars_per_thread >= scalars.size());
64 bool last_thread = ((thread_idx + 1) * scalars_per_thread) >= scalars.size();
65 const size_t start = thread_idx * scalars_per_thread;
66 const size_t end = last_thread ? scalars.size() : (thread_idx + 1) * scalars_per_thread;
67 if (!empty_thread) {
68 BB_ASSERT_GT(end, start);
69 std::vector<uint32_t>& thread_scalar_indices = thread_indices[thread_idx];
70 thread_scalar_indices.reserve(end - start);
71 for (size_t i = start; i < end; ++i) {
72 BB_ASSERT_LT(i, scalars.size());
73 auto& scalar = scalars[i];
74 scalar.self_from_montgomery_form();
75
76 bool is_zero =
77 (scalar.data[0] == 0) && (scalar.data[1] == 0) && (scalar.data[2] == 0) && (scalar.data[3] == 0);
78 if (!is_zero) {
79 thread_scalar_indices.push_back(static_cast<uint32_t>(i));
80 }
81 }
82 }
83 });
84
85 size_t num_entries = 0;
86 for (size_t i = 0; i < num_cpus; ++i) {
87 BB_ASSERT_LT(i, thread_indices.size());
88 num_entries += thread_indices[i].size();
89 }
90 consolidated_indices.resize(num_entries);
91
92 parallel_for(num_cpus, [&](size_t thread_idx) {
93 size_t offset = 0;
94 for (size_t i = 0; i < thread_idx; ++i) {
95 BB_ASSERT_LT(i, thread_indices.size());
96 offset += thread_indices[i].size();
97 }
98 for (size_t i = offset; i < offset + thread_indices[thread_idx].size(); ++i) {
99 BB_ASSERT_LT(i, scalars.size());
100 consolidated_indices[i] = thread_indices[thread_idx][i - offset];
101 }
102 });
103}
104
116template <typename Curve>
118 std::vector<std::span<ScalarField>>& scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept
119{
120
121 const size_t num_msms = scalars.size();
122 msm_scalar_indices.resize(num_msms);
123 for (size_t i = 0; i < num_msms; ++i) {
124 BB_ASSERT_LT(i, scalars.size());
125 transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
126 }
127
128 size_t total_work = 0;
129 for (const auto& indices : msm_scalar_indices) {
130 total_work += indices.size();
131 }
132
133 const size_t num_threads = get_num_cpus();
134 std::vector<ThreadWorkUnits> work_units(num_threads);
135
136 const size_t work_per_thread = numeric::ceil_div(total_work, num_threads);
137 size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
138
139 // [(MSMs + T - 1) / T] * [T - 1] > MSMs
140 // T = 192
141 // ([M + 191] / 192) * 193 > M
142 // only use a single work unit if we don't have enough work for every thread
143 if (num_threads > total_work) {
144 for (size_t i = 0; i < num_msms; ++i) {
145 work_units[0].push_back(MSMWorkUnit{
146 .batch_msm_index = i,
147 .start_index = 0,
148 .size = msm_scalar_indices[i].size(),
149 });
150 }
151 return work_units;
152 }
153
154 size_t thread_accumulated_work = 0;
155 size_t current_thread_idx = 0;
156 for (size_t i = 0; i < num_msms; ++i) {
157 BB_ASSERT_LT(i, msm_scalar_indices.size());
158 size_t msm_work = msm_scalar_indices[i].size();
159 size_t msm_size = msm_work;
160 while (msm_work > 0) {
161 const size_t total_thread_work =
162 (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
163 const size_t available_thread_work = total_thread_work - thread_accumulated_work;
164
165 if (available_thread_work >= msm_work) {
166 BB_ASSERT_LT(current_thread_idx, work_units.size());
167 work_units[current_thread_idx].push_back(MSMWorkUnit{
168 .batch_msm_index = i,
169 .start_index = msm_size - msm_work,
170 .size = msm_work,
171 });
172 thread_accumulated_work += msm_work;
173 msm_work = 0;
174 } else {
175 BB_ASSERT_LT(current_thread_idx, work_units.size());
176 work_units[current_thread_idx].push_back(MSMWorkUnit{
177 .batch_msm_index = i,
178 .start_index = msm_size - msm_work,
179 .size = available_thread_work,
180 });
181 msm_work -= available_thread_work;
182 current_thread_idx++;
183 thread_accumulated_work = 0;
184 }
185 }
186 }
187 return work_units;
188}
189
200template <typename Curve>
201uint32_t MSM<Curve>::get_scalar_slice(const typename Curve::ScalarField& scalar,
202 size_t round,
203 size_t slice_size) noexcept
204{
205 size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);
206 // todo remove
207 bool last_slice = hi_bit < slice_size;
208 size_t target_slice_size = last_slice ? hi_bit : slice_size;
209 size_t lo_bit = last_slice ? 0 : hi_bit - slice_size;
210 size_t start_limb = lo_bit / 64;
211 size_t end_limb = hi_bit / 64;
212 size_t lo_slice_offset = lo_bit & 63;
213 size_t lo_slice_bits = std::min(target_slice_size, 64 - lo_slice_offset);
214 size_t hi_slice_bits = target_slice_size - lo_slice_bits;
215 size_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((static_cast<size_t>(1) << lo_slice_bits) - 1);
216 size_t hi_slice = (scalar.data[end_limb] & ((static_cast<size_t>(1) << hi_slice_bits) - 1));
217
218 uint32_t lo = static_cast<uint32_t>(lo_slice);
219 uint32_t hi = static_cast<uint32_t>(hi_slice);
220
221 uint32_t result = lo + (hi << lo_slice_bits);
222 return result;
223}
224
232template <typename Curve> size_t MSM<Curve>::get_optimal_log_num_buckets(const size_t num_points) noexcept
233{
234 // We do 2 group operations per bucket, and they are full 3D Jacobian adds which are ~2x more than an affine add
235 constexpr size_t COST_OF_BUCKET_OP_RELATIVE_TO_POINT = 5;
236 size_t cached_cost = static_cast<size_t>(-1);
237 size_t target_bit_slice = 0;
238 for (size_t bit_slice = 1; bit_slice < 20; ++bit_slice) {
239 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bit_slice);
240 const size_t num_buckets = 1 << bit_slice;
241 const size_t addition_cost = num_rounds * num_points;
242 const size_t bucket_cost = num_rounds * num_buckets * COST_OF_BUCKET_OP_RELATIVE_TO_POINT;
243 const size_t total_cost = addition_cost + bucket_cost;
244 if (total_cost < cached_cost) {
245 cached_cost = total_cost;
246 target_bit_slice = bit_slice;
247 }
248 }
249 return target_bit_slice;
250}
251
261template <typename Curve> bool MSM<Curve>::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
262{
263 if (num_points < 128) {
264 return false;
265 }
266
267 // Affine trick requires log(N) modular inversions per Pippenger round.
268 // It saves NUM_POINTS * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION field muls
269 // It also saves NUM_BUCKETS * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 field muls
270 // due to all our buckets having Z=1 if we use the affine trick
271
272 // COST_OF_INVERSION cost:
273 // Requires NUM_BITS_IN_FIELD sqarings
274 // We use 4-bit windows = ((NUM_BITS_IN_FIELD + 3) / 4) multiplications
275 // Computing 4-bit window table requires 14 muls
276 constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + 14;
277 constexpr size_t COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION = 5;
278 constexpr size_t EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 = 5;
279
280 double num_points_f = static_cast<double>(num_points);
281 double log2_num_points_f = log2(num_points_f);
282
283 size_t group_op_cost_saving_per_round = (num_points * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION) +
284 (num_buckets * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1);
285 double inversion_cost_per_round = log2_num_points_f * static_cast<double>(COST_OF_INVERSION);
286
287 return static_cast<double>(group_op_cost_saving_per_round) > inversion_cost_per_round;
288}
289
327template <typename Curve>
329 const size_t num_points,
330 typename Curve::BaseField* scratch_space) noexcept
331{
332 using Fq = typename Curve::BaseField;
333 Fq batch_inversion_accumulator = Fq::one();
334
335 for (size_t i = 0; i < num_points; i += 2) {
336 scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
337 points[i + 1].x -= points[i].x; // x2 - x1
338 points[i + 1].y -= points[i].y; // y2 - y1
339 points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old
340 batch_inversion_accumulator *= (points[i + 1].x);
341 }
342 if (batch_inversion_accumulator == 0) {
343 // prefer abort to throw for code that might emit from multiple threads
344 throw_or_abort("attempted to invert zero in add_affine_points");
345 } else {
346 batch_inversion_accumulator = batch_inversion_accumulator.invert();
347 }
348
349 // Iterate backwards through the points, comnputing pairwise affine additions; addition results are stored in the
350 // latter half of the array
351 for (size_t i = (num_points)-2; i < num_points; i -= 2) {
352 points[i + 1].y *= batch_inversion_accumulator; // update accumulator
353 batch_inversion_accumulator *= points[i + 1].x;
354 points[i + 1].x = points[i + 1].y.sqr();
355 points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
356 // - x1
357 // Memory bandwidth is a bit of a bottleneck here.
358 // There's probably a more elegant way of structuring our data so we don't need to do all of this
359 // prefetching
360 if (i >= 2) {
361 __builtin_prefetch(points + i - 2);
362 __builtin_prefetch(points + i - 1);
363 __builtin_prefetch(points + ((i + num_points - 2) >> 1));
364 __builtin_prefetch(scratch_space + ((i - 2) >> 1));
365 }
366 points[i].x -= points[(i + num_points) >> 1].x;
367 points[i].x *= points[i + 1].y;
368 points[(i + num_points) >> 1].y = points[i].x - points[i].y;
369 }
370}
371
379template <typename Curve>
381{
382 std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
383 const size_t size = nonzero_scalar_indices.size();
384 const size_t bits_per_slice = get_optimal_log_num_buckets(size);
385 const size_t num_buckets = 1 << bits_per_slice;
387 Element round_output = Curve::Group::point_at_infinity;
388
389 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
390
391 for (size_t i = 0; i < num_rounds; ++i) {
392 round_output = evaluate_small_pippenger_round(msm_data, i, bucket_data, round_output, bits_per_slice);
393 }
394 return round_output;
395}
396
404template <typename Curve>
406{
407 const size_t msm_size = msm_data.scalar_indices.size();
408 const size_t bits_per_slice = get_optimal_log_num_buckets(msm_size);
409 const size_t num_buckets = 1 << bits_per_slice;
410
411 if (!use_affine_trick(msm_size, num_buckets)) {
412 return small_pippenger_low_memory_with_transformed_scalars(msm_data);
413 }
414 // TODO(https://github.com/AztecProtocol/barretenberg/issues/1452): Consider allowing this memory to persist rather
415 // than allocating/deallocating on every execution.
417 BucketAccumulators bucket_data = BucketAccumulators(num_buckets);
418
419 Element round_output = Curve::Group::point_at_infinity;
420
421 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
422 for (size_t i = 0; i < num_rounds; ++i) {
423 round_output = evaluate_pippenger_round(msm_data, i, affine_data, bucket_data, round_output, bits_per_slice);
424 }
425
426 return (round_output);
427}
428
440template <typename Curve>
442 const size_t round_index,
444 typename Curve::Element previous_round_output,
445 const size_t bits_per_slice) noexcept
446{
447 std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
448 std::span<const ScalarField>& scalars = msm_data.scalars;
449 std::span<const AffineElement>& points = msm_data.points;
450
451 const size_t size = nonzero_scalar_indices.size();
452 for (size_t i = 0; i < size; ++i) {
453 BB_ASSERT_LT(nonzero_scalar_indices[i], scalars.size());
454 uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice);
455 BB_ASSERT_LT(bucket_index, static_cast<uint32_t>(1 << bits_per_slice));
456 if (bucket_index > 0) {
457 // do this check because we do not reset bucket_data.buckets after each round
458 // (i.e. not neccessarily at infinity)
459 if (bucket_data.bucket_exists.get(bucket_index)) {
460 bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]];
461 } else {
462 bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]];
463 bucket_data.bucket_exists.set(bucket_index, true);
464 }
465 }
466 }
467 Element round_output;
468 round_output.self_set_infinity();
469 round_output = accumulate_buckets(bucket_data);
470 bucket_data.bucket_exists.clear();
471 Element result = previous_round_output;
472 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
473 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
474 ? NUM_BITS_IN_FIELD % bits_per_slice
475 : bits_per_slice;
476 for (size_t i = 0; i < num_doublings; ++i) {
477 result.self_dbl();
478 }
479
480 result += round_output;
481 return result;
482}
483
496template <typename Curve>
498 const size_t round_index,
501 typename Curve::Element previous_round_output,
502 const size_t bits_per_slice) noexcept
503{
504 std::span<const uint32_t>& scalar_indices = msm_data.scalar_indices; // indices of nonzero scalars
505 std::span<const ScalarField>& scalars = msm_data.scalars;
506 std::span<const AffineElement>& points = msm_data.points;
507 std::span<uint64_t>& round_schedule = msm_data.point_schedule;
508 const size_t size = scalar_indices.size();
509
510 // Construct a "round schedule". Each entry describes:
511 // 1. low 32 bits: which bucket index do we add the point into? (bucket index = slice value)
512 // 2. high 32 bits: which point index do we source the point from?
513 for (size_t i = 0; i < size; ++i) {
514 BB_ASSERT_LT(scalar_indices[i], scalars.size());
515 round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice);
516 round_schedule[i] += (static_cast<uint64_t>(scalar_indices[i]) << 32ULL);
517 }
518 // Sort our point schedules based on their bucket values. Reduces memory throughput in next step of algo
519 const size_t num_zero_entries = scalar_multiplication::process_buckets_count_zero_entries(
520 &round_schedule[0], size, static_cast<uint32_t>(bits_per_slice));
521 BB_ASSERT_LTE(num_zero_entries, size);
522 const size_t round_size = size - num_zero_entries;
523
524 Element round_output;
525 round_output.self_set_infinity();
526
527 if (round_size > 0) {
528 std::span<uint64_t> point_schedule(&round_schedule[num_zero_entries], round_size);
529 // Iterate through our point schedule and add points into corresponding buckets
530 consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0);
531 round_output = accumulate_buckets(bucket_data);
532 bucket_data.bucket_exists.clear();
533 }
534
535 Element result = previous_round_output;
536 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
537 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
538 ? NUM_BITS_IN_FIELD % bits_per_slice
539 : bits_per_slice;
540 for (size_t i = 0; i < num_doublings; ++i) {
541 result.self_dbl();
542 }
543
544 result += round_output;
545 return result;
546}
547
560template <typename Curve>
565 size_t num_input_points_processed,
566 size_t num_queued_affine_points) noexcept
567{
568
569 size_t point_it = num_input_points_processed;
570 size_t affine_input_it = num_queued_affine_points;
571 // N.B. points and point_schedule MAY HAVE DIFFERENT SIZES
572 // We source the number of actual points we work on from the point schedule
573 size_t num_points = point_schedule.size();
574 auto& bucket_accumulator_exists = bucket_data.bucket_exists;
575 auto& affine_addition_scratch_space = affine_data.points_to_add;
576 auto& bucket_accumulators = bucket_data.buckets;
577 auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations;
578 auto& scalar_scratch_space = affine_data.scalar_scratch_space;
579 auto& output_point_schedule = affine_data.addition_result_bucket_destinations;
580 AffineElement null_location{};
581 // We do memory prefetching, `prefetch_max` ensures we do not overflow our containers
582 size_t prefetch_max = (num_points - 32);
583 if (num_points < 32) {
584 prefetch_max = 0;
585 }
586 size_t end = num_points - 1;
587 if (num_points == 0) {
588 end = 0;
589 }
590
591 // Step 1: Fill up `affine_addition_scratch_space` with up to AffineAdditionData::BATCH_SIZE/2 independent additions
592 while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) {
593
594 // we prefetchin'
595 if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) {
596 for (size_t i = 16; i < 32; ++i) {
597 __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]);
598 }
599 }
600
601 // We do some branchless programming here to minimize instruction pipeline flushes
602 // TODO(@zac-williamson, cc @ludamad) check these ternary operators are not branching!
603 // We are iterating through our points and can come across the following scenarios:
604 // 1: The next 2 points in `point_schedule` belong to the *same* bucket
605 // (happy path - can put both points into affine_addition_scratch_space)
606 // 2: The next 2 points have different bucket destinations AND point_schedule[point_it].bucket contains a point
607 // (happyish path - we can put points[lhs_schedule] and buckets[lhs_bucket] into
608 // affine_addition_scratch_space)
609 // 3: The next 2 points have different bucket destionations AND point_schedule[point_it].bucket is empty
610 // We cache points[lhs_schedule] into buckets[lhs_bucket]
611 // We iterate `point_it` by 2 (case 1), or by 1 (case 2 or 3). The number of points we add into
612 // `affine_addition_scratch_space` is 2 (case 1 or 2) or 0 (case 3).
613 uint64_t lhs_schedule = point_schedule[point_it];
614 uint64_t rhs_schedule = point_schedule[point_it + 1];
615 size_t lhs_bucket = static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
616 size_t rhs_bucket = static_cast<size_t>(rhs_schedule) & 0xFFFFFFFF;
617 size_t lhs_point = static_cast<size_t>(lhs_schedule >> 32);
618 size_t rhs_point = static_cast<size_t>(rhs_schedule >> 32);
619
620 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
621 bool buckets_match = lhs_bucket == rhs_bucket;
622 bool do_affine_add = buckets_match || has_bucket_accumulator;
623
624 const AffineElement* lhs_source = &points[lhs_point];
625 const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket];
626
627 // either two points are set to be added (point to point or point into bucket accumulator), or lhs is stored in
628 // the bucket and rhs is temporarily ignored
629 AffineElement* lhs_destination =
630 do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket];
631 AffineElement* rhs_destination =
632 do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location;
633
634 // if performing an affine add, set the destination bucket corresponding to the addition result
635 uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1];
636 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
637
638 // unconditional swap. No if statements here.
639 *lhs_destination = *lhs_source;
640 *rhs_destination = *rhs_source;
641
642 // indicate whether bucket_accumulators[lhs_bucket] will contain a point after this iteration
643 bucket_accumulator_exists.set(
644 lhs_bucket,
645 (has_bucket_accumulator && buckets_match) || /* bucket has an accum and its not being used in current add */
646 !do_affine_add); /* lhs point is cached into the bucket */
647
648 affine_input_it += do_affine_add ? 2 : 0;
649 point_it += (do_affine_add && buckets_match) ? 2 : 1;
650 }
651 // We have to handle the last point as an edge case so that we dont overflow the bounds of `point_schedule`. If the
652 // bucket accumulator exists, we add the point to it, otherwise the point simply becomes the bucket accumulator.
653 if (point_it == num_points - 1) {
654 uint64_t lhs_schedule = point_schedule[point_it];
655 size_t lhs_bucket = static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
656 size_t lhs_point = static_cast<size_t>(lhs_schedule >> 32);
657 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
658
659 if (has_bucket_accumulator) { // point is added to its bucket accumulator
660 affine_addition_scratch_space[affine_input_it] = points[lhs_point];
661 affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket];
662 bucket_accumulator_exists.set(lhs_bucket, false);
663 affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket;
664 affine_input_it += 2;
665 point_it += 1;
666 } else { // otherwise, cache the point into the bucket
667 BB_ASSERT_LT(lhs_point, points.size());
668 bucket_accumulators[lhs_bucket] = points[lhs_point];
669 bucket_accumulator_exists.set(lhs_bucket, true);
670 point_it += 1;
671 }
672 }
673
674 // Now that we have populated `affine_addition_scratch_space`,
675 // compute `num_affine_points_to_add` independent additions using the Affine trick
676 size_t num_affine_points_to_add = affine_input_it;
677 if (num_affine_points_to_add >= 2) {
678 add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]);
679 }
680 // `add_affine_points` stores the result in the top-half of the used scratch space
681 G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2);
682
683 // Process the addition outputs.
684 // We either need to feed the addition outputs back into affine_addition_scratch_space for more addition operations.
685 // Or, if there are no more additions for a bucket, we store the addition output in a bucket accumulator.
686 size_t new_scratch_space_it = 0;
687 size_t affine_output_it = 0;
688 size_t num_affine_output_points = num_affine_points_to_add / 2;
689 // This algorithm is equivalent to the one we used to populate `affine_addition_scratch_space` from the point
690 // schedule, however here we source points from a different location (the addition results)
691 while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) {
692 size_t lhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
693 size_t rhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it + 1]);
694 BB_ASSERT_LT(lhs_bucket, bucket_accumulator_exists.size());
695
696 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
697 bool buckets_match = (lhs_bucket == rhs_bucket);
698 bool do_affine_add = buckets_match || has_bucket_accumulator;
699
700 const AffineElement* lhs_source = &affine_output[affine_output_it];
701 const AffineElement* rhs_source =
702 buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket];
703
704 AffineElement* lhs_destination =
705 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket];
706 AffineElement* rhs_destination =
707 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location;
708
709 uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1];
710 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
711
712 *lhs_destination = *lhs_source;
713 *rhs_destination = *rhs_source;
714
715 bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
716 new_scratch_space_it += do_affine_add ? 2 : 0;
717 affine_output_it += (do_affine_add && buckets_match) ? 2 : 1;
718 }
719 // perform final iteration as edge case so we don't overflow `affine_addition_output_bucket_destinations`
720 if (affine_output_it == (num_affine_output_points - 1)) {
721
722 size_t lhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
723
724 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
725 if (has_bucket_accumulator) {
726 BB_ASSERT_LT(new_scratch_space_it + 1, affine_addition_scratch_space.size());
727 BB_ASSERT_LT(lhs_bucket, bucket_accumulators.size());
728 BB_ASSERT_LT(new_scratch_space_it >> 1, output_point_schedule.size());
729 affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it];
730 affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket];
731 bucket_accumulator_exists.set(lhs_bucket, false);
732 output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket;
733 new_scratch_space_it += 2;
734 affine_output_it += 1;
735 } else {
736 bucket_accumulators[lhs_bucket] = affine_output[affine_output_it];
737 bucket_accumulator_exists.set(lhs_bucket, true);
738 affine_output_it += 1;
739 }
740 }
741
742 // If we have not finished iterating over the point schedule,
743 // OR we have affine additions to perform in the scratch space, continue
744 if (point_it < num_points || new_scratch_space_it != 0) {
745 consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it);
746 }
747}
748
762template <typename Curve>
763std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
765 std::vector<std::span<ScalarField>>& scalars,
766 bool handle_edge_cases) noexcept
767{
768 BB_ASSERT_EQ(points.size(), scalars.size());
769 const size_t num_msms = points.size();
770
771 std::vector<std::vector<uint32_t>> msm_scalar_indices;
772 std::vector<ThreadWorkUnits> thread_work_units = get_work_units(scalars, msm_scalar_indices);
773 const size_t num_cpus = get_num_cpus();
774 std::vector<std::vector<std::pair<Element, size_t>>> thread_msm_results(num_cpus);
775 BB_ASSERT_EQ(thread_work_units.size(), num_cpus);
776
777 // Once we have our work units, each thread can independently evaluate its assigned msms
778 parallel_for(num_cpus, [&](size_t thread_idx) {
779 if (!thread_work_units[thread_idx].empty()) {
780 const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
781 std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
782 for (const MSMWorkUnit& msm : msms) {
783 std::span<const ScalarField> work_scalars = scalars[msm.batch_msm_index];
784 std::span<const AffineElement> work_points = points[msm.batch_msm_index];
785 std::span<const uint32_t> work_indices =
786 std::span<const uint32_t>{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size };
787 std::vector<uint64_t> point_schedule(msm.size);
788 MSMData msm_data(work_scalars, work_points, work_indices, std::span<uint64_t>(point_schedule));
789 Element msm_result = Curve::Group::point_at_infinity;
790 constexpr size_t SINGLE_MUL_THRESHOLD = 16;
791 if (msm.size < SINGLE_MUL_THRESHOLD) {
792 msm_result = small_mul<Curve>(work_scalars, work_points, msm_data.scalar_indices, msm.size);
793 } else {
794 // Our non-affine method implicitly handles cases where Weierstrass edge cases may occur
795 // Note: not as fast! use unsafe version if you know all input base points are linearly independent
796 if (handle_edge_cases) {
797 msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data);
798 } else {
799 msm_result = pippenger_low_memory_with_transformed_scalars(msm_data);
800 }
801 }
802 msm_results.push_back(std::make_pair(msm_result, msm.batch_msm_index));
803 }
804 }
805 });
806
807 // Accumulate results. This part needs to be single threaded, but amount of work done here should be small
808 // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
809 // will be painful.
810 std::vector<Element> results(num_msms);
811 for (Element& ele : results) {
812 ele.self_set_infinity();
813 }
814 for (const auto& single_thread_msm_results : thread_msm_results) {
815 for (const std::pair<Element, size_t>& result : single_thread_msm_results) {
816 results[result.second] += result.first;
817 }
818 }
819 Element::batch_normalize(&results[0], num_msms);
820
821 std::vector<AffineElement> affine_results;
822 for (const auto& ele : results) {
823 affine_results.emplace_back(AffineElement(ele.x, ele.y));
824 }
825
826 // Convert our scalars back into Montgomery form so they remain unchanged
827 for (auto& msm_scalars : scalars) {
828 parallel_for_range(msm_scalars.size(), [&](size_t start, size_t end) {
829 for (size_t i = start; i < end; ++i) {
830 msm_scalars[i].self_to_montgomery_form();
831 }
832 });
833 }
834 return affine_results;
835}
836
845template <typename Curve>
848 bool handle_edge_cases) noexcept
849{
850 if (_scalars.size() == 0) {
851 return Curve::Group::affine_point_at_infinity;
852 }
853 BB_ASSERT_GTE(points.size(), _scalars.start_index + _scalars.size());
854
855 // unfortnately we need to remove const on this data type to prevent duplicating _scalars (which is typically
856 // large) We need to convert `_scalars` out of montgomery form for the MSM. We then convert the scalars back
857 // into Montgomery form at the end of the algorithm. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
858 // TODO(https://github.com/AztecProtocol/barretenberg/issues/1449): handle const correctness.
859 ScalarField* scalars = const_cast<ScalarField*>(&_scalars[_scalars.start_index]);
860
861 std::vector<std::span<const AffineElement>> pp{ points.subspan(_scalars.start_index) };
862 std::vector<std::span<ScalarField>> ss{ std::span<ScalarField>(scalars, _scalars.size()) };
863 AffineElement result = batch_multi_scalar_mul(pp, ss, handle_edge_cases)[0];
864 return result;
865}
866
867template <typename Curve>
870 [[maybe_unused]] bool handle_edge_cases) noexcept
871{
872 return MSM<Curve>::msm(points, scalars, handle_edge_cases);
873}
874
875template <typename Curve>
881
884 bool handle_edge_cases = true) noexcept;
885
886template curve::Grumpkin::Element pippenger_unsafe<curve::Grumpkin>(
887 PolynomialSpan<const curve::Grumpkin::ScalarField> scalars, std::span<const curve::Grumpkin::AffineElement> points);
888
889template curve::BN254::Element pippenger<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,
890 std::span<const curve::BN254::AffineElement> points,
891 bool handle_edge_cases = true);
892
893template curve::BN254::Element pippenger_unsafe<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,
894 std::span<const curve::BN254::AffineElement> points);
895
896} // namespace bb::scalar_multiplication
897
898template class bb::scalar_multiplication::MSM<bb::curve::Grumpkin>;
899template class bb::scalar_multiplication::MSM<bb::curve::BN254>;
#define BB_ASSERT_GTE(left, right,...)
Definition assert.hpp:101
#define BB_ASSERT_GT(left, right,...)
Definition assert.hpp:87
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:59
#define BB_ASSERT_LTE(left, right,...)
Definition assert.hpp:129
#define BB_ASSERT_LT(left, right,...)
Definition assert.hpp:115
typename Group::element Element
Definition grumpkin.hpp:55
typename Group::affine_element AffineElement
Definition grumpkin.hpp:56
static Element evaluate_small_pippenger_round(MSMData &msm_data, const size_t round_index, JacobianBucketAccumulators &bucket_data, Element previous_round_output, const size_t bits_per_slice) noexcept
Evaluate a single Pippenger round when we do not use the Affine trick.
static Element small_pippenger_low_memory_with_transformed_scalars(MSMData &msm_data) noexcept
Top-level Pippenger algorithm where number of points is small and we are not using the Affine trick.
static Element pippenger_low_memory_with_transformed_scalars(MSMData &msm_data) noexcept
Top-level Pippenger algorithm.
static uint32_t get_scalar_slice(const ScalarField &scalar, size_t round, size_t normal_slice_size) noexcept
Given a scalar that is NOT in Montgomery form, extract a slice_size-bit chunk.
static bool use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
Given a number of points and an optimal bucket size, should we use the affine trick?
static void add_affine_points(AffineElement *points, const size_t num_points, typename Curve::BaseField *scratch_space) noexcept
static size_t get_optimal_log_num_buckets(const size_t num_points) noexcept
For a given number of points, compute the optimal Pippenger bucket size.
static std::vector< AffineElement > batch_multi_scalar_mul(std::vector< std::span< const AffineElement > > &points, std::vector< std::span< ScalarField > > &scalars, bool handle_edge_cases=true) noexcept
Compute multiple multi-scalar multiplications.
static void consume_point_schedule(std::span< const uint64_t > point_schedule, std::span< const AffineElement > points, AffineAdditionData &affine_data, BucketAccumulators &bucket_data, size_t num_input_points_processed, size_t num_queued_affine_points) noexcept
Given a list of points and target buckets to add into, perform required group operations.
static std::vector< ThreadWorkUnits > get_work_units(std::vector< std::span< ScalarField > > &scalars, std::vector< std::vector< uint32_t > > &msm_scalar_indices) noexcept
Split a multiple multi-scalar-multiplication into equal units of work that can be processed by thread...
static Element evaluate_pippenger_round(MSMData &msm_data, const size_t round_index, AffineAdditionData &affine_data, BucketAccumulators &bucket_data, Element previous_round_output, const size_t bits_per_slice) noexcept
Evaluate a single Pippenger round where we use the affine trick.
static void transform_scalar_and_get_nonzero_scalar_indices(std::span< typename Curve::ScalarField > scalars, std::vector< uint32_t > &consolidated_indices) noexcept
Convert scalar out of Montgomery form. Populate consolidated_indices with nonzero scalar indices.
typename Curve::ScalarField ScalarField
typename Curve::AffineElement AffineElement
ssize_t offset
Definition engine.cpp:36
constexpr T ceil_div(const T &numerator, const T &denominator)
Computes the ceiling of the division of two integral types.
Definition general.hpp:23
Curve::Element pippenger(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases) noexcept
Curve::Element small_mul(std::span< const typename Curve::ScalarField > &scalars, std::span< const typename Curve::AffineElement > &points, std::span< const uint32_t > scalar_indices, size_t range) noexcept
Fallback method for very small numbers of input points.
size_t process_buckets_count_zero_entries(uint64_t *wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept
Curve::Element pippenger_unsafe(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points) noexcept
Entry point for Barretenberg command-line interface.
size_t get_num_cpus()
Definition thread.hpp:12
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:72
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:102
STL namespace.
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::Element Element
static constexpr field one()
constexpr field invert() const noexcept
BB_INLINE constexpr field sqr() const noexcept
Temp data structure, one created per thread!
Temp data structure, one created per thread!
MSMWorkUnit describes an MSM that may be part of a larger MSM.
void throw_or_abort(std::string const &err)