56 std::vector<uint32_t>& consolidated_indices)
noexcept
63 bool empty_thread = (thread_idx * scalars_per_thread >= scalars.size());
64 bool last_thread = ((thread_idx + 1) * scalars_per_thread) >= scalars.size();
65 const size_t start = thread_idx * scalars_per_thread;
66 const size_t end = last_thread ? scalars.size() : (thread_idx + 1) * scalars_per_thread;
69 std::vector<uint32_t>& thread_scalar_indices = thread_indices[thread_idx];
70 thread_scalar_indices.reserve(end - start);
71 for (
size_t i = start; i < end; ++i) {
73 auto& scalar = scalars[i];
74 scalar.self_from_montgomery_form();
77 (scalar.data[0] == 0) && (scalar.data[1] == 0) && (scalar.data[2] == 0) && (scalar.data[3] == 0);
79 thread_scalar_indices.push_back(
static_cast<uint32_t
>(i));
85 size_t num_entries = 0;
86 for (
size_t i = 0; i < num_cpus; ++i) {
88 num_entries += thread_indices[i].size();
90 consolidated_indices.resize(num_entries);
94 for (
size_t i = 0; i < thread_idx; ++i) {
96 offset += thread_indices[i].size();
98 for (
size_t i =
offset; i <
offset + thread_indices[thread_idx].size(); ++i) {
100 consolidated_indices[i] = thread_indices[thread_idx][i -
offset];
118 std::vector<
std::span<ScalarField>>& scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices)
noexcept
121 const size_t num_msms = scalars.size();
122 msm_scalar_indices.resize(num_msms);
123 for (
size_t i = 0; i < num_msms; ++i) {
125 transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
128 size_t total_work = 0;
129 for (
const auto& indices : msm_scalar_indices) {
130 total_work += indices.size();
137 size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
143 if (num_threads > total_work) {
144 for (
size_t i = 0; i < num_msms; ++i) {
148 .size = msm_scalar_indices[i].size(),
154 size_t thread_accumulated_work = 0;
155 size_t current_thread_idx = 0;
156 for (
size_t i = 0; i < num_msms; ++i) {
158 size_t msm_work = msm_scalar_indices[i].size();
159 size_t msm_size = msm_work;
160 while (msm_work > 0) {
161 const size_t total_thread_work =
162 (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
163 const size_t available_thread_work = total_thread_work - thread_accumulated_work;
165 if (available_thread_work >= msm_work) {
167 work_units[current_thread_idx].push_back(
MSMWorkUnit{
169 .start_index = msm_size - msm_work,
172 thread_accumulated_work += msm_work;
176 work_units[current_thread_idx].push_back(
MSMWorkUnit{
178 .start_index = msm_size - msm_work,
179 .size = available_thread_work,
181 msm_work -= available_thread_work;
182 current_thread_idx++;
183 thread_accumulated_work = 0;
329 const size_t num_points,
333 Fq batch_inversion_accumulator =
Fq::one();
335 for (
size_t i = 0; i < num_points; i += 2) {
336 scratch_space[i >> 1] = points[i].x + points[i + 1].x;
337 points[i + 1].x -= points[i].x;
338 points[i + 1].y -= points[i].y;
339 points[i + 1].y *= batch_inversion_accumulator;
340 batch_inversion_accumulator *= (points[i + 1].x);
342 if (batch_inversion_accumulator == 0) {
346 batch_inversion_accumulator = batch_inversion_accumulator.
invert();
351 for (
size_t i = (num_points)-2; i < num_points; i -= 2) {
352 points[i + 1].y *= batch_inversion_accumulator;
353 batch_inversion_accumulator *= points[i + 1].x;
354 points[i + 1].x = points[i + 1].y.
sqr();
355 points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]);
361 __builtin_prefetch(points + i - 2);
362 __builtin_prefetch(points + i - 1);
363 __builtin_prefetch(points + ((i + num_points - 2) >> 1));
364 __builtin_prefetch(scratch_space + ((i - 2) >> 1));
366 points[i].x -= points[(i + num_points) >> 1].x;
367 points[i].x *= points[i + 1].y;
368 points[(i + num_points) >> 1].y = points[i].x - points[i].y;
442 const size_t round_index,
445 const size_t bits_per_slice)
noexcept
447 std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
448 std::span<const ScalarField>& scalars = msm_data.scalars;
449 std::span<const AffineElement>& points = msm_data.points;
451 const size_t size = nonzero_scalar_indices.size();
452 for (
size_t i = 0; i < size; ++i) {
453 BB_ASSERT_LT(nonzero_scalar_indices[i], scalars.size());
454 uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice);
455 BB_ASSERT_LT(bucket_index,
static_cast<uint32_t
>(1 << bits_per_slice));
456 if (bucket_index > 0) {
459 if (bucket_data.bucket_exists.get(bucket_index)) {
460 bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]];
462 bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]];
463 bucket_data.bucket_exists.set(bucket_index,
true);
468 round_output.self_set_infinity();
469 round_output = accumulate_buckets(bucket_data);
470 bucket_data.bucket_exists.clear();
471 Element result = previous_round_output;
473 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
474 ? NUM_BITS_IN_FIELD % bits_per_slice
476 for (
size_t i = 0; i < num_doublings; ++i) {
480 result += round_output;
498 const size_t round_index,
502 const size_t bits_per_slice)
noexcept
504 std::span<const uint32_t>& scalar_indices = msm_data.scalar_indices;
505 std::span<const ScalarField>& scalars = msm_data.scalars;
506 std::span<const AffineElement>& points = msm_data.points;
507 std::span<uint64_t>& round_schedule = msm_data.point_schedule;
508 const size_t size = scalar_indices.size();
513 for (
size_t i = 0; i < size; ++i) {
515 round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice);
516 round_schedule[i] += (
static_cast<uint64_t
>(scalar_indices[i]) << 32ULL);
520 &round_schedule[0], size,
static_cast<uint32_t
>(bits_per_slice));
522 const size_t round_size = size - num_zero_entries;
525 round_output.self_set_infinity();
527 if (round_size > 0) {
528 std::span<uint64_t> point_schedule(&round_schedule[num_zero_entries], round_size);
530 consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0);
531 round_output = accumulate_buckets(bucket_data);
532 bucket_data.bucket_exists.clear();
535 Element result = previous_round_output;
537 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
538 ? NUM_BITS_IN_FIELD % bits_per_slice
540 for (
size_t i = 0; i < num_doublings; ++i) {
544 result += round_output;
565 size_t num_input_points_processed,
566 size_t num_queued_affine_points)
noexcept
569 size_t point_it = num_input_points_processed;
570 size_t affine_input_it = num_queued_affine_points;
573 size_t num_points = point_schedule.size();
574 auto& bucket_accumulator_exists = bucket_data.bucket_exists;
575 auto& affine_addition_scratch_space = affine_data.points_to_add;
576 auto& bucket_accumulators = bucket_data.buckets;
577 auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations;
578 auto& scalar_scratch_space = affine_data.scalar_scratch_space;
579 auto& output_point_schedule = affine_data.addition_result_bucket_destinations;
582 size_t prefetch_max = (num_points - 32);
583 if (num_points < 32) {
586 size_t end = num_points - 1;
587 if (num_points == 0) {
592 while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) {
595 if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) {
596 for (
size_t i = 16; i < 32; ++i) {
597 __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]);
613 uint64_t lhs_schedule = point_schedule[point_it];
614 uint64_t rhs_schedule = point_schedule[point_it + 1];
615 size_t lhs_bucket =
static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
616 size_t rhs_bucket =
static_cast<size_t>(rhs_schedule) & 0xFFFFFFFF;
617 size_t lhs_point =
static_cast<size_t>(lhs_schedule >> 32);
618 size_t rhs_point =
static_cast<size_t>(rhs_schedule >> 32);
620 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
621 bool buckets_match = lhs_bucket == rhs_bucket;
622 bool do_affine_add = buckets_match || has_bucket_accumulator;
625 const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket];
630 do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket];
632 do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location;
635 uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1];
636 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
639 *lhs_destination = *lhs_source;
640 *rhs_destination = *rhs_source;
643 bucket_accumulator_exists.set(
645 (has_bucket_accumulator && buckets_match) ||
648 affine_input_it += do_affine_add ? 2 : 0;
649 point_it += (do_affine_add && buckets_match) ? 2 : 1;
653 if (point_it == num_points - 1) {
654 uint64_t lhs_schedule = point_schedule[point_it];
655 size_t lhs_bucket =
static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
656 size_t lhs_point =
static_cast<size_t>(lhs_schedule >> 32);
657 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
659 if (has_bucket_accumulator) {
660 affine_addition_scratch_space[affine_input_it] = points[lhs_point];
661 affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket];
662 bucket_accumulator_exists.set(lhs_bucket,
false);
663 affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket;
664 affine_input_it += 2;
668 bucket_accumulators[lhs_bucket] = points[lhs_point];
669 bucket_accumulator_exists.set(lhs_bucket,
true);
676 size_t num_affine_points_to_add = affine_input_it;
677 if (num_affine_points_to_add >= 2) {
678 add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]);
681 G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2);
686 size_t new_scratch_space_it = 0;
687 size_t affine_output_it = 0;
688 size_t num_affine_output_points = num_affine_points_to_add / 2;
691 while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) {
692 size_t lhs_bucket =
static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
693 size_t rhs_bucket =
static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it + 1]);
694 BB_ASSERT_LT(lhs_bucket, bucket_accumulator_exists.size());
696 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
697 bool buckets_match = (lhs_bucket == rhs_bucket);
698 bool do_affine_add = buckets_match || has_bucket_accumulator;
700 const AffineElement* lhs_source = &affine_output[affine_output_it];
702 buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket];
705 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket];
707 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location;
709 uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1];
710 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
712 *lhs_destination = *lhs_source;
713 *rhs_destination = *rhs_source;
715 bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
716 new_scratch_space_it += do_affine_add ? 2 : 0;
717 affine_output_it += (do_affine_add && buckets_match) ? 2 : 1;
720 if (affine_output_it == (num_affine_output_points - 1)) {
722 size_t lhs_bucket =
static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
724 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
725 if (has_bucket_accumulator) {
726 BB_ASSERT_LT(new_scratch_space_it + 1, affine_addition_scratch_space.size());
728 BB_ASSERT_LT(new_scratch_space_it >> 1, output_point_schedule.size());
729 affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it];
730 affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket];
731 bucket_accumulator_exists.set(lhs_bucket,
false);
732 output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket;
733 new_scratch_space_it += 2;
734 affine_output_it += 1;
736 bucket_accumulators[lhs_bucket] = affine_output[affine_output_it];
737 bucket_accumulator_exists.set(lhs_bucket,
true);
738 affine_output_it += 1;
744 if (point_it < num_points || new_scratch_space_it != 0) {
745 consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it);
766 bool handle_edge_cases)
noexcept
769 const size_t num_msms = points.size();
779 if (!thread_work_units[thread_idx].empty()) {
783 std::span<const ScalarField> work_scalars = scalars[msm.batch_msm_index];
784 std::span<const AffineElement> work_points = points[msm.batch_msm_index];
785 std::span<const uint32_t> work_indices =
786 std::span<const uint32_t>{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size };
787 std::vector<uint64_t> point_schedule(msm.size);
788 MSMData msm_data(work_scalars, work_points, work_indices, std::span<uint64_t>(point_schedule));
789 Element msm_result = Curve::Group::point_at_infinity;
790 constexpr size_t SINGLE_MUL_THRESHOLD = 16;
791 if (msm.size < SINGLE_MUL_THRESHOLD) {
792 msm_result = small_mul<Curve>(work_scalars, work_points, msm_data.
scalar_indices, msm.size);
796 if (handle_edge_cases) {
797 msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data);
799 msm_result = pippenger_low_memory_with_transformed_scalars(msm_data);
802 msm_results.push_back(
std::make_pair(msm_result, msm.batch_msm_index));
810 std::vector<Element> results(num_msms);
812 ele.self_set_infinity();
814 for (
const auto& single_thread_msm_results : thread_msm_results) {
816 results[result.second] += result.first;
819 Element::batch_normalize(&results[0], num_msms);
821 std::vector<AffineElement> affine_results;
822 for (
const auto& ele : results) {
827 for (
auto& msm_scalars : scalars) {
829 for (size_t i = start; i < end; ++i) {
830 msm_scalars[i].self_to_montgomery_form();
834 return affine_results;