19 const std::span<G1>& points,
const std::vector<size_t>& sequence_counts)
24 std::span<Fq> scratch_space(scratch_space_vector);
27 auto [addition_sequences_, sequence_tags] = construct_thread_data(points, sequence_counts, scratch_space);
28 auto& addition_sequences = addition_sequences_;
30 const size_t num_threads = addition_sequences.size();
31 parallel_for(num_threads, [&](
size_t thread_idx) { batched_affine_add_in_place(addition_sequences[thread_idx]); });
35 size_t prev_tag = std::numeric_limits<size_t>::max();
36 for (
auto [sequences, tags] :
zip_view(addition_sequences, sequence_tags)) {
38 for (
size_t i = 0; i < sequences.sequence_counts.size(); ++i) {
39 if (tags[i] == prev_tag) {
40 reduced_points.back() = reduced_points.back() + sequences.points[i];
42 reduced_points.emplace_back(sequences.points[i]);
48 return reduced_points;
53 const std::span<G1>& points,
const std::vector<size_t>& sequence_counts,
const std::span<Fq>& scratch_space)
56 std::vector<size_t> sequence_endpoints;
57 size_t total_count = 0;
58 for (
const auto& count : sequence_counts) {
60 sequence_endpoints.emplace_back(total_count);
63 if (points.size() != total_count) {
64 throw_or_abort(
"Number of input points does not match sequence counts!");
68 const size_t MIN_POINTS_PER_THREAD = 1 << 14;
69 const size_t total_num_points = points.size();
70 const size_t optimal_threads = total_num_points / MIN_POINTS_PER_THREAD;
73 const size_t base_thread_size = total_num_points / num_threads;
74 const size_t leftover_size = total_num_points % num_threads;
75 std::vector<size_t> thread_sizes(num_threads, base_thread_size);
76 for (
size_t i = 0; i < leftover_size; ++i) {
83 std::vector<size_t> thread_endpoints;
84 size_t point_index = 0;
85 for (
auto size : thread_sizes) {
86 thread_points.push_back(points.subspan(point_index, size));
87 thread_scratch_space.push_back(scratch_space.subspan(point_index, size));
89 thread_endpoints.emplace_back(point_index);
95 std::vector<size_t> all_endpoints;
96 all_endpoints.reserve(thread_endpoints.size() + sequence_endpoints.size());
97 all_endpoints.insert(all_endpoints.end(), thread_endpoints.begin(), thread_endpoints.end());
98 all_endpoints.insert(all_endpoints.end(), sequence_endpoints.begin(), sequence_endpoints.end());
99 std::sort(all_endpoints.begin(), all_endpoints.end());
100 auto last = std::unique(all_endpoints.begin(), all_endpoints.end());
101 all_endpoints.erase(last, all_endpoints.end());
104 size_t prev_endpoint = 0;
105 size_t thread_idx = 0;
106 size_t sequence_idx = 0;
109 for (
auto& endpoint : all_endpoints) {
110 size_t chunk_size = endpoint - prev_endpoint;
111 thread_sequence_counts[thread_idx].emplace_back(chunk_size);
112 thread_sequence_tags[thread_idx].emplace_back(sequence_idx);
113 if (endpoint == thread_endpoints[thread_idx]) {
116 if (endpoint == sequence_endpoints[sequence_idx]) {
119 prev_endpoint = endpoint;
122 if (thread_sequence_counts.size() != thread_points.size()) {
128 for (
size_t i = 0; i < num_threads; ++i) {
129 addition_sequences.push_back(
130 AdditionSequences{ thread_sequence_counts[i], thread_points[i], thread_scratch_space[i] });
133 return { addition_sequences, thread_sequence_tags };
140 auto points = add_sequences.
points;
144 size_t total_num_pairs{ 0 };
145 for (
auto& count : sequence_counts) {
146 total_num_pairs += count >> 1;
151 std::span<Fq> denominators = add_sequences.
scratch_space.subspan(0, total_num_pairs);
152 std::span<Fq> differences = add_sequences.
scratch_space.subspan(total_num_pairs, 2 * total_num_pairs);
156 size_t point_idx = 0;
158 for (
auto& count : sequence_counts) {
159 const auto num_pairs = count >> 1;
160 for (
size_t j = 0; j < num_pairs; ++j) {
162 const auto& x1 = points[point_idx++].x;
163 const auto& x2 = points[point_idx++].x;
169 differences[pair_idx] = diff;
172 denominators[pair_idx++] = accumulator;
176 point_idx += (count & 0x01ULL);
180 Fq inverse = accumulator.invert();
183 for (
size_t i = 0; i < total_num_pairs; ++i) {
184 size_t idx = total_num_pairs - 1 - i;
185 denominators[idx] *= inverse;
186 inverse *= differences[idx];
195 const size_t num_points = add_sequences.
points.size();
196 if (num_points == 0 || num_points == 1) {
201 std::span<Fq> denominators = batch_compute_point_addition_slope_inverses(add_sequences);
203 auto points = add_sequences.
points;
207 size_t point_idx = 0;
208 size_t result_point_idx = 0;
210 bool more_additions =
false;
211 for (
auto& count : sequence_counts) {
212 const auto num_pairs = count >> 1;
213 const bool overflow =
static_cast<bool>(count & 0x01ULL);
215 for (
size_t j = 0; j < num_pairs; ++j) {
216 const auto& point_1 = points[point_idx++];
217 const auto& point_2 = points[point_idx++];
218 const auto& denominator = denominators[pair_idx++];
219 auto& result = points[result_point_idx++];
221 result = affine_add_with_denominator(point_1, point_2, denominator);
225 points[result_point_idx++] = points[point_idx++];
229 const uint32_t updated_sequence_count =
static_cast<uint32_t
>(num_pairs) +
static_cast<uint32_t
>(overflow);
230 count = updated_sequence_count;
233 more_additions = more_additions || updated_sequence_count > 1;
237 if (more_additions) {
238 const size_t updated_point_count = result_point_idx;
239 std::span<G1> updated_points(&points[0], updated_point_count);
240 return batched_affine_add_in_place(