Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
thread.cpp
Go to the documentation of this file.
1#include "thread.hpp"
2#include "log.hpp"
3
39namespace bb {
40// 64 core aws r5.
41// pippenger run: pippenger_bench/1048576
42// coset_fft run: coset_fft_bench_parallel/4194304
43// proof run: 2m gate ultraplonk. average of 5.
44
45// pippenger: 179ms
46// coset_fft: 54776us
47// proof: 11.33s
48void parallel_for_omp(size_t num_iterations, const std::function<void(size_t)>& func);
49
50// pippenger: 163ms
51// coset_fft: 59993us
52// proof: 11.11s
53void parallel_for_moody(size_t num_iterations, const std::function<void(size_t)>& func);
54
55// pippenger: 154ms
56// coset_fft: 92997us
57// proof: 10.84s
58void parallel_for_spawning(size_t num_iterations, const std::function<void(size_t)>& func);
59
60// pippenger: 178ms
61// coset_fft: 70207us
62// proof: 11.55s
63void parallel_for_queued(size_t num_iterations, const std::function<void(size_t)>& func);
64
65// pippenger: 152ms
66// coset_fft: 56658us
67// proof: 11.28s
68void parallel_for_atomic_pool(size_t num_iterations, const std::function<void(size_t)>& func);
69
70void parallel_for_mutex_pool(size_t num_iterations, const std::function<void(size_t)>& func);
71
72void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func)
73{
74#ifdef NO_MULTITHREADING
75 for (size_t i = 0; i < num_iterations; ++i) {
76 func(i);
77 }
78#else
79#ifdef OMP_MULTITHREADING
80 parallel_for_omp(num_iterations, func);
81#else
82 // parallel_for_spawning(num_iterations, func);
83 // parallel_for_moody(num_iterations, func);
84 // parallel_for_atomic_pool(num_iterations, func);
85 parallel_for_mutex_pool(num_iterations, func);
86 // parallel_for_queued(num_iterations, func);
87#endif
88#endif
89}
90
102void parallel_for_range(size_t num_points,
103 const std::function<void(size_t, size_t)>& func,
104 size_t no_multhreading_if_less_or_equal)
105{
106 if (num_points <= no_multhreading_if_less_or_equal) {
107 func(0, num_points);
108 return;
109 }
110 // Get number of cpus we can split into
111 const size_t num_cpus = get_num_cpus();
112
113 // Compute the size of a single chunk
114 const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
115 // Parallelize over chunks
116 parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
117 // If num_points is small, sometimes we need fewer CPUs
118 if (chunk_size * chunk_index > num_points) {
119 return;
120 }
121 // Compute the current chunk size (can differ in case it's the last chunk)
122 size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
123 if (current_chunk_size == 0) {
124 return;
125 }
126 size_t start = chunk_index * chunk_size;
127 size_t end = chunk_index * chunk_size + current_chunk_size;
128 func(start, end);
129 });
130};
131
132void parallel_for_heuristic(size_t num_points,
133 const std::function<void(size_t, size_t, size_t)>& func,
134 size_t heuristic_cost)
135{
136 // We take the maximum observed parallel_for cost (388 us) and round it up.
137 // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we
138 // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium
139 // workloads
140 constexpr size_t PARALLEL_FOR_COST = 400000;
141 // Get number of cpus we can split into
142 const size_t num_cpus = get_num_cpus();
143
144 // Compute the size of a single chunk
145 const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
146
147 // Compute the cost of all operations done by other threads
148 const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;
149
150 // If starting parallel for is longer than computing, just compute
151 if (offset_cost < PARALLEL_FOR_COST) {
152 func(0, num_points, 0);
153 return;
154 }
155 // Parallelize over chunks
156 parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
157 // If num_points is small, sometimes we need fewer CPUs
158 if (chunk_size * chunk_index > num_points) {
159 return;
160 }
161 // Compute the current chunk size (can differ in case it's the last chunk)
162 size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
163 if (current_chunk_size == 0) {
164 return;
165 }
166 size_t start = chunk_index * chunk_size;
167 size_t end = chunk_index * chunk_size + current_chunk_size;
168
169 func(start, end, chunk_index);
170 });
171};
172
173MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)
174{
175 size_t num_threads = calculate_num_threads(num_iterations, min_iterations_per_thread);
176 const size_t thread_size = num_iterations / num_threads;
177
178 // Cumpute the index bounds for each thread
179 std::vector<size_t> start(num_threads);
180 std::vector<size_t> end(num_threads);
181 for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
182 start[thread_idx] = thread_idx * thread_size;
183 end[thread_idx] = (thread_idx == num_threads - 1) ? num_iterations : (thread_idx + 1) * thread_size;
184 }
185
186 return MultithreadData{ num_threads, start, end };
187}
188
199size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
200{
201 size_t max_num_threads = get_num_cpus(); // number of available threads
202 size_t desired_num_threads = num_iterations / min_iterations_per_thread;
203 size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
204 num_threads = num_threads > 0 ? num_threads : 1; // ensure num_threads is at least 1
205 return num_threads;
206}
207
215size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
216{
217 size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
218 size_t desired_num_threads = num_iterations / min_iterations_per_thread;
219 desired_num_threads = static_cast<size_t>(1ULL << numeric::get_msb(desired_num_threads));
220 size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
221 num_threads = num_threads > 0 ? num_threads : 1; // ensure num_threads is at least 1
222 return num_threads;
223}
224} // namespace bb
constexpr T get_msb(const T in)
Definition get_msb.hpp:47
Entry point for Barretenberg command-line interface.
void parallel_for_mutex_pool(size_t num_iterations, const std::function< void(size_t)> &func)
MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)
Calculates number of threads and index bounds for each thread.
Definition thread.cpp:173
void parallel_for_queued(size_t num_iterations, const std::function< void(size_t)> &func)
size_t get_num_cpus_pow2()
Definition thread.hpp:18
size_t get_num_cpus()
Definition thread.hpp:12
void parallel_for_moody(size_t num_iterations, const std::function< void(size_t)> &func)
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread
Definition thread.cpp:199
size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
Definition thread.cpp:215
void parallel_for_atomic_pool(size_t num_iterations, const std::function< void(size_t)> &func)
void parallel_for_heuristic(size_t num_points, const std::function< void(size_t, size_t, size_t)> &func, size_t heuristic_cost)
Split a loop into several loops running in parallel based on operations in 1 iteration.
Definition thread.cpp:132
void parallel_for_spawning(size_t num_iterations, const std::function< void(size_t)> &func)
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:72
void parallel_for_omp(size_t num_iterations, const std::function< void(size_t)> &func)
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:102