File: BasicThreadPool.txt

package info (click to toggle)
ntl 9.9.1-3~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 6,348 kB
  • sloc: ansic: 78,019; cpp: 10,441; makefile: 350; sh: 14
file content (374 lines) | stat: -rw-r--r-- 13,548 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374


/************************************************************************

MODULE: BasicThreadPool

SUMMARY:

A simple thread pool class BasicThreadPool, as well as some higher-level macros
which facilitite simple parallel for loops.


***************************************************************************/


// ********************** Simple parallel for loops **************************
// 
// We begin with a description of the higher-level macros for writing simple
// parallel for loops.  These facilitaties are activated only when NTL is
// configured with NTL_THREAD_BOOST=on (which implies NTL_THREADS=on).
// However, code that uses these facilties should still compile and run
// correctly even when NTL_THREAD_BOOST=off, or even when NTL_THREADS=off, so
// this is the simplest way to write parallel for loops across a range of
// compile-time and run-time environments.  Note that if NTL_THREADS=on, C++11
// features are reqired, but when NTL_THREADS=off, these features are not
// required, so the code should compile on older C++ compilers.
// 
// Here is a simple recipe for writing parallel for loop.
// 
// At the start of program execution, your program should execute

   SetNumThreads(nt);

// You can choose nt to be any positive integer, but for best results, it
// should correspond to the number of available cores on your machine.
// [NOTE: if NTL_THREAD_BOOST=off, this function is still defined, but does
// nothing.]
// 
// Now consider the following routine:

   void mul(ZZ *x, const ZZ *a, const ZZ *b, long n) 
   {
      for (long i = 0; i < n; i++)
         mul(x[i], a[i], b[i]);
   }

// We can parallelize it as follows:

   void mul(ZZ *x, const ZZ *a, const ZZ *b, long n) 
   {
      NTL_EXEC_RANGE(n, first, last) 

         for (long i = first; i < last; i++)
            mul(x[i], a[i], b[i]);

      NTL_EXEC_RANGE_END
   }

// NTL_EXEC_RANGE and NTL_EXEC_RANGE_END are macros that just "do the right
// thing".  If there are nt threads available, the interval [0..n) will be
// partitioned into (up to)  nt subintervals, and a different thread will be
// used to process each subinterval. You still have to write the for loop
// yourself: the macro just declares and initializes variables "first" and
// "last" (or whatever you want to call them) of type long that represent the
// subinterval [first..last) to be processed by one thread.
// 
// Note that the current thread participates as one of the nt available
// threads, and that the current thread will wait for all participating threads
// to finish their task before proceeding.
// 
// Withing the "body" of this construct, you can freely reference any variables
// that are visible at this point.  This is implemented using the C++ lambda
// feature (capturing all variables by reference).
// 
// This construct will still work even if threads are disabled, in which case
// it runs single-threaded with first=0 and last=n.
// 
// Note that the code within the EXEC_RANGE body could call other routines that
// themselves attempt to execute an EXEC_RANGE: if this happens, the latter
// EXEC_RANGE will detect this and run single-threaded.
// 
// You may wish to do other things within the EXEC_RANGE body than just execute
// a loop.  One thing you may want to do is to declare variables.  Another
// thing you may want to do is setup a local context for a ZZ_p modulus (or
// other type of modulus).  Here is an example of doing this:


   void mul(ZZ_p *x, const ZZ_p *a, const ZZ_p *b, long n) 
   {
      ZZ_pContext context;
      context.save();

      NTL_EXEC_RANGE(n, first, last) 
       
         context.restore();

         for (long i = first; i < last; i++)
            mul(x[i], a[i], b[i]);

      NTL_EXEC_RANGE_END
   }


// Another useful function is AvailableThreads(), which will return the number
// of available threads.  If threads or thread boosting is not enabled, this
// will return 1.  Even if thread boosting is enabled, this may return 1 if for
// whatever reason, the thread pool is not available for use (for example,
// SetNumThreads was never called, or the thread pool is already active).
// 
// A lower-level set of tools is available, which allow you to simply run a
// specified number of threads.  Assuming nt <= AvailableThreads(), the code

   NTL_EXEC_INDEX(nt, index)

      ... code ...

   NTL_EXEC_INDEX_END

// will execute the body on nt different threads, each with a unique index in
// the range [0..nt).  A variable named "index" (or whatever name you specify)
// of type long will hold the given index.
// 
// This tool is useful if you need to manage memory a bit more carefully.  For
// example, the following code will compute an inner product using all
// available threads:

   ZZ InnerProd(const ZZ *a, const ZZ *b, long n) 
   {
      PartitionInfo pinfo(n);

      long cnt = pinfo.NumIntervals();

      Vec<ZZ> acc;
      acc.SetLength(cnt);

      NTL_EXEC_INDEX(cnt, index)

         long first, last;
         pinfo.interval(first, last, index);

         ZZ& sum = acc[index];
         sum = 0;
         for (long i = first; i < last; i++) 
            MulAddTo(sum, a[i], b[i]);

      NTL_EXEC_INDEX_END

      ZZ sum;
      sum = 0;
      for (long i = 0; i < cnt; i++)
         sum += acc[i];

      return sum;
   }

// This example also illustrates the class PartitionInfo, which is useful for
// partitioning a large interval into smaller intervals (it is used internally
// by EXEC_RANGE).  The constructor takes a single argument (in this example n)
// and computes a partition of [0..n) into nearly equally sized subintervals.
// The method NumIntervals() returns the number of subintervals, and the method
// interval(first, last, index) sets first and last according to the endpoints
// of the subinterval [first..last) with the given index.
// 
// So in this example, cnt threads will run, each accumulating a sum into a
// corresponding element of the vector acc, and afterwords, these elements are
// summed.
// 
// Note that if threads are not enabled or otherwise unavailable, the above
// code will compile and run correctly (just using one thread).
// 
// Finally, there is a "guarded" version of NTL_EXEC_RANGE called
// NTL_GEXEC_RANGE.  This allows one to dynamically "guard" against parallel
// execution. For example, on very small problems the runtime overhead of a
// parallel for loop may not be worthwhile, or in other situations parallel
// execution could cause incorrect behavior.  See below for details.


// ************************** Thread Pools ******************************
// 
// The above facilities are built on top of a more general thread pool class,
// which you may use for your own purposes.
//    
// You create a thread pool by constructing a BasicThreadPool object.  For
// example:

   long nthreads = 4;
   BasicThreadPool pool(nthreads);

// creates a thread pool of 4 threads.  These threads will exist until the
// destructor for pool is called.  
// 
// The simplest way to use a thread pools is as follows.  Suppose you have a
// task that consists of sz subtasks, indexed 0..sz-1.  Then you can write:

   pool.exec_range(sz, 
      [&](long first, long last) {
         for (long i = first; i < last; i++) {
            ... code to process subtask i ...
         }
      }
   );

// The second argument to exec_range is a C++11 "lambda".  The "[&]" indicates
// that all local variables in the calling context are captured by reference,
// so the lambda body can reference all visible local variables directly.
// C++11 provides other methods for capturing local variables.  The interval
// [0..sz) is partitioned into subintervals of the form [first..last), which
// are processed by the code in the supplied lambda.
// 
// A lower-level interface is also provided.  One can write:

   pool.exec_index(cnt,
      [&](long index) {
         ... code to process index i ...
      }
   );

// This will activate exactly cnt threads with indices 0..cnt-1, and execute
// the given code on each index.  The parameter cnt must not exceed nthreads,
// otherwise an error is raised.


// ====================================================================
// 
// NOTES:
// 
// When one activates a thread pool with nthreads threads, the *current* thread
// (the one activating the pool) will also participate in the computation.
// This means that the thread pool only contains nthreads-1 other threads.
// 
// If, during an activation, any thread throws an exception, it will be caught
// and rethrown in the activating thread when all the threads complete.  If
// more than one thread throws an exception, the first one that is caught is
// the one that is rethrown.
// 
// Methods are also provided for adding, deleting, and moving threads in and
// among thread pools.
// 
// If NTL_THREADS=off, the corresponding header file may be included, but the
// BasicThreadPool class is not defined.
//
// Unlike most classes in NTL, the BasicThreadPool is not relocatable and hence
// cannot be used in a Vec.  One should first wrap it in a pointer class, such
// as UniquePtr.



// class BasicThreadPool: provided basic functionality for thread pools

class BasicThreadPool {
private:

  BasicThreadPool(const BasicThreadPool&); // disabled
  void operator=(const BasicThreadPool&); // disabled

public:

  explicit
  BasicThreadPool(long nthreads);
  // creates a pool with nthreads threads, including the current thread
  // (so nthreads-1 other threads get created)

  template<class Fct>
  void exec_range(long sz, const Fct& fct); 
  // activate by range (see example usage above)

  template<class Fct>
  void exec_index(long cnt, const Fct& fct); 
  // activate by index (see example usage above)

  void add(long n = 1);
  // add n threads to the pool

  long NumThreads() const;
  // return number of threads (including current thread)

  void remove(long n = 1);
  // remove n threads from the pool
  
  void move(BasicThreadPool& other, long n = 1) 
  // move n threads from other pool to this pool

  bool active() const;
  // indicates an activation is in process: invoking any of the methods
  // exec_index, exec_range, add, remove, move, or the destructor
  // whie active will raise an error

  template<class Fct>
  static void relaxed_exec_range(BasicThreadPool *pool, long sz, const Fct& fct);
  // similar to pool->exec_range(sz, fct), but will still work even 
  // if !pool or pool->active(), using just the current thread

  template<class Fct>
  static void relaxed_exec_index(BasicThreadPool *pool, long cnt, const Fct& fct);
  // similar to pool->exec_index(cnt, fct), but will still work even 
  // if !pool or pool->active(), provided cnt <= 1, using just the current thread

};




// THREAD BOOSTING FEATURES:

void SetNumThreads(long nt);
// convenience routine to set NTL's thread pool.
// If called more than once, the old thread pool is destroyed and
// replaced by a new one.
// If NTL_THREAD_BOOST=off, then this is still defined, but does nothing.

long AvailableThreads();
// Number of threads currently availble to use in NTL's thread pool.  This is
// always at least 1 (for the current thread).  
// If NTL_THREAD_BOOST=off, then this is still defined, and always returns 1.

BasicThreadPool *GetThreadPool();
void ResetThreadPool(BasicThreadPool *pool = 0);
BasicThreadPool *ReleaseThreadPool();
// Routines to get and set NTL's thread pool.  The interfaces parallel NTL's
// UniquePtr class, and indeed, behind the scenes, NTL's thread pool is stored
// as a UniquePtr<BasicThreadPool>.
// These are only declared when NTL_THREAD_BOOST=on.  


#define NTL_EXEC_RANGE(sz, first, last) ...
#define NTL_EXEC_RANGE_END ...
#define NTL_EXEC_INDEX(cnt, index) ...
#define NTL_EXEC_INDEX_END ...
// convenience macros to implement "parallel for loops" using NTL's thread
// pool.  See examples above for usage.  If NTL_THREAD_BOOST=off, then these
// are still defined, and code will run on a single thread


#define NTL_GEXEC_RANGE(seq, sz, first, last) ...
#define NTL_GEXEC_RANGE_END ...
// "guarded" version of NTL_EXEC_RANGE: if seq evaluates to true, the code runs
// on a single thread.  This is useful in avoiding situations where the
// overhead of a parallel loop is too high.  If seq evaluates to the constant
// true, a good compiler will optimize code to run on a single thread, with no
// overhead.

#define NTL_IMPORT(x) 
// To be used in conjunction with NTL_EXEC_RANGE and friends.  When
// NTL_THREAD_BOOST=on, this will copy the variable named x from the enclosing
// scope to a local copy.  This should only be used for types with cheap
// copies, such as scalars and pointers.  In some situations, this allows the
// compiler to optimize a bit more aggressively.  One or more of these may be
// placed right after an NTL_EXEC_RANGE.
// When NTL_THREAD_BOOST=off, this is still defined, and does nothing.


// class PartitionInfo: A helper class to facilitate partitioning an interval
// into subintervals.  NOTE: this class is available, even when
// NTL_THREAD_BOOST=off.

class PartitionInfo {
public:

   explicit
   PartitionInfo(long sz, long nt = AvailableThreads()); 
   // partitions [0..sz) into at most nt subintervals.  sz may be 0 or
   // negative, in which case the number of subintervals is 0.

   long NumIntervals() const;
   // return the number of subintervals

   void interval(long& first, long& last, long i) const;
   // [first..last) is the ith interval, where i in [0..NumInvervals()).  No
   // range checking is performed.

};