File: quantlibbenchmark.cpp

package info (click to toggle)
quantlib 1.40-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 41,768 kB
  • sloc: cpp: 398,987; makefile: 6,574; python: 214; sh: 150; lisp: 86
file content (938 lines) | stat: -rw-r--r-- 44,340 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
/* -*- mode: c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/*
 Copyright (C) 2006, 2008, 2010, 2018, 2023 Klaus Spanderen
 Copyright (C) 2024 Jacques du Toit

 This file is part of QuantLib, a free-software/open-source library
 for financial quantitative analysts and developers - http://quantlib.org/

 QuantLib is free software: you can redistribute it and/or modify it
 under the terms of the QuantLib license.  You should have received a
 copy of the license along with this program; if not, please email
 <quantlib-dev@lists.sf.net>. The license is also available online at
 <https://www.quantlib.org/license.shtml>.

 This program is distributed in the hope that it will be useful, but WITHOUT
 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE.  See the license for more details.
*/


/*
 QuantLib Benchmark Suite

 Measures the performance of a preselected set of numerically intensive
 test cases. This benchmarks supports multiprocessing, e.g.

 Single process benchmark for testing:
 ./quantlib-benchmark --size=1 --nProc=1

 Benchmark with 16 processes and the default size:
 ./quantlib-benchmark --nProc=16

 Benchmark with one worker process per hardware thread and the default size:
 ./quantlib-benchmark

 This benchmark is derived from quantlibtestsuite.cpp. Please see the
 copyrights therein.
*/

#include <ql/types.hpp>
#include <ql/version.hpp>

#ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
#if BOOST_VERSION >= 108800
#include <boost/process/v1/system.hpp>
#include <boost/process/v1/args.hpp>
namespace bp = boost::process::v1;
#else
#include <boost/process.hpp>
namespace bp = boost::process;
#endif
#include <boost/interprocess/ipc/message_queue.hpp>
#endif

#define BOOST_TEST_NO_MAIN
#define BOOST_TEST_ALTERNATIVE_INIT_API
#include <boost/test/included/unit_test.hpp>

#include <boost/algorithm/string.hpp>
#include <boost/numeric/conversion/cast.hpp>
#include <boost/test/unit_test_suite.hpp>
#include <boost/test/framework.hpp>

#include <iomanip>
#include <iostream>
#include <utility>
#include <vector>
#include <string>
#include <chrono>
#include <thread>



/* Use BOOST_MSVC instead of _MSC_VER since some other vendors (Metrowerks,
   for example) also #define _MSC_VER
*/
#if !defined(BOOST_ALL_NO_LIB) && defined(BOOST_MSVC)
#  include <ql/auto_link.hpp>
#endif

#include "utilities.hpp"




namespace {

    /**
     * A class representing an individual benchmark.  Each benchmark is one of the QuantLib
     * test-suite tests, run one or more times.  Boost unit test framework causes a dilemma:
     *
     *    * if we don't use boost::unit_test::framework::run to run the test, then all the
     *       correcness checks are disabled.  We can't validate that the test passed.
     *    * if we do use boost::unit_test::framework::run, then we incur a very large overhead
     *       especially for short tests that are run many thousands of times.
     *
     * We deal with this by running each test exactly once using boost::unit_test::framework::run.
     * Failures are marked using a boost::unit_test::test_observer and cause immediate tear down
     * of the benchmark master process.  All subsequent runs of the test are done through a hack.
     * We copy the declarations of the BOOST_AUTO_TEST_CASE and friends macros in boost/test/unit_test_suite.hpp
     * to declare the symbols that Boost creates.  This allows us to call these symbols directly,
     * by-passing the boost unit test framework completely.
     *
     * The overall benchmark is parallelised using Boost::IPC.  QuantLib is not thread safe, so any
     * kind of shared memory paralellism is ruled out.  The benchmark creates a large (fixed) amount of
     * work, distributes this between all the workers, and sees how quickly the workers can finish it all.
     * The overall metric is #tasks/s that the system can process.  The tasks are pre-set (these are the
     * tests from the test-suite), and the --size argument to the benchmark controls how many times the
     * entire set of tasks is executed. Once the machine is saturated with work the benchmark typically
     * exhibits perfect weak scaling: doubling --size will double runtime and leave #tasks/s unchanged.
     * The #tasks/s will typically increase as the machine is given more work to do.
     *
     * The pre-set benchmark sizes are chosen to saturate even very large machines.
     */
    class Benchmark
    {
        public:
            template<class CALLABLE>
                Benchmark(
                        std::string name,               // the test name, as known by boost::unit_test::test_unit
                        CALLABLE &&body,                // the "body" of the test we want to run
                        double cost                     // how expensive (runtime) this test is relative to others
                        )
                : name_(std::move(name)),  cost_(cost),  testBody_(std::forward<CALLABLE>(body)) {}

            Benchmark(const Benchmark& copy) = default;
            Benchmark(Benchmark&& move) = default;
            Benchmark& operator=(const Benchmark &other) = default;
            Benchmark& operator=(Benchmark &&other) = default;
            ~Benchmark() = default;

            double getCost() const          { return cost_; }
            std::string getName() const     { return name_; }
            bool foundTestUnit() const      { return test_ != nullptr; }
            // Total runtime across multiple runs is manually accumulated into the class
            double& getTotalRuntime()       { return totalRuntime_; }
            const double& getTotalRuntime() const { return totalRuntime_; }
            void setTestUnit(const boost::unit_test::test_unit * unit) { test_ = unit; }


            // Run the underlying QuantLib test exactly once using the Boost test framework
            // This will check all results and will flag any errors that are found.  It is much
            // slower than running just the test body outside of the Boost framework
            double runValidation() const
            {
                double time = -1.0;
                try {
                    auto startTime = std::chrono::steady_clock::now();
                    boost::unit_test::framework::run(test_, false);
                    auto stopTime = std::chrono::steady_clock::now();
                    time = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
                }
                catch(const std::exception &e) {
                    std::cerr << "error: caught exception in benchmark " << getName() << "\n"
                        << "message: " << e.what() << "\n" << std::endl;
                }
                catch(...) {
                    std::cerr << "error: caught unknown exception in benchmark " << getName() << std::endl;
                }
                return time;
            }

            // Directly run the body of the underlying QuantLib test (multiple times) without using the Boost
            // test framework. This eliminates all the boost overhead, but also disables all results checking.
            double runBenchmark() const
            {
                double time = -1.0;
                try {
                    auto startTime = std::chrono::steady_clock::now();
                    testBody_();
                    auto stopTime = std::chrono::steady_clock::now();
                    time = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
                }
                catch(const std::exception &e) {
                    std::cerr << "Error: caught exception in benchmark " << getName() << "\n"
                        << "Message: " << e.what() << "\n" << std::endl;
                }
                catch(...) {
                    std::cerr << "Error: caught unknown exception in benchmark " << getName() << std::endl;
                }
                return time;
            }

        private:
            std::string name_;
            const boost::unit_test::test_unit * test_ = nullptr;
            double cost_;
            double totalRuntime_ = 0;
            std::function<void(void)> testBody_;
    };


    /**
     * To determine programmatically whether a test has passed or not, Boost unit test framework requires
     * us to register a test observer class. This only gives the pass/fail status for the most recently
     * run test, not even the name of the test that was run.  Hence we need some additional
     * plumbing to ensure that intra-test failures are not overridden by intra-test passes
     * (for a test that has multiple calls to BOOST_CHECK or BOOST_FAIL).
     */
    struct BenchmarkResult : public boost::unit_test::test_observer
    {
        public:
            BenchmarkResult()  {
                boost::unit_test::framework::register_observer(*this);
            }
            ~BenchmarkResult() override {
                boost::unit_test::framework::deregister_observer(*this);
            }
            BenchmarkResult(const BenchmarkResult&) = delete;
            BenchmarkResult(BenchmarkResult&&) = delete;
            BenchmarkResult& operator=(const BenchmarkResult &) = delete;
            BenchmarkResult& operator=(BenchmarkResult &&) = delete;


            void assertion_result( boost::unit_test::assertion_result  ar ) override
            {
                passed_ = passed_ && (ar == boost::unit_test::AR_PASSED);
            }
            bool pass() const { return passed_; }
            void reset() { passed_ = true; }

        private:
            bool passed_ = true;
    };


    /**
     * This class takes a list of Benchmarks and attempts to find the corresponding
     * test_units in the Boost test unit tree.
     * */
    class TestUnitFinder : public boost::unit_test::test_tree_visitor
    {
        private:
            TestUnitFinder(std::vector<Benchmark> & bm) : bm_(bm) {}

            // Utility method needed for initialising the Boost test framework
            static bool init_unit_test_suite() { return true; }

        public:
            bool visit(const boost::unit_test::test_unit & tu) override
            {
                const std::string& thisTest = tu.full_name();
                // Try find this in the bm array.  We know every test name sill start with
                //   "QuantLibTests/"  which contains 14 characters
                for(auto &b : bm_ ) {
                    if( thisTest.find( b.getName(), 14) != std::string::npos ) {
                        // We have a match
                        b.setTestUnit( &tu );
                    }
                }
                // Continue visiting
                return true;
            }


            // Find the corresponding Boost test_unit for each Benchmark
            // If we can't find a test_unit, throw an exception
            static void findAllTests(char** argv, std::vector<Benchmark> &bm)
            {
                boost::unit_test::framework::init(TestUnitFinder::init_unit_test_suite, 1, argv);
                boost::unit_test_framework::framework::finalize_setup_phase();

                TestUnitFinder tuf(bm);
                boost::unit_test::traverse_test_tree(boost::unit_test_framework::framework::master_test_suite(), tuf, true);

                // Now check that we've found all test units
                for(const auto &b : bm)  {
                    if( !b.foundTestUnit() ) {
                        std::string msg = "Unable to find the Boost test unit for Benchmark '";
                        msg += b.getName();
                        msg += "'";
                        throw std::runtime_error(msg);
                    }
                }
            }

        private:
            std::vector<Benchmark> & bm_;
    };


    // The container holding all the benchmarks we will run
    std::vector<Benchmark> bm;



    /**
     * A clas to group and tidy up all the benchmark IO and boilerplate routines
     */
    struct BenchmarkSupport
    {
        // Verbosity level and a logging macro to help debugging
        static int verbose;
#define LOG_MESSAGE(...)  if(BenchmarkSupport::verbose >= 3) { std::cout << __VA_ARGS__ << std::endl; }


        // The set of pre-defined benchmark sizes that we support
        static const std::vector< std::pair<std::string, unsigned int> > bmSizes;

        // Turn a command line '--size=<value>' string into a benchmark size
        static unsigned int parseBmSize(const std::string &size)
        {
            for(const auto & p : bmSizes) {
                if(p.first == size)
                    return p.second;
            }
            // OK - it's not a preset size, let's see if it's parsable as an integer
            try {
                unsigned int sz = std::stoul(size);
                return sz;
            }
            catch(const std::exception &e) {
                // Unable to convert to integer.  Abort
                std::cerr << "Error: INVALID BENCHMARK RUN\n";
                std::cerr << "Invalid custom benchmark size specified, unable to convert to an integer\n";
                std::cerr << "Exception generated: " << e.what() << "\n";
                exit(1);
            }
        }

        // Turn a benchmark size into a string for printing
        static std::string bmSizeAsString(unsigned int size)
        {
            for(const auto& p : bmSizes) {
                if(p.second == size)
                    return p.first;
            }
            // Not a preset size
            return "Custom (" + std::to_string(size) + ")";
        }


        static void printGreeting(const std::string &size, unsigned nProc)
        {
            std::cout << std::endl;
            std::cout << std::string(84,'-') << "\n";
            std::cout << "Benchmark Suite QuantLib "  QL_VERSION << "\n";
            std::cout << "\n";
            std::cout << "Benchmark size='" << size << "' on " << nProc << " processes\n";
            std::cout << std::string(84,'-') << "\n";
            std::cout << std::endl;
        }

        // If a test fails, notify the user and terminate the benchmark
        static void terminateBenchmark()
        {
            std::cerr << "\033[0m\nError: INVALID BENCHMARK RUN.\n"
                <<  "One or more tests failed, please see the log for details" << std::endl ;
            // Tear down the master process, which kills all child threads/processes
            exit(1);
        }


        static void printResults(
                unsigned nSize,                         // the size of the benchmark
                double masterLifetime,                  // lifetime of the master process
                std::vector<double> workerLifetimes     // lifetimes of all the worker processes
                )
        {
            std::cout     << "\033[0m\n";
            std::cout     << "Benchmark Size        = " << BenchmarkSupport::bmSizeAsString(nSize) << std::endl;
            std::cout     << "Number of processes   = " << workerLifetimes.size() << std::endl;
            std::cout     << "System Throughput     = " << (double(nSize) * bm.size() ) / masterLifetime << " tasks/s" << std::endl;
            std::cout     << "Benchmark Runtime     = " << masterLifetime<< "s" << std::endl;

            if(verbose >=1 )
            {
                const size_t nProc = workerLifetimes.size();
                std::cout << "Num. Worker Processes = " << nProc << std::endl;

                // Work out tail effect.  We define "tail effect" as the ratio of the average (geomean)
                // tail lifetime, to the lifetime of the master process.  The cutoff for defining
                // the "tail" is arbitrary.  A ratio of 1 means no tail effect  (tail lifetime is same
                // as lifetime of master process), a ratio near 0 means tail finished significantly
                // before master process
                std::sort(workerLifetimes.begin(), workerLifetimes.end());
                const double thresh = 0.1;
                int tail = (int)std::ceil(thresh * nProc);
                double tailGeomean = 1.0;
                for(int i=0; i<tail; i++) {
                    tailGeomean *= workerLifetimes[i];
                }
                tailGeomean = std::pow(tailGeomean, 1.0/tail);
                const double tailEffect = tailGeomean / masterLifetime;

                std::cout << "Tail Effect Ratio     = " << tailEffect << std::endl;
                std::cout << "                      =  Geomean( Shortest " << tail << " worker lifetimes )" << std::endl;
                std::cout << "                      --------------------------------------------------------" << std::endl;
                std::cout << "                                    Lifetime( Master process )" << std::endl;
                std::cout << std::endl;
            }

            std::cout << std::string(84,'-') << std::endl;

            if(verbose >= 2) {
                std::cout << "                       Total Runtime spent in each test " << std::endl;
                std::cout << std::string(84,'-') << std::endl;

                // Compute max test name length
                size_t len = 0;
                for (const auto & b : bm) { len = std::max(len, b.getName().length() ); }

                for (const auto& b: bm) {
                    std::cout << b.getName()
                        << std::string(len+2 - b.getName().length(),' ')
                        << ": " << b.getTotalRuntime()  << "s" << std::endl;
                }
                std::cout << std::string(84,'-') << std::endl;
            }
            std::cout << std::endl;
        }


#ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
        // The entry point for the std::thread's that will be the workers
        static int worker(const char * exe, const std::vector<std::string>& args) {
            return bp::system(exe, bp::args=args);
        }
#endif

        // A helper class to push benchmark objects into the benchmark container
        // before main() starts.  Every time the constructor is called, a test is added.
        struct AddBenchmark {
            template<class CALLABLE>
                AddBenchmark(std::vector<Benchmark> &bm, CALLABLE && test_body, const char* name, double cost) {
                    bm.push_back( Benchmark(name, std::forward<CALLABLE>(test_body), cost) );
                }
        };
    };
    int BenchmarkSupport::verbose = 0;
    const std::vector< std::pair<std::string, unsigned int> > BenchmarkSupport::bmSizes = {
            {"XXS",  60},
            {"XS",   120},
            {"S",    240},
            {"M",    480},
            {"L",    960}
        };


    // The messages sent from workers to master across boost IPC queues
    struct IPCResultMsg
    {
        unsigned bmId;              // the benchcmark that was run
        unsigned threadId;          // the ID of the worker who ran it
        double time;                // the runtime
    };

    // The messages sent from master to workers across boost IPC queues
    struct IPCInstructionMsg
    {
        unsigned j = 0;               // the benchmark to run
        bool validate = false;        // whether to run in validation mode or not
    };



}  // END anonymous namespace


// These are pulled from boost/unit_test/unit_test_suite.hpp.  We declare the
// bodies of the tests so that we can run them more efficiently.
#define QL_BENCHMARK_DECLARE(test_fixture, test_name, num_iters, cost)   \
    namespace QuantLibTests {                                        \
        namespace test_fixture {                                         \
            struct test_name : public BOOST_AUTO_TEST_CASE_FIXTURE {     \
                void test_method();                                      \
            };                                                           \
        }}                                                               \
        \
        namespace {             \
            /* Declare unique global variable and push benchmark into bm */ \
            BenchmarkSupport::AddBenchmark test_fixture##_##test_name( \
                    bm, \
                    [] { QuantLibTests::test_fixture::test_name thetest; for(int i=0; i<num_iters; i++) thetest.test_method(); }, \
#test_fixture "/" #test_name, cost);                                             \
        }


// Set of all tests we will run.  The integer is the number of times the test is run, and
// the value at the end is a relative runtime cost of each benchmark compared with the others.
// Exact values are not needed, we just need to know what is "expensive" and what is "cheap"
// in terms of runtime.

// Equity & FX
QL_BENCHMARK_DECLARE(AmericanOptionTests, testFdAmericanGreeks, 1, 0.5);
QL_BENCHMARK_DECLARE(AmericanOptionTests, testFdValues, 20, 3.0);
QL_BENCHMARK_DECLARE(AmericanOptionTests, testCallPutParity, 100, 1.0);
QL_BENCHMARK_DECLARE(AmericanOptionTests, testQdEngineStandardExample, 400, 0.5);
QL_BENCHMARK_DECLARE(EuropeanOptionTests, testImpliedVol, 1, 0.5);
QL_BENCHMARK_DECLARE(EuropeanOptionTests, testMcEngines, 1, 1.0);
QL_BENCHMARK_DECLARE(EuropeanOptionTests, testLocalVolatility, 3, 2.0);
QL_BENCHMARK_DECLARE(BatesModelTests, testDAXCalibration, 1, 0.5);
QL_BENCHMARK_DECLARE(BatesModelTests, testAnalyticVsMCPricing, 1, 1.0);
QL_BENCHMARK_DECLARE(BatesModelTests, testAnalyticAndMcVsJumpDiffusion, 5, 1.0);
QL_BENCHMARK_DECLARE(HestonModelTests, testDAXCalibration, 1, 0.5);
QL_BENCHMARK_DECLARE(HestonModelTests, testFdBarrierVsCached, 1, 3.0);
QL_BENCHMARK_DECLARE(HestonModelTests, testFdAmerican, 1, 1.0);
QL_BENCHMARK_DECLARE(HestonModelTests, testLocalVolFromHestonModel, 10, 1.0);
QL_BENCHMARK_DECLARE(FdHestonTests, testFdmHestonAmerican, 10, 1.0);
QL_BENCHMARK_DECLARE(FdHestonTests, testAmericanCallPutParity, 15, 1.5);
QL_BENCHMARK_DECLARE(FdHestonTests, testFdmHestonBarrierVsBlackScholes, 1, 2.0);
QL_BENCHMARK_DECLARE(HestonSLVModelTests, testMonteCarloCalibration, 1, 3.0);
QL_BENCHMARK_DECLARE(HestonSLVModelTests, testHestonFokkerPlanckFwdEquation, 1, 5.0);
QL_BENCHMARK_DECLARE(HestonSLVModelTests, testBarrierPricingViaHestonLocalVol, 1, 1.0);
QL_BENCHMARK_DECLARE(MCLongstaffSchwartzEngineTests, testAmericanOption, 1, 2.0);
QL_BENCHMARK_DECLARE(VarianceGammaTests, testVarianceGamma, 1, 0.1);
QL_BENCHMARK_DECLARE(ConvertibleBondTests, testBond, 100, 2.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testArbitrageFree, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testAndreasenHugeCallPut, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testAndreasenHugeCall, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testAndreasenHugePut, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testFlatVolCalibration, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testTimeDependentInterestRates, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testPiecewiseConstantInterpolation, 1, 1.0);
QL_BENCHMARK_DECLARE(AndreasenHugeVolatilityInterplTests, testLinearInterpolation, 1, 1.0);

// Interest Rates
QL_BENCHMARK_DECLARE(ShortRateModelTests, testSwaps, 30, 3.0);
QL_BENCHMARK_DECLARE(ShortRateModelTests, testCachedHullWhite2, 500, 1.0);
QL_BENCHMARK_DECLARE(ShortRateModelTests, testCachedHullWhiteFixedReversion, 1000, 1.0);
QL_BENCHMARK_DECLARE(MarketModelCmsTests, testMultiStepCmSwapsAndSwaptions, 1, 11.0);
QL_BENCHMARK_DECLARE(MarketModelSmmTests, testMultiStepCoterminalSwapsAndSwaptions, 1, 9.0);
QL_BENCHMARK_DECLARE(BermudanSwaptionTests, testCachedG2Values, 1, 2.0);
QL_BENCHMARK_DECLARE(BermudanSwaptionTests, testCachedValues, 100, 3.0);
QL_BENCHMARK_DECLARE(LiborMarketModelTests, testSwaptionPricing, 1, 1.0);
QL_BENCHMARK_DECLARE(LiborMarketModelTests, testCalibration, 1, 5.0);
QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testConvexMonotoneForwardConsistency, 10, 2.0);
QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testFlatForwardConsistency, 50, 3.0);
QL_BENCHMARK_DECLARE(PiecewiseYieldCurveTests, testGlobalBootstrap, 20, 2.0);
QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBootstrapWithArithmeticAverage, 10, 5.0);
QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBaseBootstrap, 10, 3.0);
QL_BENCHMARK_DECLARE(OvernightIndexedSwapTests, testBootstrapRegression, 10, 1.0);
QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testCalibrationTwoInstrumentSets, 1, 3.0);
QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testCalibrationOneInstrumentSet, 1, 4.0);
QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testVanillaEngines, 1, 7.0);
QL_BENCHMARK_DECLARE(MarkovFunctionalTests, testBermudanSwaption, 3, 1.0);
QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSpreadedCube, 20, 1.0);
QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSabrNormalVolatility, 1, 1.0);
QL_BENCHMARK_DECLARE(SwaptionVolatilityCubeTests, testSabrVols, 30, 1.0);
QL_BENCHMARK_DECLARE(ZabrTests, testConsistency, 1, 10.0);
QL_BENCHMARK_DECLARE(CmsSpreadTests, testCouponPricing, 1, 1.0);
QL_BENCHMARK_DECLARE(CmsTests, testCmsSwap, 20, 2.0);
QL_BENCHMARK_DECLARE(CmsTests, testParity, 30, 2.0);
QL_BENCHMARK_DECLARE(InterestRateTests, testConversions, 10000, 0.1);

// Credit Derivatives
QL_BENCHMARK_DECLARE(NthToDefaultTests, testGauss, 2, 14.0);
QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testImpliedHazardRate, 1000, 1.0);
QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testCachedMarketValue, 1000, 0.1);
QL_BENCHMARK_DECLARE(CreditDefaultSwapTests, testIsdaEngine, 200, 2.0);
QL_BENCHMARK_DECLARE(SquareRootCLVModelTests, testSquareRootCLVMappingFunction, 20, 0.5);
QL_BENCHMARK_DECLARE(SquareRootCLVModelTests, testSquareRootCLVVanillaPricing, 200, 0.5);

// Energy
QL_BENCHMARK_DECLARE(SwingOptionTests, testExtOUJumpSwingOption, 1, 3.0);
QL_BENCHMARK_DECLARE(SwingOptionTests, testExtOUJumpVanillaEngine, 1, 3.0);
QL_BENCHMARK_DECLARE(SwingOptionTests, testFdBSSwingOption, 20, 1.0);
QL_BENCHMARK_DECLARE(VppTests, testVPPPricing, 1, 5.0);
QL_BENCHMARK_DECLARE(VppTests, testKlugeExtOUSpreadOption, 1, 1.0);

// Math
QL_BENCHMARK_DECLARE(RiskStatisticsTests, testResults, 4, 0.5);
QL_BENCHMARK_DECLARE(LowDiscrepancyTests, testMersenneTwisterDiscrepancy, 2, 0.5);
QL_BENCHMARK_DECLARE(LinearLeastSquaresRegressionTests, testMultiDimRegression, 20, 2.0);
QL_BENCHMARK_DECLARE(StatisticsTests, testIncrementalStatistics, 20, 0.5);
QL_BENCHMARK_DECLARE(FunctionsTests, testFactorial, 1000, 0.1);
QL_BENCHMARK_DECLARE(FunctionsTests, testGammaFunction, 1000, 0.5);
QL_BENCHMARK_DECLARE(FunctionsTests, testGammaValues, 100000, 0.5);
QL_BENCHMARK_DECLARE(FunctionsTests, testModifiedBesselFunctions, 10000, 0.5);
QL_BENCHMARK_DECLARE(FunctionsTests, testWeightedModifiedBesselFunctions, 20, 0.5);
QL_BENCHMARK_DECLARE(LowDiscrepancyTests, testHalton, 80, 1.0);
QL_BENCHMARK_DECLARE(GaussianQuadraturesTests, testNonCentralChiSquared, 4000, 0.5);
QL_BENCHMARK_DECLARE(GaussianQuadraturesTests, testNonCentralChiSquaredSumOfNodes, 8000, 0.5);
QL_BENCHMARK_DECLARE(GaussianQuadraturesTests, testMomentBasedGaussianPolynomial, 100000, 0.5);
QL_BENCHMARK_DECLARE(RoundingTests, testCeiling, 100000, 0.1);
QL_BENCHMARK_DECLARE(RoundingTests, testUp, 100000, 0.1);
QL_BENCHMARK_DECLARE(RoundingTests, testFloor, 100000, 0.1);
QL_BENCHMARK_DECLARE(RoundingTests, testDown, 100000, 0.1);
QL_BENCHMARK_DECLARE(RoundingTests, testClosest, 100000, 0.1);




int main(int argc, char* argv[] )  // NOLINT(bugprone-exception-escape)
{
    const std::string clientModeStr = "--client_mode=true";
    bool clientMode = false;

    // Default number of worker processes to use
#if defined(QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER)
    unsigned nProc = std::thread::hardware_concurrency();
#else
    unsigned nProc = 1;
#endif

    // By default, run the smallest size we have.
    std::string defaultSize = "3";
    std::string size = defaultSize;

    // A threadId is useful for debugging, but has no other purpose
    unsigned threadId = 0;




    ////  Argument handling  //////////////////////////
    for (int i=1; i<argc; ++i) {
        std::string arg = argv[i];
        std::vector<std::string> tok;
        boost::split(tok, arg, boost::is_any_of("="));

        if (tok[0] == "--nProc") {
            QL_REQUIRE(tok.size() == 2, "Must provide a number of worker processes");
            try {
                nProc = boost::numeric_cast<unsigned>(std::stoul(tok[1]));
            } catch(const std::exception &e) {
                std::cerr << "Invalid argument to 'nProc', not a positive integer" << std::endl;
                std::cerr << "Exception generated: " << e.what() << "\n";
                exit(1);
            }
        }
        else if (tok[0] == "--threadId") {
            QL_REQUIRE(tok.size() == 2, "Must provide a threadId");
            try {
                threadId = boost::numeric_cast<unsigned>(std::stoul(tok[1]));
            } catch(const std::exception &e) {
                std::cerr << "Invalid argument to 'threadId', not a positive integer. This is an internal error, please contact the developers" << std::endl;
                std::cerr << "Exception generated: " << e.what() << "\n";
                exit(1);
            }
        }
        else if (tok[0] == "--verbose") {
            QL_REQUIRE(tok.size() == 2, "Must provide a value for verbose");
            try {
                BenchmarkSupport::verbose = boost::numeric_cast<unsigned>(std::stoul(tok[1]));
            } catch(const std::exception &e) {
                std::cerr << "Invalid argument to 'verbose', not a positive integer" << std::endl;
                std::cerr << "Exception generated: " << e.what() << "\n";
                exit(1);
            }
            QL_REQUIRE(BenchmarkSupport::verbose>=0 && BenchmarkSupport::verbose <= 3, "Value for verbose must be 0, 1, 2 or 3");
        }
        else if (tok[0] == "--size") {
            QL_REQUIRE(tok.size() == 2,
                    "benchmark size is not given");
            size = tok[1];
        }
        else if (arg == "-h" || arg == "--help" || arg == "-?") {
            std::cout
                << "\n'quantlib-benchmark' is QuantLib " QL_VERSION " CPU performance benchmark\n"
                << "\n"
                << "You are strongly encouraged to run 'ulimit -n unlimited' before running this benchmark\n"
                << "on Linux systems.  It uses Boost::IPC for parallelism, and a large number of file descriptors\n"
                << "are needed to run this benchmark with a large number of worker processes.\n"
                << "\n"
                << "By default the benchmark uses a tiny size as a quick check that\n"
                << "everything works.  To benchmark large systems a size of 'S' or larger\n"
                << "should be used.\n"
                << "\n"
                << "Usage: ./quantlib-benchmark [OPTION] ...\n"
                << "\n"
                << "with the following options:"
                << "\n"
#ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER
                << "--nProc[=NN]       \t parallel execution with NN worker processes.\n"
                << "                   \t Default value is nProc=" << nProc << "\n"
                << "\n"
#endif
                << "--size=<";
            for(const auto &p : BenchmarkSupport::bmSizes) {
                std::cout << p.first << "|";
            }
            std::cout  << "NN> \n"
                << "                   \t the size of the benchmark (how many times each \n"
                << "                   \t task is run), where 'NN' can be any positive integer.\n"
                << "                   \t Default vaue is size=" << defaultSize << "\n"
                << "\n"
                << "--verbose=<0|1|2|3>\t controls verbosity of output, default value is verbose=" << BenchmarkSupport::verbose << "\n"
                << "\n"
                << "-?, --help         \t display this help and exit"
                << std::endl;
            return 0;
        }
        else if (arg == clientModeStr)  {
            clientMode = true;
        }
        else {
            std::cout << "quantlib-benchmark: unrecognized option '" << arg << "'."
                << std::endl
                << "Try 'quantlib-benchmark --help' for more information."
                << std::endl;
            return 0;
        }
    }

    const unsigned int nSize = BenchmarkSupport::parseBmSize(size);
    std::vector<double> workerLifetimes;

    ////////  Finished argument processing, start benchmark code   //////////////////////////////////////////////

    try {

        // Ensure we find the Boost test_unit for each benchmark
        TestUnitFinder::findAllTests(argv, bm);

        // To alleviate tail effects, we sort the bechmarks so that the most expensive ones are first.
        // These will be the first to be dispatched to the OS scheduler
        std::sort(bm.begin(), bm.end(),
                [](const auto& a, const auto& b) { return a.getCost() > b.getCost(); });


        BenchmarkResult bmResult;
        if( !clientMode)
            BenchmarkSupport::printGreeting(size, nProc);



        // Sequential benchmark, useful for debugging
        if (nProc == 1 && !clientMode) {

            // First we run the validation to ensure that the
            // benchmark binary is computing the correct results
            for(auto & j : bm) {
                bmResult.reset();
                j.runValidation();
                if( !bmResult.pass() ) {
                    BenchmarkSupport::terminateBenchmark();
                }
             }

            // Now run the benchmark proper
            auto startTime = std::chrono::steady_clock::now();
            for (unsigned i=0; i < nSize; ++i) {
                for(unsigned int j=0; j<bm.size(); j++) {
                    double time = bm[j].runBenchmark();
                    bm[j].getTotalRuntime() += time;
                    LOG_MESSAGE("MASTER  :  completed benchmarkId=" << j << ", time=" << time);
                }
            }
            auto stopTime = std::chrono::steady_clock::now();
            double masterLifetime = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
            workerLifetimes.push_back(masterLifetime);
            BenchmarkSupport::printResults(nSize, masterLifetime, workerLifetimes);
        }
        else {

#if defined(QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER)

            using namespace boost::interprocess;

            message_queue::size_type recvd_size;
            unsigned int priority=0;
            const unsigned int terminateId=-1;
            const unsigned int startTimerId = terminateId - 1;
            const char* const testUnitIdQueueName = "test_unit_queue";
            const char* const testResultQueueName = "test_result_queue";

            if (!clientMode) {

                // Boost IPC message queue setup
                message_queue::remove(testUnitIdQueueName);
                message_queue::remove(testResultQueueName);
                struct queue_remove {
                    explicit queue_remove(const char* name) : name_(name) { }
                    ~queue_remove() { message_queue::remove(name_); }

                    private:
                    const char* const name_;
                } remover1(testUnitIdQueueName),remover2(testResultQueueName);

                message_queue mq(
                        open_or_create, testUnitIdQueueName,
                        nSize*bm.size()+nProc, sizeof(IPCInstructionMsg)
                        );
                message_queue rq(
                        open_or_create, testResultQueueName,
                        std::max(16u, nProc),
                        sizeof(IPCResultMsg)
                        );


                // Create the thread group and start each worker process, giving it a unique threadId (useful for debugging)
                std::vector<std::thread> threadGroup;
                {
                    std::string thread("--threadId="), verb("--verbose=");
                    verb += std::to_string(BenchmarkSupport::verbose);
                    std::vector<std::string> workerArgs = {clientModeStr, thread, verb};
                    for (unsigned i = 0; i < nProc; ++i) {
                        LOG_MESSAGE("MASTER    : creating worker threadId=" << i+1);
                        workerArgs[1] = thread + std::to_string(i+1);
                        threadGroup.emplace_back([&,workerArgs]() { BenchmarkSupport::worker(argv[0], workerArgs); });
                    }
                }

                IPCInstructionMsg msg;
                IPCResultMsg r;

                // Do a full validation run first to ensure the benchmark binary is computing
                // the correct values
                for (unsigned j=0; j < bm.size(); ++j) {
                    msg = {j, true};
                    // Will be non-blocking send since send buffer is big enough
                    LOG_MESSAGE("MASTER    : sending benchmarkId=" << msg.j << " with validation=" << msg.validate);
                    mq.send(&msg, sizeof(IPCInstructionMsg), 0);
                }
                // Receive all results from workers
                for (unsigned i=0; i < bm.size(); ++i) {
                    rq.receive(&r, sizeof(IPCResultMsg), recvd_size, priority);
                    LOG_MESSAGE("MASTER     : received result : threadId=" << r.threadId << ", benchmarkId=" << r.bmId
                            << ", time=" << r.time << " : " << bm.size()-1-i << " results pending");
                    if(r.time < 0) {
                        // A benchmark test has failed
                        BenchmarkSupport::terminateBenchmark();
                    }
                }

                // Start timer for the benchmark
                auto startTime = std::chrono::steady_clock::now();
                // Tell all workers to start their timers
                for(unsigned j=0; j<nProc; j++) {
                    msg = {startTimerId, false};
                    LOG_MESSAGE("MASTER    : sending worker=" << j << " command to restart timer");
                    mq.send(&msg, sizeof(IPCInstructionMsg), 0);
                }
                // Now do the benchmark run proper
                for (unsigned j=0; j < bm.size(); ++j) {
                    // Enqueue nSize copies of each task to even out load balance
                    for (unsigned i=0; i < nSize; ++i) {
                        msg = {j, false};
                        // Will be non-blocking send since send buffer is big enough
                        LOG_MESSAGE("MASTER    : sending benchmarkId=" << msg.j << " with validation=" << msg.validate);
                        mq.send(&msg, sizeof(IPCInstructionMsg), 0);
                    }
                }
                // Receive all results from workers
                for (unsigned i=0; i < nSize*bm.size(); ++i) {
                    rq.receive(&r, sizeof(IPCResultMsg), recvd_size, priority);
                    LOG_MESSAGE("MASTER     : received result : threadId=" << r.threadId << ", benchmarkId=" << r.bmId
                            << ", time=" << r.time << " : " << nSize*bm.size()-1-i << " results pending");
                    if(r.time < 0) {
                        // A benchmark test has failed - should be impossible here
                        BenchmarkSupport::terminateBenchmark();
                    }
                    bm[r.bmId].getTotalRuntime() += r.time;
                }


                // Send terminate signal to all workers
                for (unsigned i=0; i < nProc; ++i) {
                    LOG_MESSAGE("MASTER    : sending TERMINATE signal");
                    msg = {terminateId, false};
                    mq.send(&msg, sizeof(IPCInstructionMsg), 0);
                }
                // Receive worker lifetimes
                for (unsigned i=0; i < nProc; ++i) {
                    rq.receive(&r, sizeof(IPCResultMsg), recvd_size, priority);
                    LOG_MESSAGE("MASTER    : received worker lifetime : threadId=" << r.threadId << ", time=" << r.time << " : " << nProc-1-i << " lifetimes pending");
                    workerLifetimes.push_back(r.time);
                }


                // Synchronize with and exit all threads
                for (auto& thread: threadGroup) {
                    thread.join();
                }

                auto stopTime = std::chrono::steady_clock::now();
                double masterLifetime = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
                BenchmarkSupport::printResults(nSize, masterLifetime, workerLifetimes);


            }
            else {
                // We are a worker process - open Boost IPC queues
                message_queue mq(open_only, testUnitIdQueueName);
                message_queue rq(open_only, testResultQueueName);

                // Record start of this process's lifetime.  We keep tack of lifetimes
                // in order to monitor tail effects
                std::chrono::time_point<std::chrono::steady_clock> startTime, stopTime;

                for(;;) {
                    IPCInstructionMsg id;
                    mq.receive(&id, sizeof(IPCInstructionMsg), recvd_size, priority);

                    if(id.j == startTimerId) {
                        // The benchmark run proper is starting - start the timer for this worker.
                        // If this worker has nothing to do, we still want a non-zero lifetime
                        startTime = std::chrono::steady_clock::now();
                        stopTime = std::chrono::steady_clock::now();
                    }
                    else if(id.j == terminateId) {
                        // Worker process being told to terminate.  Report our lifetime.
                        // Lifetime is how long it took until we completed our final task
                        double workerLifetime = std::chrono::duration_cast<std::chrono::microseconds>(stopTime - startTime).count() * 1e-6;
                        IPCResultMsg r {terminateId, threadId, workerLifetime};
                        LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": received TERMINATE signal, sending lifetime=" << r.time);
                        rq.send(&r, sizeof(IPCResultMsg), 0);
                        break;
                    }
                    else {
                        LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": received benchmarkId=" << id.j << ", validation=" << id.validate << ".  Starting execution ...");
                        double time;
                        if( id.validate ) {
                            bmResult.reset();
                            time = bm[id.j].runValidation();
                            time = (bmResult.pass() ? time : -1.0);
                        }
                        else {
                            time = bm[id.j].runBenchmark();
                        }
                        IPCResultMsg r {id.j, threadId, time};
                        // We record the timestamp after each task is complete
                        // We use this to define worker lifetime
                        stopTime = std::chrono::steady_clock::now();
                        LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": sending result benchmarkId=" << id.j << ", time=" << r.time);
                        rq.send(&r, sizeof(IPCResultMsg), 0);
                    }
                }
                LOG_MESSAGE("WORKER-" << std::setw(3) << threadId << ": exiting");
            }

#else
            std::cout << "Please compile QuantLib with option 'QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER'"
                " to run the benchmarks in parallel" << std::endl;
#endif
        }

    } catch(const std::exception &e) {
        if( !clientMode )
            std::cerr << "MASTER process caught an exception:\n" << e.what() << std::endl;
        else
            std::cerr << "WORKER-" << std::setw(3) << threadId << " caught an exception:\n" << e.what() << std::endl;
    }

    return 0;
}