File: std_simd_type.h

package info (click to toggle)
vspline 1.1.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,708 kB
  • sloc: cpp: 15,905; ansic: 443; sh: 17; makefile: 2
file content (766 lines) | stat: -rw-r--r-- 25,197 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2023 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/*! \file simd_type.h

    \brief SIMD type derived from std::simd

    To use this header, an implementation of std::simd has to be
    installed, and the -std=c++17 option is needed as well.
    It has been tried with clang++ and g++; you'll need a
    recent version.

*/

#ifndef VSPLINE_STD_SIMD_TYPE_H
#define VSPLINE_STD_SIMD_TYPE_H

#ifndef VSPLINE_VECTOR_NBYTES

#define VSPLINE_VECTOR_NBYTES 64

#endif

#include <iostream>
#include <experimental/simd>

namespace vspline
{

/// class template simd_type provides a fixed-size SIMD type.
/// This implementation of vspline::simd_type uses std::simd as
/// base class. This class is used as a stand-in for Vc::SimdArray
/// - it does not cover the whole interface, but a reasonably
/// large subset - my choice of SimdArray functionality is what
/// I need in lux/vspline.
/// Most of the 'loop variant' of simd_type has been ported to
/// use std::simd instead, making use of std::simd's
/// - constructors
/// - copy_from and copy_to
/// - masks and where expressions
/// - operator functions
/// - overloads for several mathematical functions
/// - min/max

template < typename _value_type ,
           std::size_t _vsize >
struct simd_type
: private std::experimental::simd
          < _value_type ,
            std::experimental::simd_abi::fixed_size < _vsize >
          >
{
  typedef std::experimental::simd_abi::fixed_size < _vsize > abi_t ;

  typedef std::size_t size_type ;
  typedef _value_type value_type ;
  static const size_type vsize = _vsize ;

  typedef std::experimental::simd < value_type , abi_t > base_t ;
  typedef std::experimental::simd < int , abi_t > index_type ;
  using typename base_t::mask_type ;

  // provide the size as a constexpr

  static constexpr size_type size()
  {
    return vsize ;
  }

  typedef mask_type MaskType ;
  typedef index_type IndexType ;

  // operator[] is mapped to std::simd element access

  using base_t::operator[] ;

  // assignment from a value_type. The assignment is coded as a loop,
  // but it should be obvious to the compiler's loop vectorizer that
  // the loop is a 'SIMD operation in disguise', so here we have the
  // first appearance of 'goading'.

  simd_type & operator= ( const value_type & rhs )
  {
    to_base() = base_t ( rhs ) ;
    return *this ;
  }

  // c'tor from value_type. We use the assignment operator for
  // initialization.

  simd_type ( const value_type & ini )
  : base_t ( ini )
  { }

  simd_type ( const base_t & ini )
  : base_t ( ini )
  { }

  // these two c'tors are left in default mode

  simd_type() = default ;
  simd_type ( const simd_type & ) = default ;

  base_t & to_base()
  {
    return ( * static_cast < base_t * > ( this ) ) ;
  }

  const base_t & to_base() const
  {
    return ( * static_cast < const base_t * const > ( this ) ) ;
  }

  // assignment from equally-sized container. Most containers use std::size_t
  // for the template argument defining the number of elements they hold,
  // but some (notably vigra::TinyVector) use int, which is probably a relic
  // from times when non-type template arguments were of a restricted type
  // set only. By providing a specialization for SIZE_TYPE int, we make
  // equally-sized vigra::TinyVectors permitted initializers.
  // the c'tor from an equally-sized container also uses the corresponding
  // operator= overload, so we use one macro for both.
  // we also need two different variants of vsize for g++; clang++ accepts
  // size_type vsize for both places where VSZ is used, but g++ requires
  // an integer.
  // Note that the rhs can use any elementary type which can be legally
  // assigned to value_type. This allows transport of information from
  // differently typed objects, but there are no further constraints on
  // the types involved, which may degrade precision. It's the user's
  // responsibility to make sure such assignments have the desired effect
  // and overload them if necessary.

  #define BUILD_FROM_CONTAINER(SIZE_TYPE,VSZ) \
    template < typename U , template < typename , SIZE_TYPE > class V > \
    simd_type & operator= ( const V < U , VSZ > & rhs ) \
    { \
      for ( size_type i = 0 ; i < vsize ; i++ ) \
        to_base() [ i ] = value_type ( rhs [ i ] ) ; \
      return *this ; \
    } \
    template < typename U , template < typename , SIZE_TYPE > class V > \
    simd_type ( const V < U , VSZ > & ini ) \
    { \
      *this = ini ; \
    }

  BUILD_FROM_CONTAINER(std::size_t,vsize)

  #undef BUILD_FROM_CONTAINER

  static const simd_type iota()
  {
    simd_type result ;
    for ( size_type i = 0 ; i < vsize ; i++ )
      result [ i ] = value_type ( i ) ;
    return result ;
  }

  // mimick Vc's IndexesFromZero. This function produces an index
  // vector filled with indexes starting with zero.

  static const index_type IndexesFromZero()
  {
    typedef typename index_type::value_type IT ;
    static const IT ceiling = std::numeric_limits < IT > :: max() ;
    assert ( ( vsize - 1 ) <= std::size_t ( ceiling ) ) ;

    index_type ix ;
    for ( size_type i = 0 ; i < vsize ; i++ )
      ix [ i ] = int ( i ) ;
    return ix ;
  }

  // variant which starts from a different starting point and optionally
  // uses steps other than one.

  static const index_type IndexesFrom ( std::size_t start ,
                                        std::size_t step = 1 )
  {
    typedef typename index_type::value_type IT ;
    static const IT ceiling = std::numeric_limits < IT > :: max() ;
    assert ( start + ( vsize - 1 ) * step <= std::size_t ( ceiling ) ) ;

    return ( IndexesFromZero() * int(step) ) + int(start) ;
  }

  // functions Zero and One produce simd_type objects filled with
  // 0, or 1, respectively

  static const simd_type Zero()
  {
    return simd_type ( value_type ( 0 ) ) ;
  }

  static const simd_type One()
  {
    return simd_type ( value_type ( 1 ) ) ;
  }

  // echo the vector to a std::ostream, read it from an istream

  friend std::ostream & operator<< ( std::ostream & osr ,
                                     simd_type it )
  {
    osr << "(" ;
    for ( size_type i = 0 ; i < vsize - 1 ; i++ )
      osr << it [ i ] << ", " ;
    osr << it [ vsize - 1 ] << ")" ;
    return osr ;
  }

  friend std::istream & operator>> ( std::istream & isr ,
                                     simd_type it )
  {
    for ( size_type i = 0 ; i < vsize ; i++ )
      isr >> it [ i ] ;
    return isr ;
  }

  // memory access functions, which load and store vector data.
  // We start out with functions transporting data from memory into
  // the simd_type. Some of these operations have corresponding
  // c'tors which use the member function to initialize to_base().

  // load delegates to std::simd::copy_from. TODO: consider
  // overalignment

  void load ( const value_type * const p_src )
  {
    to_base().copy_from ( p_src ,
                          std::experimental::element_aligned_tag() ) ;
  }

  // std::simd does not offer gather/scatter, but it offers a
  // c'tor taking a functor to set the elements. In theory, this
  // is a good idea, because the optimizer might realize that
  // the sum of invocations of the functor can be represented
  // by a gather/scatter operation, but how well this works is
  // a different matter and mileage varies.

#define GS_LAMBDA

#ifdef GS_LAMBDA

  template < typename index_type >
  void gather ( const value_type * const p_src ,
                const index_type & indexes )
  {
    // assign base_t object created by gen-type c'tor
    to_base() = base_t ( [&] ( const size_t & i )
                    { return p_src [ indexes[i] ] ; } ) ;
  }

#else

  template < typename index_type >
  void gather ( const value_type * const p_src ,
                const index_type & indexes )
  {
    for ( std::size_t i = 0 ; i < vsize ; i++ )
      (*this)[i] = p_src [ indexes [ i ] ] ;
  }

#endif

  // c'tor from pointer and indexes, uses gather

  template < typename index_type >
  simd_type ( const value_type * const p_src ,
              const index_type & indexes )
  {
    gather ( p_src , indexes ) ;
  }

  // store saves the content of the vector to memory

  void store ( value_type * const p_trg ) const
  {
    to_base().copy_to ( p_trg ,
                        std::experimental::element_aligned_tag() ) ;
  }

  // scatter is the reverse operation to gather

#ifdef GS_LAMBDA

  template < typename index_type >
  void scatter ( value_type * const p_trg ,
                 const index_type & indexes ) const
  {
    // gen-type c'tor is only used for side effects; let the compiler
    // figure out that the result is unused.

    base_t dummy ( [&] ( const size_t & i )
                   { return p_trg [ indexes[i] ] = to_base()[i] ; } ) ;
  }

#else

  template < typename index_type >
  void scatter ( value_type * const p_trg ,
                 const index_type & indexes ) const
  {
    for ( std::size_t i = 0 ; i < vsize ; i++ )
      p_trg [ indexes [ i ] ] = (*this)[i] ;
  }

#endif

  // 'regular' gather and scatter, accessing strided memory so that the
  // first address visited is p_src/p_trg, and successive addresses are
  // 'step' apart - in units of T. Might also be done with goading, the
  // loop should autovectorize.

  void rgather ( const value_type * const p_src ,
                 const std::size_t & step )
  {
    auto indexes = IndexesFrom ( 0 , step ) ;
    gather ( p_src , indexes ) ;
  }

  void rscatter ( value_type * p_trg ,
                  const std::size_t & step ) const
  {
    auto indexes = IndexesFrom ( 0 , step ) ;
    scatter ( p_trg , indexes ) ;
  }

  // apply functions from namespace std to each element in a vector,
  // or to each corresponding set of elements in a set of vectors
  // - going up to three for fma.
  // many standard functions autovectorize well. Note that the
  // autovectorization of standard functions often needs additional
  // compiler flags, like, e.g., -fno-math-errno for clang++, to
  // produce hardware SIMD instructions.

  #define BROADCAST_STD_FUNC(FUNC) \
    friend simd_type FUNC ( simd_type arg ) \
    { \
      return FUNC ( arg.to_base() ) ; \
    }

    // TODO: getting zero back for negative args, hence no BROADCAST_STD_FUNC
    // this happens with clang++ only, I opened an issue with VcDevel/std-simd:
    // https://github.com/VcDevel/std-simd/issues/31

//   BROADCAST_STD_FUNC(abs)

  friend simd_type abs ( simd_type arg )
  {
    arg ( arg < 0 ) = - arg ;
    return arg ;
  }

  BROADCAST_STD_FUNC(trunc)

  BROADCAST_STD_FUNC(round)
  BROADCAST_STD_FUNC(floor)
  BROADCAST_STD_FUNC(ceil)
  BROADCAST_STD_FUNC(log)
  BROADCAST_STD_FUNC(exp)
  BROADCAST_STD_FUNC(sqrt)

  // the support for autovectorization of trigonometric functions is
  // sketchy - e.g. the clang++ reference does not mention them as
  // functions which autovectorize. Vc offers hand-coded trigonometric
  // functions which might be worth while porting to std::simd, but so
  // far I haven't seen this happen. In my application, this results in
  // bad performance when these functions are used.

  BROADCAST_STD_FUNC(tan)
  BROADCAST_STD_FUNC(asin)
  BROADCAST_STD_FUNC(acos)
  BROADCAST_STD_FUNC(atan)

  // TODO: odd: with clang++, sin and cos don't perform as expected;
  // using a loop does the trick:

#ifdef __clang__

  friend simd_type cos ( simd_type arg )
  {
    simd_type result ;
    for ( std::size_t i = 0 ; i < size() ; i++ )
      result[i] = std::cos ( arg[i] ) ;
    return result ;
  }

  friend simd_type sin ( simd_type arg )
  {
    simd_type result ;
    for ( std::size_t i = 0 ; i < size() ; i++ )
      result[i] = std::sin ( arg[i] ) ;
    return result ;
  }

#else

  BROADCAST_STD_FUNC(sin)
  BROADCAST_STD_FUNC(cos)

#endif

  #undef BROADCAST_STD_FUNC

  #define BROADCAST_STD_FUNC2(FUNC) \
    friend simd_type FUNC ( simd_type arg1 , \
                            simd_type arg2 ) \
    { \
      return FUNC ( arg1.to_base() , arg2.to_base() ) ; \
    }

  // a short note on atan2: Vc provides a hand-written vectorized version
  // of atan2 which is especially fast and superior to autovectorized code.

  BROADCAST_STD_FUNC2(atan2)
  BROADCAST_STD_FUNC2(pow)

  #undef BROADCAST_STD_FUNC2

  #define BROADCAST_STD_FUNC3(FUNC) \
    friend simd_type FUNC ( simd_type arg1 , \
                            simd_type arg2 , \
                            simd_type arg3 ) \
    { \
      return FUNC ( arg1.to_base() , arg2.to_base() , arg3.to_base() ) ; \
    }

  BROADCAST_STD_FUNC3(fma)

  #undef BROADCAST_STD_FUNC3

  // macros used for the parameter 'CONSTRAINT' in the definitions
  // further down. Some operations are only allowed for integral types
  // or boolans. This might be enforced by enable_if, here we use a
  // static_assert with a clear error message.
  // TODO: might relax constraints by using 'std::is_convertible'

  #define INTEGRAL_ONLY \
    static_assert ( std::is_integral < value_type > :: value , \
                    "this operation is only allowed for integral types" ) ;

  #define BOOL_ONLY \
    static_assert ( std::is_same < value_type , bool > :: value , \
                    "this operation is only allowed for booleans" ) ;

  // augmented assignment operators. Some operators are only applicable
  // to specific data types, which is enforced by 'CONSTRAINT'.
  // One might consider widening the scope by making these operator
  // functions templates and accepting arbitrary indexable types.
  // Only value_type and simd_type itto_base() are taken as rhs arguments.

  #define OPEQ_FUNC(OPFUNC,OPEQ,CONSTRAINT) \
    simd_type & OPFUNC ( value_type rhs ) \
    { \
      CONSTRAINT \
      to_base() OPEQ rhs ; \
      return *this ; \
    } \
    simd_type & OPFUNC ( simd_type rhs ) \
    { \
      CONSTRAINT \
      to_base() OPEQ rhs.to_base() ; \
      return *this ; \
    }

  OPEQ_FUNC(operator+=,+=,)
  OPEQ_FUNC(operator-=,-=,)
  OPEQ_FUNC(operator*=,*=,)
  OPEQ_FUNC(operator/=,/=,)

  OPEQ_FUNC(operator%=,%=,INTEGRAL_ONLY)
  OPEQ_FUNC(operator&=,&=,INTEGRAL_ONLY)
  OPEQ_FUNC(operator|=,|=,INTEGRAL_ONLY)
  OPEQ_FUNC(operator^=,^=,INTEGRAL_ONLY)
  OPEQ_FUNC(operator<<=,<<=,INTEGRAL_ONLY)
  OPEQ_FUNC(operator>>=,>>=,INTEGRAL_ONLY)

  #undef OPEQ_FUNC

  // we use a simple scheme for type promotion: the promoted type
  // of two values should be the same as the type we would receive
  // when adding the two values. That's standard C semantics, but
  // it won't widen the result type to avoid overflow or increase
  // precision - such conversions have to be made by user code if
  // necessary.

  #define C_PROMOTE(A,B)  \
  typename std::conditional \
            < std::is_same < A , B > :: value , \
              A , \
              decltype (   std::declval < A > () \
                          + std::declval < B > () ) \
            > :: type

  // binary operators and left and right scalar operations with
  // value_type, unary operators -, ! and ~

  #define OP_FUNC(OPFUNC,OP,CONSTRAINT) \
  template < typename RHST , \
             typename = typename std::enable_if \
                       < std::is_fundamental < RHST > :: value \
                       > :: type \
           > \
  simd_type < C_PROMOTE ( value_type , RHST ) , size() > \
  OPFUNC ( simd_type < RHST , vsize > _rhs ) const \
  { \
    CONSTRAINT \
    simd_type < C_PROMOTE ( value_type , RHST ) , vsize > lhs ( *this ) ; \
    simd_type < C_PROMOTE ( value_type , RHST ) , vsize > rhs ( _rhs ) ; \
    return lhs.to_base() OP rhs.to_base() ; \
  } \
  template < typename RHST , \
             typename = typename std::enable_if \
                       < std::is_fundamental < RHST > :: value \
                       > :: type \
           > \
  simd_type < C_PROMOTE ( value_type , RHST ) , vsize > \
  OPFUNC ( RHST _rhs ) const \
  { \
    CONSTRAINT \
    simd_type < C_PROMOTE ( value_type , RHST ) , vsize > lhs ( *this ) ; \
    C_PROMOTE ( value_type , RHST ) rhs ( _rhs ) ; \
    return lhs.to_base() OP rhs ; \
  } \
  template < typename LHST , \
             typename = typename std::enable_if \
                       < std::is_fundamental < LHST > :: value \
                       > :: type \
           > \
  friend simd_type < C_PROMOTE ( LHST , value_type ) , vsize > \
  OPFUNC ( LHST _lhs , simd_type _rhs ) \
  { \
    CONSTRAINT \
    C_PROMOTE ( value_type , LHST ) lhs ( _lhs ) ; \
    simd_type < C_PROMOTE ( LHST , value_type ) , vsize > rhs ( _rhs ) ; \
    return lhs OP rhs.to_base() ; \
  }

  // binary operators and left and right scalar operations with
  // value_type, unary operators -, ! and ~

  // #define OP_FUNC(OPFUNC,OP,CONSTRAINT) \
  //   simd_type OPFUNC ( simd_type rhs ) const \
  //   { \
  //     CONSTRAINT \
  //     return this->to_base() OP rhs.to_base() ; \
  //   } \
  //   simd_type OPFUNC ( value_type rhs ) const \
  //   { \
  //     CONSTRAINT \
  //     return this->to_base() OP rhs ; \
  //   } \
  //   friend simd_type OPFUNC ( value_type lhs , \
  //                             simd_type rhs ) \
  //   { \
  //     CONSTRAINT \
  //     return lhs OP rhs.to_base() ; \
  //   }
  //
  OP_FUNC(operator+,+,)
  OP_FUNC(operator-,-,)
  OP_FUNC(operator*,*,)
  OP_FUNC(operator/,/,)

  OP_FUNC(operator%,%,INTEGRAL_ONLY)
  OP_FUNC(operator&,&,INTEGRAL_ONLY)
  OP_FUNC(operator|,|,INTEGRAL_ONLY)
  OP_FUNC(operator^,^,INTEGRAL_ONLY)
  OP_FUNC(operator<<,<<,INTEGRAL_ONLY)
  OP_FUNC(operator>>,>>,INTEGRAL_ONLY)

  OP_FUNC(operator&&,&&,BOOL_ONLY)
  OP_FUNC(operator||,||,BOOL_ONLY)

  #undef OP_FUNC

  #define OP_FUNC(OPFUNC,OP,CONSTRAINT) \
    simd_type OPFUNC() const \
    { \
      return OP this->to_base() ; \
    }

  OP_FUNC(operator-,-,)
  OP_FUNC(operator!,!,BOOL_ONLY)
  OP_FUNC(operator~,~,INTEGRAL_ONLY)

  #undef OP_FUNC

  // provide methods to produce a mask on comparing a vector
  // with another vector or a value_type.

  #define COMPARE_FUNC(OP,OPFUNC) \
  friend mask_type OPFUNC ( simd_type lhs , \
                            simd_type rhs ) \
  { \
    return lhs.to_base() OP rhs.to_base() ; \
  } \
  friend mask_type OPFUNC ( simd_type lhs , \
                            value_type rhs ) \
  { \
    return lhs.to_base() OP rhs ; \
  } \
  friend mask_type OPFUNC ( value_type lhs , \
                            simd_type rhs ) \
  { \
    return lhs OP rhs.to_base() ; \
  }

  COMPARE_FUNC(<,operator<) ;
  COMPARE_FUNC(<=,operator<=) ;
  COMPARE_FUNC(>,operator>) ;
  COMPARE_FUNC(>=,operator>=) ;
  COMPARE_FUNC(==,operator==) ;
  COMPARE_FUNC(!=,operator!=) ;

  #undef COMPARE_FUNC

  // note: std::simd's mask_type has associated functions any_of, all_of
  // and none_of, so we needn't define them for this backend

  // next we define a masked vector as an object holding two references:
  // one reference to a mask type, determining which of the vector's
  // elements will be 'open' to an effect, and one reference to a vector,
  // which will be affected by the operation.
  // The resulting object will only be viable as long as the referred-to
  // mask and vector are alive - it's meant as a construct to be processed
  // in the same scope, as the lhs of an assignment, typically using
  // notation introduced by Vc: a vector's operator() is overloaded to
  // to produce a masked_type when called with a mask_type object, and
  // the resulting masked_type object is then assigned to.
  // Note that this does not have any effect on those values in 'whither'
  // for which the mask is false. They remain unchanged.

  typedef std::experimental::where_expression < mask_type , base_t > we_t ;

  struct masked_type
  {
    mask_type whether ;   // if the mask is true at whether[i]
    simd_type & whither ; // whither[i] will be assigned to

    masked_type ( mask_type _whether ,
                  simd_type & _whither )
    : whether ( _whether ) ,
      whither ( _whither )
      { }

    // for the masked vector, we define the complete set of assignments:

    #define OPEQ_FUNC(OPFUNC,OPEQ,CONSTRAINT) \
      simd_type & OPFUNC ( value_type rhs ) \
      { \
        CONSTRAINT \
        we_t ( whether , whither.to_base() ) OPEQ rhs ; \
        return whither ; \
      } \
      simd_type & OPFUNC ( simd_type rhs ) \
      { \
        CONSTRAINT \
        we_t ( whether , whither.to_base() ) OPEQ rhs.to_base() ; \
        return whither ; \
      }

    OPEQ_FUNC(operator=,=,)
    OPEQ_FUNC(operator+=,+=,)
    OPEQ_FUNC(operator-=,-=,)
    OPEQ_FUNC(operator*=,*=,)
    OPEQ_FUNC(operator/=,/=,)
    OPEQ_FUNC(operator%=,%=,INTEGRAL_ONLY)
    OPEQ_FUNC(operator&=,&=,INTEGRAL_ONLY)
    OPEQ_FUNC(operator|=,|=,INTEGRAL_ONLY)
    OPEQ_FUNC(operator^=,^=,INTEGRAL_ONLY)
    OPEQ_FUNC(operator<<=,<<=,INTEGRAL_ONLY)
    OPEQ_FUNC(operator>>=,>>=,INTEGRAL_ONLY)

  #undef OPEQ_FUNC

  #undef INTEGRAL_ONLY
  #undef BOOL_ONLY
  
  } ;

  // mimicking Vc, we define operator() with a mask_type argument
  // to produce a masked_type object, which can be used later on to
  // masked-assign to the referred-to vector. With this definition
  // we can use the same syntax Vc uses, e.g. v1 ( v1 > v2 ) = v3
  // This helps write code which compiles with Vc and without,
  // because this idiom is 'very Vc'.

  masked_type operator() ( mask_type mask )
  {
    return masked_type ( mask , *this ) ;
  }

  // member functions at_least and at_most. These functions provide the
  // same functionality as max, or min, respectively. Given simd_type X
  // and some threshold Y, X.at_least ( Y ) == max ( X , Y )
  // Having the functionality as a member function makes it easy to
  // implement, e.g., min as: min ( X , Y ) { return X.at_most ( Y ) ; }

  #define CLAMP(FNAME,REL) \
    simd_type FNAME ( simd_type threshold ) const \
    { \
      return REL ( to_base() , threshold.to_base() ) ; \
    } \
    simd_type FNAME ( value_type threshold ) const \
    { \
      return REL ( to_base() , threshold ) ; \
    } \

  CLAMP(at_least,max)
  CLAMP(at_most,min)

  #undef CLAMP

  // sum of vector elements. Note that there is no type promotion; the
  // summation is done to value_type. Caller must make sure that overflow
  // is not a problem.

  value_type sum() const
  {
    value_type s ( 0 ) ;
    for ( std::size_t e = 0 ; e < vsize ; e++ )
      s += (*this) [ e ] ;
    return s ;
  }
} ;

} ;

#endif // #define VSPLINE_SIMD_TYPE_H