File: transform.h

package info (click to toggle)
vspline 1.1.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,708 kB
  • sloc: cpp: 15,905; ansic: 443; sh: 17; makefile: 2
file content (1533 lines) | stat: -rw-r--r-- 63,487 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2023 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/*! \file transform.h
   
    \brief set of generic remap, transform and apply functions
   
    My foremost reason to have efficient B-spline processing is the
    formulation of generic remap-like functions. remap() is a function
    which takes an array of real-valued nD coordinates and an interpolator
    over a source array. Now each of the real-valued coordinates is fed
    into the interpolator in turn, yielding a value, which is placed in
    the output array at the same place the coordinate occupies in the
    coordinate array. To put it concisely, if we have
   
    - c, the coordinate array (or 'warp' array, 'arguments' array)
    - a, the source array (containing 'original' or 'knot point' data)
    - i, the interpolator over a
    - j, a coordinate into both c and t
    - t, the target array (receiving the 'result' of the remap)
   
    remap defines the operation
   
    t[j] = i(c[j]) for all j
   
    Now we widen the concept of remapping to a 'transform' function.
    Instead of limiting the process to the use of an 'interpolator',
    we use an arbitrary unary functor transforming incoming values to
    outgoing values, where the type of the incoming and outgoing values
    is determined by the functor. If the functor actually is an
    interpolator, we have a 'true' remap transforming coordinates
    into values, but this is merely a special case. So here we have:
   
    - c, an array containing input values
    - f, a unary functor converting input to output values
    - j, a coordinate into c and t
    - t, the target array
   
    transform performs the operation
   
    t[j] = f(c[j]) for all j
   
    remaps/transforms to other-dimensional objects are supported.
    This makes it possible to, for example, remap from a volume to a
    2D image, using a 2D coordinate array containing 3D coordinates
    ('slicing' a volume)
   
    There is also a variant of this transform function in this file,
    which doesn't take an input array. Instead, for every target
    location, the location's discrete coordinates are passed to the
    unary_functor type object. This way, transformation-based remaps
    can be implemented easily: the user code just has to provide a
    suitable functor to yield values for coordinates. This functor
    will internally take the discrete incoming coordinates (into the
    target array) and take it from there, eventually producing values
    of the target array's value_type.

    Here we have:
   
    - f, a unary functor converting discrete coordinates to output values
    - j, a discrete coordinate into t
    - t, the target array
   
    'index-based' transform performs the operation
   
    t[j] = f(j) for all j
   
    This file also has code to evaluate a b-spline at coordinates in a 
    grid, which can be used for scaling, and for separable geometric
    transformations.
   
    Finally there is a function to restore the original data from a
    b-spline to the precision possible with the given data type and
    degree of the spline. This is done with a separable convolution,
    using a unit-stepped sampling of the basis function as the
    convolution kernel along every axis.
   
    Let me reiterate the strategy used to perform the transforms and
    remaps in this file. The approach is functional: A 'processing chain'
    is set up and encoded as a functor providing two evaluation functions:
    one for 'single' data and one for vectorized data. This functor is
    applied to the data by 'wielding' code, which partitions the data
    into several jobs to be performed by individual worker threads,
    invokes the worker threads, and, once in the worker thread, feeds the
    data to the functor in turn, using hardware vectorization if possible.
    So while at the user code level a single call to some 'transform'
    or 'remap' routine is issued, passing in arrays of data and functors,
    all the 'wielding' is done automatically without any need for the user
    to even be aware of it, while still using highly efficient vector code
    with a tread pool, potentially speeding up the operation greatly as 
    compared to single-threaded, unvectorized operation, of course
    depending on the presence of several cores and vector units. On
    my system (Haswell 4-core i5 with AVX2) the speedup is about one
    order of magnitude. The only drawback is the production of a
    hardware-specific binary if vectorization is used. Both Vc use
    and multithreading can be easily activated/deactivated by #define
    switches, providing a clean fallback solution to produce code suitable
    for any target, even simple single-core machines with no vector units.
    Vectorization will be used if possible - either explicit vectorization
    (by defining USE_VC) - or autovectorization per default.
    Defining VSPLINE_SINGLETHREAD will disable multithreading.
    The code accessing multithreading and/or Vc use is #ifdeffed, so if
    these features are disabled, their code 'disappears' and the relevant 
    headers are not included, nor do the corresponding libraries have to
    be present.
*/

// TODO: don't multithread or reduce number of jobs for small data sets

#ifndef VSPLINE_TRANSFORM_H
#define VSPLINE_TRANSFORM_H

#include "multithread.h" // vspline's multithreading code
#include "eval.h"        // evaluation of b-splines
#include "poles.h"
#include "convolve.h"

// The bulk of the implementation of vspline's two 'transform' functions
// is now in wielding.h:

#include "wielding.h"

namespace vspline {

/// implementation of two-array transform using wielding::coupled_wield.
///
/// 'array-based' transform takes two template arguments:
///
/// - 'unary_functor_type', which is a class satisfying the interface
///   laid down in unary_functor.h. Typically, this would be a type 
///   inheriting from vspline::unary_functor, but any type will do as
///   long as it provides the required typedefs and an the relevant
///   eval() routines.
///
/// - the dimensionality of the input and output array
///
/// this overload of transform takes three parameters:
///
/// - a reference to a const unary_functor_type object providing the
///   functionality needed to generate values from arguments.
///
/// - a reference to a const MultiArrayView holding arguments to feed to
///   the unary functor object. It has to have the same shape as the target
///   array and contain data of the unary_functor's 'in_type'.
///
/// - a reference to a MultiArrayView to use as a target. This is where the
///   resulting data are put, so it has to contain data of unary_functor's
///   'out_type'. It has to have the same shape as the input array.
///
/// transform can be used without template arguments, they will be inferred
/// by ATD from the arguments.
///
/// trailing the argument list, all transform overloads have two further
/// parameters with default values, which are only used for special purposes:
///
/// - njobs gives the number of worker threads to be used when multithreading
///   the transform. The default results in using as many worker threads as
///   there are worker threads in the thread pool. This is a generous number,
///   and several times the number of physical cores, but this benefits
///   performance. Passing '1' here will execute the transform in 'this'
///   thread rather than invoking a single worker thread from the pool.
///
/// - p_cancel, when passed, has to point to a std::atomic<bool> which
///   is initially 'true' but can be (atomically) set to 'false' by calling
///   code. This will result in the transform stopping 'at the next convenient
///   point', usually after having processed a complete line or line segment
///   of data, resulting in an invalid result. While the result can not be
///   used, this is a graceful way out if the calling code finds that
///   continuing with the transform is futile - for instance because the
///   user asks for the whole program to terminate. The cancellation still
///   takes 'a while' to 'trickle down' to all worker threads.

template < typename unary_functor_type ,
           unsigned int dimension >
void transform ( const unary_functor_type & functor ,
                 const vigra::MultiArrayView
                     < dimension ,
                       typename unary_functor_type::in_type
                     > & input ,
                  vigra::MultiArrayView
                     < dimension ,
                       typename unary_functor_type::out_type
                     > & output ,
                  int njobs = vspline::default_njobs ,
                  vspline::atomic < bool > * p_cancel = 0
               )
{
  // check shape compatibility
  
  if ( output.shape() != input.shape() )
  {
    throw vspline::shape_mismatch
     ( "transform: the shapes of the input and output array do not match" ) ;
  }

  // wrap the vspline::unary_functor to be used with wielding code.
  // The wrapper is necessary because the code in wielding.h feeds
  // arguments as TinyVectors, even if the data are 'singular'.
  // The wrapper simply reinterpret_casts any TinyVectors of one
  // element to their corresponding value_type before calling the
  // functor.

  typedef wielding::vs_adapter < unary_functor_type > coupled_functor_type ;
  coupled_functor_type coupled_functor ( functor ) ;

  // we'll cast the pointers to the arrays to these types to be
  // compatible with the wrapped functor above.

  typedef typename coupled_functor_type::in_type src_type ;
  typedef typename coupled_functor_type::out_type trg_type ;

  typedef vigra::MultiArrayView < dimension , src_type > src_view_type ;
  typedef vigra::MultiArrayView < dimension , trg_type > trg_view_type ;

  wielding::coupled_wield < coupled_functor_type , dimension >
    ( coupled_functor ,
      (src_view_type*)(&input) ,
      (trg_view_type*)(&output) ,
      njobs ,
      p_cancel
    ) ;
}

/// reduce function processing by value. sink_functor_type is a functor
/// taking values (which are loaded from an array). The reduce function,
/// by delegation to wielding::value_reduce, performs the feeding part
/// by parcelling the data, the functor receives either single in_type
/// or vectorized data. The functor has to be a construct capable of
/// accumulating data with several threads in parallel. A typical
/// example would be a functor cumulating to some internal structure
/// and 'offloading' to a mutex-protected pooling target during it's
/// destructor. The wielding code creates a copy of the functor which
/// is passed here for each worker thread and guarantees that this
/// copy's destructor is executed before the call to reduce returns,
/// so the user can be assured of having collected the partial results
/// of all worker threads in the pooling target. The functor needs a
/// copy constructor to communicate the pooling target to the copies
/// made for the individual threads, see the index-based reduce function
/// below this one for an illustration of the simple and elegant mechanism.
/// Note that sink_functor will normally be a type inheriting from
/// vspline::sink_functor, a close relative of vspline::unary_functor
/// and also defined in unary_functor.h. This inheritance does not
/// produce any functionality, just the standard vspline functor
/// argument type system, which this code relies on. Of course the
/// sink functor can be coded 'manually' as well - inheriting form
/// vspline::sink_functor is merely convenient.

template < typename sink_functor_type ,
           unsigned int dimension >
void reduce ( const sink_functor_type & functor ,
              const vigra::MultiArrayView
                  < dimension ,
                    typename sink_functor_type::in_type
                  > & input ,
              int njobs = vspline::default_njobs ,
              vspline::atomic < bool > * p_cancel = 0
             )
{
  // wrap the vspline::sink_functor to be used with wielding code.
  // The wrapper is necessary because the code in wielding.h feeds
  // all arguments as TinyVectors, even if the data are 'singular'.
  // The wrapper simply reinterpret_casts any TinyVectors of one
  // element to the corresponding value_type before calling the
  // functor. At the same time, this function reinterpret_casts
  // the incoming array to an array of TinyVectors, even if the
  // data in the array are fundamentals. This is because the
  // wielding code expects an array of TinyVectors in every case.

  typedef wielding::vs_sink_adapter
            < sink_functor_type > adapted_functor_type ;    
  adapted_functor_type adapted_functor ( functor ) ;
  
  // we'll cast the pointers to the array to this type to be
  // compatible with the wrapped functor above.

  typedef typename adapted_functor_type::in_type src_type ;
  
  typedef vigra::MultiArrayView < dimension , src_type > src_view_type ;
  
  // now delegate to the wielding code

  wielding::value_reduce < adapted_functor_type , dimension >
    ( adapted_functor ,
      (src_view_type*)(&input) ,
      njobs ,
      p_cancel
    ) ;
}

/// reduce function processing by coordinate. sink_functor_type is
/// a functor receiving integral coordinates, with no fixed meaning,
/// but usually pertaining to an array. The reduce function feeds
/// all coordinates encompassed by the shape, either in vectorized
/// or single form. Here's an example of a functor for this function:

/*

template < typename coordinate_type > struct count_items
: public vspline::sink_functor < coordinate_type >
{
  typedef vspline::sink_functor < coordinate_type > base_type ;
  using typename base_type::in_type ;
  using typename base_type::in_v ;
  using base_type::vsize ;
  
  // reference to a thread-safe 'pooling target'

  std::atomic < long > & collector ;
  long sum ;

  // this c'tor is used to create the initial count_items object and
  // fixes the reference to the 'pooling target'

  count_items ( std::atomic < long > & _collector )
  : collector ( _collector )
  { sum = 0 ; }

  // count_items needs a copy c'tor to propagate the reference to
  // the 'pooling target'. Note that copy assignment won't work.

  count_items ( const count_items & other )
  : collector ( other.collector )
  { sum = 0 ; }

  // finally, two operator() overloads, one for scalar input and
  // one for SIMD input.

  void operator() ( const in_type & crd )
  { sum ++ ; }

  void operator() ( const in_v & crd )
  { sum += vsize ; }

  ~count_items()
  { collector += sum ; }
} ;

*/

/// implementation of index-based transform using wielding::index_wield
///
/// this overload of transform() is very similar to the first one, but
/// instead of picking input from an array, it feeds the discrete coordinates
/// of the successive places data should be rendered to to the
/// unary_functor_type object.
///
/// This sounds complicated, but is really quite simple. Let's assume you have
/// a 2X3 output array to fill with data. When this array is passed to transform,
/// the functor will be called with every coordinate pair in turn, and the result
/// the functor produces is written to the output array. So for the example given,
/// with 'ev' being the functor, we have this set of operations:
///
/// output [ ( 0 , 0 ) ] = ev ( ( 0 , 0 ) ) ;
///
/// output [ ( 1 , 0 ) ] = ev ( ( 1 , 0 ) ) ;
///
/// output [ ( 2 , 0 ) ] = ev ( ( 2 , 0 ) ) ;
///
/// output [ ( 0 , 1 ) ] = ev ( ( 0 , 1 ) ) ;
///
/// output [ ( 1 , 1 ) ] = ev ( ( 1 , 1 ) ) ;
///
/// output [ ( 2 , 1 ) ] = ev ( ( 2 , 1 ) ) ;
///
/// this transform overload takes one template argument:
///
/// - 'unary_functor_type', which is a class satisfying the interface laid
///   down in unary_functor.h. This is an object which can provide values
///   given *discrete* coordinates, like class evaluator, but generalized
///   to allow for arbitrary ways of achieving it's goal. The unary functor's
///   'in_type' determines the number of dimensions of the coordinates - since
///   they are coordinates into the target array, the functor's input type
///   has to have the same number of dimensions as the target. The functor's
///   'out_type' has to be the same as the data type of the target array, since
///   the target array stores the results of calling the functor.
///
/// this transform overload takes two parameters:
///
/// - a reference to a const unary_functor_type object providing the
///   functionality needed to generate values from discrete coordinates
///
/// - a reference to a MultiArrayView to use as a target. This is where the
///   resulting data are put.
///
/// Please note that vspline holds with vigra's coordinate handling convention,
/// which puts the fastest-changing index first. In a 2D, image processing,
/// context, this is the column index, or the x coordinate. C and C++ do
/// instead put this index last when using multidimensional array access code.
///
/// transform can be used without template arguments, they will be inferred
/// by ATD from the arguments.
///
/// See the remarks on the two trailing parameters given with the two-array
/// overload of transform.

template < class unary_functor_type >
void transform ( const unary_functor_type & functor ,
                 vigra::MultiArrayView
                        < unary_functor_type::dim_in ,
                          typename unary_functor_type::out_type
                        > & output ,
                 int njobs = vspline::default_njobs ,
                 vspline::atomic < bool > * p_cancel = 0 )
{
  enum { dimension = unary_functor_type::dim_in } ;
  
  // wrap the vspline::unary_functor to be used with wielding code

  typedef wielding::vs_adapter < unary_functor_type > index_functor_type ;    
  index_functor_type index_functor ( functor ) ;
  
  // we'll cast the pointer to the target array to this type to be
  // compatible with the wrapped functor above

  typedef typename index_functor_type::out_type trg_type ;
  typedef vigra::MultiArrayView < dimension , trg_type > trg_view_type ;
  
  // now delegate to the wielding code

  wielding::index_wield < index_functor_type , dimension >
    ( index_functor ,
      (trg_view_type*)(&output) ,
      njobs ,
      p_cancel ) ;
}

/// while the function above fills an array by calling a functor with
/// the target coordinates, this function accumulates the results by
/// means of a sink_functor. The effect is as if an index-based transform
/// was executed first, filling an intermediate array, followed by a call
/// to reduce taking the intermediate array as input. The combination of
/// both operations 'cuts out' the intermediate. Since this function does
/// not have a target array, the shape has to be communicated explicitly.
/// The functor will be invoked for every possible coordinate within the
/// shape's scope. This is done by the function 'index_reduce' in wielding.h
/// the type of the argument 'shape' is derived from the sink_functor's
/// 'dim_in' value, to make the code 'respect the singular': if the data
/// are 1D, reduce will accept a single integer as 'shape', whereas nD
/// data require a TinyVector of int.
/// For suitable sink_functors, see the remarks and example given above
/// in the comments for the array-base overload of 'reduce'.

template < typename sink_functor_type >
void reduce ( const sink_functor_type & functor ,
              typename std::conditional
                     < ( sink_functor_type::dim_in == 1 ) ,
                       int ,
                       vigra::TinyVector < int , sink_functor_type::dim_in >
                     > :: type shape ,
              int njobs = vspline::default_njobs ,
              vspline::atomic < bool > * p_cancel = 0
            )
{
  enum { dimension = sink_functor_type::dim_in } ;

  // adapt the functor to be compatible with the wielding code

  typedef wielding::vs_sink_adapter
            < sink_functor_type > adapted_functor_type ;

  adapted_functor_type adapted_functor ( functor ) ;

  // the wielding code expects all shape information as TinyVectors,
  // even if the data are 1D:

  typename sink_functor_type::in_nd_ele_type nd_shape ( shape ) ;

  wielding::index_reduce < adapted_functor_type , dimension >
    ( adapted_functor ,
      nd_shape ,
      njobs ,
      p_cancel
    ) ;
}

/// we code 'apply' as a special variant of 'transform' where the output
/// is also used as input, so the effect is to feed the unary functor
/// each 'output' value in turn, let it process it and store the result
/// back to the same location. While this looks like a rather roundabout
/// way of performing an apply, it has the advantage of using the same
/// type of functor (namely one with const input and writable output),
/// rather than a different functor type which modifies it's argument
/// in-place. While, at this level, using such a functor looks like a
/// feasible idea, It would require specialized code 'further down the
/// line' when complex functors are built with vspline's functional
/// programming tools: the 'apply-capable' functors would need to read
/// the output values first and write them back after anyway, resulting
/// in the same sequence of loads and stores as we get with the current
/// 'fake apply' implementation.
/// From the direct delegation to the two-array overload of 'transform',
/// you can see that this is merely syntactic sugar.

template < typename unary_functor_type  , // functor to apply
           unsigned int dimension >       // input/output array's dimension
void apply ( const unary_functor_type & ev ,
             vigra::MultiArrayView
                    < dimension ,
                      typename unary_functor_type::out_type >
                    & output ,
             int njobs = vspline::default_njobs ,
             vspline::atomic < bool > * p_cancel = 0
           )
{
  // make sure the functor's input and output type are the same

  static_assert ( std::is_same < typename unary_functor_type::in_type ,
                                 typename unary_functor_type::out_type > :: value ,
                  "apply: functor's input and output type must be the same" ) ;

  // delegate to transform

  transform ( ev , output , output , njobs , p_cancel ) ;
}

/// a type for a set of boundary condition codes, one per axis

template < unsigned int dimension >
using bcv_type = vigra::TinyVector < vspline::bc_code , dimension > ;

/// Implementation of 'classic' remap, which directly takes an array of
/// values and remaps it, internally creating a b-spline of given order
/// just for the purpose. This is used for one-shot remaps where the spline
/// isn't reused, and specific to b-splines, since the functor used is a
/// b-spline evaluator. The spline defaults to a cubic b-spline with
/// mirroring on the bounds.
///
/// So here we have the 'classic' remap, where the input array holds
/// coordinates and the functor used is actually an interpolator. Since
/// this is merely a special case of using transform(), we delegate to
/// transform() once we have the evaluator.
///
/// The template arguments are chosen to allow the user to call 'remap'
/// without template arguments; the template arguments can be found by ATD
/// by looking at the MultiArrayViews passed in.
///
/// - original_type is the value_type of the array holding the 'original' data over
///   which the interpolation is to be performed
///
/// - result_type is the value_type of the array taking the result of the remap, 
///   namely the values produced by the interpolation. these data must have as
///   many channels as original_type
///
/// - coordinate_type is the type for coordinates at which the interpolation is to
///   be performed. coordinates must have as many components as the input array
///   has dimensions.
///
/// optionally, remap takes a set of boundary condition values and a spline
/// degree, to allow creation of splines for specific use cases beyond the
/// default. I refrain from extending the argument list further; user code with
/// more specific requirements will have to create an evaluator and use 'transform'.
///
/// Note that remap can be called without template arguments, the types will
/// be inferred by ATD from the arguments passed in.
///
/// See the remarks on the two trailing parameters given with the two-array
/// overload of transform.

template < typename original_type ,   // data type of original data
           typename result_type ,     // data type for interpolated data
           typename coordinate_type , // data type for coordinates
           unsigned int cf_dimension ,  // dimensionality of original data
           unsigned int trg_dimension , // dimensionality of result array
           int bcv_dimension = cf_dimension > // see below. g++ ATD needs this.
void remap ( const vigra::MultiArrayView
                          < cf_dimension , original_type > & input ,
             const vigra::MultiArrayView
                          < trg_dimension , coordinate_type > & coordinates ,
             vigra::MultiArrayView
                    < trg_dimension , result_type > & output ,
             bcv_type < bcv_dimension > bcv
              = bcv_type < bcv_dimension > ( MIRROR ) ,
             int degree = 3 ,
             int njobs = vspline::default_njobs ,
             vspline::atomic < bool > * p_cancel = 0 )
{
  static_assert (    vigra::ExpandElementResult < original_type > :: size
                  == vigra::ExpandElementResult < result_type > :: size ,
                  "input and output data type must have same nr. of channels" ) ;
                  
  static_assert (    cf_dimension
                  == vigra::ExpandElementResult < coordinate_type > :: size ,
                  "coordinate type must have the same dimension as input array" ) ;

  // this is silly, but when specifying bcv_type < cf_dimension >, the code failed
  // to compile with g++. So I use a separate template argument bcv_dimension
  // and static_assert it's the same as cf_dimension. TODO this sucks...
                  
  static_assert (    cf_dimension
                  == bcv_dimension ,
                  "boundary condition specification needs same dimension as input array" ) ;

  // check shape compatibility
  
  if ( output.shape() != coordinates.shape() )
  {
    throw shape_mismatch 
    ( "the shapes of the coordinate array and the output array must match" ) ;
  }

  // get a suitable type for the b-spline's coefficients

  typedef typename vigra::PromoteTraits < original_type , result_type >
                          :: Promote _cf_type ;
                          
  typedef typename vigra::NumericTraits < _cf_type >
                          :: RealPromote cf_type ;
  
  // create the bspline object
  
  typedef typename vspline::bspline < cf_type , cf_dimension > spline_type ;
  spline_type bsp ( input.shape() , degree , bcv ) ;
  
  // prefilter, taking data in 'input' as knot point data
  
  bsp.prefilter ( input ) ;

  // since this is a commodity function, we use a 'safe' evaluator.
  // If maximum performance is needed and the coordinates are known to be
  // in range, user code should create a 'naked' vspline::evaluator and
  // use it with vspline::transform.
  // Note how we pass in 'rc_type', the elementary type of a coordinate.
  // We want to allow the user to pass float or double coordinates.
  
  typedef typename vigra::ExpandElementResult < coordinate_type > :: type rc_type ;
  
  auto ev = vspline::make_safe_evaluator < spline_type , rc_type > ( bsp ) ;
  
  // call transform(), passing in the evaluator,
  // the coordinate array and the target array
  
  transform ( ev , coordinates , output , njobs , p_cancel ) ;
}

// next we have code for evaluation of b-splines over grids of coordinates.
// This code lends itself to some optimizations, since part of the weight
// generation used in the evaluation process is redundant, and by
// precalculating all redundant values and referring to the precalculated
// values during the evaluation a good deal of time can be saved - provided
// that the data involved are nD.

// TODO: as in separable convolution, it might be profitable here to apply
// weights for one axis to the entire array, then repeat with the other axes
// in turn. storing, modifying and rereading the array may still
// come out faster than the rather expensive DDA needed to produce the
// value with weighting in all dimensions applied at once, as the code
// below does (by simply applying the weights in the innermost eval
// class evaluator offers). The potential gain ought to increase with
// the dimensionality of the data.

// for evaluation over grids of coordinates, we use a vigra::TinyVector
// of 1D MultiArrayViews holding the component coordinates for each
// axis.
// When default-constructed, this object holds default-constructed
// MultiArrayViews, which, when assigned another MultiArrayView,
// will hold another view over the data, rather than copying them.
// initially I was using a small array of pointers for the purpose,
// but that is potentially unsafe and does not allow passing strided
// data.

template < unsigned int dimension , typename rc_ele_type = float >
using grid_spec =
vigra::TinyVector
  < vigra::MultiArrayView < 1 , rc_ele_type > ,
    dimension
  > ;

// the implementation of grid-based evaluation is in namespace detail;
// I assume that this code is not very useful for users and it would
// only clutter the vspline namespace.

namespace detail
{

/// we need a 'generator functor' to implement grid_eval using the code
/// in wielding.h.
/// this functor precalculates the b-spline evaluation weights corresponding
/// to the coordinates in the grid and stores them in vectorized format,
/// to speed up their use as much as possible.

template < typename ET >
struct grev_generator
{
  typedef ET evaluator_type ;

  static const size_t dimension = evaluator_type::dimension ;
  static const size_t vsize = evaluator_type::vsize ;
  static const size_t channels = evaluator_type::channels ;
  
  typedef typename evaluator_type::inner_type inner_type ;
  typedef typename evaluator_type::math_ele_type weight_type ;

  typedef typename evaluator_type::out_type out_type ;
  typedef typename evaluator_type::out_ele_type out_ele_type ;
  typedef typename evaluator_type::out_nd_ele_type out_nd_ele_type ;
  typedef typename evaluator_type::out_v out_v ;
  typedef typename evaluator_type::out_ele_v out_ele_v ;
  typedef typename evaluator_type::out_nd_ele_v out_nd_ele_v ;
  
  typedef typename vspline::vector_traits
                   < weight_type , vsize > :: type weight_v ;
  typedef typename evaluator_type::rc_ele_type rc_type ;
  typedef vigra::MultiArrayView
          < dimension , typename evaluator_type::trg_type > target_type ;
  typedef vigra::TinyVector < size_t , dimension > shape_type ;

  typedef typename vspline::vector_traits
                   < int , vsize > :: type ofs_ele_v ;

  typedef typename vspline::grid_spec < dimension , rc_type > grid_spec_type ;

  const int spline_order ; // order of the b-spline
  const shape_type shape ; // shape of result array
  const grid_spec_type & grid ; // the grid coordinates
  const evaluator_type & itp ; // b-spline evaluator
  const size_t axis ;       // axis to process, other axes will vary
  const size_t aggregates ; // number of full weight/offset packets
  const size_t rest ;       // number leftover single coordinates
  const size_t w_pack_sz ;  // number of weight vectors in a packet

  int socket_offset ; // cumulated offset for axes != axis

  // keeps track of ownership of allocated data: it's set to 'this'
  // when the buffers are allocated in the c'tor. If default copy
  // construction or default assignment take place, memory_guard
  // and this don't match anymore and the destructor does not free
  // the data.

  void * memory_guard ;

  // dimension X pointers to sets of weights. Each set of weights
  // consists of spline_order single weights, and there are as many
  // sets as there are corresponding coordinates in the grid spec.

  weight_type * grid_weight [ dimension ] ;

  // for the dimension 'axis', we keep a reshuffled copy of the
  // weights, so that we have groups of spline_order weight vectors
  // followed by single weight sets which don't fit a vectorized
  // weight pack (the latter might be dropped)
  
  weight_type * shuffled_weights ;
  
  // per dimesnion, one offset per coordinate:

  int * grid_ofs [ dimension ] ;

  // We'll count the vectorized packets up, this state variable
  // holds the next packet's number

  size_t current_pack ;

  // tail_crd will hold the nD coordinate of the first coordinate
  // along the grid coordinates for axis which did not fit into
  // a full packet, so that single-value processing cas start there
  // after peeling

  shape_type tail_crd ;
  
  // set of weights for a single coordinate

  vigra::MultiArray < 2 , weight_type > weight ;

  // have storage ready for one set of vectorized weights
  
  using allocator_t
  = typename vspline::allocator_traits < weight_v > :: type ;
  
  vigra::MultiArray < 2 , weight_v , allocator_t > vweight ;
  
  // reset is called when starting another line. The packet number
  // is reset to zero, tail_crd is set to the first unvectorized
  // position (starting point after peeling is complete). Then those
  // components of the coordinate 'crd' which are *not* 'axis' are
  // broadcast to the corresponding vectors in 'vweight', which
  // holds the current packet; they will remain constant for the
  // whole line, whereas the row in vweight corresponding to 'axis'
  // will receive fresh values for each (vectorized) iteration.
  // The extra argument _aggregates will usually be the same as
  // 'aggregates', but a smaller number may be specified here to
  // process subsets (used for 1D operation, TODO needs testing)

  void reset ( shape_type crd , size_t _aggregates )
  {
    // we require starting coordinates to be a multiple of vsize for
    // the processing axis, so that we can fetch vectorized weights,
    // which have been stored in batches of vsize, starting at 0.
    // If we'd allow 'odd' starting coordinates, we'd have to first
    // process a few single coordinates before switching to vectors,
    // which would complicate the logic.

    assert ( crd [ axis ] % vsize == 0 ) ;

    current_pack = crd [ axis ] / vsize ;
    tail_crd [ axis ] = crd [ axis ] + _aggregates * vsize ;
    socket_offset = 0 ;

    for ( size_t d = 0 ; d < dimension ; d++ )
    {
      if ( d != axis )
      {
        tail_crd [ d ] = crd [ d ] ;
        socket_offset += grid_ofs [ d ] [ crd [ d ] ] ;
        for ( size_t k = 0 ; k < spline_order ; k++ )
        {
          vweight [ vigra::Shape2 ( k , d ) ]
            = weight [ vigra::Shape2 ( k , d ) ]
            = grid_weight [ d ] [ crd [ d ] * spline_order + k ] ;
        }
      }
    }
  }

  // fetch_weights copies a set of vectorized weights from
  // 'shuffled_weights' (where it has been put in vectorized form
  // to be picked up by fast SIMD loads, rather than having to
  // deinterleave the data from grid_weight [ axis ])

  void fetch_weights()
  {
    auto p_wgt = shuffled_weights + current_pack * w_pack_sz ;
    auto w_line = vweight.bindAt ( 1 , axis ) ;
    for ( auto & wvr : w_line )
    {
      wvr.load ( p_wgt ) ;
      p_wgt += vsize ;
    }
  }

  // the c'tor sets up all invariant data of the generator. The most
  // important of these are the precalculated weights, which allow for
  // fast evaluation, avoiding the weight generation which is otherwise
  // needed for every evaluation locus.
  // Note that grev_generator objects may be default-copied, retaining their
  // validity.

  grev_generator ( const vspline::grid_spec < dimension , rc_type > & _grid ,
                   const evaluator_type & _itp ,
                   size_t _axis ,
                   shape_type _shape
                 )
  : grid ( _grid ) ,
    itp ( _itp ) ,
    axis ( _axis ) ,
    shape ( _shape ) ,
    aggregates ( _shape [ _axis ] / vsize ) ,
    rest ( _shape [ _axis ] % vsize ) ,
    w_pack_sz ( _itp.inner.get_order() * vsize ) ,
    spline_order ( _itp.inner.get_order() ) ,
    vweight ( vigra::Shape2 ( _itp.inner.get_order() , dimension ) ) ,
    weight ( vigra::Shape2 ( _itp.inner.get_order() , dimension ) ) ,
    memory_guard ( this )
  {
    // get some metrics from the b-spline evaluator needed to translate
    // coordinates to offsets in memory

    vigra::TinyVector < int , dimension >
      estride ( itp.inner.get_estride() ) ;
    
    // allocate space for the per-axis weights and offsets

    for ( int d = 0 ; d < dimension ; d++ )
    {
      grid_weight[d] = new weight_type [ spline_order * shape [ d ] ] ;
      grid_ofs[d] = new int [ shape [ d ] ] ;
    }

    // fill in the weights and offsets, using the interpolator's
    // split() to split the coordinates received in grid_coordinate,
    // the interpolator's obtain_weights() method to produce the
    // weight components, and the strides of the coefficient array
    // to convert the integral parts of the coordinates into offsets.

    for ( int d = 0 ; d < dimension ; d++ )
    {
      int select ;   // integral part of coordinate
      rc_type tune ; // delta, so that select + tune == crd [ d ]

      // split all coordinates received in the grid spec into
      // integral and remainder part, then use the remainder part
      // to obtain a set of weights. The integral part of the
      // coordinates is transformed to an offset in memory.

      for ( int c = 0 ; c < shape [ d ] ; c++ )
      {
        itp.inner.split
            ( grid [ d ] [ c ] , select , tune ) ;

        itp.inner.obtain_weights
            ( grid_weight [ d ] + spline_order * c , d , tune ) ;

        grid_ofs [ d ] [ c ] = select * estride [ d ] ;
      }
    }
    
    // reshuffle the weights along axis 'axis' to vectorized order
    // so that the evaluation can pick them up with SIMD loads.
    // This might be coded more efficiently, but since this
    // calculation is done only once for a whole grid, it's no big deal.
    
    shuffled_weights = new weight_type [ spline_order * shape [ axis ] ] ;

    if ( shape [ axis ] >= vsize )
    {
      for ( size_t a = 0 ; a < aggregates ; a++ )
      {
        auto source = grid_weight[axis] + a * w_pack_sz ;
        auto target = shuffled_weights + a * w_pack_sz ;

        for ( size_t e = 0 ; e < vsize ; e++ )
        {
          for ( size_t k = 0 ; k < spline_order ; k++ )
          {
            target [ vsize * k + e ] = source [ e * spline_order + k ] ; 
          }
        }
      }
    }

    // fill up with weights which didn't fit a pack. might be omitted.

    for ( size_t i = aggregates * w_pack_sz ;
          i < spline_order * shape [ axis ] ;
          i++ )
      shuffled_weights [ i ] = grid_weight [ axis ] [ i ] ;
  }

  // evaluate, using the current packet of weights/offsets. This
  // blindly trust that calling code keeps track of the number of
  // packets which can be processed during the peeling phase.

  template < typename = std::enable_if < ( vsize > 1 ) > >
  void eval ( out_v & result )
  {
    // pick up the varying weights into the current package

    fetch_weights() ;

    // load current set offsets for 'axis'

    ofs_ele_v select ;

    select.load ( grid_ofs [ axis ] + current_pack * vsize ) ;

    // add offsets for the remaining axes; these are broadcast,
    // there is only a single offset instead of a whole vector of
    // offsets needed for 'axis'.

    select += socket_offset ;

    // a bit of type artistry to provide a reference to an nD
    // result type, even if the data are single-channel:
    // trg_t is an nD type even for single-channel data, so
    // vector_traits produces an nD vectorized type.
    // The inner evaluator which is used for evaluation uses
    // 'synthetic' data types for all data, and we want to
    // use it for evaluation, so we have to present 'result'
    // in proper guise.

    typedef typename inner_type::trg_type trg_t ;
    typedef typename vspline::vector_traits
                     < trg_t , vsize > :: type trg_v ;

    trg_v & nd_result = reinterpret_cast < trg_v & > ( result ) ;
    
    // finally we perfrom the evaluation

    itp.inner.eval ( select , vweight , nd_result ) ;

    // and advance current_pack for the next cycle

    current_pack++ ;
  }

  // here we have the single-value evaluation which is used after
  // all full vectorized packs have been processed

  void eval ( out_type & result )
  {
    auto c = tail_crd [ axis ] ;

    int select = socket_offset + grid_ofs [ axis ] [ c ] ;

    for ( int k = 0 ; k < spline_order ; k++ )
    {
      weight [ vigra::Shape2 ( k , axis ) ] =
        grid_weight [ axis ] [ c * spline_order + k ] ;
    }

    // move to the 'synthetic' type

    typedef typename inner_type::trg_type trg_t ;
    trg_t & nd_result = reinterpret_cast < trg_t & > ( result ) ;
    itp.inner.eval ( select , weight , nd_result ) ;

    // count up coordinate along 'axis'

    ++ tail_crd [ axis ] ;
  }

  // evaluation into a buffer. Here the data for one line are produced
  // all at once.
  
  void eval ( vigra::MultiArrayView < 1 , out_ele_v > & vresult ,
              vigra::MultiArrayView < 1 , out_type > & leftover )
  {
    assert ( vresult.shape(0) == aggregates * channels ) ;
    assert ( leftover.shape(0) == rest ) ;
    
    // TODO: would be nice to simply have a MultiArrayView of
    // aggregates * out_v, but that crashes
    // hence the detour via the nD type and storing (and reloading)
    // individual vectors

    out_v vr ;
    out_nd_ele_v & ndvr = reinterpret_cast < out_nd_ele_v & > ( vr ) ;

    for ( size_t a = 0 ; a < aggregates ; a++ )
    {
      eval ( vr ) ;
      for ( size_t ch = 0 ; ch < channels ; ch++ )
        vresult [ a * channels + ch ] = ndvr [ ch ] ;
    }
    for ( size_t a = 0 ; a < rest ; a++ )
    {
      eval ( leftover[a] ) ;
    }
  }

  void eval ( vigra::MultiArrayView < 1 , out_type > & result )
  {
    assert ( result.shape(0) == shape [ axis ] ) ;
    
    for ( size_t a = 0 ; a < shape [ axis ] ; a++ )
    {
      eval ( result[a] ) ;
    }
  }

  // TODO: I also want an evaluator which only applies the weights along
  // 'axis', and no weights pertaining to the other axes - to obtain data
  // which are 'partially evaluated' along 'axis'. If this process is
  // repeated for each axis, the result should be the same. This should
  // be more efficient if evaluation loci in the grid are 'close' so that
  // multiple full evaluations share coefficients, and the advantage
  // should increase with dimensionality.

  ~grev_generator()
  {
    // clean up. memory_guard is only set in the c'tor, so copied objects
    // will not have matching memory_guard and this. Only the object which
    // has initially allocated the memory will free it.

    if ( memory_guard == this )
    {
      for ( int d = 0 ; d < dimension ; d++ )
      {
        delete[] grid_weight[d] ;
        delete[] grid_ofs[d] ;
      }
      delete[] shuffled_weights ;
    }
  }
} ;

/// given the generator functor 'grev_generator' (above), performing
/// the grid_eval becomes trivial: construct the generator and pass it
/// to the wielding code. The signature looks complex, but the types
/// needed can all be inferred by ATD - callers just need to pass a
/// grid_spec object, a b-spline evaluator and a target array to accept
/// the result - plus, optionally, a pointer to a cancellation atomic,
/// which, if given, may be set by the calling code to gracefully abort
/// the calculation at the next convenient occasion.
/// Note that grid_eval is specific to b-spline evaluation and will not
/// work with any evaluator type which is not a 'plain', 'raw' b-spline
/// evaluator. So, even functors gained by calling vspline's factory
/// functions (make_evaluator, make_safe_evaluator) will *not* work here,
/// since they aren't of type vspline::evaluator (they are in fact grok_type).
/// To use arbitrary functors on a grid, use gen_grid_eval, below.

template < typename functor_type >
void transform ( std::true_type , // functor is a b-spline evaluator
                 grid_spec < functor_type::dim_in ,
                             typename functor_type::in_ele_type > & grid ,
                 const functor_type & functor ,
                 vigra::MultiArrayView < functor_type::dim_in ,
                                         typename functor_type::out_type >
                   & result  ,
                 int njobs = vspline::default_njobs ,
                 vspline::atomic < bool > * p_cancel = 0
               )
{
  // create the generator functor

  grev_generator < functor_type > gg ( grid ,
                                       functor ,
                                       0 ,
                                       result.shape() ) ;

  // and 'wield' it

  wielding::generate_wield ( gg , result , njobs , p_cancel ) ;
}

/// grid_eval_functor is used for 'generalized' grid evaluation, where
/// the functor passed in is not a bspline evaluator. For this general
/// case, we can't precalculate anything to speed things up, all we
/// need to do is pick the grid values and put them together to form
/// arguments for calls to the functor.

template < typename _inner_type ,
           typename ic_type = int >
struct grid_eval_functor
: public vspline::unary_functor
         < typename vspline::canonical_type
                    < ic_type , _inner_type::dim_in > ,
           typename _inner_type::out_type ,
           _inner_type::vsize >
{
  typedef _inner_type inner_type ;
  
  enum { vsize = inner_type::vsize } ;
  enum { dimension = inner_type::dim_in } ;
  
  typedef typename vspline::canonical_type
                    < ic_type , dimension > in_type ;
                    
  typedef typename inner_type::out_type out_type ;
  
  typedef typename inner_type::in_ele_type rc_type ;
  
  typedef typename vspline::unary_functor
          < in_type , out_type , vsize > base_type ;
  
  static_assert ( std::is_integral < ic_type > :: value ,
                  "grid_eval_functor: must use integral coordinates" ) ;
  
  const inner_type inner ;
  
  typedef grid_spec < inner_type::dim_in ,
                      typename inner_type::in_ele_type > grid_spec_t ;

  grid_spec_t grid ;

  grid_eval_functor ( grid_spec_t & _grid ,
                      const inner_type & _inner )
  : grid ( _grid ) ,
    inner ( _inner )
  { } ;
  
  void eval ( const in_type & c , out_type & result ) const
  {
    typename inner_type::in_type cc ;
    
    // for uniform access, we use reinterpretations of the coordinates
    // as nD types, even if they are only 1D. This is only used to
    // fill in 'cc', the cordinate to be fed to 'inner'.

    typedef typename base_type::in_nd_ele_type nd_ic_type ;
    typedef typename inner_type::in_nd_ele_type nd_rc_type ;
  
    const nd_ic_type & nd_c ( reinterpret_cast < const nd_ic_type & > ( c ) ) ;
    nd_rc_type & nd_cc ( reinterpret_cast < nd_rc_type & > ( cc ) ) ;
    
    for ( int d = 0 ; d < dimension ; d++ )
      nd_cc [ d ] = grid [ d ] [ nd_c[d] ] ;
    
    inner.eval ( cc , result ) ;
  }
  
  template < typename = std::enable_if < ( vsize > 1 ) > >
  void eval ( const typename base_type::in_v & c ,
              typename base_type::out_v & result ) const
  {
    typename inner_type::in_v cc ;
    
    typedef typename base_type::in_nd_ele_v nd_ic_v ;
    typedef typename inner_type::in_nd_ele_v nd_rc_v ;

    const nd_ic_v & nd_c ( reinterpret_cast < const nd_ic_v & > ( c ) ) ;
    nd_rc_v & nd_cc ( reinterpret_cast < nd_rc_v & > ( cc ) ) ;
    
    // TODO: we might optimize in two ways:
    // if the grid data are contiguous, we can issue a gather,
    // and if the coordinates above dimension 0 are equal for all e,
    // we can assign a scalar to nd_cc[d] for d > 0.

    for ( int d = 0 ; d < dimension ; d++ )
      for ( int e = 0 ; e < vsize ; e++ )
        nd_cc[d][e] = grid[d][ nd_c[d][e] ] ;
    
    inner.eval ( cc , result ) ;
  }
} ;

/// generalized grid evaluation. The production of result values from
/// input values is done by an instance of grid_eval_functor, see above.
/// The template argument, ev_type, has to be a functor (usually this
/// will be a vspline::unary_functor). If the functor's in_type has
/// dim_in components, grid_spec must also contain dim_in 1D arrays,
/// since ev's input is put together by picking a value from each
/// of the arrays grid_spec points to. The result obviously has to have
/// as many dimensions.

template < typename functor_type >
void transform ( std::false_type , // functor is not a b-spline evaluator
                 grid_spec < functor_type::dim_in ,
                             typename functor_type::in_ele_type > & grid ,
                 const functor_type & functor ,
                 vigra::MultiArrayView < functor_type::dim_in ,
                                         typename functor_type::out_type >
                   & result  ,
                 int njobs = vspline::default_njobs ,
                 vspline::atomic < bool > * p_cancel = 0
                )
{
  grid_eval_functor < functor_type > gev ( grid , functor ) ;

  vspline::transform ( gev , result , njobs , p_cancel ) ;
}

} ; // namespace detail

/// This overload of 'transform' takes a vspline::grid_spec as it's
/// first parameter. This object defines a grid where each grid point
/// is made up from components taken from the per-axis grid values.
/// Let's say the grid_spec holds
///
/// { 1 , 2 } and { 3 , 4 },
///
/// then we get grid points
///
/// ( 1 , 3 ) , ( 2 , 3 ) , ( 1 , 4 ) , ( 2 , 4 )
///
/// The grid points are taken as input for the functor, and the results
/// are stored in 'result'. So with the simple example above, result
/// would have to be a 2D array and after the transform it would contain
///
/// { { f ( ( 1 , 3 ) ) , f ( ( 2 , 3 ) ) } ,
///
///   { f ( ( 1 , 4 ) ) , f ( ( 2 , 4 ) ) } }
///
/// The grid specification obviously takes up less space than the whole
/// grid would occupy, and if the functor is a b-spline evaluator,
/// additional performance gain is possible by precalculating the
/// evaluation weights. The distinction is made here by dispatching to
/// two specific overloads in namespace detail, which provide the
/// implementation. Note that only functors of type vspline::evaluator
/// will qualify as b-spline evaluators. The functors produced by the
/// factory functions 'make_evaluator' and 'make_safe_evaluator' will
/// *not* qualify; they are of vspline::grok_type and even though they
/// have a b-spline evaluator 'somewhere inside' this can't be accessed
/// due to the type erasure.
///
/// See the remarks on the two trailing parameters given with the two-array
/// overload of transform.

template < typename functor_type >
void transform ( grid_spec < functor_type::dim_in ,
                             typename functor_type::in_ele_type > & grid ,
                 const functor_type & functor ,
                 vigra::MultiArrayView < functor_type::dim_in ,
                                         typename functor_type::out_type >
                   & result  ,
                 int njobs ,
                 vspline::atomic < bool > * p_cancel ,
                 std::false_type ) // is not 1D
{
  // make sure the grid specification has enough coordinates

  for ( int d = 0 ; d < functor_type::dim_in ; d++ )
    assert ( grid[d].size() >= result.shape ( d ) ) ;

  using uses_spline_t = typename
    std::is_base_of < bspline_evaluator_tag , functor_type > :: type ;
    
  detail::transform ( uses_spline_t() , grid , functor , result ,
                      njobs , p_cancel ) ;
}

/// overload of grid-based transform for 1D grids. Obviously, this is
/// a case for using 'transform' directly taking the single set of
/// per-axis grid values as input, and this overload is here only for
/// completeness' sake - 'normal' user code will use an array-based
/// transform straight away.

template < typename functor_type >
void transform ( grid_spec < 1 ,
                             typename functor_type::in_ele_type > & grid ,
                 const functor_type & functor ,
                 vigra::MultiArrayView < 1 ,
                                         typename functor_type::out_type >
                   & result  ,
                 int njobs ,
                 vspline::atomic < bool > * p_cancel ,
                 std::true_type ) // is 1D
{
  assert ( grid[0].size() >= result.shape ( 0 ) ) ;
  vspline::transform ( functor , grid[0] , result , njobs , p_cancel ) ;
}

// this is the dispatch routine, differentiating between the 1D and the
// nd case. The two variants are above.

template < typename functor_type >
void transform ( grid_spec < functor_type::dim_in ,
                             typename functor_type::in_ele_type > & grid ,
                 const functor_type & functor ,
                 vigra::MultiArrayView < functor_type::dim_in ,
                                         typename functor_type::out_type >
                   & result  ,
                 int njobs = vspline::default_njobs ,
                 vspline::atomic < bool > * p_cancel = 0 )
{
  static const bool is_1d ( functor_type::dim_in == 1 ) ;

  transform ( grid , functor , result , njobs , p_cancel ,
              std::integral_constant < bool , is_1d > () ) ;
}

/// for backward compatibility, we provide 'grid_eval'. This is deprecated
/// and will eventually go away, new code should use 'transform' above.

template < typename functor_type >
void grid_eval ( grid_spec < functor_type::dim_in ,
                             typename functor_type::in_ele_type > & grid ,
                 const functor_type & functor ,
                 vigra::MultiArrayView < functor_type::dim_in ,
                                         typename functor_type::out_type >
                   & result  ,
                 int njobs = vspline::default_njobs ,
                 vspline::atomic < bool > * p_cancel = 0 )
{
  transform ( grid , functor , result , njobs , p_cancel ) ;
}

/// for backward compatibility, we provide 'gen_grid_eval'. This is deprecated
/// and will eventually go away, new code should use 'transform' above.

template < typename functor_type >
void gen_grid_eval ( grid_spec < functor_type::dim_in ,
                             typename functor_type::in_ele_type > & grid ,
                     const functor_type & functor ,
                     vigra::MultiArrayView < functor_type::dim_in ,
                                             typename functor_type::out_type >
                       & result  ,
                     int njobs = vspline::default_njobs ,
                     vspline::atomic < bool > * p_cancel = 0 )
{
  transform ( grid , functor , result , njobs , p_cancel ) ;
}

/// restore restores the original data from the b-spline coefficients.
/// This is done efficiently using a separable convolution, the kernel
/// is simply a unit-spaced sampling of the basis function.
/// Since the filter uses internal buffering, using this routine
/// in-place is safe - meaning that 'target' may be bspl.core itself.
/// math_type, the data type for performing the actual maths on the
/// buffered data, and the type the data are converted to when they
/// are placed into the buffer, can be chosen, but I could not detect
/// any real benefits from using anything but the default, which is to
/// leave the data in their 'native' type.
///
/// an alternative way to restore is running an index-based
/// transform with an evaluator for the spline. This is much
/// less efficient, but the effect is the same:
///
///   auto ev = vspline::make_evaluator ( bspl ) ;
///   vspline::transform ( ev , target ) ;
///
/// Note that vsize, the vectorization width, can be passed explicitly.
/// If Vc is in use and math_ele_type can be used with hardware
/// vectorization, the arithmetic will be done with Vc::SimdArrays
/// of the given size. Otherwise 'goading' will be used: the data are
/// presented in simd_type of vsize math_ele_type, hoping that the
/// compiler may autovectorize the operation.
///
/// 'math_ele_type', the type used for arithmetic inside the filter,
/// defaults to the vigra RealPromote type of value_type's elementary.
/// This ensures appropriate treatment of integral-valued splines.

// TODO hardcoded default vsize

template < unsigned int dimension ,
           typename value_type ,
           typename math_ele_type
             = typename vigra::NumericTraits
                        < typename vigra::ExpandElementResult
                                   < value_type > :: type 
                        > :: RealPromote ,
           size_t vsize
             = vspline::vector_traits < math_ele_type > :: size
         >
void restore ( const vspline::bspline < value_type , dimension > & bspl ,
               vigra::MultiArrayView < dimension , value_type > & target )
{
  if ( target.shape() != bspl.core.shape() )
    throw shape_mismatch
     ( "restore: spline's core shape and target array shape must match" ) ;

  if ( bspl.spline_degree < 2 )
  {
    // we can handle the degree 0 and 1 cases very efficiently,
    // since we needn't apply a filter at all. This is an
    // optimization, the filter code would still perform
    // correctly without it.

    if ( (void*) ( bspl.core.data() ) != (void*) ( target.data() ) )
    {
      // operation is not in-place, copy data to target
      target = bspl.core ;
    }
    return ;
  }
  
  // first assemble the arguments for the filter

  int degree = bspl.spline_degree ;
  int headroom = degree / 2 ;
  int ksize = headroom * 2 + 1 ;

  // KFJ 2019-09-18 now using new convenience function 'get_kernel'

  vigra::MultiArray < 1 , xlf_type > kernel ( ksize ) ;
  get_kernel ( degree , kernel ) ;

//   xlf_type kernel [ ksize ] ;
//
//   // pick the precomputed basis function values for the kernel.
//   // Note how the values in precomputed_basis_function_values
//   // (see poles.h) are provided at half-unit steps, hence the
//   // index acrobatics.
// 
//   for ( int k = - headroom ; k <= headroom ; k++ )
//   {
//     int pick = 2 * std::abs ( k ) ;
//     kernel [ k + headroom ]
//     = vspline_constants
//       ::precomputed_basis_function_values [ degree ]
//         [ pick ] ;
//   }
  
  // the arguments have to be passed one per axis. While most
  // arguments are the same throughout, the boundary conditions
  // may be different for each axis.

  std::vector < vspline::fir_filter_specs > vspecs ;
  
  for ( int axis = 0 ; axis < dimension ; axis++ )
  {
    vspecs.push_back 
      ( vspline::fir_filter_specs
        ( bspl.bcv [ axis ] , ksize , headroom , kernel.data() ) ) ;
  }
 
  // KFJ 2018-05-08 with the automatic use of vectorization the
  // distinction whether math_ele_type is 'vectorizable' or not
  // is no longer needed: simdized_type will be a Vc::SimdArray
  // if possible, a vspline::simd_type otherwise.
  
  typedef typename vspline::convolve
                            < vspline::simdized_type ,
                              math_ele_type ,
                              vsize
                            > filter_type ;
                            
  // now we have the filter's type, create an instance and
  // use it to affect the restoration of the original data.
  
  vspline::filter
  < value_type , value_type , dimension , filter_type >
  ( bspl.core , target , vspecs ) ;
}

/// overload of 'restore' writing the result of the operation back to
/// the array which is passed in. This looks like an in-place operation,
/// but the data are in fact moved to a buffer stripe by stripe, then
/// the arithmetic is done on the buffer and finally the buffer is
/// written back. This is repeated for each dimension of the array.

template < int dimension ,
           typename value_type ,
           typename math_ele_type
             = typename vigra::NumericTraits
                        < typename vigra::ExpandElementResult
                                   < value_type > :: type 
                        > :: RealPromote ,
           size_t vsize = vspline::vector_traits<math_ele_type>::size >
void restore
  ( vspline::bspline < value_type , dimension > & bspl )
{
  restore < dimension , value_type , math_ele_type , vsize >
    ( bspl , bspl.core ) ;
}

} ; // end of namespace vspline

#endif // VSPLINE_TRANSFORM_H