File: sumthreshold.h

package info (click to toggle)
aoflagger 3.4.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 8,960 kB
  • sloc: cpp: 83,076; python: 10,187; sh: 260; makefile: 178
file content (147 lines) | stat: -rw-r--r-- 5,444 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#ifndef SUMTHRESHOLD_H
#define SUMTHRESHOLD_H

#include <cstddef>
#include <cstring>
#include <memory>

#include "../structures/image2d.h"
#include "../structures/mask2d.h"

namespace algorithms {

class SumThreshold {
 public:
  struct VerticalScratch {
    VerticalScratch();
    VerticalScratch(size_t width, size_t height);
    std::unique_ptr<int[], decltype(&free)> lastFlaggedPos;
    std::unique_ptr<num_t[], decltype(&free)> sum;
    std::unique_ptr<int[], decltype(&free)> count;
  };

  template <size_t Length>
  static void Horizontal(const Image2D* input, Mask2D* mask, num_t threshold);

  template <size_t Length>
  static void Vertical(const Image2D* input, Mask2D* mask, num_t threshold);

  template <size_t Length>
  static void HorizontalLarge(const Image2D* input, Mask2D* mask,
                              Mask2D* scratch, num_t threshold);

/* We always want to compile SSE for 64-bit Intel. Note that code will only be
   executed if the CPU where the binary is run supports SSE. However, code can
   only be compiled successfully if either __SSE__ is defined or if we're on
   64-bit Intel (since we're not cross-compiling) */
#if defined(__SSE__) || defined(__x86_64__)
  template <size_t Length>
  __attribute__((target("sse"))) static void VerticalLargeSSE(
      const Image2D* input, Mask2D* mask, Mask2D* scratch, num_t threshold);

  __attribute__((target("sse"))) static void VerticalLargeSSE(
      const Image2D* input, Mask2D* mask, Mask2D* scratch, size_t length,
      num_t threshold);

  template <size_t Length>
  __attribute__((target("sse"))) static void HorizontalLargeSSE(
      const Image2D* input, Mask2D* mask, Mask2D* scratch, num_t threshold);

  __attribute__((target("sse"))) static void HorizontalLargeSSE(
      const Image2D* input, Mask2D* mask, Mask2D* scratch, size_t length,
      num_t threshold);

#endif  // defined(__SSE__) || defined(__x86_64__)

/* We always want to compile AVX2 for 64-bit Intel. Note that code will only be
   executed if the CPU where the binary is run supports AVX2. However, code can
   only be compiled successfully if either __AVX2__ is defined or if we're on
   64-bit Intel (since we're not cross-compiling) */
#if defined(__AVX2__) || defined(__x86_64__)
  template <size_t Length>
  __attribute__((target("avx2"))) static void VerticalLargeAVX(
      const Image2D* input, Mask2D* mask, Mask2D* scratch, num_t threshold);

  __attribute__((target("avx2"))) static void VerticalLargeAVX(
      const Image2D* input, Mask2D* mask, Mask2D* scratch, size_t length,
      num_t threshold);

  __attribute__((target("avx2"))) static void HorizontalAVXDumas(
      const Image2D* input, Mask2D* mask, size_t length, num_t threshold);

  __attribute__((target("avx2"))) static void VerticalAVXDumas(
      const Image2D* input, Mask2D* mask, VerticalScratch* scratch,
      size_t length, num_t threshold);

  template <size_t Length>
  __attribute__((target("avx2"))) static void HorizontalAVXDumas(
      const Image2D* input, Mask2D* mask, num_t threshold);

  template <size_t Length>
  __attribute__((target("avx2"))) static void VerticalAVXDumas(
      const Image2D* input, Mask2D* mask, VerticalScratch* scratch,
      num_t threshold);

#endif  // defined(__AVX2__) || defined(__x86_64__)

  template <size_t Length>
  static void VerticalLarge(const Image2D* input, Mask2D* mask, Mask2D* scratch,
                            num_t threshold);

  template <size_t Length>
  static void Large(const Image2D* input, Mask2D* mask, num_t hThreshold,
                    num_t vThreshold) {
    HorizontalLarge<Length>(input, mask, hThreshold);
    VerticalLarge<Length>(input, mask, vThreshold);
  }

  static void VerticalLarge(const Image2D* input, Mask2D* mask, Mask2D* scratch,
                            VerticalScratch* vScratch, size_t length,
                            num_t threshold) {
#if defined(__AVX2__) || defined(__x86_64__)
    if (__builtin_cpu_supports("avx2")) {
      VerticalAVXDumas(input, mask, vScratch, length, threshold);
      return;
    }
#endif
#if defined(__SSE__) || defined(__x86_64__)
    if (__builtin_cpu_supports("sse")) {
      VerticalLargeSSE(input, mask, scratch, length, threshold);
      return;
    }
#endif
    VerticalLargeReference(input, mask, scratch, length, threshold);
  }

  static void VerticalLargeReference(const Image2D* input, Mask2D* mask,
                                     Mask2D* scratch, size_t length,
                                     num_t threshold);

  static void HorizontalLargeReference(const Image2D* input, Mask2D* mask,
                                       Mask2D* scratch, size_t length,
                                       num_t threshold);

  static void HorizontalLarge(const Image2D* input, Mask2D* mask,
                              Mask2D* scratch, size_t length, num_t threshold) {
#if defined(__AVX2__) || defined(__x86_64__)
    if (__builtin_cpu_supports("avx2")) {
      if (length >= 64)
        HorizontalAVXDumas(input, mask, length, threshold);
      else
        HorizontalLargeSSE(input, mask, scratch, length, threshold);
      return;
    }
#endif
#if defined(__SSE__) || defined(__x86_64__)
    if (__builtin_cpu_supports("sse")) {
      HorizontalLargeSSE(input, mask, scratch, length, threshold);
      return;
    }
#endif
    HorizontalLargeReference(input, mask, scratch, length, threshold);
  }
};

}  // namespace algorithms

#endif