File: otbSampleAugmentation.h

package info (click to toggle)
otb 8.1.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,030,436 kB
  • sloc: xml: 231,007; cpp: 224,490; ansic: 4,592; sh: 1,790; python: 1,131; perl: 92; makefile: 72
file content (224 lines) | stat: -rw-r--r-- 6,931 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/*
 * Copyright (C) 2005-2022 Centre National d'Etudes Spatiales (CNES)
 *
 * This file is part of Orfeo Toolbox
 *
 *     https://www.orfeo-toolbox.org/
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef otbSampleAugmentation_h
#define otbSampleAugmentation_h

#ifdef _OPENMP
#include <omp.h>
#endif

#include <vector>
#include <algorithm>
#include <random>
#include <ctime>
#include <cassert>

namespace otb
{

namespace sampleAugmentation
{
using SampleType       = std::vector<double>;
using SampleVectorType = std::vector<SampleType>;

/**
Estimate standard deviations of the components in one pass using
Welford's algorithm
*/
SampleType EstimateStds(const SampleVectorType& samples)
{
  const auto nbSamples    = samples.size();
  const long nbComponents = static_cast<long>(samples[0].size());
  SampleType stds(nbComponents, 0.0);
  SampleType means(nbComponents, 0.0);
  for (size_t i = 0; i < nbSamples; ++i)
  {
    auto norm_factor = 1.0 / (i + 1);
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (long j = 0; j < nbComponents; ++j)
    {
      const auto mu    = means[j];
      const auto x     = samples[i][j];
      auto       muNew = mu + (x - mu) * norm_factor;
      stds[j] += (x - mu) * (x - muNew);
      means[j] = muNew;
    }
  }
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (long j = 0; j < nbComponents; ++j)
  {
    stds[j] = std::sqrt(stds[j] / nbSamples);
  }
  return stds;
}

/** Create new samples by replicating input samples. We loop through
* the input samples and add them to the new data set until nbSamples
* are added. The elements of newSamples are removed before proceeding.
*/
void ReplicateSamples(const SampleVectorType& inSamples, const size_t nbSamples, SampleVectorType& newSamples)
{
  newSamples.resize(nbSamples);
  const long long nbSamplesLL = static_cast<long long>(nbSamples);
  size_t          imod{0};
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (long long i = 0; i < nbSamplesLL; ++i)
  {
    if (imod == inSamples.size())
      imod        = 0;
    newSamples[i] = inSamples[imod++];
  }
}

/** Create new samples by adding noise to existing samples. Gaussian
* noise is added to randomly selected samples. The standard deviation
* of the noise added to each component is the same as the one of the
* input variables divided by stdFactor (defaults to 10). The
* elements of newSamples are removed before proceeding.
*/
void JitterSamples(const SampleVectorType& inSamples, const size_t nbSamples, SampleVectorType& newSamples, float stdFactor = 10,
                   const int seed = std::time(nullptr))
{
  newSamples.resize(nbSamples);
  const long         nbComponents = static_cast<long>(inSamples[0].size());
  std::random_device rd;
  std::mt19937       gen(rd());
  // The input samples are selected randomly with replacement
  std::srand(seed);
  // We use one gaussian distribution per component since they may
  // have different stds
  auto                                         stds = EstimateStds(inSamples);
  std::vector<std::normal_distribution<double>> gaussDis(nbComponents);
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (long i   = 0; i < nbComponents; ++i)
    gaussDis[i] = std::normal_distribution<double>{0.0, stds[i] / stdFactor};

  for (size_t i = 0; i < nbSamples; ++i)
  {
    newSamples[i] = inSamples[std::rand() % inSamples.size()];
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (long j = 0; j < nbComponents; ++j)
      newSamples[i][j] += gaussDis[j](gen);
  }
}


struct NeighborType
{
  size_t index;
  double distance;
};

struct NeighborSorter
{
  constexpr bool operator()(const NeighborType& a, const NeighborType& b) const
  {
    return b.distance > a.distance;
  }
};

double ComputeSquareDistance(const SampleType& x, const SampleType& y)
{
  assert(x.size() == y.size());
  double dist{0};
  for (size_t i = 0; i < x.size(); ++i)
  {
    dist += (x[i] - y[i]) * (x[i] - y[i]);
  }
  return dist / (x.size() * x.size());
}

using NNIndicesType = std::vector<NeighborType>;
using NNVectorType  = std::vector<NNIndicesType>;
/** Returns the indices of the nearest neighbors for each input sample
*/
void FindKNNIndices(const SampleVectorType& inSamples, const size_t nbNeighbors, NNVectorType& nnVector)
{
  const long long nbSamples = static_cast<long long>(inSamples.size());
  nnVector.resize(nbSamples);
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (long long sampleIdx = 0; sampleIdx < nbSamples; ++sampleIdx)
  {
    NNIndicesType nns;
    for (long long neighborIdx = 0; neighborIdx < nbSamples; ++neighborIdx)
    {
      if (sampleIdx != neighborIdx)
        nns.push_back({static_cast<size_t>(neighborIdx), ComputeSquareDistance(inSamples[sampleIdx], inSamples[neighborIdx])});
    }
    std::partial_sort(nns.begin(), nns.begin() + nbNeighbors, nns.end(), NeighborSorter{});
    nns.resize(nbNeighbors);
    nnVector[sampleIdx] = std::move(nns);
  }
}

/** Generate the new sample in the line linking s1 and s2
*/
SampleType SmoteCombine(const SampleType& s1, const SampleType& s2, double position)
{
  auto result = s1;
  for (size_t i = 0; i < s1.size(); ++i)
    result[i]   = s1[i] + (s2[i] - s1[i]) * position;
  return result;
}

/** Create new samples using the SMOTE algorithm
Chawla, N. V., Bowyer, K. W., Hall, L. O., & Kegelmeyer, W. P., Smote:
synthetic minority over-sampling technique, Journal of artificial
intelligence research, 16(), 321–357 (2002).
http://dx.doi.org/10.1613/jair.953
*/
void Smote(const SampleVectorType& inSamples, const size_t nbSamples, SampleVectorType& newSamples, const int nbNeighbors, const int seed = std::time(nullptr))
{
  newSamples.resize(nbSamples);
  const long long nbSamplesLL = static_cast<long long>(nbSamples);
  NNVectorType    nnVector;
  FindKNNIndices(inSamples, nbNeighbors, nnVector);
  // The input samples are selected randomly with replacement
  std::srand(seed);
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for (long long i = 0; i < nbSamplesLL; ++i)
  {
    const auto sampleIdx   = std::rand() % (inSamples.size());
    const auto sample      = inSamples[sampleIdx];
    const auto neighborIdx = nnVector[sampleIdx][std::rand() % nbNeighbors].index;
    const auto neighbor    = inSamples[neighborIdx];
    newSamples[i]          = SmoteCombine(sample, neighbor, std::rand() / double{RAND_MAX});
  }
}

} // end namespaces sampleAugmentation
} // end namespace otb

#endif