File: instrument2Histogram.cpp

package info (click to toggle)
groops 0%2Bgit20250907%2Bds-1
  • links: PTS, VCS
  • area: non-free
  • in suites: forky, sid
  • size: 11,140 kB
  • sloc: cpp: 135,607; fortran: 1,603; makefile: 20
file content (163 lines) | stat: -rw-r--r-- 6,868 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/***********************************************/
/**
* @file instrument2Histogram.cpp
*
* @brief Compute a histogram (arc-wise) from an instrument file
*
* @author Andreas Kvas
* @date 2017-07-09
*/
/***********************************************/

// Latex documentation
#define DOCSTRING docstring
static const char *docstring = R"(
This program computes the arc-wise histogram from an \file{instrument file}{instrument}.
The output is a \file{matrix}{matrix} with the first column containing the lower bound of each bin.
The other columns contain the histograms for each arc.

\fig{!hb}{0.8}{instrument2Histogram}{fig:instrument2Histogram}{GRACE range-rate residuals of one month (one arc) divided into 50 bins.}
)";

/***********************************************/

#include "programs/program.h"
#include "files/fileMatrix.h"
#include "files/fileInstrument.h"

/***** CLASS ***********************************/

/** @brief Compute a histogram from a time series.
* @ingroup programsGroup */
class Instrument2Histogram
{
public:
  void run(Config &config, Parallel::CommunicatorPtr comm);
};

GROOPS_REGISTER_PROGRAM(Instrument2Histogram, PARALLEL, "compute a histogram from an instrument file", Instrument, Statistics)
GROOPS_RENAMED_PROGRAM(InstrumentComputeHistogram, Instrument2Histogram, date2time(2020, 7, 7))

/***********************************************/

void Instrument2Histogram::run(Config &config, Parallel::CommunicatorPtr comm)
{
  try
  {
    FileName         fileNameIn, fileNameOut;
    UInt             selectData;
    ExpressionVariablePtr exprMin, exprMax;
    Bool             relative = TRUE;
    Bool             cumulative = FALSE;
    UInt             binCount = 0;

    readConfig(config, "outputfileMatrix",     fileNameOut, Config::MUSTSET,  "",  "column 1: lower bin bound; columns 2 to N: histogram of each arc");
    readConfig(config, "inputfileInstrument",  fileNameIn,  Config::MUSTSET,  "",  "");
    readConfig(config, "selectDataField",      selectData,  Config::DEFAULT,  "0", "select channel for histogram computation");
    readConfig(config, "binCount",             binCount,    Config::OPTIONAL, "",  "(default: Freedman-Diaconis' choice, maximum of all channels)");
    readConfig(config, "lowerBound",           exprMin,     Config::DEFAULT,  "dataMin", "lower bound for bins (default: global minimum, data values outside are ignored)");
    readConfig(config, "upperBound",           exprMax,     Config::DEFAULT,  "dataMax", "upper bound for bins (default: global maximum, data values outside are ignored)");
    readConfig(config, "relative",             relative,    Config::DEFAULT,  "1", "output relative frequencies");
    readConfig(config, "cumulative",           cumulative,  Config::DEFAULT,  "0", "accumulate frequencies");
    if(isCreateSchema(config)) return;

    logStatus<<"reading instrument file <"<<fileNameIn<<">"<<Log::endl;
    InstrumentFile instrumentFile(fileNameIn);
    const UInt arcCount = instrumentFile.arcCount();

    // collect all data
    // ----------------
    std::vector<std::vector<Double>> arcWiseData(arcCount);
    Parallel::forEach(arcWiseData, [&](UInt arcNo)
    {
      Matrix data = instrumentFile.readArc(arcNo).matrix();
      std::vector<Double> values(data.rows());
      for(UInt k=0; k<data.rows(); k++)
        values.at(k) = data(k, selectData+1);
      return values;
    }, comm);
    Parallel::broadCast(arcWiseData, 0, comm);

    // determine bins
    // --------------
    std::vector<Double> bins;
    if(Parallel::isMaster(comm))
    {
      std::vector<Double> globalData;
      for(UInt arcNo=0; arcNo<arcWiseData.size(); arcNo++)
        globalData.insert(globalData.end(), arcWiseData[arcNo].begin(), arcWiseData[arcNo].end());
      std::sort(globalData.begin(), globalData.end());

      VariableList varList;
      varList.setVariable("dataMin", globalData.front());
      varList.setVariable("dataMax", globalData.back());
      const Double lowerBound = exprMin->evaluate(varList);
      const Double upperBound = exprMax->evaluate(varList);

      const UInt originalSize = globalData.size();
      globalData.erase(std::remove_if(globalData.begin(), globalData.end(), [lowerBound, upperBound](Double x) {return (x<lowerBound) || (x>upperBound);}), globalData.end());
      logInfo<<"Discarded "<<originalSize-globalData.size()<<" elements."<<Log::endl;

      // compute number of bins based on Freedman-Diaconis' choice
      if(binCount == 0)
      {
        const UInt   count       = (globalData.size()+1)/2;
        const Double q1          = globalData[count / 2];
        const Double q3          = globalData[3 * count / 2];
        const Double binSize     = 2.0 * (q3 - q1) / std::pow(globalData.size(), 1./3.);
        const UInt   binCountMax = 100;
        binCount = std::min(binCountMax, static_cast<UInt>(std::ceil((upperBound-lowerBound)/binSize)));
        if(binCount == binCountMax)
          logWarning << "Bin count set to a maximum of <" << binCountMax << ">" << Log::endl;
      }
      logInfo<<"Sort data into "<<binCount<<" bins in the range of ["<<lowerBound<<", "<<upperBound<<"]"<<Log::endl;

      bins.resize(1, lowerBound); // intervals
      for(UInt k=0; k<binCount; k++)
        bins.push_back(bins.back() + (upperBound-lowerBound)/binCount);
      bins.back() = upperBound; // make sure upper bound is correct
    }
    Parallel::broadCast(bins, 0, comm);

    // compute histogram
    // -----------------
    logStatus<<"compute histogram"<<Log::endl;
    Matrix histogram(bins.size()-1, arcCount+1); // first column: lower bin bound

    Parallel::forEach(arcCount, [&](UInt arcNo)
    {
      std::vector<Double> data = arcWiseData.at(arcNo);
      for(UInt k = 0; k<bins.size()-2; k++)
        histogram(k, arcNo+1) = std::count_if(data.begin(), data.end(), [&, bins, k](Double v){ return (v>=bins[k] && v<bins[k+1]); });

      // last bin includes upper bound
      histogram(bins.size()-2, arcNo+1) = std::count_if(data.begin(), data.end(), [&, bins](Double v){ return (v>=bins[bins.size()-2] && v<=bins.back()); });

      if(relative)
      {
        UInt count = std::count_if(data.begin(), data.end(), [&, bins](Double v){ return (v>=bins.front() && v<=bins.back()); });
        histogram.column(arcNo+1) *= 1./count;
      }
      if(cumulative)
      {
        for(UInt k = 1; k<bins.size()-1; k++)
          histogram(k, arcNo+1) += histogram(k-1, arcNo+1);
      }
    }, comm);
    Parallel::reduceSum(histogram, 0, comm);

    logStatus<<"write histogram to <"<<fileNameOut<<">"<<Log::endl;
    if(Parallel::isMaster(comm))
    {
      for(UInt k = 0; k<bins.size()-1; k++)
        histogram(k, 0) = bins.at(k);
      writeFileMatrix(fileNameOut, histogram);
    }
  }
  catch(std::exception &e)
  {
    GROOPS_RETHROW(e)
  }
}

/***********************************************/