1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
/***********************************************/
/**
* @file instrument2CrossCorrelationFunction.cpp
*
* @brief Empirical computation of the correlation between two instrument files.
*
* @author Andreas Kvas
* @date 2018-01-01
*
*/
/***********************************************/
// Latex documentation
#define DOCSTRING docstring
static const char *docstring = R"(
This program computes the cross correlation between all corresponding data columns
in two \file{instrument files}{instrument}. The instrument files must be synchronized (\program{InstrumentSynchronize}).
The \configFile{outputfileCorrelation}{matrix} is a matrix with the first column containing the time lag followed by
cross-correlation function for each data column. The maximum lag is defined by the maximum arc length.
The correlation is based on the unbiased estimate of the cross-covariance between data columns $x$ and $y$,
\begin{equation}
\sigma_{xy}(h) = \frac{1}{N}\sum_{k=1} x_{k+h} y_k,
\end{equation}
which is averaged over all arcs. From this estimate, the correlation for each lag is then computed via
\begin{equation}
r_{xy}(h) = \frac{\sigma_{xy}(h)}{\sigma_x(0)\sigma_y(0)},
\end{equation}
which is the ratio between the biased estimates of the cross-covariance at lag $h$ and the auto-covariance of the individual data columns.
For instrument with data gaps, lag bins without any data are set to NAN.
)";
/***********************************************/
#include "programs/program.h"
#include "base/fourier.h"
#include "files/fileMatrix.h"
#include "files/fileInstrument.h"
/***** CLASS ***********************************/
/** @brief Empirical computation of the correlation between two instrument files.
* @ingroup programsGroup */
class Instrument2CrossCorrelationFunction
{
public:
void run(Config &config, Parallel::CommunicatorPtr comm);
};
GROOPS_REGISTER_PROGRAM(Instrument2CrossCorrelationFunction, PARALLEL, "Empirical computation of the correlation between two instrument files.", Instrument, Statistics)
GROOPS_RENAMED_PROGRAM(InstrumentComputeCorrelation, Instrument2CrossCorrelationFunction, date2time(2020, 7, 7))
/***********************************************/
void Instrument2CrossCorrelationFunction::run(Config &config, Parallel::CommunicatorPtr comm)
{
try
{
FileName outputName;
FileName inputName, inputNameReference;
readConfig(config, "outputfileCorrelation", outputName, Config::MUSTSET, "", "column 1: time lag, column 2..n cross-correlation");
readConfig(config, "inputfileInstrument", inputName, Config::MUSTSET, "", "");
readConfig(config, "inputfileInstrumentReference", inputNameReference, Config::MUSTSET, "", "");
if(isCreateSchema(config)) return;
// check input
// -----------
InstrumentFile instrumentFile(inputName);
InstrumentFile instrumentFileReference(inputNameReference);
InstrumentFile::checkArcCount({instrumentFile, instrumentFileReference});
const UInt arcCount = instrumentFile.arcCount();
const UInt dataCount = std::min(instrumentFile.dataCount(TRUE/*mustDefined*/), instrumentFileReference.dataCount(TRUE/*mustDefined*/));
// determine arc length and data fields
// ------------------------------------
UInt maxLag;
Double sampling = 1.0;
if(Parallel::isMaster(comm))
{
std::vector<Time> times;
Time maxArcLen;
for(UInt arcNo = 0; arcNo<arcCount; arcNo++)
{
Arc arc = instrumentFile.readArc(arcNo);
if(arc.size() == 0)
continue;
std::vector<Time> arcTimes = arc.times();
maxArcLen = std::max(arcTimes.back() - arcTimes.front(), maxArcLen);
times.insert(times.end(), arcTimes.begin(), arcTimes.end());
}
sampling = medianSampling(times).seconds();
maxLag = static_cast<UInt>(std::round(maxArcLen.seconds()/sampling)+1);
logInfo<<" maximum arc length: "<<maxLag<<" epochs"<<Log::endl;
logInfo<<" median sampling: "<<sampling<<" seconds"<<Log::endl;
}
Parallel::broadCast(maxLag, 0, comm);
Parallel::broadCast(sampling, 0, comm);
// estimate the covariance for each arc, then reduce
// -------------------------------------------------
logStatus<<"Estimate cross-correlation for each arc"<<Log::endl;
Vector count(maxLag*2-1);
Vector autoCovarianceX(dataCount);
Vector autoCovarianceY(dataCount);
Matrix crossCovariance(maxLag*2-1, dataCount+1);
Parallel::forEach(arcCount, [&](UInt arcNo)
{
const Arc arc = instrumentFile.readArc(arcNo);
const Arc arcRef = instrumentFileReference.readArc(arcNo);
if(arc.size() == 0)
return;
const Matrix X = arc.matrix();
const Matrix Y = arcRef.matrix();
for(UInt i=0; i<dataCount; i++)
autoCovarianceX(i) += quadsum(X.column(i+1));
for(UInt i=0; i<dataCount; i++)
autoCovarianceY(i) += quadsum(Y.column(i+1));
std::vector<Time> times = arc.times();
if(isRegular(times)) // fast version possible?
{
count(maxLag-1) += X.rows();
for(UInt col=1; col<crossCovariance.columns(); col++)
crossCovariance(maxLag-1, col) += inner(X.column(col), Y.column(col));
for(UInt h=1; h<X.rows(); h++)
{
count(maxLag-1-h) += X.rows()-h;
count(maxLag-1+h) += X.rows()-h;
for(UInt col=1; col<crossCovariance.columns(); col++)
{
crossCovariance(maxLag-1-h, col) += inner(X.slice(0, col, X.rows()-h, 1), Y.slice(h, col, X.rows()-h, 1));
crossCovariance(maxLag-1+h, col) += inner(X.slice(h, col, X.rows()-h, 1), Y.slice(0, col, X.rows()-h, 1));
}
}
}
else // general case
{
for(UInt i=0; i<X.rows(); i++)
for(UInt k=0; k<Y.rows(); k++)
{
const UInt idx = static_cast<UInt>(std::round((times.at(i)-times.at(k)).seconds()/sampling)+maxLag-1);
count(idx)++;
for(UInt col=1; col<crossCovariance.columns(); col++)
crossCovariance(idx, col) += X(i, col) * Y(k, col);
}
}
}, comm);
Parallel::reduceSum(count, 0, comm);
Parallel::reduceSum(autoCovarianceX, 0, comm);
Parallel::reduceSum(autoCovarianceY, 0, comm);
Parallel::reduceSum(crossCovariance, 0, comm);
if(Parallel::isMaster(comm))
{
autoCovarianceX *= 1./count(maxLag-1);
autoCovarianceY *= 1./count(maxLag-1);
for(UInt i=0; i<count.rows(); i++)
crossCovariance.row(i) *= 1./count(i);
for(UInt i=0; i<dataCount; i++)
crossCovariance.column(i+1) *= 1.0/std::sqrt(autoCovarianceX(i)*autoCovarianceY(i));
for(UInt h=0; h<maxLag; h++)
{
crossCovariance(maxLag-h-1, 0) = -sampling*h;
crossCovariance(h+maxLag-1, 0) = sampling*h;
}
logStatus<<"write cross correlation to <"<<outputName<<">"<<Log::endl;
writeFileMatrix(outputName, crossCovariance);
}
}
catch(std::exception &e)
{
GROOPS_RETHROW(e)
}
}
/***********************************************/
|