File: PCA.hpp

package info (click to toggle)
salmon 0.7.2%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 4,352 kB
  • ctags: 5,243
  • sloc: cpp: 42,341; ansic: 6,252; python: 228; makefile: 207; sh: 190
file content (110 lines) | stat: -rw-r--r-- 3,673 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#ifndef __PCA__HPP__
#define __PCA__HPP__

#include <iostream>
#include <unordered_set>
#include "Eigen/Dense"

class PCA {
    public:
    PCA(Eigen::MatrixXd& m) { //Eigen::Map<Eigen::MatrixXd>& m) {
        dat_ = m;
        /*
        dat_ = Eigen::MatrixXd(m.rows(), m.cols());
        for(size_t i = 0; i < m.rows(); ++i) {
            for (size_t j = 0; j < m.cols(); ++j) {
                dat_(i,j) = m(i,j);
            }
        }
        */
    }

    void performDecomposition(bool standardize=true) {
        if (standardize) {
            std::unordered_set<uint32_t> droppedCols;

            size_t nrows = dat_.rows();
            Eigen::VectorXd means(dat_.cols());
            means = dat_.colwise().mean();
            Eigen::VectorXd stdDev(dat_.cols());
            decltype(dat_.cols()) i, j;
            for (i = 0; i < dat_.cols(); ++i) {
                for (j = 0; j < dat_.rows(); ++j) {
                    dat_(j,i) -= means(i);
                }
                auto x = dat_.col(i).dot(dat_.col(i));
                auto sd = std::sqrt(x / nrows);
                if (sd < 1e-20) {
                    droppedCols.insert(i);
                } else {
                    dat_.col(i) /= sd;
                }
            }

            // If we dropped any columns for having a "0" standard deviation,
            // then create a new normalized data matrix without them.
            if (droppedCols.size() > 0) {
                Eigen::MatrixXd tmpDat(dat_.rows(), dat_.cols() - droppedCols.size());
                size_t curCol{0};
                decltype(dat_.cols()) i;
                for (i = 0; i < dat_.cols(); ++i) {
                    if (droppedCols.find(i) == droppedCols.end()) {
                        tmpDat.col(curCol) = dat_.col(i);
                        ++curCol;
                    }
                }
                std::swap(dat_, tmpDat);
            }
        }

        //std::cerr << "DATA = " << dat_ << "\n";
        Eigen::JacobiSVD<Eigen::MatrixXd> svd(dat_,
                                              Eigen::ComputeThinU |
                                              Eigen::ComputeThinV);
        proj_ = svd.matrixV();
        variances_ = svd.singularValues();
        //std::cerr << "V = " << proj_ << "\n";
        //std::cerr << "S = " << variances_;
    }

    uint32_t varFrac(double frac, bool verbose=false) {
        uint32_t cutoff{0};
        bool haveCutoff{false};

        if (verbose) {
            std::cerr << "variances: " << variances_ << "\n";
        }
        Eigen::VectorXd fracs(variances_.rows());
        double totalVar{0.0};
        for (uint32_t i = 0; i < variances_.rows(); ++i) {
            variances_(i) = variances_(i) * variances_(i);
            totalVar += variances_(i);
        }
        double partialSum{0.0};
        for (uint32_t i = 0; i < variances_.rows(); ++i) {
            partialSum += variances_(i);
            fracs(i) = partialSum / totalVar;
            if (!haveCutoff and fracs(i) >= frac) {
                cutoff = i;
                haveCutoff = true;
            }
        }
        if (verbose) {
            std::cerr << "fracs = " << fracs << "\n";
            std::cerr << "include the first " << cutoff + 1 << " components to reach cutoff of " << frac << "\n";
        }
        return cutoff + 1;
    }

    Eigen::MatrixXd projectedData(double frac, bool verbose=false) {
        uint32_t numVec = varFrac(frac, verbose);
        return dat_ * proj_.block(0, 0, proj_.rows(), numVec);
    }

    private:
        Eigen::MatrixXd dat_;
        Eigen::MatrixXd proj_;
        Eigen::VectorXd variances_;
};

#endif //__PCA__HPP__