File: freecontact.cpp

package info (click to toggle)
freecontact 1.0.21-16
links: PTS, VCS
area: main
in suites: forky, sid
size: 3,080 kB
sloc: sh: 11,174; xml: 1,497; cpp: 1,441; perl: 126; makefile: 113; f90: 99
file content (1134 lines) | stat: -rw-r--r-- 50,328 bytes
parent folder | download | duplicates (3)
/*  FreeContact - program to predict protein residue contacts from a sufficiently large multiple alignment
*   Copyright (C) 2012-2013  Laszlo Kajan <lkajan@rostlab.org> Rost Lab, Technical University of Munich, Germany
*   
*   This program is free software: you can redistribute it and/or modify
*   it under the terms of the GNU General Public License as published by
*   the Free Software Foundation, either version 3 of the License, or
*   (at your option) any later version.
*   
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*   GNU General Public License for more details.
*   
*   You should have received a copy of the GNU General Public License
*   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#include "config.h"

#ifdef HAVE_OPENMP
#include <omp.h>
#endif
#include <boost/format.hpp>
#include <cblas.h>
#include <iomanip>
#include <iostream>
//#include <memory> // auto_ptr, unique_ptr
#include <signal.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include "freecontact.h"

// lkajan: EVfold-mfDCA is calculate_evolutionary_constraints.m

namespace bo = boost;

typedef float g_fp_t;
//                          n      S         L         thr       maxIt  msg    warm   X         W         Wd        WXj       info   brk
extern "C" void glassofast_(int *, g_fp_t *, g_fp_t *, g_fp_t *, int *, int *, int *, g_fp_t *, g_fp_t *, g_fp_t *, g_fp_t *, int *, int *);

extern "C" {
    // LAPACK Cholesky
    void spotrf_(const char* UPLO, const int* N, float* A, const int* LDA, int* INFO);

    // LU decomoposition of a general matrix
    void sgetrf_(int* M, int* N, float* A, int* LDA, int* IPIV, int* INFO);

    // generate inverse of a matrix given its LU decomposition
    void sgetri_(int* N, float* A, int* lda, int* IPIV, float* WORK, int* LWORK, int* INFO);
}

namespace bo = boost;
using namespace std;
namespace freecontact {

// When you change these, rename BLAS/LAPACK calls for correct precision.
typedef float cov_fp_t;
typedef float fp_t;

template<typename _Tp>
class ct_vector : public std::vector<_Tp> {
    public:
    uint16_t alilen;

    ct_vector(uint16_t __alilen = 0) : std::vector<_Tp>(__alilen*__alilen), alilen(__alilen) {}

    // Does not check range.
    inline _Tp& operator()(uint16_t __i, uint16_t __j)
    {
        return this->_M_impl._M_start[ __i*alilen + __j ];
    }
    // Does not check range.
    inline const _Tp& operator()(uint16_t __i, uint16_t __j) const
    {
        return this->_M_impl._M_start[ __i*alilen + __j ];
    }
};

template<typename _Tp>
class af_vector : public std::vector<_Tp> {
    public:
    uint16_t alilen;
    uint8_t q; // letters

    af_vector(uint16_t __alilen, uint8_t __q, _Tp __v) : std::vector<_Tp>(__alilen*__q, __v), alilen(__alilen), q(__q) {}

    inline _Tp& operator()(uint16_t __i, uint8_t __ai)
    {
        return this->_M_impl._M_start[ __i*q + __ai ];
    }
    inline const _Tp& operator()(uint16_t __i, uint8_t __ai) const
    {
        return this->_M_impl._M_start[ + __i*q + __ai ];
    }
};

class pf_vector : public std::vector<predictor::pairfreq_t> {
    typedef predictor::pairfreq_t   _Tp;
    public:
    uint16_t alilen;
    uint8_t q; // letters
    size_t d3, d2, d1;

    pf_vector() : std::vector<_Tp>(), alilen(0), q(0), d3(0), d2(0), d1(0) {}
    pf_vector(uint16_t __alilen, uint8_t __q, _Tp __v) : std::vector<_Tp>(__alilen*__alilen*__q*__q, __v), alilen(__alilen), q(__q), d3(q), d2(q*d3), d1(alilen*d2) {}

    inline _Tp& operator()(uint16_t __i, uint16_t __j, uint8_t __ai, uint8_t __aj)
    {
        return this->_M_impl._M_start[ (__i)*d1 + (__j)*d2 + (__ai)*d3 + (__aj) ];
    }
    inline const _Tp& operator()(uint16_t __i, uint16_t __j, uint8_t __ai, uint8_t __aj) const
    {
        return this->_M_impl._M_start[ (__i)*d1 + (__j)*d2 + (__ai)*d3 + (__aj) ];
    }
};

template<typename _Tp>
class cov_vector : public std::vector<_Tp> {
    public:
    uint16_t alilen;
    uint8_t q; // letters
    size_t d1;

    cov_vector() : std::vector<_Tp>(), alilen(0), q(0), d1(0) {}
    cov_vector(uint16_t __alilen, uint8_t __q) : std::vector<_Tp>(__alilen*__q*__alilen*__q),
        alilen(__alilen), q(__q), d1(alilen*q) {}
    cov_vector(uint16_t __alilen, uint8_t __q, _Tp __v) : std::vector<_Tp>(__alilen*__q*__alilen*__q, __v),
        alilen(__alilen), q(__q), d1(alilen*q) {}

    // 2D
    inline _Tp& operator()(size_t __row, size_t __col)
    {
        return this->_M_impl._M_start[ __row*d1 + __col ];
    }
    inline const _Tp& operator()(size_t __row, size_t __col) const
    {
        return this->_M_impl._M_start[ __row*d1 + __col ];
    }
    // 4D
    inline _Tp& operator()(uint16_t __i, uint16_t __j, uint8_t __ai, uint8_t __aj)
    {
        return this->_M_impl._M_start[ (__i*q+__ai)*d1 + __j*q+__aj ];
    }
    inline const _Tp& operator()(uint16_t __i, uint16_t __j, uint8_t __ai, uint8_t __aj) const
    {
        return this->_M_impl._M_start[ (__i*q+__ai)*d1 + __j*q+__aj ];
    }
};

/// Calculates raw score for an i,j contact
class _rawscore_calc_t {
    public:
    virtual double operator()(const uint16_t i, const uint16_t j) const = 0;
};

/// 2-dimensional matrix.
template<typename _Tp, typename _Tri = size_t, typename _Tci = size_t>
class d2matrix : public std::vector<_Tp> {
    public:
    _Tri rows;
    _Tci cols;

                    d2matrix(_Tri __rows, _Tci __cols) : std::vector<_Tp>(__rows*__cols), rows(__rows), cols(__cols)
    {
        memset(this->_M_impl._M_start, 0, rows*cols*sizeof(_Tp));
    }
    inline _Tp& operator()(_Tri __r, _Tci __c)
    {
        return this->_M_impl._M_start[ __r*cols + __c ];
    }
    inline const _Tp& operator()(_Tri __r, _Tci __c) const
    {
        return this->_M_impl._M_start[ __r*cols + __c ];
    }
};
typedef d2matrix<uint32_t, uint32_t, int> simcnt_t;

static void         _raw_score_matrix(map<string, ct_vector<fp_t> > &__raw_ctscore, map<string, vector<double> > &__apc_bg, map<string, double> &__apc_mean,
    const uint16_t __alilen, const string &__key, const _rawscore_calc_t &__fo );

static vector<contact_t>
                    _apc( const ct_vector<fp_t>& __raw_ctscore, const vector<double>& __apc_bg, const double __apc_mean, const uint16_t __mincontsep, bool __filt );

static inline vector<contact_t>
                    _raw_as_is( const ct_vector<fp_t>& __raw_ctscore, const uint16_t __mincontsep );

static inline uint32_t
                    _cache_holds_nseq(uint16_t __seqsize);

#ifndef __SSE2__
static inline __m128i _mm_setzero_si128(){ __m128i m; long long *mp = (long long *)&m; mp[1] = mp[0] = 0; return m; }
#endif


class _glasso_timer {
    timer_t         _timerid;

    static void     _brk(sigval_t __sigval)
    {
        *static_cast<int*>(__sigval.sival_ptr) = 1;
    }

    // this is a resource - disable copy contructor and copy assignment
                    _glasso_timer(const _glasso_timer&){}
    _glasso_timer&  operator=(const _glasso_timer&){return *this;}
    public:
    bool            dbg;
                    _glasso_timer(int *__sival_ptr, time_t __tv_sec, bool __dbg = false) : dbg(__dbg)
    {
        struct sigevent sev;
        sev.sigev_notify = SIGEV_THREAD;
        sev.sigev_notify_function = _brk;
        sev.sigev_value.sival_ptr = __sival_ptr;
        sev.sigev_notify_attributes = NULL;
        if (timer_create(CLOCK_PROCESS_CPUTIME_ID, &sev, &_timerid) == -1)
        {
            if (dbg) cerr << "timer_create(CLOCK_PROCESS_CPUTIME_ID, ...): " << strerror(errno) << " at " << __FILE__ << ":" << __LINE__ << "\n";
            // In case timer_create fails with CLOCK_PROCESS_CPUTIME_ID because clock_getcpuclockid() is not supported (e.g. on kfreebsd):
            if (timer_create(CLOCK_MONOTONIC, &sev, &_timerid) == -1) throw std::runtime_error(bo::str(bo::format("%s at %s:%d") % strerror(errno) % __FILE__ % __LINE__ ));
        }
        struct itimerspec its;
        its.it_value.tv_sec = __tv_sec;
        its.it_value.tv_nsec = 0;
        its.it_interval.tv_sec = 0;
        its.it_interval.tv_nsec = 0;
        if (timer_settime(_timerid, 0, &its, NULL) == -1) throw std::runtime_error(bo::str(bo::format("%s at %s:%d") % strerror(errno) % __FILE__ % __LINE__ ));
    }

    virtual         ~_glasso_timer()
    {
        if (timer_delete(_timerid) == -1) throw std::runtime_error(strerror(errno));
    }
};

void                predictor::get_seq_weights(
                        freq_vec_t &__aliw, double &__wtot,
                        const ali_t& __ali, double __cp,
                        bool __veczw, int __num_threads
                    ) //throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error)
{
    if (__ali.seqcnt < 2) throw alilen_error(bo::str(bo::format("alignment size (%d) < 2") % __ali.seqcnt));
    // lkajan: the present implementation is limited to alilen 4080.
    if (__ali.alilenpad > 4080) throw alilen_error(bo::str(bo::format("alignment (%d) longer than 4080") % __ali.alilenpad));
    if (__cp < 0 || __cp > 1) throw range_error(bo::str(bo::format("clustering threshold is out of range [0-1] %g") % __cp ));
    if (__num_threads < 0) throw range_error(bo::str(bo::format("number of threads is out of range [0-) %d") % __num_threads ));

    #ifdef HAVE_OPENMP
    int num_threads = __num_threads ? __num_threads : omp_get_max_threads();
    if(dbg) cerr << "will use " << num_threads << " OMP threads\n";
    #else
    int num_threads = 1;
    if(dbg) cerr << "no OMP thread support\n";
    #endif

    // Calculate BLOSUM sequence weights at clustering threshold __cp: "sequence segments that are identical for at least that percentage of amino acids are grouped together".
    // A modification of the method in "Amino acid substitution matrices from protein blocks. Henikoff and Henikoff 1992. PNAS.", because the weight is not established for all members of
    // the cluster from the size of the __cp cluster, but instead individually, from the number of sequences found at >= __cp. This is the method of mfDCA and PSICOV.
    // Gaps and Xs also contribute to matches/mismatches, notably X matches only [JOUX].
    time_t t0 = time(NULL);

    uint16_t mismthresh = __ali.alilen - ceil( __ali.alilen * __cp );
    int matchthresh = __ali.alilenpad - mismthresh; // minimum number of matches for clustering seq

    uint32_t    cache_holds_nseq = _cache_holds_nseq(__ali.alilenpad);
    simcnt_t    simcnt(__ali.seqcnt, num_threads);

    // lkajan: allow for other vectorizations as well
    #define CAN_VECW 0
    #ifdef __SSE2__
    #undef CAN_VECW
    #define CAN_VECW 1
    #endif

    if(!CAN_VECW || !__veczw)
    {
        // lkajan: The SSE2 implementation came first. Aim was to keep changes to the minimum here. That's why this implementation looks a bit weird.
        // lkajan: Look out: the iterations take less and less time, so a division by first-half/second-half, as seems to be the default, leads to thread starvation.
        #ifdef HAVE_OPENMP
            #pragma omp parallel for num_threads(num_threads), schedule(dynamic)
        #endif
        for(uint32_t i=0; i<__ali.seqcnt-1; ++i)
        {
            if(dbg 
                #ifdef HAVE_OPENMP
                    && omp_get_thread_num() == 0
                #endif
            )
            { time_t t1 = time(NULL); if( t1 != t0 ) { t0 = t1; fprintf(stderr, "%s%d/%d", "[1G[K", i, __ali.seqcnt); } } // xterm

            const uint8_t *a0 = &__ali(i,0);

            for(uint32_t j=i+1; j<__ali.seqcnt; ++j)
            {
                const uint8_t *a = a0, *b = &__ali(j,0);
                uint16_t matc128 = 0;

                for(uint16_t k = 0, k_end = __ali.alilenpad; k < k_end; ++k)
                    matc128 += ( a[k] == b[k] );

                if( matc128 >= matchthresh )
                {
                    #ifndef HAVE_OPENMP
                    int thread_num = 0;
                    #else
                    int thread_num = omp_get_thread_num();
                    #endif
                    ++simcnt(i,thread_num);
                    ++simcnt(j,thread_num);
                }
            }
        }
    }
    #ifdef __SSE2__
    else
    {
        // lkajan: Dynamically set chunk size according to (L1) cache size (cat /sys/devices/system/cpu/cpu0/cache/index*/{coherency_line_size,level,size,type}).
        // lkajan: Would prefetching with _mm_prefetch(ptr, _MM_HINT_T0) help?
        uint32_t wchunk = cache_holds_nseq / 2;

        if(dbg) cerr << "SSE2 veczweight, wchunk = "<< wchunk <<"\n";

        __m128i m0 = _mm_setzero_si128();
        __m128i m1 = _mm_cmpeq_epi8( m0, m0 );

        uint32_t chunk_e = (__ali.seqcnt-2)/wchunk + 1;
        #ifdef HAVE_OPENMP
            #pragma omp parallel for num_threads(num_threads), schedule(dynamic)
        #endif
        for(uint32_t chunki=0; chunki<chunk_e; ++chunki)
        {
            // chunkj == chunki - diagonal
            {
                if(dbg 
                        #ifdef HAVE_OPENMP
                        && omp_get_thread_num() == 0
                        #endif
                  )
                { time_t t1 = time(NULL); if( t1 != t0 ) { t0 = t1; fprintf(stderr, "%s%d,%d/%d", "[1G[K", chunki, chunki, chunk_e); } } // xterm

                uint32_t i_b = chunki*wchunk;
                uint32_t i_e = min( i_b+wchunk, __ali.seqcnt-1 );
                uint32_t j_e = min( i_b+wchunk, __ali.seqcnt );
                for(uint32_t i=i_b; i<i_e; ++i)
                {
                    __m128i *a0 = (__m128i*)&__ali(i,0);

                        // lkajan: This is a macro, because the inline function equivalent is 5% slower, although the timings are slightly weird.
                        // m1: matches, counted /down/ from 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
                        // lkajan: My attempts to accelerate the followin by early break on (matc >= matchthresh || (k<<4) - matc > mismthresh) have failed.
                        // matc128 = _mm_add_epi8( m, matc128 ): look out, good only till seq len <= 4080
#define _count_matches_sse2_helper(__omp_get_thread_num) {\
                        __m128i *a = a0, *b = (__m128i*)&__ali(j,0);\
                        __m128i matc128 = m1;\
\
                        for(uint16_t k = 0, k_end = __ali.alilen16; k < k_end; ++k) \
                        {\
                            __m128i m = _mm_cmpeq_epi8( a[k], b[k] );\
                            matc128 = _mm_add_epi8( m, matc128 );\
                        }\
\
                        matc128 = _mm_sad_epu8( m1, matc128 );\
                        matc128 = _mm_add_epi32( matc128,\
                                _mm_srli_si128( matc128, 8 ) );\
\
                        if( _mm_cvtsi128_si32(matc128) >= matchthresh )\
                        {\
                            int thread_num = (__omp_get_thread_num);\
                            ++simcnt(i,thread_num);\
                            ++simcnt(j,thread_num);\
                        } }
#ifdef HAVE_OPENMP
#define _count_matches_sse2() _count_matches_sse2_helper(omp_get_thread_num())
#else
#define _count_matches_sse2() _count_matches_sse2_helper(0)
#endif
                    for(uint32_t j = i+1; j<j_e; ++j)
                        _count_matches_sse2();
                }
            }

            // off-diagonal chunks
            for(uint32_t chunkj=chunki+1; chunkj<chunk_e; ++chunkj)
            {
                if(dbg 
                        #ifdef HAVE_OPENMP
                        && omp_get_thread_num() == 0
                        #endif
                  )
                { time_t t1 = time(NULL); if( t1 != t0 ) { t0 = t1; fprintf(stderr, "%s%d,%d/%d", "[1G[K", chunki, chunkj, chunk_e); } } // xterm

                uint32_t i_b = chunki*wchunk;
                uint32_t i_e = min( i_b+wchunk, __ali.seqcnt-1 );
                uint32_t j_b = chunkj*wchunk;
                uint32_t j_e = min( j_b+wchunk, __ali.seqcnt );
                for(uint32_t i=i_b; i<i_e; ++i)
                {
                    __m128i *a0 = (__m128i*)&__ali(i,0);

                    for(uint32_t j=j_b; j<j_e; ++j)
                        _count_matches_sse2();
                }
            }
#undef _count_matches_sse2
#undef _count_matches_sse2_helper
        }// for chunki
    }
    #else // should never happen unless bug
    else throw std::runtime_error(bo::str(bo::format("ooops, there is a bug in %s around %s:%d") % PACKAGE % __FILE__ % __LINE__ ));
    #endif // __SSE2__

    __wtot = 0; // total weight of all aligments, EVfold-mfDCA::Meff
    __aliw = freq_vec_t(__ali.seqcnt); // alignment weight array, EVfold-mfDCA::W

    for(uint32_t i = 0; i<__ali.seqcnt; ++i)
    {
        #ifndef HAVE_OPENMP
        int thread_num = 0;
        uint32_t sci = simcnt(i,thread_num);
        #else
        uint32_t sci = 0;
        for(int thread_num = 0; thread_num < num_threads; ++thread_num)
            sci += simcnt(i,thread_num);
        #endif
        __wtot += (__aliw[i] = 1.0 / (1 + sci));
    }

    if(dbg) fprintf(stderr, "total weight (variation) of alignment = %16.15g\n", __wtot );

    return;
}

predictor::cont_res_t
                    predictor::run(const ali_t &__ali, const freq_vec_t &__aliw, const double __wtot,
                        double __dens, double __gapth, uint16_t __mincontsep,
                        double __pseudocnt, double __pscnt_weight, bool __estimate_ivcov, double __shrink_lambda,
                        bool __cov20, bool __apply_gapth, double __rho,
                        int __num_threads, time_t __icme_timeout, time_res_t *__timing
                    ) //throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error)
{
    if (__ali.seqcnt < 2) throw alilen_error(bo::str(bo::format("alignment size (%d) < 2") % __ali.seqcnt));
    if (__ali.seqcnt != __aliw.size()) throw alilen_error(bo::str(bo::format("alignment size (%d) != alignment weight vector size (%d)") % __ali.seqcnt % __aliw.size()));
    // operations on unsigned loop bounds make this limit necessary
    if (__ali.alilen < 4) throw alilen_error(bo::str(bo::format("alignment (%d) shorter than 4") % __ali.alilen));
    if (__ali.alilen < __mincontsep+1) throw alilen_error(bo::str(bo::format("alignment (%d) shorter than __mincontsep (%d) + 1") % __ali.alilen % __mincontsep));
    // lkajan: PSICOV exits with failure here if wtot < __ali.alilen.
    //if (__wtot < __ali.alilen) throw alilen_error(bo::str(bo::format("total alignment weight (%g) < alignment length (%d)") % __wtot % __ali.alilen));
    if(__dens < 0 || __dens > 1) throw range_error(bo::str(bo::format("contact density is out of range [0-1] %g") % __dens ));
    if(__gapth < 0 || __gapth > 1) throw range_error(bo::str(bo::format("gap threshold is out of range [0-1] %g") % __gapth ));
    if(__mincontsep < 1) throw range_error(bo::str(bo::format("minimum contact separation is out of range [1-) %d") % __mincontsep ));
    if(__num_threads < 0) throw range_error(bo::str(bo::format("number of threads is out of range [0-) %d") % __num_threads ));
    if(__pseudocnt < 0) throw range_error(bo::str(bo::format("pseudo count is out of range [0-) %g") % __pseudocnt ));
    if(__pscnt_weight < 0 || __pscnt_weight > 1) throw range_error(bo::str(bo::format("pseudo count weight is out of range [0-1] %g") % __pscnt_weight ));
    if(__shrink_lambda < 0 || __shrink_lambda > 1) throw range_error(bo::str(bo::format("shrinkage lambda is out of range [0-1] %g") % __shrink_lambda ));

    #ifdef HAVE_OPENMP
    int num_threads = __num_threads ? __num_threads : omp_get_max_threads();
    if(dbg) cerr << "will use " << num_threads << " OMP threads\n";
    #else
    int num_threads = 1;
    if(dbg) cerr << "no OMP thread support\n";
    #endif

    if(__timing) (*__timing)["num_threads"] = num_threads;
    //timeval t_run; gettimeofday(&t_run, NULL);

    // Calculate column-wise AA frequencies with pseudocount and weighting, 21-letter alphabet.
    const double ps_wtot = __pseudocnt * 21 + __wtot; // lkajan: sum( pa[i][*] ) and sum( pab[i][j][*][*] ) is __wtot, if no pseudocounts are used
    af_vector<freq_t> aafreq( __ali.alilen, 21, __pseudocnt / ps_wtot ); // aafreq[__ali.alilen][21], this is EVfold-mfDCA::Pi_true with __pseudocnt == 0

    freq_t ps_aliw[__ali.seqcnt]; memcpy(ps_aliw, __aliw.data(), __ali.seqcnt*sizeof(freq_t));
    cblas_dscal(__ali.seqcnt, 1/ps_wtot, ps_aliw, 1); // scale vector by a constant

    for(uint32_t k = 0; k < __ali.seqcnt; ++k)
    {
        for(uint16_t j = 0; j < __ali.alilen; ++j)
        {
            uint8_t aa = __ali(k,j);
            if(aa < 21) aafreq(j,aa) += ps_aliw[k]; // if aa is not X or alike
        }
    }

    vector<uint16_t> gap_cols;
    if(__gapth<1)
        for(uint16_t i = 0; i < __ali.alilen; ++i)
        {
            if( aafreq(i,aamap_gapidx) > __gapth ) gap_cols.push_back(i);
        }

    if(dbg) cerr << "calculated column aa frequencies, gap cols = " << gap_cols.size() << "\n";

    // Calculate AA-pair frequencies with pseudocount and weighting, 21-letter alphabet.
    pf_vector pairfreq( __ali.alilen, 21, __pseudocnt/21.0 / ps_wtot ); // pairfreq[__ali.alilen][__ali.alilen][21][21], this is EVfold-mfDCA::Pij_true with __pseudocnt == 0

    timeval t_before;
    gettimeofday(&t_before, NULL);
    {
        for(uint32_t k = 0; k<__ali.seqcnt; ++k)
        {
            freq_t ps_aliwk = ps_aliw[k];
            const uint8_t *alik = &__ali(k,0);

            #ifdef HAVE_OPENMP
                #pragma omp parallel for num_threads(num_threads), schedule(static)
            #endif
            for(uint16_t i = 0; i < (__ali.alilen-2)/2+1; ++i)
            {
#define _fill_pfi(__i) {\
                pairfreq_t *pfi = &pairfreq(__i,0,0,0);\
                for(uint16_t j = __i+1; j < __ali.alilen; ++j)\
                {\
                    uint8_t ai = alik[__i];\
                    uint8_t aj = alik[j];\
                    if(ai < 21 && aj < 21)\
                        pfi[ j*pairfreq.d2 + ai*pairfreq.d3 + aj ] += ps_aliwk;\
                } }
                //
                _fill_pfi(i);

                uint16_t ri = __ali.alilen-2-i;
                if(ri!=i)
                    _fill_pfi(ri);
#undef _fill_pfi
            }
        }

        for(uint16_t i = 0; i < __ali.alilen-1; ++i)
            for(uint16_t j = i+1; j < __ali.alilen; ++j)
            {
                pairfreq_t *pfij = &pairfreq(i,j,0,0), *pfji = &pairfreq(j,i,0,0); // lkajan: this is faster
                for(uint8_t ai = 0; ai < 21; ++ai) // -funroll-loops to unroll such loops? vectorize?
                    for(uint8_t aj = 0; aj < 21; ++aj)
                        pfji[ aj*pairfreq.d3 + ai ] = pfij[ ai*pairfreq.d3 + aj ];
            }

        for(uint16_t i = 0; i < __ali.alilen; ++i)
        {
            // zero out 21x21 matrix
            memset(pairfreq.data() + i*pairfreq.d1 + i*pairfreq.d2, 0, 21*21*sizeof(pairfreq_t));
            for(uint8_t aa = 0; aa < 21; ++aa)
                pairfreq(i,i,aa,aa) = aafreq(i,aa);
        }
    }
    { timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
        if(__timing) (*__timing)["pairfreq"] = t_diff.tv_sec+t_diff.tv_usec/1e6; if(dbg) fprintf(stderr, "calculated pair frequency table in %g secs\n", t_diff.tv_sec+t_diff.tv_usec/1e6 ); }

    // lkajan: Do MI_true calculation before changing aafreq and pairfreq:
    map<string, ct_vector<fp_t> > raw_ctscore;
    map<string, vector<double> > apc_bg;
    map<string, double> apc_mean;
    // EVfold-mfDCA MI_true
    class _rawscore_MI_true_t : public _rawscore_calc_t {
        const af_vector<freq_t> &_Pi_true;
        const pf_vector &_Pij_true;
      public:
        _rawscore_MI_true_t(const af_vector<freq_t> &__Pi_true, const pf_vector &__Pij_true ) : _Pi_true(__Pi_true), _Pij_true(__Pij_true) {}
        virtual double operator()(const uint16_t i, const uint16_t j) const
        {
            double raw_ij = 0;
            for(uint8_t ai = 0; ai < 21; ++ai)
                for(uint8_t aj = 0; aj < 21; ++aj)
                    if(_Pij_true(i,j,ai,aj) > 0)
                        raw_ij += _Pij_true(i,j,ai,aj) * log( _Pij_true(i,j,ai,aj) / _Pi_true(i,ai) / _Pi_true(j,aj) );
            return raw_ij;
        }
    } _rawscore_MI_true(aafreq, pairfreq);
    _raw_score_matrix(raw_ctscore, apc_bg, apc_mean, __ali.alilen, "MI", _rawscore_MI_true );
    if(dbg) fprintf(stderr, "collected apc_mean[MI] = %16.15g\n", apc_mean["MI"]);

    // EVfold-mfDCA::with_pc() pseudocount weight function implementation
    {
        pf_vector Pij_true = pairfreq;
        if( __pscnt_weight != 0.0 )
        {
            for(size_t i = 0; i < pairfreq.size(); ++i) pairfreq[i] = (1-__pscnt_weight)*pairfreq[i] + __pscnt_weight/21/21; // vectorize?
            for(size_t i = 0; i < aafreq.size(); ++i) aafreq[i] = (1-__pscnt_weight)*aafreq[i] + __pscnt_weight/21; // vectorize?
    
            for(uint16_t i = 0; i < __ali.alilen; ++i)
                for(uint8_t ai = 0; ai < 21; ++ai)
                    for(uint8_t aj = 0; aj < 21; ++aj)
                        pairfreq(i,i,ai,aj) = ai==aj ? (1-__pscnt_weight)*Pij_true(i,i,ai,aj) + __pscnt_weight/21 : 0;
        }
    }

    if(dbg)
    {
        // compute sum of pairfreq matrix
        double aafs = 0; // aafreq sum
        double pfs = 0;
        for(uint16_t i = 0; i < __ali.alilen; ++i)
            for(uint16_t j = 0; j < __ali.alilen; ++j)
                for(uint8_t ai = 0; ai < 21; ++ai)
                {
                    if(j==0) aafs += aafreq(i,ai);
                    for(uint8_t aj = 0; aj < 21; ++aj)
                        pfs += pairfreq(i,j,ai,aj);
                }
        fprintf(stderr, "aa freq sum (cell) = %.15g, pairfreq sum (cell) = %.15g\n", aafs/__ali.alilen, pfs/__ali.alilen/__ali.alilen);
    }

    #ifdef RETURN_AFTER_PAIRFREQ
    cerr << "returning because of RETURN_AFTER_PAIRFREQ " << __FILE__ << ":" << __LINE__ << "\n";
    return map<string, vector<contact_t> >();
    #endif

    // Forming the covariance matrix.
    // lkajan: EVfold-mfDCA: covq = 20, the last letter ('Y') is simply left off. In this code '-' (gap) is left off.
    // lkajan: Gaps and the actual inversion of the covariance matrix (as opposed to estimation with glasso):
    // lkajan: * Leave ( aafreq(i,aamap_gapidx) > __gapth || aafreq(j,aamap_gapidx) > __gapth ) columns and rows entirely out of the cov matrix.
    cov_vector<cov_fp_t>    covm(
        __apply_gapth ? __ali.alilen - gap_cols.size() : __ali.alilen,
        __cov20 ? 20 : 21,
        0.0
    );

    uint16_t i = 0, covi = 0;
    vector<uint16_t>::const_iterator gc_bi = ( __apply_gapth ? gap_cols.begin() : gap_cols.end() ), gc_ei = gap_cols.end();
    for(; i < __ali.alilen; ++i)
    {
        if(gc_bi!=gc_ei && *gc_bi == i){ ++gc_bi; continue; } // too many gaps in this row

        uint16_t j = 0, covj = 0;
        vector<uint16_t>::const_iterator gc_bj = ( __apply_gapth ? gap_cols.begin() : gap_cols.end() ), gc_ej = gap_cols.end();
        for(; j < __ali.alilen; ++j)
        {
            if(gc_bj!=gc_ej && *gc_bj == j){ ++gc_bj; continue; } // too many gaps in this col

            for(uint8_t ai = 0; ai < covm.q; ++ai)
                for(uint8_t aj = 0; aj < covm.q; ++aj)
                    covm(covi,covj,ai,aj) = pairfreq(i,j,ai,aj) - aafreq(i,ai) * aafreq(j,aj);
            ++covj;
        }
        ++covi;
    }
    { pf_vector _empty; pairfreq.swap(_empty); } // lkajan: this trick releases memory, unlike clear()

    // lkajan: psicov shrinks covm in infinite loop without zeroing those cells out:
    if(__estimate_ivcov) // lkajan: control it by matrix inversion/estimation method
        for(uint16_t i = 0; i < covm.alilen; ++i)
            for(uint8_t ai = 0; ai < covm.q; ++ai)
                for(uint8_t aj = 0; aj < covm.q; ++aj)
                    if(ai != aj) covm(i,i,ai,aj) = 0;
    if(dbg) cerr<<"formed covariance matrix ("<<covm.alilen<<"/"<<__ali.alilen<<","<<gap_cols.size()<<")\n";

    // Shrinking the sample covariance matrix for positive definite-ness, testing with LAPACK Cholesky factorization.
    // lkajan: This speeds up glasso/glassofast, positive definite-ness is required (invertibility is not enough).
    // lkajan: How long would a full matrix inversion take? That is pretty fast, actually, with LAPACK, see below.
    if(__shrink_lambda>0)
    {
        gettimeofday(&t_before, NULL);

        double dmean = 0;
        for(size_t i = 0; i < covm.d1; ++i) dmean += covm(i, i);
        dmean /= covm.d1;
        const double& lambda = __shrink_lambda; // lkajan: A constant lambda seems crude. See if a mathematician can think of a better solution.

        for (;;)
        {
            if(dbg) cerr << "cov matrix Cholesky...";

            int cholres;
            {
                vector<float> tempmat = covm;
                int N = covm.d1;
                spotrf_("L",&N,tempmat.data(),&N,&cholres);
                if(cholres < 0) throw std::runtime_error(bo::str(bo::format("illegal value for argument %d") % -cholres ));
            }
            if(cholres==0) break;

            if(dbg) cerr << " not positive definite";

            for(uint16_t i = 0; i < covm.alilen; ++i)
                for(uint16_t j = 0; j < covm.alilen; ++j)
                    for(uint8_t ai = 0; ai < covm.q; ++ai)
                        for(uint8_t aj = 0; aj < covm.q; ++aj)
                            if(i != j)
                                covm(i,j,ai,aj) *= (1.0 - lambda);
                            else if(ai == aj)
                                covm(i,j,ai,aj) = lambda*dmean + (1.0-lambda)*covm(i,j,ai,aj);

            if(dbg)
            {
                double dmean_dev = 0;
                for(size_t i = 0; i < covm.d1; ++i) dmean_dev += pow(dmean - covm(i,i), 2);
                dmean_dev /= covm.d1; dmean_dev = sqrt(dmean_dev);
                fprintf(stderr, ", dmean dev: %g\n", dmean_dev );
            }
        }
        { timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
            if(__timing) (*__timing)["shrink"] = t_diff.tv_sec+t_diff.tv_usec/1e6; if(dbg) fprintf(stderr, "\nshrank covariance matrix in %g secs\n", t_diff.tv_sec+t_diff.tv_usec/1e6 ); }
    }

        #ifdef LIBFREEC_CHKPT
        {
            FILE *f = fopen("covm.dat", "w");
            do {
                if( fwrite(&covd, sizeof(covd), 1, f) != 1 ) { cerr << "error writing 'covm.dat': " << strerror(errno) << "\n"; fclose(f); unlink("covm.dat"); break; }
                if( fwrite(&galilen, sizeof(galilen), 1, f) != 1 ) { cerr << "error writing 'covm.dat': " << strerror(errno) << "\n"; fclose(f); unlink("covm.dat"); break; }
                if( fwrite( covm.data(), sizeof(*covm.data()), covm.size(), f ) != covm.size() )
                {
                    cerr << "error writing 'covm.dat': " << strerror(errno) << "\n";
                    fclose(f); unlink("covm.dat");
                    break;
                }
                cerr << "wrote 'covm.dat'\n";
                fclose(f);
            } while(false);
        }
        #endif

    // Invert the covariance matrix, aiming for __dens precision matrix density.
    fp_t rho = __rho;
    fp_t rhofac = 0;
    cov_vector<g_fp_t>      wwi( covm.alilen, covm.q ); // inverse covariance matrix

    gettimeofday(&t_before, NULL);
    if(__estimate_ivcov)
    {
        fp_t density = -1, prev_dens = -1;
        do {
            // Set new rho - the bigger rho, the smaller density gets
            if( rho < 0 ) rho = std::max(0.001, 1.0 / __wtot);
            else if(density >= 0)
            {
                if( density == 0 ) rhofac = 0.5;
                else
                {
                    if( rhofac != 0 && prev_dens > 0 && density > 0 )
                        rhofac = pow( rhofac, log( __dens / density ) / log( density / prev_dens ) );
                    else
                        rhofac = __dens > density ? 0.9 : 1.1; // lkajan: be more aggressive here? 0.5 : 2?
                }
                rho *= rhofac;
            }

            int g_ia = 0; // exact solution
            if(dbg) fprintf(stderr, "will try rho %g, %s solution\n", rho, (g_ia ? "approximate" : "exact"));

            if(rho <= 0 || rho >= 1) throw std::runtime_error(bo::str(bo::format("regularization parameter rho is out of expected range (0-1): %g") % rho ));

            cov_vector<g_fp_t> rho_m( covm.alilen, covm.q, rho );

            for(uint16_t i = 0; i < rho_m.alilen; ++i)
                for(uint16_t j = 0; j < rho_m.alilen; ++j)
                    for(uint8_t ai = 0; ai < rho_m.q; ++ai)
                        for(uint8_t aj = 0; aj < rho_m.q; ++aj)
                            // Watch out, __ali.alilen may != galilen; __gapth filtering does not seem to make much sense with 0.5 __pscnt_weight.
                            if( (__ali.alilen==rho_m.alilen && ( aafreq(i,aamap_gapidx) > __gapth || aafreq(j,aamap_gapidx) > __gapth )) ||
                                    (i == j && ai != aj )
                              )
                                rho_m(i,j,ai,aj) = 1e9;

            #ifdef LIBFREEC_CHKPT
            {
                FILE *f = fopen("rho_m.dat", "w");
                do {
                    if( fwrite( rho_m.data(), sizeof(*rho_m.data()), rho_m.size(), f ) != rho_m.size() )
                    {
                        cerr << "error writing 'rho_m.dat': " << strerror(errno) << "\n";
                        fclose(f); unlink("rho_m.dat");
                        break;
                    }
                    cerr << "wrote 'rho_m.dat'\n";
                    fclose(f);
                } while(false);
            }
            #endif

            // lkajan: estimate inverse covariance matrix
            {
                int g_n = covm.d1, g_is = 0, g_msg = dbg, g_maxit = 1e5, g_jerr, g_brk = 0;
                int g_niter;
                g_fp_t g_thr = 1e-4;
                vector<g_fp_t>  ww( g_n*g_n ); // estimated covariance matrix

                timeval t_before; if(dbg) gettimeofday(&t_before, NULL);

                {
                    // Set up a timer to break this process if it overruns some process cputime limit
                    _glasso_timer _gt(&g_brk, __icme_timeout, dbg);

                    // No need to transpose for Fortran, matrix is symmetric.
                    {
                        vector<float>   g_Wd(g_n), g_WXj(g_n);
                        glassofast_(&g_n, covm.data(), rho_m.data(), &g_thr, &g_maxit, &g_msg, &g_is, wwi.data(), ww.data(), g_Wd.data(), g_WXj.data(), &g_jerr, &g_brk);
                        switch (g_jerr) {
                          case 0:
                            break;
                          case 256:
                            throw icme_timeout_error(bo::str(bo::format("inverse covariance matrix estimation exceeded time limit (%d sec)") % __icme_timeout )); break;
                          default:
                            throw std::runtime_error("an error occurred in glassofast_");
                        }
                        g_niter = g_maxit;
                    }
                }

                if(dbg){ timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
                    fprintf(stderr, "%d iterations of glassofast took %g secs\n", g_niter, t_diff.tv_sec+t_diff.tv_usec/1e6 ); }
            }

            if( __dens == 0 ) break; // Contact density target is not set, no need to calculate density.

            prev_dens = density;
            {
                size_t nz = 0;
                for(size_t i = 0; i < wwi.d1; ++i)
                    for(size_t j = i+1; j < wwi.d1; ++j)
                        if( wwi(i, j) != 0 ) ++nz;

                density = (fp_t)nz / ( wwi.d1 * (wwi.d1-1) / 2 ); // triangle w/o diagonal
            }
            if(dbg) fprintf(stderr, "density = %7g, target sp = %g, |density-target sp|/target sp = %g\n", density, __dens, fabs(density - __dens)/__dens);
        }
        while( __dens == 0 || fabs(__dens - density)/__dens > 0.01 );
        // lkajan: Is 0.01 strict enough? 1% of the 3% still means 17 contacts for a 340-long protein. But does that matter? Perhaps make 0.01 a parameter.
    }
    else
    {
        // lkajan: LAPACK matrix inversion: fast, but does not disregard gaps like glasso* do through high regularization, and no control over density.
        wwi = covm; { cov_vector<cov_fp_t> _empty; covm.swap(_empty); } // lkajan: this trick releases memory, unlike clear()

        int N = wwi.d1, INFO;
        int IPIV[N+1];

        timeval t_before; if(dbg) gettimeofday(&t_before, NULL);

        sgetrf_(&N,&N,wwi.data(),&N,IPIV,&INFO);
        if(INFO) throw std::runtime_error(bo::str(bo::format("LU factorization error %d") % INFO ));
        // lkajan: we could handle the singular matrix case if we wanted to, e.g. by increasing __pscnt_weight.

        if(dbg){ timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
            fprintf(stderr, "LU factorization took %g secs, ", t_diff.tv_sec+t_diff.tv_usec/1e6 ); }

        int LWORK = -1; vector<float> WORK(1);
        sgetri_(&N,wwi.data(),&N,IPIV,WORK.data(),&LWORK,&INFO);
        LWORK = N*N < WORK[0] ? N*N : WORK[0];
        WORK.resize(LWORK);

        sgetri_(&N,wwi.data(),&N,IPIV,WORK.data(),&LWORK,&INFO);
        if(INFO) throw std::runtime_error(bo::str(bo::format("matrix inversion error %d") % INFO ));

        if(dbg){ timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
            fprintf(stderr, "inverted matrix (incl LUf) in %g secs\n", t_diff.tv_sec+t_diff.tv_usec/1e6 ); }
    }
    { cov_vector<cov_fp_t> _empty; covm.swap(_empty); } // lkajan: this trick releases memory, unlike clear()
    if(__timing){ timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
        (*__timing)["inv"] = t_diff.tv_sec+t_diff.tv_usec/1e6; }

    if(dbg)
    {
        size_t nz = 0;
        double checksum = 0;
        for(size_t i = 0; i < wwi.d1; ++i)
            for(size_t j = i+1; j < wwi.d1; ++j)
                if( wwi(i, j) != 0 ){ ++nz; checksum += wwi(i, j); }

        fp_t density = (fp_t)nz / wwi.d1 / (wwi.d1-1) * 2; // upper triangle w/o diagonal
        fprintf(stderr, "density of inverse covariance matrix = %g (cksum %.7g)\n", density, checksum);
    }

    // lkajan: Go back to __ali.alilen*covq x __ali.alilen*covq wwi matrix.
    // lkajan: Well, we could go back to __ali.alilen*21 x __ali.alilen*21 even?
    cov_vector<g_fp_t> wwiwg( __ali.alilen, wwi.q, 0 ); // wwi with gaps (if any)
    if(wwi.alilen != wwiwg.alilen)
    {
        uint16_t i = 0, covi = 0;
        vector<uint16_t>::const_iterator gc_bi = ( __apply_gapth ? gap_cols.begin() : gap_cols.end() ), gc_ei = gap_cols.end();
        for(; i < __ali.alilen; ++i)
        {
            if(gc_bi!=gc_ei && *gc_bi == i){ ++gc_bi; continue; } // gapped row

            uint16_t j = 0, covj = 0;
            vector<uint16_t>::const_iterator gc_bj = ( __apply_gapth ? gap_cols.begin() : gap_cols.end() ), gc_ej = gap_cols.end();
            for(; j < __ali.alilen; ++j)
            {
                if(gc_bj!=gc_ej && *gc_bj == j){ ++gc_bj; continue; } // gapped col

                for(uint8_t ai = 0; ai < wwiwg.q; ++ai)
                    for(uint8_t aj = 0; aj < wwiwg.q; ++aj)
                        wwiwg(i,j,ai,aj) = wwi(covi,covj,ai,aj);
                ++covj;
            }
            ++covi;
        }
        if(dbg) cerr<<"went back to gapped ("<<wwiwg.alilen<<") wwi matrix\n";
    }
    else
        wwiwg.swap(wwi);
    { cov_vector<g_fp_t> _empty; wwi.swap(_empty); } // lkajan: this trick releases memory, unlike clear()

    // lkajan: Idea: how about doing APC over the scores of contacts beyond __mincontsep, instead of 1 (excluding diagonal only)?
    // PSICOV raw scores and background for average product correction (APC) [Dunn et al. 2008]
    // l1-norm (PSICOV) raw scores
    class _rawscore_l1norm_t : public _rawscore_calc_t {
        const cov_vector<g_fp_t> &_wwiwg;
      public:
        _rawscore_l1norm_t(const cov_vector<g_fp_t> &__wwiwg ) : _wwiwg(__wwiwg) {}
        virtual double operator()(const uint16_t i, const uint16_t j) const
        {
            double raw_ij = 0;
            for(uint8_t ai = 0; ai < 20; ++ai) // disregard gap (index 20) - works also if __cov20 is in effect
                for(uint8_t aj = 0; aj < 20; ++aj)
                    raw_ij += fabs(_wwiwg(i,j,ai,aj));
            return raw_ij;
        } 
    } _rawscore_l1norm(wwiwg);
    _raw_score_matrix(raw_ctscore, apc_bg, apc_mean, __ali.alilen, "l1norm", _rawscore_l1norm );
    if(dbg) fprintf(stderr, "collected apc_mean[l1norm] = %16.15g\n", apc_mean["l1norm"]);

    // Frobenius norm
    // EVfold-mfDCA::calculate_cn_scores() up to APC bit: obtain EVfold-mfDCA::fn_scores - courtesy of Thomas Hopf
    // Quoting Thomas: transform to zero-sum gauge according to Ekeberg et al.
    // Quoting Thomas: calculate Frobenius norm for each pair of columns i, j
    class _rawscore_fro_t : public _rawscore_calc_t {
        const cov_vector<g_fp_t> &_wwiwg;
      public:
        _rawscore_fro_t(const cov_vector<g_fp_t> &__wwiwg) : _wwiwg(__wwiwg) {}
        virtual double operator()(const uint16_t i, const uint16_t j) const
        {
            vector<float> W_mf( 21*21, 0.0 );
            vector<double> rmean_W_mf( 21, 0.0 ); // row mean W_mf
            vector<double> cmean_W_mf( 21, 0.0 ); // col mean W_mf
            double mean2_W_mf = 0;
            for(uint8_t ai = 0; ai < 20; ++ai) // lkajan: look out, 20, not covq!
                for(uint8_t aj = 0; aj < 20; ++aj)
                {
                    const double cell = ( W_mf[ ai*21+aj ] = -_wwiwg(i,j,ai,aj) );
                    mean2_W_mf += cell;
                    rmean_W_mf[ai] += cell/21;
                    cmean_W_mf[aj] += cell/21;
                }
            mean2_W_mf /= W_mf.size();

            // transform to zero-sum gauge according to Ekeberg et al.
            double sum_sum_W_mf_zsg2 = 0.0;
            for(uint8_t ai = 0; ai < 21; ++ai)
                for(uint8_t aj = 0; aj < 21; ++aj)
                {
                    double W_mf_zsg = (double)W_mf[ ai*21+aj ] - rmean_W_mf[ai] - cmean_W_mf[aj] + mean2_W_mf;
                    sum_sum_W_mf_zsg2 += W_mf_zsg*W_mf_zsg;
                }

            // calculate Frobenius norm for each pair of columns i, j
            double raw_ij = sqrt(sum_sum_W_mf_zsg2);

            return raw_ij;
        }
    } _rawscore_fro(wwiwg);
    _raw_score_matrix(raw_ctscore, apc_bg, apc_mean, __ali.alilen, "fro", _rawscore_fro );
    if(dbg) fprintf(stderr, "collected apc_mean[fro] = %16.15g\n", apc_mean["fro"]);

    { cov_vector<g_fp_t> _empty; wwiwg.swap(_empty); } // lkajan: this trick releases memory, unlike clear()

    // Calculate final scores with average product correction (APC, Dunn et al., 2008)
    map<string, vector<contact_t> > res;
    // l1norm
    res["l1norm"] = _apc( raw_ctscore["l1norm"], apc_bg["l1norm"], apc_mean["l1norm"], __mincontsep, true );
    // MI_true - report raw values, not apc
    res["MI"] = _raw_as_is( raw_ctscore["MI"], __mincontsep );
    // fro
    res["fro"] = _apc( raw_ctscore["fro"], apc_bg["fro"], apc_mean["fro"], __mincontsep, false );
    //
    //{ timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_run, &t_diff);
    //    if(__timing) (*__timing)["run"] = t_diff.tv_sec+t_diff.tv_usec/1e6; if(dbg) fprintf(stderr, "run done in %g secs\n", t_diff.tv_sec+t_diff.tv_usec/1e6); }
    return res;
}


predictor::cont_res_t
                    predictor::run(const ali_t& __ali, double __clustpc,
                        double __density, double __gapth, uint16_t __mincontsep,
                        double __pseudocnt, double __pscnt_weight, bool __estimate_ivcov, double __shrink_lambda,
                        bool __cov20, bool __apply_gapth, double __rho,
                        bool __veczw, int __num_threads, time_t __icme_timeout, time_res_t *__timing
                    ) //throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error)
{
    timeval t_top; gettimeofday(&t_top, NULL);
    timeval t_before; gettimeofday(&t_before, NULL);

    freq_vec_t aliw; double wtot;
    get_seq_weights(aliw, wtot, __ali, __clustpc, __veczw, __num_threads);

    { timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_before, &t_diff);
        if(__timing) (*__timing)["seqw"] = t_diff.tv_sec+t_diff.tv_usec/1e6;
        if(dbg) fprintf(stderr, "\nseq weight loop for %d seqs took %g secs\n", __ali.seqcnt, t_diff.tv_sec+t_diff.tv_usec/1e6 ); }

    cont_res_t ret = run(__ali, aliw, wtot,
        __density, __gapth, __mincontsep,
        __pseudocnt, __pscnt_weight, __estimate_ivcov, __shrink_lambda,
        __cov20, __apply_gapth, __rho,
        __num_threads, __icme_timeout, __timing);

    { timeval t_now, t_diff; gettimeofday(&t_now, NULL); timersub(&t_now, &t_top, &t_diff);
        if(__timing) (*__timing)["all"] = t_diff.tv_sec+t_diff.tv_usec/1e6; if(dbg) fprintf(stderr, "all done in %g secs\n", t_diff.tv_sec+t_diff.tv_usec/1e6); }

    return ret;
}


ali_t&          ali_t::push(const std::vector<uint8_t>& __al) //throw (alilen_error)
{
    if( __al.size() != alilen ) throw alilen_error(bo::str(bo::format("alignment length mismatch, expected %d, got %d") % alilen % __al.size() ));
    resize( ++seqcnt*alilen16, _mm_setzero_si128() );

    uint8_t *alptr = reinterpret_cast<uint8_t*>(data()) + (seqcnt-1) * alilenpad;
    memcpy(alptr, __al.data(), __al.size());

    if( capacity() == size() )
        reserve( size() + 1024*alilen16 );
    return *this;
}


/// Calculate raw contact scores using given function and collect row/column/overall mean for APC calculation.
/** \param [out] __raw_ctscore
 *  \param [out] __apc_bg
 *  \param [out] __apc_mean
 *  \param [in]  __alilen
 *  \param [in]  __key
 *  \param [in]  __fo
 */
void _raw_score_matrix(map<string, ct_vector<fp_t> > &__raw_ctscore, map<string, vector<double> > &__apc_bg, map<string, double> &__apc_mean,
    const uint16_t __alilen, const string &__key, const _rawscore_calc_t &__fo )
{
    ct_vector<fp_t>&    raw_cts =   __raw_ctscore[__key] = ct_vector<fp_t>( __alilen );
    vector<double>&     apc_bg =    __apc_bg[__key] = vector<double>( __alilen );
    double&             mean =      __apc_mean[__key] = 0;

    for(uint16_t i = 0; i < __alilen; ++i)
        for(uint16_t j = i+1; j < __alilen; ++j)
        {
            double raw_ij = __fo(i, j);

            raw_cts[ i*__alilen + j ] = raw_cts[ j*__alilen + i ] = raw_ij;
            mean += raw_ij;

            double bg_ij = raw_ij / (__alilen-1);
            apc_bg[i] += bg_ij;
            apc_bg[j] += bg_ij;
        }
    mean /= __alilen * (__alilen-1) / 2;
}

vector<contact_t>   _apc( const ct_vector<fp_t>& __raw_ctscore, const vector<double>& __apc_bg, const double __apc_mean, const uint16_t __mincontsep, bool __filt )
{
    const uint16_t alilen = __apc_bg.size();

    // we expect that __raw_ctscore is a alilen*alilen matrix
    vector<contact_t> res;
    res.reserve( alilen*(alilen-1)/2*0.03 ); // reserve space for 3% contacts

    for(int i = 0, e = alilen-__mincontsep; i < e; ++i)
        for(uint16_t j = i+__mincontsep; j < alilen; ++j)
            if(!__filt || __raw_ctscore(i,j) > 0 ) // lkajan: psicov has this - it does strip off a few predictions.
                res.push_back( 
                    contact_t( i, j,
                        __raw_ctscore(i,j) - __apc_bg[i] * __apc_bg[j] / __apc_mean
                    )
                );
    return res;
}

vector<contact_t>   _raw_as_is( const ct_vector<fp_t>& __raw_ctscore, const uint16_t __mincontsep )
{
    const uint16_t alilen = __raw_ctscore.alilen;

    vector<contact_t> res;
    res.reserve( alilen*(alilen-1)/2 );

    for(int i = 0, e = alilen-__mincontsep; i < e; ++i)
        for(uint16_t j = i+__mincontsep; j < alilen; ++j)
                res.push_back( 
                    contact_t( i, j,
                        __raw_ctscore(i,j)
                    )
                );
    return res;
}

static inline uint32_t
                    _cache_holds_nseq(uint16_t __seqsize)
{
    long l1_dcache_size =     sysconf(_SC_LEVEL1_DCACHE_SIZE);
    //long l1_dcache_linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
    long l2_cache_size =      sysconf(_SC_LEVEL2_CACHE_SIZE);
    //long l2_cache_linesize =  sysconf(_SC_LEVEL2_CACHE_LINESIZE);
    long l3_cache_size =      sysconf(_SC_LEVEL3_CACHE_SIZE);
    //long l3_cache_linesize =  sysconf(_SC_LEVEL3_CACHE_LINESIZE);

    uint32_t wchunk = 0;

    if(l1_dcache_size > 0)                  wchunk = 0.5*l1_dcache_size / __seqsize;
    if(wchunk < 4 && l2_cache_size > 0)     wchunk = 0.5*l2_cache_size / __seqsize;
    if(wchunk < 4 && l3_cache_size > 0)     wchunk = 0.5*l3_cache_size / __seqsize;
    if(wchunk < 4){                         wchunk = 0.5*1048576 / __seqsize; // assume there is a 1MB cache somewhere, even if we can not detect it
        if(wchunk < 4)                      wchunk = 4;
    }

    // jobtest2:
    // 100000 => 86s
    //wchunk = 100000
    // L3 cache enough for 49152
    // 20000 => 50s
    //wchunk = 20000
    // L2 cache enough for 4096
    // 2000 => 42s
    //wchunk = 2000
    // L1 cache enough for 512
    // 200 => 42s
    //wchunk = 200
    // the below is the fastest so far, 50 is slower
    // 100 => 40.5s
    //wchunk = 100

    // emak:
    // 100000 => 67s
    //wchunk = 100000
    // L2 cache enough for 24576
    // 10000 => 36s
    //wchunk = 10000
    // L1 cache enough for 256
    // 100 => 34.6s
    //wchunk = 100
    // the below is the fastest so far, 25 is slower
    // 50 => 34.4s
    //wchunk = 50

    return wchunk;
}

} // namespace freecontact

// vim:et:ts=4:ai:sw=4: