File: Stats.cpp

package info (click to toggle)
clustalx 2.1%2Blgpl-9
links: PTS, VCS
area: main
in suites: bookworm, bullseye, forky, sid, trixie
size: 3,324 kB
sloc: cpp: 40,050; sh: 163; xml: 102; makefile: 16
file content (375 lines) | stat: -rw-r--r-- 9,744 bytes
parent folder | download | duplicates (11)
/**
 *
 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <cstdlib>
#include <iostream>
#include <algorithm>
#include <string>
#include <stdio.h>
#include "../alignment/Alignment.h"
#ifdef HAVE_MHASH_H
#include "mhash.h"
#endif

#include "Stats.h"


using namespace std;

namespace clustalw
{


Stats::Stats()
{
    enabled = false;
}


Stats::~Stats()
{
}




/* adopted from Sean Eddy'ssquid:aligneval.c */
/* Function: PairwiseIdentity()
 * 
 * Purpose:  Calculate the pairwise fractional identity between
 *           two aligned sequences s1 and s2. This is simply
 *           (idents / MIN(len1, len2)).
 *
 *           Note how many ways there are to calculate pairwise identity,
 *           because of the variety of choices for the denominator:
 *           idents/(idents+mismat) has the disadvantage that artifactual
 *             gappy alignments would have high "identities".
 *           idents/(AVG|MAX)(len1,len2) both have the disadvantage that 
 *             alignments of fragments to longer sequences would have
 *             artifactually low "identities".
 *           
 *           Case sensitive; also, watch out in nucleic acid alignments; 
 *           U/T RNA/DNA alignments will be counted as mismatches!
 */
/* float
PairwiseIdentity(char @s1, char @s2)
{
  int     idents;		/@ total identical positions  @/
  int     len1, len2;		/@ lengths of seqs            @/
  int     x;			/@ position in aligned seqs   @/

  idents = len1 = len2 = 0;
  for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++) 
  {
      if (!isgap(s1[x])) {
          len1++;
          if (s1[x] == s2[x]) idents++; 
      }
      if (!isgap(s2[x])) len2++;
  }
  if (len2 < len1) len1 = len2;
  return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
}
*/


/* s1/s2 are unit-offset as usual
 *
 */
float
Stats::pairwiseIdentity(Alignment *alnObj, int s1, int s2)
{
    int idents;     /* total identical positions */
    int len1, len2; /* real lengths of seqs      */
    int x;          /* position in aligned seqs  */
    
    const vector<int>* seq1 = alnObj->getSequence(s1);
    const vector<int>* seq2 = alnObj->getSequence(s2);
    
    idents = len1 = len2 = 0;
    // cerr << "comparing " << alnObj->getName(s1).c_str() << ":" << alnObj->getName(s2).c_str() << " " << s1 << ":" << s2  << "\n";
    
    // sequence length should be identical, but be paranoid
    for (x = 1; x<=alnObj->getSeqLength(s1) && x<=alnObj->getSeqLength(s2); x++)
    {
        if (! alnObj->isGap(s1, x)) {
            len1++;
            //cerr << " pos " << x << ": " << (*seq1)[x] << ":" << (*seq2)[x] << "\n";
            if ((*seq1)[x] == (*seq2)[x])
                idents++; 
        }
        //DEBUG
        //else {
        //cerr << " gap at pos " << x << " (" << s1 << ")\n";
        //}
        if (! alnObj->isGap(s2, x))
            len2++;
        //DEBUG
        //else
        //    cerr << " gap at pos " << x << " (" << s2 << ")\n";
    }
    if (len2 < len1)
        len1 = len2;
    return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
}
  


#ifdef HAVE_MHASH_H

string
Stats::ConcatInputHash(Alignment *alnObj)
{
    vector<string> rawSeqArray;
    string ret;
    char *hash;

    // collect all sequences and sort 
    const clustalw::SeqArray* seqArray = alnObj->getSeqArray();
    for(int s = 1; s <= alnObj->getNumSeqs(); s++)
    {
        string seq;
        for(int r = 1; r <= alnObj->getSeqLength(s); r++)
        {
            int val = (*seqArray)[s][r];
            seq.append(1, clustalw::userParameters->getAminoAcidCode(val));
        }
        rawSeqArray.push_back(seq);
    }
    std::sort(rawSeqArray.begin(), rawSeqArray.end());

    // concatenate sorted seqs
    string concatSeq;
    std::vector<string>::iterator iter;
    for(iter=rawSeqArray.begin(); iter != rawSeqArray.end(); ++iter)
        concatSeq.append(*iter);

    // build hash and return
    hash = Md5Hash(concatSeq.c_str());
    if (hash==NULL)
    {
        ret="HASHING_FAILURE";
    } else {
        for (int i=0; i<strlen(hash); i++)
        ret.append(1, hash[i]);
        free(hash);
    }
    return ret;
}


string
Stats::Md5ForSeq(Alignment *alnObj, int s)
{
    string seq;
    const clustalw::SeqArray* seqArray = alnObj->getSeqArray();
    char *hash;
    string ret;
    
    for(int l = 1; l <= alnObj->getSeqLength(s); l++)
    {
        int val = (*seqArray)[s][l];
        // continue if gap
        if((val < 0) || (val > userParameters->getMaxAA()))
            continue;
        seq.append(1, clustalw::userParameters->getAminoAcidCode(val));
    }
    std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper);

    hash = Md5Hash(seq.c_str());
    if (hash==NULL)
    {
        ret = "HASHING_FAILURE";
    } else {
        ret = hash;
    }
    return ret;
}

/* create md5 hash from input string
 * returns NULL on failure
 * user must free returned string
 */
char *
Stats::Md5Hash(const char *thread)
{
    MHASH td;
    char *tmpcstr;
    int i;
    unsigned char *hash;
    char *rethash;
    
    td = mhash_init(MHASH_MD5);
    if (td == MHASH_FAILED)
        return NULL;

    if (thread==NULL)
        return NULL;

    mhash(td, thread, strlen(thread));
    //mhash_deinit(td, hash);
    hash = (unsigned char*) mhash_end(td);
    
    rethash = (char*) calloc(mhash_get_block_size(MHASH_MD5)*2+1, sizeof(char));
    for (i = 0; i < mhash_get_block_size(MHASH_MD5); i++) {
        sprintf(&rethash[i*2], "%.2x", hash[i]);
    }

    return rethash;
}

#endif
// HAVE_MHASH_H




void
Stats::logCmdLine(int argc, char **argv)
{
    FILE *fp = fopen(logfilename.c_str(), "a");
    if (fp == NULL)
    {
        cerr << "couldn't open file " << logfilename << " for logging of stats\n";
        return;
    }

    if (argc > 1)
    {
        for (int i=1; i<argc; i++)
        {
            // remove non-interesting stuff
            if (strstr(argv[i], "-infile=")==NULL &&
                strstr(argv[i], "-outfile=")==NULL &&
                strstr(argv[i], "-stats=")==NULL &&
                strstr(argv[i], "-align")==NULL)
            {
                fprintf(fp, "cmdline non-default arg: %s\n", argv[i]);
            }
        }
    }
}


/* log some statistics for input sequences, i.e. alnObj should hold
 * the unaligned sequences
 *
 */
void
Stats::logInputSeqStats(Alignment *alnObj)
{
    int i;
    std::vector<double> lengths;
    time_t t = time(NULL);
    tm s = *localtime(&t);
    int shortest;
    string hash;
    
    FILE *fp = fopen(logfilename.c_str(), "a");
    if (fp == NULL)
    {
        cerr << "couldn't open file " << logfilename << " for logging of stats\n";
        return;
    }

    fprintf(fp, "logging job: %s on %s", userParameters->getSeqName().c_str(), asctime(&s));
    fprintf(fp, "clustal version: %s\n", userParameters->getRevisionLevel().c_str());

    fprintf(fp, "seq type: ");
    if (userParameters->getDNAFlag())
        fprintf(fp, "DNA");
    else
        fprintf(fp, "protein");
    fprintf(fp, "\n");
    
    fprintf(fp, "numseqs: %d\n", alnObj->getNumSeqs());

    // create a vector of seq lengths for later
    // and get shortest seq at the same time
    shortest=alnObj->getLengthLongestSequence();
    for (i = 1; i <= alnObj->getNumSeqs(); i++) {
        int l = alnObj->getSeqLength(i);
        lengths.push_back(l);
        if (l<shortest)
            shortest=l;
    }
    
    fprintf(fp, "seqlen longest: %d\n", alnObj->getLengthLongestSequence());
    fprintf(fp, "seqlen shortest: %d\n", shortest);

    fprintf(fp, "seqlen avg: %.2f\n", utilityObject->average(lengths));
    fprintf(fp, "seqlen std-dev: %.2f\n", utilityObject->stdDev(lengths));
    fprintf(fp, "seqlen median: %.2f\n", utilityObject->median(lengths));

    
#ifdef HAVE_MHASH_H
    //hash = concatInputHash(alnObj);
    //fprintf(fp, "seq hash: %s\n", hash.c_str());
    
    for (int s = 1; s <= alnObj->getNumSeqs(); s++) {
        string md5 = Md5ForSeq(alnObj, s);
        fprintf(fp, "md5 for seq %d: %s\n", s, md5.c_str());

    }
#else
    fprintf(fp, "md5: disabled\n");    
#endif

    fclose(fp);
}



/* log some statistics for aligned sequences, i.e. alnObj should hold
 * the already aligned sequences
 *
 */
void
Stats::logAlignedSeqStats(Alignment *alnObj)
{
    FILE *fp = fopen(logfilename.c_str(), "a");
    if (fp == NULL)
    {
        cerr << "couldn't open file " << logfilename << " for logging of stats\n";
        return;
    }
    
    // alignment length is the length of any sequence
    fprintf(fp, "aln len: %d\n", alnObj->getSeqLength(1));


    std::vector<double> pwIdents;
    double lowestPwId = 1.0;
    double hightestPwId = 0.0;

    // create vector of pairwise identities
    for(int s1 = 1; s1 <= alnObj->getNumSeqs(); s1++)
    {        
        for(int s2 = s1+1; s2 <= alnObj->getNumSeqs(); s2++)
        {
            double thisPwId = pairwiseIdentity(alnObj, s1, s2);
            pwIdents.push_back(thisPwId);
            if (thisPwId>hightestPwId)
                hightestPwId=thisPwId;
            if (thisPwId<lowestPwId)
                lowestPwId=thisPwId;
            //fprintf(fp, "ident %s:%s %d:%d=%f\n", alnObj->getName(s1).c_str(), alnObj->getName(s2).c_str(), s1, s2, PairwiseIdentity(alnObj, s1, s2));
        }
    }
    fprintf(fp, "aln pw-id highest: %.2f\n", hightestPwId);
    fprintf(fp, "aln pw-id lowest: %.2f\n", lowestPwId);
    fprintf(fp, "aln pw-id avg: %.2f\n",  utilityObject->average(pwIdents));
    fprintf(fp, "aln pw-id std-dev: %.2f\n",  utilityObject->stdDev(pwIdents));
    fprintf(fp, "aln pw-id median: %.2f\n",  utilityObject->median(pwIdents));

    fclose(fp);
}

}