File: sam-stats.cpp

package info (click to toggle)
ea-utils 1.1.2%2Bdfsg-4
links: PTS, VCS
area: main
in suites: stretch
size: 836 kB
ctags: 1,256
sloc: ansic: 11,459; cpp: 3,752; perl: 426; makefile: 74; sh: 42
file content (1121 lines) | stat: -rw-r--r-- 37,421 bytes
parent folder | download | duplicates (5)
/*
# Copyright (c) 2011 Erik Aronesty
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
*/

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <math.h>
#include <stdarg.h>
#include <sys/stat.h>

#include <string>
#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
#include <google/dense_hash_map>  // or sparse_hash_set, dense_hash_map, ...

#include <samtools/sam.h>         // samtools api

#include "fastq-lib.h"

const char * VERSION = "1.38";

#define SVNREV atoi(strchr("$LastChangedRevision: 681 $", ':')+1)

using namespace std;

void usage(FILE *f);

#define MAX_MAPQ 300
// this factor is based on a quick empirical look at a few bam files....
#define VFACTOR 1.5

//#define max(a,b) (a>b?a:b)
//#define min(a,b) (a<b?a:b)
#define meminit(l) (memset(&l,0,sizeof(l)))
#define debugout(s,...) if (debug) fprintf(stderr,s,##__VA_ARGS__)
#undef warn
#define warn(s,...) ((++errs), fprintf(stderr,s,##__VA_ARGS__))
#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))

template <class vtype> 
    double quantile(const vtype &vec, double p);

template <class itype> 
    double quantile(const vector<itype> &vec, double p);

std::string string_format(const std::string &fmt, ...);

int debug=0;
int errs=0;
extern int optind;
int histnum=30;
bool isbwa=false;
int rnamode = 0;
bool allow_no_reads = false;

// from http://programerror.com/2009/10/iterative-calculation-of-lies-er-stats/
class cRunningStats
{
private:
  double m_n;  // count
  double m_m1; // mean
  double m_m2; // second moment
  double m_m3; // third moment
  double m_m4; // fourth moment
public:
  cRunningStats() : m_n(0.0), m_m1(0.0), m_m2(0.0), m_m3(0.0), m_m4(0.0)
    { ; }
  void Push(double x)
  {
    m_n++;
    double d = (x - m_m1);
    double d_n = d / m_n;
    double d_n2 = d_n * d_n;
    m_m4 += d * d_n2 * d_n * ((m_n - 1) * ((m_n * m_n) - 3 * m_n + 3)) +
            6 * d_n2 * m_m2 - 4 * d_n * m_m3;
    m_m3 += d * d_n2 * ((m_n - 1) * (m_n - 2)) - 3 * d_n * m_m2;
    m_m2 += d * d_n * (m_n - 1);
    m_m1 += d_n;
  }
  double Mean() { return m_m1; }
  double StdDeviation() { return sqrt(Variance()); }
  double StdError() { return (m_n > 1.0) ? sqrt(Variance() / m_n) : 0.0; }
  double Variance() { return (m_n > 1.0) ? (m_m2 / (m_n - 1.0)) : 0.0; }
  double Skewness() { return sqrt(m_n) * m_m3 / pow(m_m2, 1.5); }
  double Kurtosis() { return m_n * m_m4 / (m_m2 * m_m2); }
};

/// if we use this a lot may want to make it variable size
class scoverage {
public:
	scoverage() {mapb=reflen=0; dist.resize(histnum+2); mapr=0;};
	long long int mapb;
	long int mapr;
    cRunningStats spos;
	int reflen;
	vector <int> dist;
};

// sorted integer bucket ... good for ram with small max size, slow to access
class ibucket {
public:
	int tot;
	vector<int> dat;
	ibucket(int max) {dat.resize(max+1);tot=0;}
	int size() const {return tot;};

	int operator[] (int n) const {
		assert(n < size());
		int i; 
		for (i=0;i<dat.size();++i) {
			if (n < dat[i]) {
				return i;
			}
			n-=dat[i];
		}
	}

	void push(int v) {
		assert(v<dat.size());
		++dat[v];
		++tot;
	}
};

class fqent {
    public:
    int bits; 
    std::string r;
    std::string q;
};

class sstats {
public:
	ibucket vmapq;			// all map qualities
	sstats() : vmapq(MAX_MAPQ) {
		memset((void*)&dat,0,sizeof(dat));
		covr.set_empty_key("-");
		petab.set_deleted_key("-");
	}
	~sstats() {
		covr.clear();
	}
	struct {
		int n, mapn, mapzero;		// # of entries, # of mapped entries, 
		int lenmin, lenmax; double lensum, lenssq;	// read length stats
		double mapsum, mapssq;	// map quality sum/ssq 
		double nmnz, nmsum;	// # of mismatched reads, sum of mismatch lengths 
		long long int nbase;
		int qualmax, qualmin;	// num bases samples, min/max qual 
		double qualsum, qualssq;	// sum quals, sum-squared qual
		int nrev, nfor;		// rev reads, for reads
		double tmapb;		// number of mapped bases
		long long int basecnt[5];
		int del, ins;		// length total dels/ins found
		bool pe;		// paired-end ? 0 or 1	
		int disc;
		int disc_pos;
		int dupmax;		// max dups found
	} dat;
	vector<int> visize;		// all insert sizes
	google::dense_hash_map<std::string, scoverage> covr;	// # mapped per ref seq
	google::sparse_hash_map<std::string, int> dups;		// alignments by read-id (not necessary for some pipes)
	google::sparse_hash_map<std::string, fqent> petab;		// peread table

	// file-format neutral ... called per read... warning seq/qual are not necessarily null-terminated
	void dostats(string name, int rlen, int bits, const string &ref, int pos, int mapq, const string &materef, int nmate, const string &seq, const char *qual, int nm, int del, int ins);

	// read a bam/sam file and call dostats over and over
	bool parse_bam(const char *in);
	bool parse_sam(FILE *f);
};

#define T_A 0
#define T_C 1
#define T_G 2
#define T_T 3
#define T_N 4

void build_basemap();

int dupreads = 1000000;
int max_chr = 1000;
bool trackdup=0;
FILE *sefq = NULL;
FILE *pefq1 = NULL;
FILE *pefq2 = NULL;
int basemap[256];
int main(int argc, char **argv) {
	const char *ext = NULL;
	bool multi=0, newonly=0, inbam=0;
    int fq_out=0;
    const char *rnafile = NULL;
	char c;
	optind = 0;
    struct option long_options[] = {
               {"fastq", no_argument, NULL, 'o'},
               {0,0,0,0},
    };
    int long_index=0;
    const char *prefix;

    while ( (c = getopt_long(argc, argv, "?BzArR:Ddx:MhS:", long_options, &long_index)) != -1) {
                switch (c) {
                case 'd': ++debug; break;                                       // increment debug level
                case 'D': ++trackdup; break;
                case 'B': inbam=1; break;
                case 'A': max_chr=1000000; break;                               // max chrom
                case 'R': rnafile=optarg;                                       // pass through
                case 'r': max_chr=1000000; rnamode=1; if (histnum < 60) histnum=60; break;
                case 'O': prefix=optarg; break;
                case 'S': histnum=atoi(optarg); break;
                case 'x': ext=optarg; break;
                case 'M': newonly=1; break;
                case 'z': allow_no_reads = true; break;
                case 'o': fq_out=1; trackdup=1; break;                     // output suff
                case 'h': usage(stdout); return 0;
                case '?':
                     if (!optopt) {
                        usage(stdout); return 0;
                     } else if (optopt && strchr("ox", optopt))
                       fprintf (stderr, "Option -%c requires an argument.\n", optopt);
                     else if (isprint(optopt))
                       fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                     else
                       fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
                     usage(stderr);
                     return 1;
                }
    }

	// recompute argc owing to getopt (is this necessary? i don't think so)
	const char *stdv[3] = {argv[0],"-",NULL}; 
	if (!argv[optind]) {
		argc=2;
		argv = (char **) stdv;
		optind=1;
	}

	multi = (argc-optind-1) > 0;                // more than 1 input? 
	if (multi && !ext) 
		ext = "stats";                          // force serial processed extension-mode
	
	build_basemap();                            // precompute matrices for rabit base->integer (A->0, C->1,. ...etc) lookups

	debugout("argc:%d, argv[1]:%s, multi:%d, ext:%s\n", argc,argv[optind],multi,ext);

    FILE *rnao = NULL;


	const char *p;
	// for each input file
	for (;optind < argc;++optind) {
		sstats s;
		const char *in = argv[optind];
		FILE *f;
		FILE *o=NULL;
		FILE *rnao=NULL;
		bool needpclose = 0;

		// decide input format
		string out;

		if (!strcmp(in,"-")) {
			// read sam/bam from stdin
			if (ext||fq_out) {
				warn("Can't use file extension with stdin\n");
				continue;
			}
			f = stdin;
			o = stdout;
		} else {
			if ((p = strrchr(in,'.')) && !strcmp(p, ".gz")) {
				// maybe this is a gzipped sam file...
				string cmd = string_format("gunzip -c '%s'", in);
				f = popen(cmd.c_str(), "r");
				needpclose=1;
				if (f) {
					char c;
					if (!inbam) {
						// guess file format with 1 char
						c=getc(f); ungetc(c,f);
						if (c==-1) {
							warn("Can't unzip %s\n", in);
							pclose(f);
							continue;
						}
						if (c==31) {
							// bam file... reopen to reset stream... can't pass directly
							string cmd = string_format("gunzip -c '%s'", in);
							f = popen(cmd.c_str(), "r");
							inbam=1;
						}
					} else 
						c = 31;	// user forced bam, no need to check/reopen

					if (inbam) {
						// why did you gzip a bam... weird? 
						if (dup2(fileno(f),0) == -1) {
						      warn("Can't dup2 STDIN\n");
						      continue;
						}
						in = "-";
					}
				} else {
					warn("Can't unzip %s: %s\n", in, strerror(errno));
					continue;
				}
				// extension mode... output to file minus .gz
				if (ext||fq_out) 
					out=string(in, p-in);
			} else {
	 			f = fopen(in, "r");
				if (!f) {
					warn("Can't open %s: %s\n", in, strerror(errno));
					continue;
				}
				// extension mode... output to file
				if (ext||fq_out) 
					out=in;
			}
            if (fq_out) {
                sefq=fopen((out+".fq").c_str(),"w");
                pefq1=fopen((out+".fq1").c_str(),"w");
                pefq2=fopen((out+".fq2").c_str(),"w");
            }
			if (ext) {
				( out += '.') += ext;
				o=fopen(out.c_str(), "w");
				if (!o) {
					warn("Can't write %s: %s\n", out.c_str(), strerror(errno));
					continue;
				}
			} else
				o=stdout;
		}

		// more guessing
		debugout("file:%s, f: %lx\n", in, (long int) f);
		char c;
		if (!inbam) {
			// guess file format
			c=getc(f); ungetc(c,f);
			if (c==31 && !strcmp(in,"-")) {
				// if bamtools api allowed me to pass a stream, this wouldn't be an issue....
				warn("Specify -B to read a bam file from standard input\n");
				continue;
			}
		} else 
			c = 31;		// 31 == bam

        if (rnafile) {
            rnao=fopen(rnafile,"w");
            if (!rnao) {
                warn("Can't write %s: %s\n", rnafile, strerror(errno));
                return 1;
            }
        } else {
            rnao=o;
        }
 
		// parse sam or bam as needed
		if (c != 31) {
			// (could be an uncompressed bam... but can't magic in 1 char)
			if (!s.parse_sam(f)) {
				if (needpclose) pclose(f); else fclose(f);
				warn("Invalid or corrupt sam file %s\n", in);
				continue;
			}
		} else {
			if (!s.parse_bam(in)) {
				if (needpclose) pclose(f); else fclose(f);
				warn("Invalid or corrupt bam file %s\n", in);
				continue;
			}
		}
        int ret;
		if (needpclose) ret=pclose(f); else ret=fclose(f);
        if (ret!=0) {
            warn("Error closing '%s': %s\n", in, strerror(errno));
            continue;
        } 

        if (fq_out) {
            if(sefq && s.dat.pe) {
                fclose(sefq);
                unlink((out+".fq").c_str());
            }
            if (pefq1 && !s.dat.pe) {
                fclose(pefq1);
                fclose(pefq2);
                unlink((out+".fq1").c_str());
                unlink((out+".fq2").c_str());
            }
        }

		// sort sstats
		sort(s.visize.begin(), s.visize.end());

		int phred = s.dat.qualmin < 64 ? 33 : 64;
		if (!s.dat.n && ! allow_no_reads) {
			warn("No reads in %s\n", in);
			continue;
		}
		fprintf(o, "reads\t%d\n", s.dat.n);
		fprintf(o, "version\t%s.%d\n", VERSION, SVNREV);

		// mapped reads is the number of reads that mapped at least once (either mated or not)
		if (s.dat.mapn > 0) {
			if (trackdup && s.dat.dupmax > (s.dat.pe+1)) {
				google::sparse_hash_map<string,int>::iterator it = s.dups.begin();
				vector<int> vtmp;
				int amb = 0;
				int sing = 0;
				while(it!=s.dups.end()) {
					// *not* making the distinction between 2 singleton mappings and 1 paired here
					if (it->second > (s.dat.pe+1)) {
						++amb;
					}
					if (it->second == 1 && s.dat.pe) {
						++sing;	
					}
					++it;
				}
                int mapped = (int) s.dups.size()*(s.dat.pe+1)-sing;

				fprintf(o,"mapped reads\t%d\n", mapped);
				if (amb > 0) {
                    int unmapped=s.dat.n-s.dat.mapn;
					fprintf(o,"pct align\t%.6f\n", 100.0*((double)mapped/(double)(mapped+unmapped)));
					fprintf(o,"ambiguous\t%d\n", amb*(s.dat.pe+1));
					fprintf(o,"pct ambiguous\t%.6f\n", 100.0*((double)amb/(double)s.dups.size()));
					fprintf(o,"max dup align\t%.d\n", s.dat.dupmax-s.dat.pe);
				} else {
                    // no ambiguous mappings... simple
				    fprintf(o, "pct align\t%.6f\n", 100.0*(double)s.dat.mapn/(double)s.dat.n);
                }
				if (sing)
					fprintf(o,"singleton mappings\t%.d\n", sing);
				// number of total mappings
				fprintf(o, "total mappings\t%d\n", s.dat.mapn);
			} else {
				// dup-id's not tracked
				fprintf(o, "mapped reads\t%d\n", s.dat.mapn);
				fprintf(o, "pct align\t%.6f\n", 100.0*(double)s.dat.mapn/(double)s.dat.n);
				// todo: add support for bwa's multiple alignment tag
				// fprintf(o, "total mappings\t%d\n", s.dat.mapn);
			}
		} else {
			fprintf(o, "mapped reads\t%d\n", s.dat.mapn);
		}

        if (s.dat.mapzero > 0) {
			fprintf(o, "skipped mappings\t%d\n", s.dat.mapzero);
        }

		fprintf(o, "mapped bases\t%.0f\n", s.dat.tmapb);
		if (s.dat.pe) {
			fprintf(o, "library\tpaired-end\n");
		}
		if (s.dat.disc > 0) {
			fprintf(o, "discordant mates\t%d\n", s.dat.disc);
		}
		if (s.dat.disc_pos > 0) {
			fprintf(o, "distant mates\t%d\n", s.dat.disc_pos);
		}

		if (s.dat.mapn > 0) {
           if (s.dat.mapn > 100) {
                // at least 100 mappings to call a meaningful "percentage" 
    			fprintf(o, "pct forward\t%.3f\n", 100*(s.dat.nfor/(double)(s.dat.nfor+s.dat.nrev)));
            }

			fprintf(o, "phred\t%d\n", phred);
			fprintf(o, "forward\t%d\n", s.dat.nfor);
			fprintf(o, "reverse\t%d\n", s.dat.nrev);
			if (s.dat.lenmax != s.dat.lenmin) {
				fprintf(o, "len max\t%d\n", s.dat.lenmax);	
				fprintf(o, "len mean\t%.4f\n", s.dat.lensum/s.dat.mapn);	
				fprintf(o, "len stdev\t%.4f\n", stdev(s.dat.mapn, s.dat.lensum, s.dat.lenssq));	
			} else {
				fprintf(o, "len max\t%d\n", s.dat.lenmax);	
			}
			fprintf(o, "mapq mean\t%.4f\n", s.dat.mapsum/s.dat.mapn);
			fprintf(o, "mapq stdev\t%.4f\n", stdev(s.dat.mapn, s.dat.mapsum, s.dat.mapssq));

			fprintf(o, "mapq Q1\t%.2f\n", quantile(s.vmapq,.25));
			fprintf(o, "mapq median\t%.2f\n", quantile(s.vmapq,.50));
			fprintf(o, "mapq Q3\t%.2f\n", quantile(s.vmapq,.75));

			if (s.dat.lensum > 0) {
				fprintf(o, "snp rate\t%.6f\n", s.dat.nmsum/s.dat.lensum);
				if (s.dat.ins >0 ) fprintf(o, "ins rate\t%.6f\n", s.dat.ins/s.dat.lensum);
				if (s.dat.del >0 ) fprintf(o, "del rate\t%.6f\n", s.dat.del/s.dat.lensum);
				fprintf(o, "pct mismatch\t%.4f\n", 100.0*((double)s.dat.nmnz/s.dat.mapn));
			}

			if (s.visize.size() > 0) {
				double p10 = quantile(s.visize, .10);
				double p90 = quantile(s.visize, .90);
				double matsum=0, matssq=0;
				int matc = 0;
				int i;
				for(i=0;i<s.visize.size();++i) {
					int v = s.visize[i];
					if (v >= p10 && v <= p90) {
						++matc;
						matsum+=v;
						matssq+=v*v;
					}
				}
				fprintf(o, "insert mean\t%.4f\n", matsum/matc);
				if (matc > 1) {
					fprintf(o, "insert stdev\t%.4f\n", stdev(matc, matsum, matssq));
					fprintf(o, "insert Q1\t%.2f\n", quantile(s.visize, .25));
					fprintf(o, "insert median\t%.2f\n", quantile(s.visize, .50));
					fprintf(o, "insert Q3\t%.2f\n", quantile(s.visize, .75));
				}
			}

			if (s.dat.nbase >0) {
				fprintf(o,"base qual mean\t%.4f\n", (s.dat.qualsum/s.dat.nbase)-phred);
				fprintf(o,"base qual stdev\t%.4f\n", stdev(s.dat.nbase, s.dat.qualsum, s.dat.qualssq));
				fprintf(o,"%%A\t%.4f\n", 100.0*((double)s.dat.basecnt[T_A]/(double)s.dat.nbase));
				fprintf(o,"%%C\t%.4f\n", 100.0*((double)s.dat.basecnt[T_C]/(double)s.dat.nbase));
				fprintf(o,"%%G\t%.4f\n", 100.0*((double)s.dat.basecnt[T_G]/(double)s.dat.nbase));
				fprintf(o,"%%T\t%.4f\n", 100.0*((double)s.dat.basecnt[T_T]/(double)s.dat.nbase));
				if (s.dat.basecnt[T_N] > 0) {
					fprintf(o,"%%N\t%.4f\n", 100.0*((double)s.dat.basecnt[T_N]/(double)s.dat.nbase));
				}
			}
			// how many ref seqs have mapped bases?
			int mseq=0;
			google::dense_hash_map<string,scoverage>::iterator it = s.covr.begin();
			vector<string> vtmp;
			bool haverlen = 0;
			while (it != s.covr.end()) {
				if (it->second.mapb > 0) {
					++mseq;								// number of mapped refseqs
					if (mseq <= max_chr) vtmp.push_back(it->first);		// don't bother if too many chrs
					if (it->second.reflen > 0) haverlen = 1;
				}
				++it;
			}
			// don't print per-seq percentages if size is huge, or is 1
			if ((haverlen || mseq > 1) && mseq <= max_chr) {			// worth reporting
				// sort the id's
				sort(vtmp.begin(),vtmp.end());
				vector<string>::iterator vit=vtmp.begin();
				double logb=log(2);
                vector<double> vcovrvar;
                vector<double> vcovr;
                vector<double> vskew;
                // for each chromosome or reference sequence...
				while (vit != vtmp.end()) {
					scoverage &v = s.covr[*vit];                    // coverage vector
					if (v.reflen && histnum > 0) {                  // user asked for histogram
						string sig;
						int d; double logd, lsum=0, lssq=0;

						for (d=0;d<histnum;++d) {                   // log counts for each portion of the histogram
                            logd = log(1+v.dist[d])/logb;
                            lsum+=logd;
                            lssq+=logd*logd;
							sig += ('0' + (int) logd);
						}
                        if (rnamode) {
                            // variability of coverage
                            double cv = stdev(histnum, lsum, lssq)/(lsum/histnum);
                            // percent coverage estimated using historgram... maybe track real coverage some day, for now this is fine
                            double covr = 0;
                            for (d=0;d<histnum;++d) {
                                // VFAC = % greater than 1 that a bin must be to be considered 100%
                                if (v.dist[d] > VFACTOR*v.reflen/histnum) {
                                    ++covr;     // 100% covered this bin
                                } else {
                                    // calc bases/(factor * size of bin)
                                    covr += ((double)v.dist[d] / ((double)VFACTOR*v.reflen/histnum));
                                }
                            }
                            double origcovr = covr;
                            covr /= (double) histnum;
                            covr = min(100.0*((double)v.mapb/v.reflen),100.0*covr);
                            // when dealing with "position skewness", you need to anchor things
                            v.spos.Push(v.reflen);
                            v.spos.Push(1);
                            double skew = -v.spos.Skewness();
                            // if there's some coverage
                            if (v.mapr > 0) {
                                if (v.mapr > 10) {
                                    // summary stats
                                    vcovr.push_back(covr);              // look at varition
                                    vcovrvar.push_back(cv);             // look at varition
                                    vskew.push_back(skew);              // and skew
                                }
                                if (rnao) {                         // "rna mode"  = more detailed output of coverage and skewness of coverage
        						    fprintf(rnao,"%s\t%d\t%ld\t%.2f\t%.4f\t%.4f\t%s\n", vit->c_str(), v.reflen, v.mapr, covr, skew, cv, sig.c_str());
                                }
                            }
                        } else if (max_chr < 100) {                 // normal dna mode, just print percent alignment to each
    						fprintf(o,"%%%s\t%.2f\t%s\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum), sig.c_str());
                        } else {
    						fprintf(o,"%%%s\t%.6f\t%s\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum), sig.c_str());
                        }
					} else {
                        if (max_chr < 100) {
						    fprintf(o,"%%%s\t%.2f\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum));
                        } else {
						    fprintf(o,"%%%s\t%.6f\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum));
                        }
					}
					++vit;
				}
                if (rnamode) {
		            sort(vcovr.begin(), vcovr.end());
		            sort(vcovrvar.begin(), vcovrvar.end());
		            sort(vskew.begin(), vskew.end());
                    double medcovrvar = quantile(vcovrvar,.5);
                    double medcovr = quantile(vcovr,.5);
                    double medskew = quantile(vskew,.5);
                    fprintf(o,"median skew\t%.2f\n", medskew);
                    fprintf(o,"median coverage cv\t%.2f\n", medcovrvar);
                    fprintf(o,"median coverage\t%.2f\n", medcovr);
                }
			}
			if (s.covr.size() > 1) {
				fprintf(o,"num ref seqs\t%d\n", (int) s.covr.size());
				fprintf(o,"num ref aligned\t%d\n", (int) mseq);
			}
		} else {
			if (s.covr.size() > 1) {
				fprintf(o,"num ref seqs\t%d\n", (int) s.covr.size());
			}
		}
	}
	return errs ? 1 : 0;
}

#define S_ID 0
#define S_BITS 1
#define S_NMO 2
#define S_POS 3
#define S_MAPQ 4
#define S_CIG 5
#define S_MATEREF 6
#define S_MATE 8
#define S_READ 9
#define S_QUAL 10
#define S_TAG 11

void sstats::dostats(string name, int rlen, int bits, const string &ref, int pos, int mapq, const string &materef, int nmate, const string &seq, const char *qual, int nm, int del, int ins) {

	++dat.n;

	if (bits & 0x04) return;       // bits say ... query was not mapped

	if (pos<=0) {
	    ++dat.mapzero;             // quantify weird errors
        return;				       // not mapped well enough to count
    }

	++dat.mapn;                    // mapped query

    // TODO: build a histogram of read lengths using the integer bucket

    // read length min/max
	if (rlen > dat.lenmax) dat.lenmax = rlen;
	if ((rlen < dat.lenmin) || dat.lenmin==0) dat.lenmin = rlen;

    // read length sum/ssq
	dat.lensum += rlen;
	dat.lenssq += rlen*rlen;

    // TODO: allow for alternate paired-end layouts besides Illumina's

    // reverse stranded query
	if (bits & 16) 
	    if (bits & 0x40)            // first read in the pair
    		++dat.nrev;             // reverse
		else
            ++dat.nfor;             // second read? actually was a forward alignment
	else
	    if (bits & 0x40)            // first read in the pair
		    ++dat.nfor;             
        else
		    ++dat.nrev;

    // mapping quality mean/stdev
	dat.mapsum += mapq;
	dat.mapssq += mapq*mapq;

    // mapping quality histogram
    vmapq.push(mapq);

    // TODO: NM histogram maybe?

    // number of mismateches
	if (nm > 0) {
        // nm is snp+ins+del... which is silly
		dat.nmnz += 1;                          // how many read are not perfect matches?
		dat.nmsum += nm-del-ins;                // mismatch sum
	}
	dat.del+=del;                               // deletion sum
	dat.ins+=ins;                               // insert sum

    // if we know about the reference sequence
	if (ref.length()) {
		scoverage *sc = &(covr[ref]);
		if (sc) {                               // and we have ram for coverage
			sc->mapb+=rlen;                     // total up mapped bases in that ref
            if (rnamode) {                      // more detailed
                int i;
			    sc->mapr+=1;
                for (i=0;i<rlen;++i) {          // walk along read
                    sc->spos.Push(pos+i);       // per-position stats
                }
			    if (histnum > 0 && sc->reflen > 0) {                                // if we're making a histogram
                    for (i=0;i<rlen;++i) {                                          // walk along read
				        int x = histnum * ((double)(pos+i) / sc->reflen);           // find the bucket this base is in
                        if (x < histnum) {                                          
                            sc->dist[x]+=1;                                         // add 1 to that bucket
                        } else {
                            // out of bounds.... what to do?
                            sc->dist[histnum] += 1;                                 // out of bounds bases (fall off the edge) = extra bucket
                        }
                    }
                }
            } else if (histnum > 0 && sc->reflen > 0) {                             // lightweight... don't deal with each base, ok becauss CHRs are big
				int x = histnum * ((double)pos / sc->reflen);
				if (debug > 1) { 
					warn("chr: %s, hn: %d, pos: %d, rl: %d, x: %x\n", ref.c_str(), histnum, pos, sc->reflen, x);
				}
				if (x < histnum) {
                    sc->dist[x]+=rlen;
				} else {
					// out of bounds.... what to do?
					sc->dist[histnum] +=rlen;
				}
			}
		}
	}
    // total mapped bases += read length
	dat.tmapb+=rlen;
	if (nmate>0) {
        // insert size histogram
		visize.push_back(nmate);
		dat.pe=1;
	}

    // mate reference chromosome is not the same as my own?
	if (materef.size() && (materef != "=" && materef != "*" && materef != ref)) {
        // this is a discordant read
		dat.disc++;
	} else {
    // mate reference chromosome is far (>50kb) from my own?
		if (abs(nmate) > 50000) {
            // this is discordant-by position
			dat.disc_pos++;
		}
	}

    // walk along sequence, add qualities to overall min/max/mean/stdev 
	int i, j;
	for (i=0;i<seq.length();++i) {
		if (qual[i]>dat.qualmax) dat.qualmax=qual[i];
		if (qual[i]<dat.qualmin) dat.qualmin=qual[i];
		dat.qualsum+=qual[i];
		dat.qualssq+=qual[i]*qual[i];
        // also count bases
		++dat.basecnt[basemap[seq[i]]];
        // total number of bases counted (this should be the same as tmapb???   get rid of it???)
		++dat.nbase;
	}

    // TODO: we should be able to use the "non primary" bit field
    //       need to test to see if this works for all aligners
    //       then have a mode that only report stats for primary alignments... for example, and no need for this 
    //       expensive, giant hash table

    // duplicate tracking turned on?
	if (trackdup) {
		size_t p;
        // illumina mode... check for a space in the name, and ignore stuff after it
		if ((p = name.find_first_of(' '))!=string::npos) 
			name.resize(p);

        // count dups for that id
		int x=++dups[name];

        // keep track of max dups
		if (x>dat.dupmax) 
			dat.dupmax=x;

        // fastq-output mode... 
        if (sefq) {
            // if the data isn't paired end or if we're not sure yet
            if (!dat.pe || dat.mapn < 1000) {
                // output a single end fq
                fprintf(sefq,"@%s\n%s\n+\n%s\n",name.c_str(), seq.c_str(), qual);
            }
        }

        // if we're outputting paired-end fastq's and if there's not a lot of dups
        if (pefq1 && x < 4 && (dat.pe || dat.mapn < 1000)) {
            fqent fq;
            google::sparse_hash_map<string,fqent>::iterator it=petab.find(name);
            // find my mate?
            if (it == petab.end()) {
                // no, add me
                fq.r=seq;
                fq.q=qual;
                fq.bits=bits&0x40;                  // mate flag
                petab[name]=fq;
            } else if (it->second.bits != bits) {
                // yes? remove me
                fq=it->second;
                fprintf(pefq1,"@%s 1\n%s\n+\n%s\n",name.c_str(), fq.r.c_str(), fq.q.c_str());
                fprintf(pefq2,"@%s 2\n%s\n+\n%s\n",name.c_str(), seq.c_str(), qual);
                petab.erase(it); 
            }
        }
	}
}

// parse a sam file... maybe let samtools do this, and then handle stats in "bam mode"... faster for sure
bool sstats::parse_sam(FILE *f) {
	line l; meminit(l);
    int lineno=0;
    int warnings=0;
	while (read_line(f, l)>0)  {
        ++lineno;
		char *sp;
		if (l.s[0]=='@') {
			if (!strncmp(l.s,"@SQ\t",4)) {
				char *t=strtok_r(l.s, "\t", &sp);
				string sname; int slen=0;
				while(t) {
					if (!strncmp(t,"SN:",3)) {
						sname=&(t[3]);
						if (slen) 
							break;
					} else if (!strncmp(t,"LN:",3)) {
						slen=atoi(&t[3]);
						if (sname.length()) 
							break;
					}
					t=strtok_r(NULL, "\t", &sp);
				}
				covr[sname].reflen=slen;
			}
			continue;
		}
		char *t=strtok_r(l.s, "\t", &sp);
		char *d[100]; meminit(d);
		int n =0;
		while(t) {
			d[n++]=t;
			t=strtok_r(NULL, "\t", &sp);
		}
		int nm=0;
		int i;
		// get # mismatches
		for (i=S_TAG;i<n;++i){
			if (d[i] && !strncasecmp(d[i],"NM:i:",5)) {
				nm=atoi(&d[i][5]);
			}
		}

		if (!d[S_BITS] || !isdigit(d[S_BITS][0]) 
		 || !d[S_POS]  || !isdigit(d[S_POS][0])
		   ) {
            if (warnings < 5) {
                warn("Line %d, missing bits/position information\n", lineno);
                ++warnings;
            }
			// invalid sam
			return false;
		}

		int ins = 0, del = 0;	
		char *p=d[S_CIG];
		// sum the cig
		while (*p) {
			int n=strtod(p, &sp);
			if (sp==p) {
				break;
			}
			if (*sp == 'I') 
				ins+=n;
			else if (*sp == 'D') 
				del+=n;
			p=sp+1;
		}

        // force unmapped to position negative one
		if (d[S_CIG][0] == '*') d[S_POS] = (char *) "-1";

        // as-if it were a bam...
		dostats(d[S_ID],strlen(d[S_READ]),atoi(d[S_BITS]),d[S_NMO],atoi(d[S_POS]),atoi(d[S_MAPQ]),d[S_MATEREF],atoi(d[S_MATE]),d[S_READ],d[S_QUAL],nm, ins, del);
	}
	return true;
}

// let samtools parse the bam
bool sstats::parse_bam(const char *in) {
    samfile_t *fp;
    if (!(fp=samopen(in, "rb", NULL))) {
            warn("Error reading '%s': %s\n", in, strerror(errno));
            return false;
    }
    if (fp->header) {
        int i;
        for (i = 0; i < fp->header->n_targets; ++i) {
            covr[fp->header->target_name[i]].reflen=fp->header->target_len[i];
        }
    }
	bam1_t *al=bam_init1();
    int ret=0;
    while ( (ret=samread(fp, al)) > 0 ) {
        uint32_t *cig = bam1_cigar(al);
        char *name = bam1_qname(al);
        int len = al->core.l_qseq;
        uint8_t *tag=bam_aux_get(al, "NM");     // NM tag
		int nm = tag ? bam_aux2i(tag) : 0;
		int ins=0, del=0;
		int i;

        // count inserts and deletions
		for (i=0;i<al->core.n_cigar;++i) {
            int op = cig[i] & BAM_CIGAR_MASK;
			if (op == BAM_CINS) {
				ins+=(cig[i] >> BAM_CIGAR_SHIFT);
			} else if (op == BAM_CDEL) {
				del+=(cig[i] >> BAM_CIGAR_SHIFT);
			}
		}

        // crappy cigar?
		if (al->core.n_cigar == 0) 
			al->core.pos=-1;                    // not really a match if there's no cigar string... this deals with bwa's issue

        char *qual = (char *) bam1_qual(al);    // qual string
        uint8_t * bamseq = bam1_seq(al);        // sequence string
        string seq; seq.resize(len);            // ok... really make it a string
        for (i=0;i<len;++i) {
            seq[i] = bam_nt16_rev_table[bam1_seqi(bamseq, i)];
            qual[i] += 33;
        }

        // now do stats
		dostats(name,len,al->core.flag,al->core.tid>=0?fp->header->target_name[al->core.tid]:"",al->core.pos+1,al->core.qual, al->core.mtid>=0?fp->header->target_name[al->core.mtid]:"", al->core.isize, seq, qual, nm, ins, del);
	}
    if (ret < -2) {
            // no stats .. corrupt file
            return false;
    }
    if (ret < -1) {
        ++errs;
        // truncated file, output stats, but return error code
        return true;
    }
	return true;
}

void usage(FILE *f) {
        fprintf(f,
"Usage: sam-stats [options] [file1] [file2...filen]\n"
"Version: %s.%d\n"
"\n"
"Produces lots of easily digested statistics for the files listed\n"
"\n"
"Options (default in parens):\n"
"\n"
"-D             Keep track of multiple alignments\n"
"-O PREFIX      Output prefix enabling extended output (see below)\n"
"-R FIL         Coverage/RNA output (coverage, 3' bias, etc, implies -A)\n"
"-A             Report all chr sigs, even if there are more than 1000\n"
"-b INT         Number of reads to sample for per-base stats (1M)\n"
"-S INT         Size of ascii-signature (30)\n"
"-x FIL         File extension for handling multiple files (stats)\n"
"-M             Only overwrite if newer (requires -x, or multiple files)\n"
"-B             Input is bam, don't bother looking at magic\n"
"-z             Don't fail when zero entries in sam\n"
"\n"
"OUTPUT:\n"
"\n"
"If one file is specified, then the output is to standard out.  If\n"
"multiple files are specified, or if the -x option is supplied,\n"
"the output file is <filename>.<ext>.  Default extension is 'stats'.\n"
"\n"
"Complete Stats:\n"
"\n"
"  <STATS>           : mean, max, stdev, median, Q1 (25 percentile), Q3\n"
"  reads             : # of entries in the sam file, might not be # reads\n"
"  phred             : phred scale used\n"
"  bsize             : # reads used for qual stats\n"
"  mapped reads      : number of aligned reads (unique probe id sequences)\n"
"  mapped bases      : total of the lengths of the aligned reads\n"
"  forward           : number of forward-aligned reads\n"
"  reverse           : number of reverse-aligned reads\n"
"  snp rate          : mismatched bases / total bases (snv rate)\n"
"  ins rate          : insert bases / total bases\n"
"  del rate          : deleted bases / total bases\n"
"  pct mismatch      : percent of reads that have mismatches\n"
"  pct align         : percent of reads that aligned\n"
"  len <STATS>       : read length stats, ignored if fixed-length\n"
"  mapq <STATS>      : stats for mapping qualities\n"
"  insert <STATS>    : stats for insert sizes\n"
"  %%<CHR>           : percentage of mapped bases per chr, followed by a signature\n"
"\n"
"Subsampled stats (1M reads max):\n"
"  base qual <STATS> : stats for base qualities\n"
"  %%A,%%T,%%C,%%G       : base percentages\n"
"\n"
"Meaning of the per-chromosome signature:\n"
"  A ascii-histogram of mapped reads by chromosome position.\n"
"  It is only output if the original SAM/BAM has a header. The values\n"
"  are the log2 of the # of mapped reads at each position + ascii '0'.\n"
"\n"
"Extended output mode produces a set of files:\n"
"  .stats           : primary output\n"
"  .fastx           : fastx-toolkit compatible output\n"
"  .rcov            : per-reference counts & coverage\n"
"  .xdist           : mismatch distribution\n"
"  .ldist           : length distribution (if applicable)\n"
"  .mqdist          : mapping quality distribution\n"
"\n"
        ,VERSION, SVNREV);
}

std::string string_format(const std::string &fmt, ...) {
       int n, size=100;
       std::string str;
       va_list ap;
       while (1) {
       str.resize(size);
       va_start(ap, fmt);
       int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
       va_end(ap);
       if (n > -1 && n < size)
           return str;
       if (n > -1)
           size=n+1;
       else
           size*=2;
       }
}

// R-compatible quantile code : TODO convert to template

template <class vtype>
double quantile(const vtype &vec, double p) {
        int l = vec.size();
        if (!l) return 0;
        double t = ((double)l-1)*p;
        int it = (int) t;
        int v=vec[it];
        if (t > (double)it) {
                return (v + (t-it) * (vec[it+1] - v));
        } else {
                return v;
        }
}

template <class itype>
double quantile(const vector<itype> &vec, double p) {
        int l = vec.size();
        if (!l) return 0;
        double t = ((double)l-1)*p;
        int it = (int) t;
        itype v=vec[it];
        if (t > (double)it) {
                return (v + (t-it) * (vec[it+1] - v));
        } else {
                return v;
        }
}

void build_basemap() {
	int cb,j;
	for (cb=0;cb<256;++cb) {
		switch(cb) {
			case 'A': case 'a':
				j=T_A; break;
			case 'C': case 'c':
				j=T_C; break;
			case 'G': case 'g':
				j=T_G; break;
				case 'T': case 't':
					j=T_T; break;
				default:
					j=T_N; break;
		}
		basemap[cb]=j;
	}
}