1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
|
#include "pysam.h"
/* This program demonstrates how to generate pileup from multiple BAMs
* simutaneously, to achieve random access and to use the BED interface.
* To compile this program separately, you may:
*
* gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include "bam.h"
typedef struct { // auxiliary data structure
bamFile fp; // the file handler
bam_iter_t iter; // NULL if a region not specified
int min_mapQ, min_len; // mapQ filter; length filter
} aux_t;
void *bed_read(const char *fn); // read a BED or position list file
void bed_destroy(void *_h); // destroy the BED data structure
int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
// This function reads a BAM alignment from one BAM file.
static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
{
aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
if (!(b->core.flag&BAM_FUNMAP)) {
if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
}
return ret;
}
int read_file_list(const char *file_list,int *n,char **argv[]);
#ifdef _MAIN_BAM2DEPTH
int main(int argc, char *argv[])
#else
int main_depth(int argc, char *argv[])
#endif
{
int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
void *bed = 0; // BED data structure
char *file_list = NULL, **fn = NULL;
bam_header_t *h = 0; // BAM header of the 1st input
aux_t **data;
bam_mplp_t mplp;
// parse the command line
while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
switch (n) {
case 'l': min_len = atoi(optarg); break; // minimum query length
case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
case 'q': baseQ = atoi(optarg); break; // base quality threshold
case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
case 'f': file_list = optarg; break;
}
}
if (optind == argc && !file_list) {
fprintf(pysamerr, "\n");
fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
fprintf(pysamerr, "Options:\n");
fprintf(pysamerr, " -b <bed> list of positions or regions\n");
fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n");
fprintf(pysamerr, " -l <int> minQLen\n");
fprintf(pysamerr, " -q <int> base quality threshold\n");
fprintf(pysamerr, " -Q <int> mapping quality threshold\n");
fprintf(pysamerr, " -r <chr:from-to> region\n");
fprintf(pysamerr, "\n");
return 1;
}
// initialize the auxiliary data structures
if (file_list)
{
if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
n = nfiles;
argv = fn;
optind = 0;
}
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(void*)); // data[i] for the i-th input
beg = 0; end = 1<<30; tid = -1; // set the default region
for (i = 0; i < n; ++i) {
bam_header_t *htmp;
data[i] = calloc(1, sizeof(aux_t));
data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
data[i]->min_mapQ = mapQ; // set the mapQ filter
data[i]->min_len = min_len; // set the qlen filter
htmp = bam_header_read(data[i]->fp); // read the BAM header
if (i == 0) {
h = htmp; // keep the header of the 1st BAM
if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
} else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
if (tid >= 0) { // if a region is specified and parsed successfully
bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index
data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
}
}
// the core multi-pileup loop
mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
}
printf("\t%d", n_plp[i] - m); // this the depth to output
}
putchar('\n');
}
free(n_plp); free(plp);
bam_mplp_destroy(mplp);
bam_header_destroy(h);
for (i = 0; i < n; ++i) {
bam_close(data[i]->fp);
if (data[i]->iter) bam_iter_destroy(data[i]->iter);
free(data[i]);
}
free(data); free(reg);
if (bed) bed_destroy(bed);
if ( file_list )
{
for (i=0; i<n; i++) free(fn[i]);
free(fn);
}
return 0;
}
|