1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
|
#include <zlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include "maqmap.h"
#include "stdhash.hh"
#include "main.h"
#include "stdhash.hh"
#include "bfa.h"
/*
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
decode as ATTGAC(RBGOG), there are one color change and one nt change;
if we decode as ATTAAC(RBRBG), there are two color changes.
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
as the penalty; otherwise, we will use color quality as the
penalty. This means we always prefer two consistent color changes over
a nt change, but if a color has high quality, we may prefer one nt
change.
In the above example, the penalties of the two types of decoding are
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
otherwise the second. Note that no matter what we choose, the fourth
base will get a low nt quality.
*/
#define COLOR_MM 19
#define NUCL_MM 25
const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
const int nst_ntcs2nt_table[] = { 0, 1, 2, 3, 1, 0, 3, 2, 2, 3, 0, 1, 3, 2, 1, 0 }; // useless at the moment
static inline int get_readseq(const nst_bfa1_t *l, int pos, int size, bit8_t *ntread)
{
int k;
bit64_t *s, *m;
if (pos + size >= l->ori_len) return 1;
s = l->seq + (pos>>5); m = l->mask + (pos>>5);
for (k = 0; k <= size; ++k) {
int i = (31 - ((pos+k)&0x1f)) << 1;
ntread[k] = ((*m>>i&0x3) == 3)? (*s>>i&0x3) : 4;
if (31 - ((pos+k)&0x1f) == 0) {
++s; ++m;
}
}
return 0;
}
static void cs2nt_dp(int size, const bit8_t *nt_ref, const bit8_t *csseq, bit8_t *nt_read, bit8_t *btarray)
{
int h[8], curr, last;
int x, y, xmin, hmin, k;
// recursion: initial value
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
else {
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
h[nt_ref[0]] = 0;
}
// recursion: main loop
curr = 1; last = 0;
for (k = 1; k <= size; ++k) {
for (x = 0; x != 4; ++x) {
int min = 0x7fffffff, ymin = 0;
for (y = 0; y != 4; ++y) {
int s = h[last<<2|y];
if (csseq[k-1] && csseq[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
s += ((csseq[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (csseq[k-1]&0x3f); // color mismatch
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
if (s < min) {
min = s; ymin = y;
}
}
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
}
last = curr; curr = 1 - curr; // swap
}
// back trace
hmin = 0x7fffffff; xmin = 0;
for (x = 0; x != 4; ++x) {
if (h[last<<2|x] < hmin) {
hmin = h[last<<2|x]; xmin = x;
}
}
nt_read[size] = xmin;
for (k = size - 1; k >= 0; --k)
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
}
static void cal_nt_qual(int size, const bit8_t *nt_read, bit8_t *seq, bit8_t *tarray)
{
int k, c1, c2;
bit8_t *t2array = tarray + size;
// get the color sequence of nt_read
c1 = nt_read[0];
for (k = 1; k <= size; ++k) {
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
c1 = c2;
}
for (k = 1; k != size; ++k) {
int q = 0;
if (tarray[k-1] == seq[k-1]>>6 && tarray[k] == seq[k]>>6) {
q = int(seq[k-1]&0x3f) + int(seq[k]&0x3f) + 10;
} else if (tarray[k-1] == seq[k-1]>>6) {
q = int(seq[k-1]&0x3f) - int(seq[k]&0x3f);
} else if (tarray[k] == seq[k]>>6) {
q = int(seq[k]&0x3f) - int(seq[k-1]&0x3f);
} // else, q = 0
if (q < 1) q = 1;
if (q > 63) q = 63;
t2array[k] = nt_read[k]<<6 | q;
if (seq[k-1] == 0 && seq[k] == 0) t2array[k] = 0;
}
memcpy(seq, t2array+1, size-1); // t2array[0] and t2array[size] are not copied
seq[size-1] = 0;
}
static void csmap2nt_core(gzFile fpout, FILE *fpbfa, gzFile fpmap)
{
nst_bfa1_t *l = 0;
bit32_t seqid = 0;
int k;
hash_map_char<bit32_t> *hash_map = new hash_map_char<bit32_t>;
maqmap_t *mm = maqmap_read_header(fpmap); // skip the sequence names
maqmap1_t mm1, *m1;
bit8_t nt_ref[MAX_READLEN+1], nt_read[MAX_READLEN+1], tarray[(MAX_READLEN+1) * 4];
m1 = &mm1;
for (k = 0; k != mm->n_ref; ++k) hash_map->insert(mm->ref_name[k], k);
maqmap_write_header(fpout, mm);
maqmap_read1(fpmap, m1);
while ((l = nst_load_bfa1(fpbfa)) != 0) {
if (!hash_map->find(l->name, &seqid)) {
nst_delete_bfa1(l);
continue;
}
do {
if (m1->seqid != seqid) break;
if (get_readseq(l, m1->pos>>1, m1->size, nt_ref)) continue;
cs2nt_dp(m1->size, nt_ref, m1->seq, nt_read, tarray);
cal_nt_qual(m1->size, nt_read, m1->seq, tarray);
m1->pos += 2;
--m1->size;
gzwrite(fpout, m1, sizeof(maqmap1_t));
} while (maqmap_read1(fpmap, m1));
nst_delete_bfa1(l);
if (gzeof(fpmap)) break; // end of alignment file
}
maq_delete_maqmap(mm);
delete hash_map;
}
int maq_csmap2nt(int argc, char *argv[])
{
gzFile fpin, fpout;
FILE *fp_bfa;
if (argc < 4) {
fprintf(stderr, "Usage: maq csmap2nt <out.nt.map> <in.ref.nt.bfa> <in.cs.map>\n");
return 1;
}
fpout = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdout), "w") : gzopen(argv[1], "w");
fp_bfa = fopen(argv[2], "r");
fpin = (strcmp(argv[3], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[3], "r");
assert(fpout && fpin && fp_bfa);
csmap2nt_core(fpout, fp_bfa, fpin);
gzclose(fpin); gzclose(fpout); fclose(fp_bfa);
return 0;
}
|