iVar
call_consensus.cpp
1 #include "htslib/vcf.h"
2 
3 #include<iostream>
4 #include<fstream>
5 
6 struct maj {
7  int *ind;
8  int n;
9 };
10 
11 #define GET_MAX_INDICES(type_t, pu, no, m) { \
12  type_t v = 1; \
13  type_t *p = (type_t*) (pu); \
14  for (int k = 0; k < no; ++k){ \
15  if(p[k] > v){ \
16  v = p[k]; \
17  m.n = 0; \
18  m.ind[m.n] = k; \
19  } else if(p[k] == v) { \
20  m.n++; \
21  m.ind[m.n] = k; \
22  } \
23  } \
24  }
25 
26 #define INDEL_SIZE 30
27 
28 #define GAP 'N'
29 
30 // From bcftools.h - https://github.com/samtools/bcftools/blob/b0376dff1ed70603c9490802f37883b9009215d2/bcftools.h#L48
31 static inline char gt2iupac(char a, char b)
32 {
33  static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
34  if ( a>='a' ) a -= 'a' - 'A';
35  if ( b>='a' ) b -= 'a' - 'A';
36  if ( a=='A' ) a = 0;
37  else if ( a=='C' ) a = 1;
38  else if ( a=='G' ) a = 2;
39  else if ( a=='T' ) a = 3;
40  else return 'N';
41  if ( b=='A' ) b = 0;
42  else if ( b=='C' ) b = 1;
43  else if ( b=='G' ) b = 2;
44  else if ( b=='T' ) b = 3;
45  else return 'N';
46  return iupac[(int)a][(int)b];
47 }
48 
49 void get_consensus_indel(char **allele, maj m, char *&nuc){
50  int max_len = 0, l = 0;
51  uint8_t q = 0;
52  for (int k = 0; k <= m.n; ++k){
53  if(strlen(allele[m.ind[k]]) > max_len)
54  max_len = strlen(allele[m.ind[k]]);
55  }
56  char n;
57  for (int j = 0; j < max_len; ++j){ // Iterate over base positions
58  n = 0;
59  for (int k = 0; k <= m.n ; ++k){ // Iterate over major alleles
60  if(j < strlen(allele[m.ind[k]])){
61  if(n == 0){
62  n = allele[m.ind[k]][j];
63  } else {
64  n = gt2iupac(n, allele[m.ind[k]][j]);
65  }
66  }
67  }
68  if(n == 0)
69  nuc[l] = GAP;
70  else
71  nuc[l] = n;
72  l++;
73  if(l % INDEL_SIZE == 0){
74  nuc = (char*) realloc(nuc, INDEL_SIZE * ((l/INDEL_SIZE) + 1) * sizeof(char));
75  }
76  }
77  nuc[l] = 0;
78 }
79 
80 int main_old(int argc, char* argv[]) {
81  std::cout << "Path " << argv[1] <<std::endl;
82  std::string bcf = std::string(argv[1]);
83  std::string region_;
84  std::string out_path = std::string(argv[2]);
85  if(argc > 3) {
86  region_ = std::string(argv[3]);
87  }
88  if(!bcf.empty()) {
89  //open BCF for reading
90  htsFile *in = bcf_open(bcf.c_str(), "r");
91  std::ofstream out(out_path);
92  out << ">" << out_path << "\n";
93  if(in == NULL) {
94  throw std::runtime_error("Unable to open BCF/VCF file.");
95  }
96  int ad_id;
97  //Get the header
98  bcf_hdr_t *header = bcf_hdr_read(in);
99  if(header == NULL) {
100  bcf_close(in);
101  throw std::runtime_error("Unable to open BCF header.");
102  }
103  ad_id = bcf_hdr_id2int(header, BCF_DT_ID, "AD");
104  std::cout << ad_id << " " << std::endl;
105  //Initialize iterator
106  bcf1_t *v = bcf_init();
107  int ctr = 0, prev_pos = 0, s, d;
108  maj m;
109  char *nuc =(char *) malloc(INDEL_SIZE * sizeof(char)), *r;
110  while(bcf_read(in, header, v) ==0){
111  // std::cout << "Position: " << v->pos << std::endl;
112  // std::cout << "Number of Alleles: " << v->n_allele << std::endl;
113  for (int i = prev_pos; i < v->pos - 1; ++i){
114  out << GAP;
115  }
116  bcf_unpack(v, BCF_UN_FMT);
117  bcf_unpack(v, BCF_UN_STR);
118  bcf_fmt_t *fmt = v->d.fmt;
119  for (int i = 0; i < v->n_fmt; ++i){
120  if(strcmp(header->id[BCF_DT_ID][fmt[i].id].key, "AD") == 0){
121  m.ind = (int *) malloc(fmt[i].n * sizeof(int));
122  m.n = -1;
123  switch(fmt[i].type){
124  case BCF_BT_INT8: GET_MAX_INDICES(int8_t, fmt[i].p, fmt[i].n, m); break;
125  case BCF_BT_INT16: GET_MAX_INDICES(int16_t, fmt[i].p, fmt[i].n, m); break;
126  case BCF_BT_INT32: GET_MAX_INDICES(int32_t, fmt[i].p, fmt[i].n, m); break;
127  case BCF_BT_FLOAT: GET_MAX_INDICES(float, fmt[i].p, fmt[i].n, m); break;
128  default: hts_log_error("Unexpected type %d", fmt[i].type); exit(1);
129  }
130  if(m.n == -1){
131  out << GAP;
132  } else {
133  if(bcf_get_variant_types(v) == VCF_REF){
134  out << v->d.allele[0];
135  } else if(bcf_get_variant_types(v) == VCF_SNP || bcf_get_variant_types(v) == VCF_MNP){
136  nuc =(char *) malloc(INDEL_SIZE * sizeof(char));
137  get_consensus_indel(v->d.allele, m, nuc);
138  out << nuc;
139  } else { // For INDEL insert difference.between reference and consensus allele
140  nuc =(char *) malloc(INDEL_SIZE * sizeof(char));
141  get_consensus_indel(v->d.allele, m, nuc);
142  s = strlen(v->d.allele[0]) - strlen(nuc); // > 0 Deletion. < 0 INSERTION
143  d = (s > 0) ? strlen(v->d.allele[0]) : strlen(nuc);
144  r = (s > 0) ? v->d.allele[0] : nuc;
145  s = (s < 0) ? -1 * s : s;
146  if(v->pos == 499)
147  std::cout << "Number: " << m.n << " Maj Allele: " << v->d.allele[m.ind[0]] << strlen(nuc) << " - Nuc Length" << nuc << " " << s << " " << d << std::endl;
148  for (int i = 0; i < s; ++i){
149  out << r[d - s];
150  }
151  free(m.ind);
152  free(nuc);
153  // std::cout << nuc << std::endl;
154  }
155  }
156  }
157  }
158  prev_pos = v->pos;
159  ctr++;
160  if(ctr % 1000 == 0){
161  std::cout << ctr << std::endl;
162  }
163  }
164  bcf_destroy(v);
165  bcf_hdr_destroy(header);
166  bcf_close(in);
167  out.close();
168  }
169  return 0;
170 }