iVar
trim_primer_quality.cpp
1 #include "htslib/sam.h"
2 #include "htslib/bgzf.h"
3 
4 #include <iostream>
5 #include <stdexcept>
6 #include <string>
7 #include <vector>
8 #include <fstream>
9 #include <sstream>
10 #include <cstring>
11 
12 #include "trim_primer_quality.h"
13 #include "primer_bed.h"
14 
15 int32_t get_pos_on_query(uint32_t *cigar, uint32_t ncigar, int32_t pos, int32_t ref_start){
16  int cig;
17  int32_t n;
18  int32_t ql = 0, rl = ref_start;
19  for (uint32_t i = 0; i < ncigar; ++i){
20  cig = bam_cigar_op(cigar[i]);
21  n = bam_cigar_oplen(cigar[i]);
22  if (bam_cigar_type(cig) & 2) { // Only reference consuming
23  if (pos <= rl + n) {
24  if (bam_cigar_type(cig) & 1) // Only query consuming
25  ql += (pos - rl); // n consumed reference, check if it consumes query too.
26  return ql;
27  }
28  rl += n;
29  }
30  if (bam_cigar_type(cig) & 1) // Only query consuming
31  ql += n;
32  }
33  return ql;
34 }
35 
36 int32_t get_pos_on_reference(uint32_t *cigar, uint32_t ncigar, uint32_t pos, uint32_t ref_start){
37  int cig;
38  int32_t n;
39  uint32_t ql = 0, rl = ref_start;
40  for (uint32_t i = 0; i < ncigar; ++i){
41  cig = bam_cigar_op(cigar[i]);
42  n = bam_cigar_oplen(cigar[i]);
43  if (bam_cigar_type(cig) & 1) { // Only query consuming
44  if (pos <= ql + n) {
45  if (bam_cigar_type(cig) & 2) // Only reference consuming
46  rl += (pos - ql); // n consumed reference, check if it consumes query too.
47  return rl;
48  }
49  ql += n;
50  }
51  if (bam_cigar_type(cig) & 2) // Only reference consuming
52  rl += n;
53  }
54  return rl;
55 }
56 
57 void reverse_qual(uint8_t *q, int l){
58  for (int i = 0; i < l/2; ++i){
59  q[i]^=q[l-i-1];
60  q[l-i-1]^=q[i];
61  q[i]^=q[l-i-1];
62  }
63 }
64 
65 void reverse_cigar(uint32_t *cigar, int l){
66  for (int i = 0; i < l/2; ++i){
67  cigar[i]^=cigar[l-i-1];
68  cigar[l-i-1]^=cigar[i];
69  cigar[i]^=cigar[l-i-1];
70  }
71 }
72 
73 double mean_quality(uint8_t *a, int s, int e){
74  double m = 0;
75  for (int i = s; i < e; ++i){
76  m += (double)a[i];
77  }
78  m = m/(e-s);
79  return m;
80 }
81 
82 cigar_ quality_trim(bam1_t* r, uint8_t qual_threshold, uint8_t sliding_window){
83  uint32_t *ncigar = (uint32_t*) malloc(sizeof(uint32_t) * (r->core.n_cigar + 1)), // Maximum edit is one more element with soft mask
84  *cigar = bam_get_cigar(r);
85  uint8_t *qual = bam_get_qual(r);
86  int32_t start_pos;
87  if(bam_is_rev(r)){
88  reverse_qual(qual, r->core.l_qseq);
89  }
90  double m = 60;
91  int del_len, cig, temp;
92  uint32_t i = 0, j = 0;
93  while(m >= qual_threshold && (i < r->core.l_qseq)){
94  m = mean_quality(qual, i, i+sliding_window);
95  i++;
96  if(i > r->core.l_qseq - sliding_window)
97  sliding_window--;
98  }
99  del_len = r->core.l_qseq - i;
100  start_pos = get_pos_on_reference(cigar, r->core.n_cigar, del_len, r->core.pos); // For reverse reads need to set core->pos
101  if(bam_is_rev(r) && start_pos <= r->core.pos) {
102  return {
103  cigar,
104  r->core.n_cigar,
105  r->core.pos
106  };
107  }
108  int32_t n;
109  i = 0;
110  if(bam_is_rev(r)){
111  reverse_cigar(cigar, r->core.n_cigar);
112  }
113  reverse_cigar(cigar, r->core.n_cigar); // Reverse cigar and trim the beginning of read.
114  while(i < r->core.n_cigar){
115  if (del_len == 0){
116  ncigar[j] = cigar[i];
117  i++;
118  j++;
119  continue;
120  }
121  cig = bam_cigar_op(cigar[i]);
122  n = bam_cigar_oplen(cigar[i]);
123  if ((bam_cigar_type(cig) & 1)){ // Consumes Query
124  if(del_len >= n ){
125  ncigar[j] = bam_cigar_gen(n, BAM_CSOFT_CLIP);
126  } else if (del_len < n){
127  ncigar[j] = bam_cigar_gen(del_len, BAM_CSOFT_CLIP);
128  }
129  j++;
130  temp = n;
131  n = std::max(n - del_len, 0);
132  del_len = std::max(del_len - temp, 0);
133  if(n > 0){
134  ncigar[j] = bam_cigar_gen(n, cig);
135  j++;
136  }
137  }
138  i++;
139  }
140  reverse_cigar(ncigar, j); // Reverse Back
141  if(bam_is_rev(r)){
142  reverse_cigar(ncigar, j);
143  }
144  return {
145  ncigar,
146  j,
147  start_pos
148  };
149 }
150 
151 void print_cigar(uint32_t *cigar, int nlength){
152  for (int i = 0; i < nlength; ++i){
153  std::cout << ((cigar[i]) & BAM_CIGAR_MASK);
154  std::cout << "-" << ((cigar[i]) >> BAM_CIGAR_SHIFT) << " ";
155  }
156 }
157 
158 cigar_ primer_trim(bam1_t *r, int32_t new_pos){
159  uint32_t *ncigar = (uint32_t*) malloc(sizeof(uint32_t) * (r->core.n_cigar + 1)), // Maximum edit is one more element with soft mask
160  *cigar = bam_get_cigar(r);
161  uint32_t i = 0, j = 0;
162  int del_len, cig, temp;
163  if (bam_is_rev(r)){
164  del_len = bam_cigar2qlen(r->core.n_cigar, bam_get_cigar(r)) - get_pos_on_query(cigar, r->core.n_cigar, new_pos, r->core.pos) - 1;
165  // print_cigar(cigar, r->core.n_cigar);
166  reverse_cigar(cigar, r->core.n_cigar);
167  // print_cigar(cigar, r->core.n_cigar);
168  // std::cout << "Del Length " << del_len << std::endl;
169  } else {
170  del_len = get_pos_on_query(cigar, r->core.n_cigar, new_pos, r->core.pos);
171  }
172  int32_t n;
173  while(i < r->core.n_cigar){
174  if (del_len == 0){
175  ncigar[j] = cigar[i];
176  i++;
177  j++;
178  continue;
179  }
180  cig = bam_cigar_op(cigar[i]);
181  n = bam_cigar_oplen(cigar[i]);
182  if ((bam_cigar_type(cig) & 1)){ // Consumes Query
183  if(del_len >= n ){
184  ncigar[j] = bam_cigar_gen(n, BAM_CSOFT_CLIP);
185  } else if (del_len < n){
186  ncigar[j] = bam_cigar_gen(del_len, BAM_CSOFT_CLIP);
187  }
188  j++;
189  temp = n;
190  n = std::max(n - del_len, 0);
191  del_len = std::max(del_len - temp, 0);
192  if(n > 0){
193  ncigar[j] = bam_cigar_gen(n, cig);
194  j++;
195  }
196  }
197  i++;
198  }
199  if(bam_is_rev(r)){
200  reverse_cigar(ncigar, j);
201  }
202  cigar_ t = {
203  ncigar,
204  j
205  };
206  return t;
207 }
208 
209 void replace_cigar(bam1_t *b, int n, uint32_t *cigar){
210  if (n != b->core.n_cigar) {
211  int o = b->core.l_qname + b->core.n_cigar * 4;
212  if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
213  b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
214  kroundup32(b->m_data);
215  b->data = (uint8_t*)realloc(b->data, b->m_data);
216  }
217  memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
218  memcpy(b->data + b->core.l_qname, cigar, n * 4);
219  b->l_data += (n - b->core.n_cigar) * 4;
220  b->core.n_cigar = n;
221  } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
222 }
223 
224 int16_t get_overlapping_primer_indice(bam1_t* r, std::vector<primer> primers){
225  uint32_t query_pos, start_pos, *cigar = bam_get_cigar(r);
226  if(bam_is_rev(r)){
227  start_pos = bam_endpos(r)-1;
228  query_pos = start_pos + (bam_cigar2qlen(r->core.n_cigar, cigar) - get_pos_on_query(cigar, r->core.n_cigar, start_pos, r->core.pos)) - 1;
229  } else {
230  start_pos = r->core.pos;
231  query_pos = start_pos - get_pos_on_query(cigar, r->core.n_cigar, start_pos, r->core.pos);
232  }
233  for(std::vector<int>::size_type i = 0; i!=primers.size();i++){
234  if(query_pos >= primers[i].get_start() && query_pos <= primers[i].get_end() && start_pos >= primers[i].get_start() && start_pos <= primers[i].get_end()) // Change int to int32_t in primer_bed.cpp
235  return i;
236  }
237  return -1;
238 }
239 
240 cigar_ remove_trailing_query_ref_consumption(uint32_t* cigar, uint32_t n){
241  int i = 0, len = 0, cig, start_pos = 0;
242  while(i < n){
243  cig = bam_cigar_op(cigar[i]);
244  len = bam_cigar_oplen(cigar[i]);
245  if((bam_cigar_type(cig) & 2) && (bam_cigar_type(cig) & 1))
246  break;
247  if(bam_cigar_type(cig) & 1){ // Consumes only query - insertion
248  cigar[i] = bam_cigar_gen(len, BAM_CSOFT_CLIP);
249  }
250  if(bam_cigar_type(cig) & 2){ // Consumes only reference - deletion
251  for (int j = i; j < n-1; ++j){
252  cigar[j] = cigar[j+1];
253  }
254  n--;
255  i--;
256  start_pos += len;
257  }
258  i++;
259  }
260  reverse_cigar(cigar, n);
261  i = 0, len = 0;
262  while(i < n){
263  cig = bam_cigar_op(cigar[i]);
264  len = bam_cigar_oplen(cigar[i]);
265  if((bam_cigar_type(cig) & 2) && (bam_cigar_type(cig) & 1))
266  break;
267  if(bam_cigar_type(cig) & 1){ // Consumes only query - insertion
268  cigar[i] = bam_cigar_gen(len, BAM_CSOFT_CLIP);
269  }
270  if(bam_cigar_type(cig) & 2){ // Consumes only reference - deletion
271  for (int j = i; j < n-1; ++j){
272  cigar[j] = cigar[j+1];
273  }
274  n--;
275  i--;
276  // start_pos += len;
277  }
278  i++;
279  }
280  reverse_cigar(cigar, n);
281  return {
282  cigar,
283  n,
284  start_pos
285  };
286 }
287 
288 cigar_ condense_cigar(uint32_t* cigar, uint32_t n){
289  int i = 0, len = 0, cig, next_cig, start_pos = 0;
290  cigar_ t = remove_trailing_query_ref_consumption(cigar, n);
291  cigar = t.cigar;
292  n = t.nlength;
293  start_pos = t.start_pos;
294  while(i< n -1){
295  cig = bam_cigar_op(cigar[i]);
296  next_cig = bam_cigar_op(cigar[i+1]);
297  if(cig == next_cig){
298  len = bam_cigar_oplen(cigar[i])+bam_cigar_oplen(cigar[i+1]);
299  cigar[i] = bam_cigar_gen(len, bam_cigar_op(cigar[i]));
300  for(int j = i+1; j < n - 1; j++){
301  cigar[j] = cigar[j+1];
302  }
303  n--;
304  } else {
305  i++;
306  }
307  }
308  return {
309  cigar,
310  n,
311  start_pos
312  };
313 }
314 
315 int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window, int min_length = 30){
316  std::vector<primer> primers = populate_from_file(bed);
317  if(bam.empty()){
318  std::cout << "Bam file in empty." << std::endl;
319  return -1;
320  }
321  bam_out += ".bam";
322  samFile *in = hts_open(bam.c_str(), "r");
323  BGZF *out = bgzf_open(bam_out.c_str(), "w");
324  if(in == NULL) {
325  std::cout << ("Unable to open BAM/SAM file.") << std::endl;
326  return -1;
327  }
328  //Load the index
329  hts_idx_t *idx = sam_index_load(in, bam.c_str());
330  if(idx == NULL) {
331  std::cout << ("Unable to open BAM/SAM index.") << std::endl; // TODO: Generate index
332  return -1;
333  }
334  //Get the header
335  bam_hdr_t *header = sam_hdr_read(in);
336  if(header == NULL) {
337  sam_close(in);
338  std::cout << "Unable to open BAM/SAM header." << std::endl;
339  }
340  if(bam_hdr_write(out, header) < 0){
341  std::cout << "Unable to write BAM header to path." << std::endl;
342  sam_close(in);
343  return -1;
344  }
345  if (region_.empty()){
346  std::cout << "Number of references: " << header->n_targets << std::endl;
347  for (int i = 0; i < header->n_targets; ++i){
348  std::cout << "Reference Name: " << header->target_name[i] << std::endl;
349  std::cout << "Reference Length: " << header->target_len[i] << std::endl;
350  if(i==0){
351  region_.assign(header->target_name[i]);
352  }
353  }
354  std::cout << "Using Region: " << region_ << std::endl;
355  }
356  std::string hdr_text(header->text);
357  if (hdr_text.find(std::string("SO:coordinate"))) {
358  std::cout << "Sorted By Coordinate" << std::endl; // Sort by coordinate
359  } if(hdr_text.find(std::string("SO:queryname"))) {
360  std::cout << "Sorted By Query Name" << std::endl; // Sort by name
361  } else {
362  std::cout << "Not sorted" << std::endl;
363  }
364  //Initialize iterator
365  hts_itr_t *iter = NULL;
366  //Move the iterator to the region we are interested in
367  iter = sam_itr_querys(idx, header, region_.c_str());
368  if(header == NULL || iter == NULL) {
369  sam_close(in);
370  std::cout << "Unable to iterate to region within BAM/SAM." << std::endl;
371  return -1;
372  }
373  //Initiate the alignment record
374  bam1_t *aln = bam_init1();
375  int ctr = 0;
376  cigar_ t;
377  int16_t *p = (int16_t*)malloc(sizeof(int16_t));
378  uint32_t primer_trim_count = 0;
379  while(sam_itr_next(in, iter, aln) >= 0) {
380  *p = get_overlapping_primer_indice(aln, primers);
381  if(*p != -1){
382  primer_trim_count++;
383  if(bam_is_rev(aln)){
384  t = primer_trim(aln, primers[*p].get_start() - 1);
385  } else {
386  t = primer_trim(aln, primers[*p].get_end() + 1);
387  aln->core.pos = primers[*p].get_end() + 1;
388  }
389  replace_cigar(aln, t.nlength, t.cigar);
390  }
391  t = quality_trim(aln, min_qual, sliding_window); // Quality Trimming
392  if(bam_is_rev(aln))
393  aln->core.pos = t.start_pos;
394  t = condense_cigar(t.cigar, t.nlength);
395  aln->core.pos += t.start_pos;
396  replace_cigar(aln, t.nlength, t.cigar);
397  if(bam_cigar2rlen(aln->core.n_cigar, bam_get_cigar(aln)) >= min_length){
398  if(*p != -1)
399  bam_aux_append(aln, "xa", 'i', 4, (uint8_t*) p);
400  if(bam_write1(out, aln) < 0){
401  std::cout << "Not able to write to BAM" << std::endl;
402  hts_itr_destroy(iter);
403  hts_idx_destroy(idx);
404  bam_destroy1(aln);
405  bam_hdr_destroy(header);
406  sam_close(in);
407  bgzf_close(out);
408  return -1;
409  };
410  }
411  ctr++;
412  if(ctr % 1000000 == 0){
413  std::cout << "Processed " << ctr << "reads ... " << std::endl;
414  }
415  }
416  std::cout << "Results: " << std::endl;
417  std::cout << "Trimmed primers from " << primer_trim_count << " reads." << std::endl;
418  hts_itr_destroy(iter);
419  hts_idx_destroy(idx);
420  bam_destroy1(aln);
421  bam_hdr_destroy(header);
422  sam_close(in);
423  bgzf_close(out);
424  return 0;
425 }
426 
427 // int main_old(int argc, char* argv[]) {
428 // std::cout << "Path " << argv[1] <<std::endl;
429 // std::string bam = std::string(argv[1]);
430 // std::string region_;
431 // std::string bed = std::string(argv[2]);
432 // std::string bam_out = std::string(argv[3]);
433 // std::vector<primer> primers = populate_from_file(bed);
434 // if(argc > 4) {
435 // region_ = std::string(argv[4]);
436 // }
437 // if(!bam.empty()) {
438 // //open BAM for reading
439 // samFile *in = hts_open(bam.c_str(), "r");
440 // BGZF *out = bgzf_open(bam_out.c_str(), "w");
441 // if(in == NULL) {
442 // throw std::runtime_error("Unable to open BAM/SAM file.");
443 // }
444 // //Load the index
445 // hts_idx_t *idx = sam_index_load(in, bam.c_str());
446 // if(idx == NULL) {
447 // throw std::runtime_error("Unable to open BAM/SAM index."); // Generate index
448 // }
449 // //Get the header
450 // bam_hdr_t *header = sam_hdr_read(in);
451 // bam_hdr_write(out, header);
452 // if(header == NULL) {
453 // sam_close(in);
454 // throw std::runtime_error("Unable to open BAM header.");
455 // }
456 // if (region_.empty()){
457 // std::cout << "Number of references: " << header->n_targets << std::endl;
458 // for (int i = 0; i < header->n_targets; ++i){
459 // std::cout << "Reference Name: " << header->target_name[i] << std::endl;
460 // std::cout << "Reference Length: " << header->target_len[i] << std::endl;
461 // if(i==0){
462 // region_.assign(header->target_name[i]);
463 // }
464 // }
465 // std::cout << "Using Region: " << region_ << std::endl;
466 // }
467 // std::string hdr_text(header->text);
468 // if (hdr_text.find(std::string("SO:coordinate"))) {
469 // std::cout << "Sorted By Coordinate" << std::endl; // Sort by coordinate
470 // } if(hdr_text.find(std::string("SO:queryname"))) {
471 // std::cout << "Sorted By Query Name" << std::endl; // Sort by name
472 // } else {
473 // std::cout << "Not sorted" << std::endl;
474 // }
475 // //Initialize iterator
476 // hts_itr_t *iter = NULL;
477 // //Move the iterator to the region we are interested in
478 // iter = sam_itr_querys(idx, header, region_.c_str());
479 // if(header == NULL || iter == NULL) {
480 // sam_close(in);
481 // throw std::runtime_error("Unable to iterate to region within BAM.");
482 // }
483 // //Initiate the alignment record
484 // bam1_t *aln = bam_init1();
485 // int ctr = 0;
486 // cigar_ t;
487 // int16_t *p = (int16_t*)malloc(sizeof(int16_t));
488 // while(sam_itr_next(in, iter, aln) >= 0) {
489 // // std::cout << "Query Length: " << bam_cigar2qlen(aln->core.n_cigar, cigar) << std::endl;
490 // // std::cout << "Read Length: " << bam_cigar2rlen(aln->core.n_cigar, cigar) << std::endl;
491 // // std::cout << "Query Start: " << get_query_start(aln) << std::endl;
492 // // std::cout << "Query End: " << get_query_end(aln) << std::endl;
493 // *p = get_overlapping_primer_indice(aln, primers);
494 // if(*p == -1)
495 // continue;
496 // if(bam_is_rev(aln)){
497 // t = primer_trim(aln, primers[*p].get_start() - 1);
498 // } else {
499 // t = primer_trim(aln, primers[*p].get_end() + 1);
500 // aln->core.pos = primers[*p].get_end() + 1;
501 // }
502 // // std::cout << *p << std::endl;
503 // replace_cigar(aln, t.nlength, t.cigar);
504 // // Quality Trimming
505 // t = quality_trim(aln);
506 // if(bam_is_rev(aln))
507 // aln->core.pos = t.start_pos;
508 // // std::cout << "Old: " << t.nlength << std::endl;
509 // // print_cigar(t.cigar, t.nlength);
510 // // std::cout << std::endl;
511 // t = condense_cigar(t.cigar, t.nlength);
512 // aln->core.pos += t.start_pos;
513 // // std::cout << "New: " << t.nlength << std::endl;
514 // // print_cigar(t.cigar, t.nlength);
515 // // std::cout << std::endl;
516 // replace_cigar(aln, t.nlength, t.cigar);
517 // // std::cout << "Name: " << name << std::endl;
518 // // std::cout << "Seq: " << seq << "\tQual: " << qual;
519 // // std::cout << std::endl;
520 // // if (strcmp(bam_get_qname(aln), "M01244:143:000000000-BHWC7:1:1104:13925:8758") == 0){
521 // // std::cout << "Start Pos: " << aln->core.pos << std::endl;
522 // // uint8_t *q = bam_get_qual(aln);
523 // // for (int i = 0; i < aln->core.l_qseq -4; ++i){
524 // // std::cout << mean_quality(q, i, i+4) << " ";
525 // // }
526 // // std::cout << std::endl;
527 // // }
528 // int min_length = 30;
529 // if(bam_cigar2rlen(aln->core.n_cigar, bam_get_cigar(aln)) >= min_length){
530 // // bam1_t *b, const char tag[2], char type, int len, const uint8_t *data
531 // bam_aux_append(aln, "xa", 'i', 4, (uint8_t*) p);
532 // bam_write1(out, aln);
533 // }
534 // ctr++;
535 // if(ctr % 100000 == 0){
536 // std::cout << ctr << std::endl;
537 // }
538 // }
539 // hts_itr_destroy(iter);
540 // hts_idx_destroy(idx);
541 // bam_destroy1(aln);
542 // bam_hdr_destroy(header);
543 // sam_close(in);
544 // bgzf_close(out);
545 // }
546 // return 0;
547 // }