1 #include "htslib/sam.h" 2 #include "htslib/bgzf.h" 12 #include "trim_primer_quality.h" 13 #include "primer_bed.h" 15 int32_t get_pos_on_query(uint32_t *cigar, uint32_t ncigar, int32_t pos, int32_t ref_start){
18 int32_t ql = 0, rl = ref_start;
19 for (uint32_t i = 0; i < ncigar; ++i){
20 cig = bam_cigar_op(cigar[i]);
21 n = bam_cigar_oplen(cigar[i]);
22 if (bam_cigar_type(cig) & 2) {
24 if (bam_cigar_type(cig) & 1)
30 if (bam_cigar_type(cig) & 1)
36 int32_t get_pos_on_reference(uint32_t *cigar, uint32_t ncigar, uint32_t pos, uint32_t ref_start){
39 uint32_t ql = 0, rl = ref_start;
40 for (uint32_t i = 0; i < ncigar; ++i){
41 cig = bam_cigar_op(cigar[i]);
42 n = bam_cigar_oplen(cigar[i]);
43 if (bam_cigar_type(cig) & 1) {
45 if (bam_cigar_type(cig) & 2)
51 if (bam_cigar_type(cig) & 2)
57 void reverse_qual(uint8_t *q,
int l){
58 for (
int i = 0; i < l/2; ++i){
65 void reverse_cigar(uint32_t *cigar,
int l){
66 for (
int i = 0; i < l/2; ++i){
67 cigar[i]^=cigar[l-i-1];
68 cigar[l-i-1]^=cigar[i];
69 cigar[i]^=cigar[l-i-1];
73 double mean_quality(uint8_t *a,
int s,
int e){
75 for (
int i = s; i < e; ++i){
82 cigar_ quality_trim(bam1_t* r, uint8_t qual_threshold, uint8_t sliding_window){
83 uint32_t *ncigar = (uint32_t*) malloc(
sizeof(uint32_t) * (r->core.n_cigar + 1)),
84 *cigar = bam_get_cigar(r);
85 uint8_t *qual = bam_get_qual(r);
88 reverse_qual(qual, r->core.l_qseq);
91 int del_len, cig, temp;
92 uint32_t i = 0, j = 0;
93 while(m >= qual_threshold && (i < r->core.l_qseq)){
94 m = mean_quality(qual, i, i+sliding_window);
96 if(i > r->core.l_qseq - sliding_window)
99 del_len = r->core.l_qseq - i;
100 start_pos = get_pos_on_reference(cigar, r->core.n_cigar, del_len, r->core.pos);
101 if(bam_is_rev(r) && start_pos <= r->core.pos) {
111 reverse_cigar(cigar, r->core.n_cigar);
113 reverse_cigar(cigar, r->core.n_cigar);
114 while(i < r->core.n_cigar){
116 ncigar[j] = cigar[i];
121 cig = bam_cigar_op(cigar[i]);
122 n = bam_cigar_oplen(cigar[i]);
123 if ((bam_cigar_type(cig) & 1)){
125 ncigar[j] = bam_cigar_gen(n, BAM_CSOFT_CLIP);
126 }
else if (del_len < n){
127 ncigar[j] = bam_cigar_gen(del_len, BAM_CSOFT_CLIP);
131 n = std::max(n - del_len, 0);
132 del_len = std::max(del_len - temp, 0);
134 ncigar[j] = bam_cigar_gen(n, cig);
140 reverse_cigar(ncigar, j);
142 reverse_cigar(ncigar, j);
151 void print_cigar(uint32_t *cigar,
int nlength){
152 for (
int i = 0; i < nlength; ++i){
153 std::cout << ((cigar[i]) & BAM_CIGAR_MASK);
154 std::cout <<
"-" << ((cigar[i]) >> BAM_CIGAR_SHIFT) <<
" ";
158 cigar_ primer_trim(bam1_t *r, int32_t new_pos){
159 uint32_t *ncigar = (uint32_t*) malloc(
sizeof(uint32_t) * (r->core.n_cigar + 1)),
160 *cigar = bam_get_cigar(r);
161 uint32_t i = 0, j = 0;
162 int del_len, cig, temp;
164 del_len = bam_cigar2qlen(r->core.n_cigar, bam_get_cigar(r)) - get_pos_on_query(cigar, r->core.n_cigar, new_pos, r->core.pos) - 1;
166 reverse_cigar(cigar, r->core.n_cigar);
170 del_len = get_pos_on_query(cigar, r->core.n_cigar, new_pos, r->core.pos);
173 while(i < r->core.n_cigar){
175 ncigar[j] = cigar[i];
180 cig = bam_cigar_op(cigar[i]);
181 n = bam_cigar_oplen(cigar[i]);
182 if ((bam_cigar_type(cig) & 1)){
184 ncigar[j] = bam_cigar_gen(n, BAM_CSOFT_CLIP);
185 }
else if (del_len < n){
186 ncigar[j] = bam_cigar_gen(del_len, BAM_CSOFT_CLIP);
190 n = std::max(n - del_len, 0);
191 del_len = std::max(del_len - temp, 0);
193 ncigar[j] = bam_cigar_gen(n, cig);
200 reverse_cigar(ncigar, j);
209 void replace_cigar(bam1_t *b,
int n, uint32_t *cigar){
210 if (n != b->core.n_cigar) {
211 int o = b->core.l_qname + b->core.n_cigar * 4;
212 if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
213 b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
214 kroundup32(b->m_data);
215 b->data = (uint8_t*)realloc(b->data, b->m_data);
217 memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
218 memcpy(b->data + b->core.l_qname, cigar, n * 4);
219 b->l_data += (n - b->core.n_cigar) * 4;
221 }
else memcpy(b->data + b->core.l_qname, cigar, n * 4);
224 int16_t get_overlapping_primer_indice(bam1_t* r, std::vector<primer> primers){
225 uint32_t query_pos, start_pos, *cigar = bam_get_cigar(r);
227 start_pos = bam_endpos(r)-1;
228 query_pos = start_pos + (bam_cigar2qlen(r->core.n_cigar, cigar) - get_pos_on_query(cigar, r->core.n_cigar, start_pos, r->core.pos)) - 1;
230 start_pos = r->core.pos;
231 query_pos = start_pos - get_pos_on_query(cigar, r->core.n_cigar, start_pos, r->core.pos);
233 for(std::vector<int>::size_type i = 0; i!=primers.size();i++){
234 if(query_pos >= primers[i].get_start() && query_pos <= primers[i].get_end() && start_pos >= primers[i].get_start() && start_pos <= primers[i].get_end())
240 cigar_ remove_trailing_query_ref_consumption(uint32_t* cigar, uint32_t n){
241 int i = 0, len = 0, cig, start_pos = 0;
243 cig = bam_cigar_op(cigar[i]);
244 len = bam_cigar_oplen(cigar[i]);
245 if((bam_cigar_type(cig) & 2) && (bam_cigar_type(cig) & 1))
247 if(bam_cigar_type(cig) & 1){
248 cigar[i] = bam_cigar_gen(len, BAM_CSOFT_CLIP);
250 if(bam_cigar_type(cig) & 2){
251 for (
int j = i; j < n-1; ++j){
252 cigar[j] = cigar[j+1];
260 reverse_cigar(cigar, n);
263 cig = bam_cigar_op(cigar[i]);
264 len = bam_cigar_oplen(cigar[i]);
265 if((bam_cigar_type(cig) & 2) && (bam_cigar_type(cig) & 1))
267 if(bam_cigar_type(cig) & 1){
268 cigar[i] = bam_cigar_gen(len, BAM_CSOFT_CLIP);
270 if(bam_cigar_type(cig) & 2){
271 for (
int j = i; j < n-1; ++j){
272 cigar[j] = cigar[j+1];
280 reverse_cigar(cigar, n);
288 cigar_ condense_cigar(uint32_t* cigar, uint32_t n){
289 int i = 0, len = 0, cig, next_cig, start_pos = 0;
290 cigar_ t = remove_trailing_query_ref_consumption(cigar, n);
293 start_pos = t.start_pos;
295 cig = bam_cigar_op(cigar[i]);
296 next_cig = bam_cigar_op(cigar[i+1]);
298 len = bam_cigar_oplen(cigar[i])+bam_cigar_oplen(cigar[i+1]);
299 cigar[i] = bam_cigar_gen(len, bam_cigar_op(cigar[i]));
300 for(
int j = i+1; j < n - 1; j++){
301 cigar[j] = cigar[j+1];
315 int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window,
int min_length = 30){
316 std::vector<primer> primers = populate_from_file(bed);
318 std::cout <<
"Bam file in empty." << std::endl;
322 samFile *in = hts_open(bam.c_str(),
"r");
323 BGZF *out = bgzf_open(bam_out.c_str(),
"w");
325 std::cout << (
"Unable to open BAM/SAM file.") << std::endl;
329 hts_idx_t *idx = sam_index_load(in, bam.c_str());
331 std::cout << (
"Unable to open BAM/SAM index.") << std::endl;
335 bam_hdr_t *header = sam_hdr_read(in);
338 std::cout <<
"Unable to open BAM/SAM header." << std::endl;
340 if(bam_hdr_write(out, header) < 0){
341 std::cout <<
"Unable to write BAM header to path." << std::endl;
345 if (region_.empty()){
346 std::cout <<
"Number of references: " << header->n_targets << std::endl;
347 for (
int i = 0; i < header->n_targets; ++i){
348 std::cout <<
"Reference Name: " << header->target_name[i] << std::endl;
349 std::cout <<
"Reference Length: " << header->target_len[i] << std::endl;
351 region_.assign(header->target_name[i]);
354 std::cout <<
"Using Region: " << region_ << std::endl;
356 std::string hdr_text(header->text);
357 if (hdr_text.find(std::string(
"SO:coordinate"))) {
358 std::cout <<
"Sorted By Coordinate" << std::endl;
359 }
if(hdr_text.find(std::string(
"SO:queryname"))) {
360 std::cout <<
"Sorted By Query Name" << std::endl;
362 std::cout <<
"Not sorted" << std::endl;
365 hts_itr_t *iter = NULL;
367 iter = sam_itr_querys(idx, header, region_.c_str());
368 if(header == NULL || iter == NULL) {
370 std::cout <<
"Unable to iterate to region within BAM/SAM." << std::endl;
374 bam1_t *aln = bam_init1();
377 int16_t *p = (int16_t*)malloc(
sizeof(int16_t));
378 uint32_t primer_trim_count = 0;
379 while(sam_itr_next(in, iter, aln) >= 0) {
380 *p = get_overlapping_primer_indice(aln, primers);
384 t = primer_trim(aln, primers[*p].get_start() - 1);
386 t = primer_trim(aln, primers[*p].get_end() + 1);
387 aln->core.pos = primers[*p].get_end() + 1;
389 replace_cigar(aln, t.nlength, t.cigar);
391 t = quality_trim(aln, min_qual, sliding_window);
393 aln->core.pos = t.start_pos;
394 t = condense_cigar(t.cigar, t.nlength);
395 aln->core.pos += t.start_pos;
396 replace_cigar(aln, t.nlength, t.cigar);
397 if(bam_cigar2rlen(aln->core.n_cigar, bam_get_cigar(aln)) >= min_length){
399 bam_aux_append(aln,
"xa",
'i', 4, (uint8_t*) p);
400 if(bam_write1(out, aln) < 0){
401 std::cout <<
"Not able to write to BAM" << std::endl;
402 hts_itr_destroy(iter);
403 hts_idx_destroy(idx);
405 bam_hdr_destroy(header);
412 if(ctr % 1000000 == 0){
413 std::cout <<
"Processed " << ctr <<
"reads ... " << std::endl;
416 std::cout <<
"Results: " << std::endl;
417 std::cout <<
"Trimmed primers from " << primer_trim_count <<
" reads." << std::endl;
418 hts_itr_destroy(iter);
419 hts_idx_destroy(idx);
421 bam_hdr_destroy(header);