File: parseAlignment.cpp

package info (click to toggle)
bitseq 0.7.5%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 676 kB
  • sloc: cpp: 7,043; python: 562; makefile: 150; sh: 52
file content (612 lines) | stat: -rw-r--r-- 27,582 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
// DECLARATIONS: {{{
#include<cmath>
#include<set>

using namespace std;

#include "ArgumentParser.h"
#include "misc.h"
#include "MyTimer.h"
#include "ReadDistribution.h"
#include "TranscriptExpression.h"
#include "TranscriptInfo.h"
#include "TranscriptSequence.h"

#include "common.h"
//}}}

//#define DEBUG_AT(x) message(x) 
#define DEBUG_AT(x)

namespace ns_parseAlignment {
class TagAlignment{//{{{
   protected:
      int_least32_t trId;
//      bool strand; // true = forward; false = reverse
      double prob,lowProb;
   public:
      TagAlignment(long t=0,double p = 0,double lp = 0){
         trId=(int_least32_t)t;
//         strand=s;
         prob=p;
         lowProb=lp;
      }
      long getTrId()const {return trId;}
      double getProb()const {return prob;}
      double getLowProb()const {return lowProb;}
      void setProb(double p){prob=p;}
}; //}}}

// Check if next fragment is different.
bool nextFragDiffers(const ns_rD::fragmentP curF, const ns_rD::fragmentP nextF, bool mateNamesDiffer);
// String comparison allowing last cmpEPS bases different as long as length
// is the same.
long readNameCmp(const char *str1, const char *str2);
// Read Fragment from SAM file.
// Copies data from 'next' fragment into 'cur' fragment and reads new fragment information into 'next'.
// Fragment is either both paired-ends or just single read.
bool readNextFragment(samfile_t* samData, ns_rD::fragmentP &cur, ns_rD::fragmentP &next);

// Determine input format base either on --format flag or on the file extension.
// Sets format to bam/sam and returns true, or returns false if format is unknown.
bool setInputFormat(const ArgumentParser &args, string *format);

bool openSamFile(const string &name, const string &inFormat, samfile_t **samFile);

bool initializeInfoFile(const ArgumentParser &args, samfile_t *samFile, TranscriptInfo **trInfo, long *M);
} // namespace ns_parseAlignment

extern "C" int parseAlignment(int *argc,char* argv[]){
string programDescription =
"Pre-computes probabilities of (observed) reads' alignments.\n\
   [alignment file] should be in either SAM or BAM format.\n";
   TranscriptInfo *trInfo=NULL;
   TranscriptSequence *trSeq=NULL;
   TranscriptExpression *trExp=NULL;
   MyTimer timer;
   timer.start();
   timer.start(7);
   long Ntotal = 0, Nmap = 0, M=0, i;
   string inFormat;
   samfile_t *samData=NULL;
   ReadDistribution readD;
   ns_rD::fragmentP curF = new ns_rD::fragmentT, nextF = new ns_rD::fragmentT, validAF = new ns_rD::fragmentT;
   // This could be changed to either GNU's hash_set or C++11's unsorted_set, once it's safe.
   set<string> ignoredReads;
   long ignoredMaxAlignments = 0, ignoredSingletons = 0;
   // Intro: {{{
   // Set options {{{
   ArgumentParser args(programDescription,"[alignment file]",1);
   args.addOptionS("o","outFile","outFileName",1,"Name of the output file.");
   args.addOptionS("f","format","format",0,"Input format: either SAM, BAM.");
   args.addOptionS("t","trInfoFile","trInfoFileName",0,"File to save transcript information extracted from [BS]AM file and reference.");
   //args.addOptionS("t","trInfoFile","trInfoFileName",0,"If transcript(reference sequence) information is contained within SAM file, program will write this information into <trInfoFile>, otherwise it will look for this information in <trInfoFile>.");
   args.addOptionS("s","trSeqFile","trSeqFileName",1,"Transcript sequence in FASTA format --- for non-uniform read distribution estimation.");
   args.addOptionS("","trSeqHeader","trSeqHeader",0,"Transcript sequence header format enables gene name extraction (standard/gencode).","standard");
   args.addOptionS("e","expressionFile","expFileName",0,"Transcript relative expression estimates --- for better non-uniform read distribution estimation.");
   args.addOptionL("N","readsN","readsN",0,"Total number of reads. This is not necessary if [SB]AM contains also reads with no valid alignments.");
   args.addOptionS("","failed","failed",0,"File name where to save names of reads that failed to align.");
   args.addOptionB("","uniform","uniform",0,"Use uniform read distribution.");
   args.addOptionD("","lenMu","lenMu",0,"Set mean of log fragment length distribution. (l_frag ~ LogNormal(mu,sigma^2))");
   args.addOptionD("","lenSigma","lenSigma",0,"Set sigma^2 (or variance) of log fragment length distribution. (l_frag ~ LogNormal(mu,sigma^2))");
   args.addOptionS("","distributionFile","distributionFileName",0,"Name of file to which read-distribution should be saved.");
   args.addOptionL("P","procN","procN",0,"Maximum number of threads to be used. This provides speedup mostly when using non-uniform read distribution model (i.e. no --uniform flag).",4);
   args.addOptionB("V","veryVerbose","veryVerbose",0,"Very verbose output.");
   args.addOptionL("","noiseMismatches","numNoiseMismatches",0,"Number of mismatches to be considered as noise.",ns_rD::LOW_PROB_MISSES);
   args.addOptionL("l","limitA","maxAlignments",0,"Limit maximum number of alignments per read. (Reads with more alignments are skipped.)");
   args.addOptionB("","unstranded","unstranded",0,"Paired read are not strand specific.");
   args.addOptionB("","show1warning","show1warning",0,"Show first alignments that are considered wrong (TID unknown, TID mismatch, wrong strand).");
   args.addOptionB("","excludeSingletons","excludeSingletons",0,"Exclude single mate alignments for paired-end reads.");
   args.addOptionB("","mateNamesDiffer","mateNamesDiffer",0,"Mates from paired-end reads have different names.");
   if(!args.parse(*argc,argv))return 0;
   if(args.verbose)buildTime(argv[0],__DATE__,__TIME__);
   readD.setProcN(args.getL("procN"));
   if(args.flag("show1warning"))readD.showFirstWarnings();
   // }}}
   if(!ns_parseAlignment::setInputFormat(args, &inFormat))return 1;
   if(!ns_parseAlignment::openSamFile(args.args()[0], inFormat, &samData))return 1;
   if(!ns_parseAlignment::initializeInfoFile(args, samData, &trInfo, &M))return 1;
   // Read expression and initialize transcript sequence {{{
   if(args.verbose)message("Initializing fasta sequence reader.\n");
   // Initialize fasta sequence reader.
   trSeq = new TranscriptSequence();
   if(args.getLowerS("trSeqHeader") == "gencode"){
      trSeq->readSequence(args.getS("trSeqFileName"), GENCODE); 
   }else{
      trSeq->readSequence(args.getS("trSeqFileName"), STANDARD); 
   }
   // Check numbers for transcripts match.
   if(trSeq->getM() != M){
      error("Main: Number of transcripts in the alignment file and the sequence file are different: %ld vs %ld\n",M,trSeq->getM());
      return 1;
   }
   // Check that length of each transcript matches.
   for(i=0;i<M;i++){
      if(trInfo->L(i) != (long)(trSeq->getTr(i).size())){
         error("Main: Transcript info length and sequence length of transcript %ld DO NOT MATCH! (%ld %d)\n",i,trInfo->L(i),(int)(trSeq->getTr(i).size()));
         return 1;
      }
   }
   // If there were gene names in transcript sequence, assign them to transcript info.
   if(trSeq->hasGeneNames() && (trSeq->getG()>1)){
      if(trInfo->getG() == 1){
         // If just one gene present, then assign gene names.
         if(args.verbose)message("Found gene names in sequence file, updating transcript information.\n");
         trInfo->updateGeneNames(trSeq->getGeneNames());
      }else{
         // If there is more than one gene name already, don't fix.
         if(trInfo->getG() != trSeq->getG()){
            warning("Main: Different number of genes detected in transcript information and sequence file (%ld %ld).\n   You might want to check your data.\n", trInfo->getG(), trSeq->getG());
         }
      }
   }
   // If format is GENCODE and transcript names were extracted, update.
   if((args.getLowerS("trSeqHeader") == "gencode")&&(trSeq->hasTrNames())){
      if(args.flag("veryVerbose"))message("Updating transcript names.\n");
      if(!trInfo->updateTrNames(trSeq->getTrNames())){
         if(args.flag("veryVerbose"))warning("Transcript names update failed.\n");
      }
   }
   if(!args.flag("uniform")){
      // Try loading expression file from previous estimation for non-uniform read model.
      if(args.isSet("expFileName")){
         if(args.verbose)message("Loading transcript initial expression data.\n");
         trExp = new TranscriptExpression(args.getS("expFileName"), GUESS);
         if(trExp->getM() != M){
            error("Main: Number of transcripts in the alignment file and the expression file are different: %ld vs %ld\n",M,trExp->getM());
            return 1;
         }
      }
   }
   //}}}
   timer.split(0,'m');
   //}}}

   // Estimating probabilities {{{
   bool analyzeReads = false;

   if(args.isSet("lenMu") && args.isSet("lenSigma")){
      readD.setLength(args.getD("lenMu"),args.getD("lenSigma"));
   }else{
      analyzeReads = true;
   }
   if(args.flag("uniform")){
      if(args.verbose)message("Using uniform read distribution.\n");
      readD.initUniform(M,trInfo,trSeq,args.flag("veryVerbose"));
   }else{
      if(args.verbose)message("Estimating non-uniform read distribution.\n");
      readD.init(M,trInfo,trSeq,trExp,args.flag("unstranded"),args.flag("veryVerbose"));
      if(args.flag("veryVerbose"))message(" ReadDistribution initialization done.\n");
      analyzeReads = true;
   }
   if(args.isSet("numNoiseMismatches")){
      readD.setLowProbMismatches(args.getL("numNoiseMismatches"));
   }
   // fill in "next" fragment:
   // Counters for all, Good Alignments; and weird alignments
   long observeN, pairedGA, firstGA, secondGA, singleGA, weirdGA, allGA, pairedBad;
   bool storedValidA = false;
   long RE_noEndInfo, RE_weirdPairdInfo, RE_nameMismatch;
   long maxAlignments = 0;
   if(args.isSet("maxAlignments") && (args.getL("maxAlignments")>0))
      maxAlignments = args.getL("maxAlignments");
   // start counting (and possibly estimating):
   observeN = pairedGA = firstGA = secondGA = singleGA = weirdGA = pairedBad = 0;
   RE_noEndInfo = RE_weirdPairdInfo = RE_nameMismatch = 0;
   ns_parseAlignment::readNextFragment(samData, curF, nextF);
   while(ns_parseAlignment::readNextFragment(samData,curF,nextF)){
      R_INTERUPT;
      if( !(curF->first->core.flag & BAM_FUNMAP) ){
         // (at least) The first read was mapped.
         if( curF->paired ) {
            // Fragment's both reads are mapped as a pair.
            // Check mates' names.
            if((ns_parseAlignment::readNameCmp(bam1_qname(curF->first), bam1_qname(curF->second))==0) || 
               (args.flag("mateNamesDiffer"))){
               pairedGA++;
            }else{
               pairedBad++;
               if(RE_nameMismatch == 0){
                  warning("Paired read name mismatch: %s %s\n",bam1_qname(curF->first), bam1_qname(curF->second));
               }
               RE_nameMismatch++;
               if(RE_nameMismatch>10)break;
            }
         }else {
            if (curF->first->core.flag & BAM_FPAIRED) {
               // Read was part of pair (meaning that the other is unmapped).
               if (curF->first->core.flag & BAM_FREAD1) {
                  firstGA++;
               } else if (curF->first->core.flag & BAM_FREAD2) {
                  secondGA++;
               } else weirdGA ++;
            } else {
               // Read is single end, with valid alignment.
               singleGA++;
            }
         }
         // Unless pairedBad>0 the alignment is valid.
         // If excludeSingletons is set, only use paired alignment and alignments of single-end reads.
         if((!storedValidA) && 
            (((!args.flag("excludeSingletons")) && (pairedBad == 0)) ||
             (pairedBad + firstGA + secondGA + weirdGA == 0))){
            validAF->copyFragment(curF);
            storedValidA=true;
         }
      }
      // Next fragment is different.
      if(ns_parseAlignment::nextFragDiffers(curF, nextF, args.flag("mateNamesDiffer"))){
         Ntotal++;
         allGA = singleGA + pairedGA + firstGA +secondGA+ weirdGA;
         if( allGA == 0 ){ // No good alignment.
            // Just in case:
            storedValidA=false;
            pairedBad = 0;
            continue;
         }
         Nmap ++;
         if(weirdGA)RE_noEndInfo++;
         if((singleGA>0) && (pairedGA>0)) RE_weirdPairdInfo++;
         // If it's good uniquely aligned fragment/read, add it to the observation.
         if(( allGA == 1) && analyzeReads && (pairedBad == 0) && storedValidA){
            if(readD.observed(validAF))observeN++;
         }else if(maxAlignments && (allGA>maxAlignments)) {
            // This read will be ignored.
            ignoredReads.insert(bam1_qname(curF->first));
            ignoredMaxAlignments++;
            Nmap --;
         }else if(args.flag("excludeSingletons") && (pairedGA + singleGA == 0)){
            // When excluding singletons only alignments of full pair or single-end read count.
            ignoredReads.insert(bam1_qname(curF->first));
            ignoredSingletons++;
            Nmap --;
         }
         pairedGA = firstGA = secondGA = singleGA = weirdGA = pairedBad = 0;
         storedValidA = false;
      }
   }
   if(RE_nameMismatch>10){
      error("Names of paired mates didn't match at least 10 times.\n"
            "   Something is possibly wrong with your data or the reads have to be renamed.\n");
      return 1;
   }
   message("Reads: all(Ntotal): %ld  mapped(Nmap): %ld\n",Ntotal,Nmap);
   if(args.verbose)message("  %ld reads were used to estimate empirical distributions.\n",observeN);
   if(ignoredMaxAlignments>0)message("  %ld reads are skipped due to having more than %ld alignments.\n",ignoredMaxAlignments, maxAlignments);
   if(ignoredSingletons>0)message("  %ld reads skipped due to having just single mate alignments.\n",ignoredSingletons);
   if(RE_noEndInfo)warning("  %ld reads that were paired, but do not have \"end\" information.\n  (is your alignment file valid?)", RE_noEndInfo);
   if(RE_weirdPairdInfo)warning("  %ld reads that were reported as both paired and single end.\n  (is your alignment file valid?)", RE_weirdPairdInfo);
   readD.writeWarnings();
   if(args.flag("veryVerbose"))timer.split(0,'m');
   // Normalize read distribution:
   if(args.flag("veryVerbose"))message("Normalizing read distribution.\n");
   readD.normalize();
   if(args.isSet("distributionFileName")){
      readD.logProfiles(args.getS("distributionFileName"));
   }
   timer.split(0,'m');
   // }}}

   // Writing probabilities: {{{
   // Re-opening alignment file 
   if(!ns_parseAlignment::openSamFile(args.args()[0], inFormat, &samData))return 1;
   if(args.verbose)message("Writing alignment probabilities.\n");
   double prob,probNoise,minProb;
   prob = probNoise = 0;
   set<string> failedReads;
   vector<ns_parseAlignment::TagAlignment> alignments;
   // Open and initialize output file {{{
   ofstream outF(args.getS("outFileName").c_str());
   if(!outF.is_open()){
      error("Main: Unable to open output file.\n");
      return 1;
   }
   outF<<"# Ntotal "<<Ntotal<<"\n# Nmap "<<Nmap<<"\n# M "<<M<<endl;
   outF<<"# LOGFORMAT (probabilities saved on log scale.)\n# r_name num_alignments (tr_id prob )^*{num_alignments}"<<endl;
   outF.precision(9);
   outF<<scientific;
   // }}}
   
   // start reading:
   timer.start(1);
   bool invalidAlignment = false;
   long readC, pairedN, singleN, firstN, secondN, weirdN, invalidN, noN;
   readC = pairedN = singleN = firstN = secondN = weirdN = invalidN = noN = 0;
   RE_nameMismatch = 0 ;
   // fill in "next" fragment:
   ns_parseAlignment::readNextFragment(samData, curF, nextF);
   while(ns_parseAlignment::readNextFragment(samData,curF,nextF)){
      R_INTERUPT;
      // Skip all alignments of this read.
      if(ignoredReads.count(bam1_qname(curF->first))>0){
         DEBUG_AT(" ignore\n");
         // Read reads while the name is the same.
         while(ns_parseAlignment::readNextFragment(samData,curF,nextF)){
            DEBUG_AT(" ignore\n");
            if(ns_parseAlignment::nextFragDiffers(curF, nextF, args.flag("mateNamesDiffer")))
               break;
         }
         readC++;
         if(args.verbose){ if(progressLog(readC,Ntotal,10,' '))timer.split(1,'m');}
         continue;
      }
      if( !(curF->first->core.flag & BAM_FUNMAP) ){
         DEBUG_AT("M");
         // (at least) The first read was mapped.
         // Check mates' names.
         if(curF->paired && (ns_parseAlignment::readNameCmp(bam1_qname(curF->first), bam1_qname(curF->second))!=0) && (!args.flag("mateNamesDiffer"))){
            if(RE_nameMismatch == 0){
               warning("Paired read name mismatch: %s %s\n",bam1_qname(curF->first), bam1_qname(curF->second));
            }
            RE_nameMismatch++;
            if(RE_nameMismatch>10)break;
            invalidAlignment = true;
         }else if((!args.flag("excludeSingletons")) || curF->paired || (! (curF->first->core.flag & BAM_FPAIRED))){
            // We only calculate probabilties and add alignments if: 
            // (singletons are not exlucded) OR  (it is a proper paired alignments) OR (it is single-end read)
            if(readD.getP(curF, prob, probNoise)){
               // We calculated valid probabilities for this alignment.   
               // Add alignment:
               alignments.push_back(ns_parseAlignment::TagAlignment(curF->first->core.tid+1, prob, probNoise));
               // Update counters:
               if( curF->paired ) {
                  // Fragment's both reads are mapped as a pair.
                  pairedN++;
                  DEBUG_AT(" P\n");
               }else {
                  if (curF->first->core.flag & BAM_FPAIRED) {
                     // Read was part of pair (meaning that the other is unmapped).
                     if (curF->first->core.flag & BAM_FREAD1) {
                        firstN++;
                        DEBUG_AT(" 1\n");
                     } else if (curF->first->core.flag & BAM_FREAD2) {
                        secondN++;
                        DEBUG_AT(" 2\n");
                     } else {
                        weirdN ++;
                        DEBUG_AT(" W\n");
                     }
                  } else {
                     // Read is single end, with valid alignment.
                     singleN++;
                     DEBUG_AT(" S\n");
                  }
               }
            } else {
               // Calculation of alignment probabilities failed.
               invalidAlignment = true;
            }
         }
      }else DEBUG_AT("UNMAP\n");
      // next fragment has different name
      if(ns_parseAlignment::nextFragDiffers(curF, nextF, args.flag("mateNamesDiffer"))){
         DEBUG_AT("  last\n");
         readC++;
         if(args.verbose){ if(progressLog(readC,Ntotal,10,' '))timer.split(1,'m');}
         if(!alignments.empty()){
            outF<<bam1_qname(curF->first)<<" "<<alignments.size()+1;
            minProb = 1;
            for(i=0;i<(long)alignments.size();i++){
               if(minProb>alignments[i].getLowProb())minProb = alignments[i].getLowProb();
               outF<<" "<<alignments[i].getTrId()
//                   <<" "<<getStrandC(alignments[i].getStrand())
                   <<" "<<alignments[i].getProb();
            }
            outF<<" 0 "<<minProb<<endl;
            alignments.clear();
         }else{
            // read has no valid alignments:
            if(invalidAlignment){
               // If there were invalid alignments, write a mock record in order to keep Nmap consistent.
               invalidN++;
               outF<<bam1_qname(curF->first)<<" 1 0 0"<<endl;
            }else {
               noN++;
            }
            if(args.isSet("failed")){
               // Save failed reads.
               failedReads.insert(bam1_qname(curF->first));
               if(curF->paired)failedReads.insert(bam1_qname(curF->second));
            }
         }
         invalidAlignment = false;
      }
   }
   if(RE_nameMismatch>10){
      error("Names of paired mates didn't match at least 10 times.\n"
            "   Something is possibly wrong with your data or the reads have to be renamed.\n");
      return 1;
   }
   outF.close();
   timer.split(0,'m');
   if(args.verbose){
      message("Analyzed %ld reads:\n",readC);
      if(ignoredMaxAlignments>0)message(" %ld ignored due to --limitA flag\n",ignoredMaxAlignments);
      if(invalidN>0)message(" %ld had only invalid alignments (see warnings)\n",invalidN);
      if(noN>0)message(" %ld had no alignments\n",noN);
      message("The rest had %ld alignments:\n",pairedN+singleN+firstN+secondN+weirdN);
      if(pairedN>0)message(" %ld paired alignments\n",pairedN);
      if(firstN+secondN+weirdN>0)
         message(" %ld half alignments (paired-end mates aligned independently)\n",firstN+secondN+weirdN);
      if(singleN>0)message(" %ld single-read alignments\n",singleN);
      //flushStdout();
      messageFlush();
   }else {
      messageF("Alignments: %ld.\n",pairedN+singleN+firstN+secondN+weirdN);
   }
   readD.writeWarnings();
   if(args.flag("veryVerbose")){
      message("Number of weights cached: %ld\n",readD.getWeightNormCount());
   }
   // Deal with reads that failed to align {{{
   if(args.isSet("failed")){
      outF.open(args.getS("failed").c_str());
      if(outF.is_open()){
         for(set<string>::iterator setIt=failedReads.begin(); setIt!=failedReads.end();setIt++)
            outF<<*setIt<<endl;
         outF.close();
      }
   } //}}}
   // Compute effective length and save transcript info {{{
   if(args.isSet("trInfoFileName")){
      if(args.verbose)messageF("Computing effective lengths.\n");
      trInfo->setEffectiveLength(readD.getEffectiveLengths());
      if(! trInfo->writeInfo(args.getS("trInfoFileName"))){
         warning("Main: File %s probably already exists.\n"
                 "   Will save new transcript info into %s-NEW.\n",(args.getS("trInfoFileName")).c_str(),(args.getS("trInfoFileName")).c_str());
         if(! trInfo->writeInfo(args.getS("trInfoFileName")+"-NEW", true)){ // DO OVERWRITE
            warning("Main: Writing into %s failed!.",(args.getS("trInfoFileName")+"-NEW").c_str());
         }
      }else {
         if(args.verbose)message("Transcript information saved into %s.\n",(args.getS("trInfoFileName")).c_str());
      }
      if(args.verbose)timer.split(0,'m');
   } //}}}
   // Close, free and write failed reads if filename provided {{{
   delete curF;
   delete nextF;
   delete validAF;
   delete trInfo;
   delete trSeq;
   delete trExp;
   samclose(samData);
   // }}}
   // }}}
   if(args.verbose)message("DONE. ");
   timer.split(7,'m');
   return 0;
}

#ifndef BIOC_BUILD
int main(int argc,char* argv[]){
   return parseAlignment(&argc,argv);
}
#endif

namespace ns_parseAlignment {

bool nextFragDiffers(const ns_rD::fragmentP curF, const ns_rD::fragmentP nextF, bool mateNamesDiffer){//{{{
   if(readNameCmp(bam1_qname(curF->first), bam1_qname(nextF->first))==0) return false;
   if(nextF->paired && mateNamesDiffer && (readNameCmp(bam1_qname(curF->first), bam1_qname(nextF->second))==0)) return false;
   return true;
}//}}}

long readNameCmp(const char *str1, const char *str2){//{{{
   // Check first character(so that we can look back later).
   if(*str1 != *str2)return *str1 - *str2;
   while(*str1 || *str2){
      if(*str1 != *str2){
         // They can differ in last character if its preceeeded by /:_.
         if(*str1 && *str2 && (!*(str1+1)) && (!*(str2+1)) && 
            ((*(str1-1) == '/') || (*(str1-1) == ':') || (*(str1-1) == '_'))){
            return 0;
         }
         return *str1 - *str2;
      }
      str1++;
      str2++;
   }
   return 0;
}//}}}

bool readNextFragment(samfile_t* samData, ns_rD::fragmentP &cur, ns_rD::fragmentP &next){//{{{
   static ns_rD::fragmentP tmpF = NULL;
   bool currentOK = true;
   // switch current to next:
   tmpF = cur;
   cur = next;
   next = tmpF;
   // check if current fragment is valid
   if( !cur->first->data || ( *(cur->first->data) == '\0')){
      // current fragment is invalid
      currentOK = false;
   }
   // try reading next fragment:
   if(samread(samData,next->first)<0){
      // read failed: set next reads name to empty string
      *(next->first->data) = '\0';
      return currentOK;
   }
   // Read proper pairs OR pairs with both mates unmapped into one fragment.
   if((next->first->core.flag & BAM_FPROPER_PAIR) ||
      ((next->first->core.flag & BAM_FPAIRED) &&
       (next->first->core.flag & BAM_FUNMAP) &&
       (next->first->core.flag & BAM_FMUNMAP))){
      next->paired = true;
      // Try reading second mate.
      if(samread(samData,next->second)<0) next->paired = false;
   }else{
      next->paired = false;
   }
   /* Note:
    * Relying on BAM_FREAD2 as being the last read of template probably does not work.
    */
   return currentOK;
}//}}}

bool setInputFormat(const ArgumentParser &args, string *format){//{{{
   if(args.isSet("format")){
      *format = args.getLowerS("format");
      if((*format =="sam")||(*format == "bam")){
         return true;
      }
      warning("Unknown format '%s'.\n",format->c_str());
   }
   string fileName = args.args()[0];
   string extension = fileName.substr(fileName.rfind(".")+1);
   *format = ns_misc::toLower(extension);
   if((*format =="sam")||(*format == "bam")){
      if(args.verb())message("Assuming alignment file in '%s' format.\n",format->c_str());
      return true;
   }
   message("Unknown extension '%s'.\n",extension.c_str());
   error("Couldn't determine the type of input file, please use --format and check your input.\n");
   return false;
}//}}}

bool openSamFile(const string &name, const string &inFormat, samfile_t **samFile){//{{{
   if(*samFile != NULL)samclose(*samFile);
   if(inFormat=="bam") *samFile = samopen(name.c_str(), "rb" , NULL);
   else *samFile = samopen(name.c_str(), "r" , NULL);
   if(*samFile == NULL){
      error("Failed re-reading alignments.\n");
      return false;
   }
   return true;
}//}}}

bool initializeInfoFile(const ArgumentParser &args, samfile_t *samFile, TranscriptInfo **trInfo, long *M){//{{{
   if((samFile->header == NULL)||(samFile->header->n_targets == 0)){
      if(! args.isSet("trInfoFileName")){
         error("Main: alignment file does not contain header, or the header is empty.\n"
               "  Please either include header in alignment file or provide transcript information file.\n"
               "  (option --trInfoFile, file should contain lines with <gene name> <transcript name> <transcript length>.\n");
         return false;
      }else{
         if(args.verb())message("Using %s for transcript information.\n",(args.getS("trInfoFileName")).c_str());
         if((*trInfo = new TranscriptInfo(args.getS("trInfoFileName"))) && (*trInfo)->isOK()){
            *M=(*trInfo)->getM();
         }else {
            error("Main: Can't get transcript information.\n");
            return false;
         }
      }
   }else{
      if(args.verbose)message("Using alignments' header for transcript information.\n");
      *M = samFile->header->n_targets;
      vector<string> trNames(*M);
      vector<long> trLengths(*M);
      for(long i=0;i<*M;i++){
         trNames[i] = samFile->header->target_name[i];
         trLengths[i] = samFile->header->target_len[i]; 
      }
      *trInfo = new TranscriptInfo();
      if(! (*trInfo)->setInfo(vector<string>(*M,"none"), trNames, trLengths)){
         error("TranscriptInfo not initialized.\n");
         return false;
      }
   }
   return true;
}//}}}

} // namespace ns_parseAlignment