File: SamToM4.cpp

package info (click to toggle)
blasr 5.3.5%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,196 kB
  • sloc: cpp: 8,412; ansic: 806; python: 331; sh: 178; java: 158; makefile: 36
file content (221 lines) | stat: -rw-r--r-- 8,236 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/*
 * =====================================================================================
 *
 *       Filename:  SamToM4.cpp
 *
 *    Description:  Convert a sam file to a blasr m4 file.
 *
 *        Version:  1.0
 *        Created:  04/03/2013 01:19:43 PM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  Yuan Li (yli), yli@pacificbiosciences.com
 *        Company:  Pacific Biosciences
 *
 * =====================================================================================
 */

#include <iostream>

#include <alignment/algorithms/alignment/AlignmentUtils.hpp>
#include <alignment/algorithms/alignment/DistanceMatrixScoreFunction.hpp>
#include <alignment/datastructures/alignment/AlignmentCandidate.hpp>
#include <alignment/datastructures/alignment/SAMToAlignmentCandidateAdapter.hpp>
#include <alignment/format/IntervalPrinter.hpp>
#include <pbdata/ChangeListID.hpp>
#include <pbdata/CommandLineParser.hpp>
#include <pbdata/FASTAReader.hpp>
#include <pbdata/FASTASequence.hpp>
#include <pbdata/sam/SAMReader.hpp>

char VERSION[] = "v0.1.0";
char PERFORCE_VERSION_STRING[] = "$Change: 126414 $";

int main(int argc, char* argv[])
{
    std::string program = "samtom4";
    std::string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    std::string samFileName, refFileName, outFileName;
    bool printHeader = false;
    bool parseSmrtTitle = false;
    bool useShortRefName = false;

    CommandLineParser clp;
    clp.SetProgramName(program);
    clp.SetVersion(versionString);
    clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format.");
    clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file, which is produced by blasr.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate file.sam.");
    clp.RegisterStringOption("out.m4", &outFileName, "Output in blasr M4 format.");
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterFlagOption("header", &printHeader, "Print M4 header.");
    clp.RegisterFlagOption("useShortRefName", &useShortRefName,
                           "Use abbreviated reference names obtained "
                           "from file.sam instead of using full names "
                           "from reference.fasta.");
    //clp.SetExamples(program + " file.sam reference.fasta out.m4");

    clp.ParseCommandLine(argc, argv);

    std::ostream* outFilePtr = &std::cout;
    std::ofstream outFileStrm;
    if (outFileName != "") {
        CrucialOpen(outFileName, outFileStrm, std::ios::out);
        outFilePtr = &outFileStrm;
    }

    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    std::string command;
    CommandLineParser::CommandLineToString(argc, argv, command);

    //
    // Read necessary input.
    //
    std::vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    samReader.ReadHeader(alignmentSet);

    //
    // The order of references in std::vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that it is ordered in
    // exactly the same way as std::vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    //
    // Map short names for references obtained from file.sam to
    // full names obtained from reference.fasta
    //
    std::map<std::string, std::string> shortRefNameToFull;
    std::map<std::string, std::string>::iterator it;
    assert(references.size() == alignmentSet.references.size());
    if (!useShortRefName) {
        for (size_t i = 0; i < references.size(); i++) {
            std::string shortRefName = alignmentSet.references[i].GetSequenceName();
            std::string fullRefName(references[i].title);
            if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
                std::cout << "ERROR, Found more than one reference " << shortRefName
                          << "in sam header" << std::endl;
                std::exit(EXIT_FAILURE);
            }
            shortRefNameToFull[shortRefName] = fullRefName;
            alignmentSet.references[i].sequenceName = fullRefName;
        }
    }

    // Map reference name obtained from SAM file to indices
    std::map<std::string, int> refNameToIndex;
    for (size_t i = 0; i < references.size(); i++) {
        std::string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    size_t alignIndex = 0;

    //
    // For 150K, each chip produces about 300M sequences
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can
    // fit in the memory.
    // Need to scale for larger sequal data in the future.
    //
    if (printHeader) IntervalOutput::PrintHeader(*outFilePtr);

    // The socre matrix does not matter because we will use the
    // aligner's score from SAM file anyway.
    DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn;

    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (!useShortRefName) {
            //convert shortRefName to fullRefName
            it = shortRefNameToFull.find(samAlignment.rName);
            if (it == shortRefNameToFull.end()) {
                std::cout << "ERROR, Could not find " << samAlignment.rName
                          << " in the reference repository." << std::endl;
                std::exit(EXIT_FAILURE);
            }
            samAlignment.rName = (*it).second;
        }

        // The padding character 'P' is not supported
        if (samAlignment.cigar.find('P') != std::string::npos) {
            std::cout << "WARNING. Could not process sam record with 'P' in its cigar string."
                      << std::endl;
            continue;
        }

        std::vector<AlignmentCandidate<> > convertedAlignments;

        //
        // Keep reference as forward.
        // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse
        // and tStrand is forward.
        //
        bool keepRefAsForward = false;

        SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments,
                                  parseSmrtTitle, keepRefAsForward);

        if (convertedAlignments.size() > 1) {
            std::cout << "WARNING. Ignore an alignment which has multiple segments." << std::endl;
            continue;
        }

        //all alignments are unique single-ended alignments.
        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<>& alignment = convertedAlignments[i];

            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq,
                                  distScoreFn);

            // Use aligner's score from SAM file anyway.
            alignment.score = samAlignment.as;
            alignment.mapQV = samAlignment.mapQV;

            // Since SAM only has the aligned sequence, many info of the
            // original query (e.g. the full length) is missing.
            // Overwrite alignment.qLength (which is length of the query
            // in the SAM alignment) with xq (which is the length of the
            // original query sequence saved by blasr) right before printing
            // the output so that one can reconstruct a blasr m4 record from
            // a blasr sam alignment.
            if (samAlignment.xq != 0) alignment.qLength = samAlignment.xq;

            IntervalOutput::PrintFromSAM(alignment, *outFilePtr);

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

    if (outFileName != "") {
        outFileStrm.close();
    }
    return 0;
}