File: SamToCmpH5.cpp

package info (click to toggle)
blasr 5.3.5%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,196 kB
  • sloc: cpp: 8,412; ansic: 806; python: 331; sh: 178; java: 158; makefile: 36
file content (208 lines) | stat: -rw-r--r-- 8,998 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#include <iostream>

#include <alignment/datastructures/alignment/AlignmentCandidate.hpp>
#include <alignment/datastructures/alignment/SAMToAlignmentCandidateAdapter.hpp>
#include <alignment/datastructures/alignmentset/AlignmentSetToCmpH5Adapter.hpp>
#include <alignment/format/StickAlignmentPrinter.hpp>
#include <hdf/HDFCmpFile.hpp>
#include <pbdata/ChangeListID.hpp>
#include <pbdata/CommandLineParser.hpp>
#include <pbdata/FASTAReader.hpp>
#include <pbdata/FASTASequence.hpp>
#include <pbdata/sam/SAMReader.hpp>
#include <pbdata/utils/TimeUtils.hpp>

char VERSION[] = "v1.0.0";
char PERFORCE_VERSION_STRING[] = "$Change: 141782 $";

int main(int argc, char* argv[])
{
    std::string program = "samtoh5";
    std::string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);
    std::string samFileName, cmpFileName, refFileName;
    bool parseSmrtTitle = false;
    bool useShortRefName = false;
    bool copyQVs = false;
    CommandLineParser clp;
    std::string readType = "standard";
    int verbosity = 0;

    clp.SetProgramName(program);
    clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file.");
    clp.SetVersion(versionString);

    clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file.", true);
    clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads.",
                             true);
    clp.RegisterStringOption("out.cmp.h5", &cmpFileName, "Output cmp.h5 file.", true);
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle,
                           "Use this option when converting alignments "
                           "generated from reads produced by the "
                           "pls2fasta from bas.h5 files by parsing read "
                           "coordinates from the SMRT read title.  The title "
                           "is in the format /name/hole/coordinates, where "
                           "coordinates are in the format \\d+_\\d+, and "
                           "represent the interval of the read that was "
                           "aligned.");
    clp.RegisterStringOption("readType", &readType,
                             "Set the read type: 'standard', 'strobe', 'CCS', "
                             "or 'cDNA'");
    clp.RegisterIntOption("verbosity", &verbosity, "Set desired verbosity.",
                          CommandLineParser::PositiveInteger);
    clp.RegisterFlagOption("useShortRefName", &useShortRefName,
                           "Use abbreviated reference names obtained "
                           "from file.sam instead of using full names "
                           "from reference.fasta.");
    clp.RegisterFlagOption("copyQVs", &copyQVs,
                           "Copy all QVs available in the SAM file into the "
                           "cmp.h5 file. This includes things like InsertionQV "
                           "and DeletionTag.");
    std::string description =
        ("Because SAM has optional tags that have different "
         "meanings in different programs, careful usage is required in order to "
         "have proper output. The \"xs\" tag in bwa-sw is used to show the "
         "suboptimal score, but in PacBio SAM (blasr) it is defined as the start "
         "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is "
         "specified, the xs tag is ignored, but when it is not specified, the "
         "coordinates given by the xs and xe tags are used to define the interval "
         "of a read that is aligned. The CIGAR string is relative to this interval.");
    clp.SetExamples(description);

    clp.ParseCommandLine(argc, argv);

    if (readType != "standard" and readType != "strobe" and readType != "cDNA" and
        readType != "CCS") {
        std::cout << "ERROR. Read type '" << readType
                  << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." << std::endl;
        std::exit(EXIT_FAILURE);
    }

    std::cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << std::endl;

    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader;
    FASTAReader fastaReader;
    HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile;

    //
    // Initialize input/output files.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);
    cmpFile.Create(cmpFileName);

    //
    // Configure the file log.
    //
    std::string command;
    CommandLineParser::CommandLineToString(argc, argv, command);
    std::string log = "Convert sam to cmp.h5";
    cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString);

    //
    // Set the readType
    //
    cmpFile.SetReadType(readType);

    //
    // Read necessary input.
    //

    std::vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    //
    // This should probably be handled by the alignmentSetAdapter, but
    // time constraints...
    //
    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet;
    samReader.ReadHeader(alignmentSet);

    //
    // The order of references in std::vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that it is ordered in
    // exactly the same way as std::vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    //
    // Always recompute the MD5 values even if they exist in the input
    // sam file. Because MD5 is defined differently in sam and cmp.h5 files.
    // The SAM convention uppercases and normalizes before computing the MD5.
    // For cmp.h5, we compute the MD5 on the sequence 'as is'.
    //
    for (size_t i = 0; i < alignmentSet.references.size(); i++) {
        MakeMD5((const char*)&references[i].seq[0], (unsigned int)references[i].length,
                alignmentSet.references[i].md5);
    }

    //
    // Map short names for references obtained from file.sam to full names obtained from reference.fasta
    //
    std::map<std::string, std::string> shortRefNameToFull;
    std::map<std::string, std::string>::iterator it;
    assert(references.size() == alignmentSet.references.size());
    if (!useShortRefName) {
        for (size_t i = 0; i < references.size(); i++) {
            std::string shortRefName = alignmentSet.references[i].GetSequenceName();
            std::string fullRefName(references[i].title);
            if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
                std::cout << "ERROR, Found more than one reference " << shortRefName
                          << "in sam header" << std::endl;
                std::exit(EXIT_FAILURE);
            }
            shortRefNameToFull[shortRefName] = fullRefName;
            alignmentSet.references[i].sequenceName = fullRefName;
        }
    }

    //
    // Start setting up the cmp.h5 file.
    //
    AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > >
        alignmentSetAdapter;
    alignmentSetAdapter.Initialize();
    alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile);

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }
        if (!useShortRefName) {
            //convert shortRefName to fullRefName
            it = shortRefNameToFull.find(samAlignment.rName);
            if (it == shortRefNameToFull.end()) {
                std::cout << "ERROR, Could not find " << samAlignment.rName
                          << " in the reference repository." << std::endl;
                std::exit(EXIT_FAILURE);
            }
            samAlignment.rName = (*it).second;
        }
        std::vector<AlignmentCandidate<> > convertedAlignments;
        if (verbosity > 0) {
            std::cout << "Storing alignment for " << samAlignment.qName << std::endl;
        }
        SAMAlignmentsToCandidates(samAlignment,
                                  // Order of references and alignmentSetAdapter.RefInfoGroup
                                  // should be exactly the same.
                                  references, alignmentSetAdapter.refNameToRefInfoIndex,
                                  convertedAlignments, parseSmrtTitle, false, copyQVs);

        // -1: moleculeID will be computed dynamically.
        // o.w., the value will be assigned as moleculeID.
        alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, -1, copyQVs);

        for (size_t a = 0; a < convertedAlignments.size(); a++) {
            convertedAlignments[a].FreeSubsequences();
        }
    }

    std::cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << std::endl;
    return 0;
}