File: error_rate_from_sam.cpp

package info (click to toggle)
seqan2 2.5.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 228,748 kB
  • sloc: cpp: 257,602; ansic: 91,967; python: 8,326; sh: 1,056; xml: 570; makefile: 229; awk: 51; javascript: 21
file content (171 lines) | stat: -rw-r--r-- 5,542 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#include <iostream>
#include <map>

#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/stream.h>
#include <seqan/bam_io.h>

// USAGE: error_rate_from_sam IN.sam > OUT.txt

// There must only be one alignment per read.
//
// We ignore unaligned reads, and just count them.

int main(int argc, char const ** argv)
{
    using namespace seqan2;

    // Checking command line parameters.

    if (argc != 2)
    {
        std::cerr << "USAGE: error_rate IN.sam > OUT.txt\n";
        return 1;
    }

    // Opening file and record reader.

    std::fstream in(argv[1], std::ios::in | std::ios::binary);
    RecordReader<std::fstream, SinglePass<> > reader(in);

    // Allocating BAM header, reference name store, and cache.

    typedef StringSet<CharString>      TNameStore;
    typedef NameStoreCache<TNameStore> TNameStoreCache;

    TNameStore refNameStore;
    TNameStoreCache refNameStoreCache(refNameStore);
    BamIOContext<TNameStore> context(refNameStore, refNameStoreCache);

    // Read header.

    BamHeader header;
    int res = 0;
    res = readRecord(header, context, reader, Sam());
    if (res != 0)
    {
        std::cerr << "Could not read SAM header!\n";
        return 1;
    }

    // Check that the SAM file is sorted by QNAME.
    CharString sortOrder;
    for (unsigned i = 0; i < length(header.records); ++i)
    {
        if (header.records[i].type != BAM_HEADER_FIRST)
            continue;
        unsigned idx = 0;
        if (findTagKey(idx, "SO", header.records[i]))
            sortOrder = header.records[i].tags[idx].i2;
    }
    if (sortOrder != "queryname")
    {
        std::cerr << "SAM file not sorted by 'queryname'!\n";
        return 1;
    }

    // Allocate data structures for counting / histogram building.
    unsigned totalBaseCount = 0;            // Number of bases read.
    unsigned totalErrorCount = 0;           // Number of errors read.
    unsigned totalReadCount = 0;            // Number of reads read.
    unsigned totalErrorneousReadCount = 0;  // Number of reads with errors read, excluding unaligned reads.
    unsigned totalUnalignedReadCount = 0;   // Number of reads without alignments.
    std::map<unsigned, unsigned> histo;     // Histogram error count -> num occurrences.

    // Read records

    BamAlignmentRecord record;
    CharString previousQName;
    bool previousIsFirst = false;

    while (!atEnd(reader))
    {
        res = readRecord(record, context, reader, Sam());
        if (res != 0)
        {
            std::cerr << "Error reading SAM record!\n";
            return 1;
        }

        // Skip secondary records.
        if (hasFlagSecondary(record))
            continue;

        // Skip non-aligning records.
        if (hasFlagUnaligned(record))
            continue;

        // Check that this is not a duplicate non-secondary record.
        if (record.qName == previousQName && hasFlagFirst(record) == previousIsFirst)
        {
            std::cerr << "ERROR: Duplicate non-secondary record for " << record.qName << "\n";
            return 1;
        }

        BamTagsDict bamTags(record.tags);

        // Counter: Total reads, unaligned reads.
        if (record.rId == BamAlignmentRecord::INVALID_REFID)
        {
            totalUnalignedReadCount += 1;
            continue;
        }
        totalReadCount += 1;

        // Get tag with edit distance, must be present for aligned reads.
        unsigned idx = 0;
        if (!findTagKey(idx, bamTags, "NM"))
        {
            std::cerr << "ERROR: Could not find NM tag!\n";
            return 1;
        }
        int editDistance = 0;
        if (!extractTagValue(editDistance, bamTags, idx))
        {
            std::cerr << "ERROR: Could not cast NM tag to int!\n";
            return 1;
        }

        // Count: Reads with errors.
        totalErrorneousReadCount += (editDistance != 0);

        // Count: Bases.
        totalBaseCount += length(record.seq);
        totalErrorCount += editDistance;

        // Register with histogram.
        histo[editDistance] += 1;

        // Update previous QNAME and is-first flag.
        previousQName = record.qName;
        previousIsFirst = hasFlagFirst(record);
    }

    // Print results.

    // The quick stats are what we need for the table in the paper.
    std::cout << "QUICK STATS\n"
              << "by base %\tby read %\n";
    fprintf(stdout, "%5.2f\t\t%5.2f\n\n", 100.0 * totalErrorCount / totalBaseCount, 100.0 * totalErrorneousReadCount / totalReadCount);

    // Print detailed statistics and histogram.
    std::cout << "STATISTICS\n"
              << "total read count      " << totalReadCount << "\t\t(excludes unaligned reads)\n"
              << "unaligned read count  " << totalUnalignedReadCount << "\n"
              << "erroneous read count " << totalErrorneousReadCount << "\n"
              << "per read error rate   " << 100.0 * totalErrorneousReadCount / totalReadCount << "\n"
              << "\n"
              << "total bases           " << totalBaseCount << "\n"
              << "total errors          " << totalErrorCount << "\n"
              << "per base error rate   " << 100.0 * totalErrorCount / totalBaseCount << "\n"
              << "\n"
              << "HISTOGRAM\n"
              << "errors\tcount\t\tpercent\n";
    for (std::map<unsigned, unsigned>::const_iterator it = histo.begin(); it != histo.end(); ++it)
    {
        fprintf(stdout, "%3d\t%12u\t%5.2f\n", it->first, it->second, 100.0 * it->second / totalReadCount);
    }

    return 0;
}