1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
#include <iostream>
#include <map>
#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/stream.h>
#include <seqan/bam_io.h>
// USAGE: error_rate_from_sam IN.sam > OUT.txt
// There must only be one alignment per read.
//
// We ignore unaligned reads, and just count them.
int main(int argc, char const ** argv)
{
using namespace seqan2;
// Checking command line parameters.
if (argc != 2)
{
std::cerr << "USAGE: error_rate IN.sam > OUT.txt\n";
return 1;
}
// Opening file and record reader.
std::fstream in(argv[1], std::ios::in | std::ios::binary);
RecordReader<std::fstream, SinglePass<> > reader(in);
// Allocating BAM header, reference name store, and cache.
typedef StringSet<CharString> TNameStore;
typedef NameStoreCache<TNameStore> TNameStoreCache;
TNameStore refNameStore;
TNameStoreCache refNameStoreCache(refNameStore);
BamIOContext<TNameStore> context(refNameStore, refNameStoreCache);
// Read header.
BamHeader header;
int res = 0;
res = readRecord(header, context, reader, Sam());
if (res != 0)
{
std::cerr << "Could not read SAM header!\n";
return 1;
}
// Check that the SAM file is sorted by QNAME.
CharString sortOrder;
for (unsigned i = 0; i < length(header.records); ++i)
{
if (header.records[i].type != BAM_HEADER_FIRST)
continue;
unsigned idx = 0;
if (findTagKey(idx, "SO", header.records[i]))
sortOrder = header.records[i].tags[idx].i2;
}
if (sortOrder != "queryname")
{
std::cerr << "SAM file not sorted by 'queryname'!\n";
return 1;
}
// Allocate data structures for counting / histogram building.
unsigned totalBaseCount = 0; // Number of bases read.
unsigned totalErrorCount = 0; // Number of errors read.
unsigned totalReadCount = 0; // Number of reads read.
unsigned totalErrorneousReadCount = 0; // Number of reads with errors read, excluding unaligned reads.
unsigned totalUnalignedReadCount = 0; // Number of reads without alignments.
std::map<unsigned, unsigned> histo; // Histogram error count -> num occurrences.
// Read records
BamAlignmentRecord record;
CharString previousQName;
bool previousIsFirst = false;
while (!atEnd(reader))
{
res = readRecord(record, context, reader, Sam());
if (res != 0)
{
std::cerr << "Error reading SAM record!\n";
return 1;
}
// Skip secondary records.
if (hasFlagSecondary(record))
continue;
// Skip non-aligning records.
if (hasFlagUnaligned(record))
continue;
// Check that this is not a duplicate non-secondary record.
if (record.qName == previousQName && hasFlagFirst(record) == previousIsFirst)
{
std::cerr << "ERROR: Duplicate non-secondary record for " << record.qName << "\n";
return 1;
}
BamTagsDict bamTags(record.tags);
// Counter: Total reads, unaligned reads.
if (record.rId == BamAlignmentRecord::INVALID_REFID)
{
totalUnalignedReadCount += 1;
continue;
}
totalReadCount += 1;
// Get tag with edit distance, must be present for aligned reads.
unsigned idx = 0;
if (!findTagKey(idx, bamTags, "NM"))
{
std::cerr << "ERROR: Could not find NM tag!\n";
return 1;
}
int editDistance = 0;
if (!extractTagValue(editDistance, bamTags, idx))
{
std::cerr << "ERROR: Could not cast NM tag to int!\n";
return 1;
}
// Count: Reads with errors.
totalErrorneousReadCount += (editDistance != 0);
// Count: Bases.
totalBaseCount += length(record.seq);
totalErrorCount += editDistance;
// Register with histogram.
histo[editDistance] += 1;
// Update previous QNAME and is-first flag.
previousQName = record.qName;
previousIsFirst = hasFlagFirst(record);
}
// Print results.
// The quick stats are what we need for the table in the paper.
std::cout << "QUICK STATS\n"
<< "by base %\tby read %\n";
fprintf(stdout, "%5.2f\t\t%5.2f\n\n", 100.0 * totalErrorCount / totalBaseCount, 100.0 * totalErrorneousReadCount / totalReadCount);
// Print detailed statistics and histogram.
std::cout << "STATISTICS\n"
<< "total read count " << totalReadCount << "\t\t(excludes unaligned reads)\n"
<< "unaligned read count " << totalUnalignedReadCount << "\n"
<< "erroneous read count " << totalErrorneousReadCount << "\n"
<< "per read error rate " << 100.0 * totalErrorneousReadCount / totalReadCount << "\n"
<< "\n"
<< "total bases " << totalBaseCount << "\n"
<< "total errors " << totalErrorCount << "\n"
<< "per base error rate " << 100.0 * totalErrorCount / totalBaseCount << "\n"
<< "\n"
<< "HISTOGRAM\n"
<< "errors\tcount\t\tpercent\n";
for (std::map<unsigned, unsigned>::const_iterator it = histo.begin(); it != histo.end(); ++it)
{
fprintf(stdout, "%3d\t%12u\t%5.2f\n", it->first, it->second, 100.0 * it->second / totalReadCount);
}
return 0;
}
|