1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
|
/*****************************************************************************
NewGenomeFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#include "NewGenomeFile.h"
#include "ParseTools.h"
#include "Tokenizer.h"
NewGenomeFile::NewGenomeFile(const string &genomeFilename)
: _maxId(-1)
{
_genomeFileName = genomeFilename;
loadGenomeFileIntoMap();
}
NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector)
: _maxId(-1)
{
size_t i = 0;
for (; i < refVector.size(); ++i) {
string chrom = refVector[i].RefName;
CHRPOS length = refVector[i].RefLength;
_maxId++;
_chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId);
_chromList.push_back(chrom);
}
// Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string.
// Add in an empty chrom so these don't error.
_maxId++;
_chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId);
_chromList.push_back("");
}
// Destructor
NewGenomeFile::~NewGenomeFile(void) {
}
void NewGenomeFile::loadGenomeFileIntoMap() {
ifstream genFile(_genomeFileName.c_str());
if (!genFile.good()) {
cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl;
exit(1);
}
string sLine;
Tokenizer fieldTokens;
CHRPOS chrSize = 0;
string chrName;
while (!genFile.eof()) {
sLine.clear();
chrSize = 0;
chrName.clear();
getline(genFile, sLine);
int numFields = fieldTokens.tokenize(sLine.c_str());
if (numFields != 2) {
continue;
}
chrName = fieldTokens.getElem(0);
chrSize = str2chrPos(fieldTokens.getElem(1));
_maxId++;
_chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId);
_startOffsets.push_back(_genomeLength);
_genomeLength += chrSize;
_chromList.push_back(chrName);
}
if (_maxId == -1) {
cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl;
exit(1);
}
// Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string.
// Add in an empty chrom so these don't error.
_maxId++;
_chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId);
_chromList.push_back("");
_startOffsets.push_back(_genomeLength); //insert the final length as the last element
//to help with the lower_bound call in the projectOnGenome method.
genFile.close();
}
bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, string &chrom, CHRPOS &start) {
// search for the chrom that the position belongs on.
// add 1 to genome position b/c of zero-based, half open.
vector<CHRPOS>::const_iterator low =
lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
// use the iterator to identify the appropriate index
// into the chrom name and start vectors
CHRPOS i = CHRPOS(low-_startOffsets.begin());
if (i >= _chromList.size()) {
return false; //position not on genome
}
chrom = _chromList[i - 1];
start = genome_pos - _startOffsets[i - 1];
return true;
}
CHRPOS NewGenomeFile::getChromSize(const string &chrom) {
if (chrom == _currChromName) {
return _currChromSize;
}
lookupType::const_iterator iter= _chromSizeIds.find(chrom);
if (iter != _chromSizeIds.end()) {
_currChromName = iter->first;
_currChromSize = iter->second.first;
_currChromId = iter->second.second;
return _currChromSize;
}
cerr << "Error: chromosome " << chrom << " is not in the genome file " << _genomeFileName << ". Exiting." << endl;
return INT_MAX;
}
CHRPOS NewGenomeFile::getChromSize(const string &chrom) const {
if (chrom == _currChromName) {
return _currChromSize;
}
lookupType::const_iterator iter= _chromSizeIds.find(chrom);
if (iter != _chromSizeIds.end()) {
return iter->second.first;
}
cerr << "Error: chromosome " << chrom << " is not in the genome file " << _genomeFileName << ". Exiting." << endl;
return INT_MAX;
}
CHRPOS NewGenomeFile::getChromId(const string &chrom) {
if (chrom == _currChromName) {
return _currChromId;
}
lookupType::const_iterator iter= _chromSizeIds.find(chrom);
if (iter != _chromSizeIds.end()) {
_currChromName = iter->first;
_currChromSize = iter->second.first;
_currChromId = iter->second.second;
return _currChromId;
} else {
cerr << "Error: requested chromosome " << chrom << " does not exist in the genome file " << _genomeFileName << ". Exiting." << endl;
exit(1);
}
return INT_MAX;
}
|