File: NewGenomeFile.cpp

package info (click to toggle)
bedtools 2.26.0%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 55,328 kB
  • sloc: cpp: 37,989; sh: 6,930; makefile: 2,225; python: 163
file content (151 lines) | stat: -rw-r--r-- 4,715 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/*****************************************************************************
  NewGenomeFile.cpp

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

  Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#include "NewGenomeFile.h"
#include "ParseTools.h"
#include "Tokenizer.h"

NewGenomeFile::NewGenomeFile(const string &genomeFilename)
: _maxId(-1)
{
    _genomeFileName = genomeFilename;
    loadGenomeFileIntoMap();
}

NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector)
: _maxId(-1)
{
	size_t i = 0;
    for (; i < refVector.size(); ++i) {
        string chrom = refVector[i].RefName;
        CHRPOS length = refVector[i].RefLength;
        _maxId++;
        _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId);
		_chromList.push_back(chrom);
    }
	// Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string.
	// Add in an empty chrom so these don't error.
	_maxId++;
	_chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId);
	_chromList.push_back("");

}

// Destructor
NewGenomeFile::~NewGenomeFile(void) {
}


void NewGenomeFile::loadGenomeFileIntoMap() {


	ifstream genFile(_genomeFileName.c_str());
	if (!genFile.good()) {
		cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl;
		exit(1);
	}
	string sLine;
	Tokenizer fieldTokens;
	CHRPOS chrSize = 0;
	string chrName;
	while (!genFile.eof()) {
		sLine.clear();
		chrSize = 0;
		chrName.clear();
		getline(genFile, sLine);
		int numFields = fieldTokens.tokenize(sLine.c_str());
		if (numFields != 2) {
			continue;
		}
		chrName = fieldTokens.getElem(0);
		chrSize = str2chrPos(fieldTokens.getElem(1));
		_maxId++;
		_chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId);
		_startOffsets.push_back(_genomeLength);
		_genomeLength += chrSize;
		_chromList.push_back(chrName);
	}
	if (_maxId == -1) {
		cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl;
		exit(1);
	}
	// Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string.
	// Add in an empty chrom so these don't error.
	_maxId++;
	_chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId);
	_chromList.push_back("");


	_startOffsets.push_back(_genomeLength); //insert the final length as the last element
	//to help with the lower_bound call in the projectOnGenome method.
	genFile.close();
}

bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, string &chrom, CHRPOS &start) {
    // search for the chrom that the position belongs on.
    // add 1 to genome position b/c of zero-based, half open.
    vector<CHRPOS>::const_iterator low =
        lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
    
    // use the iterator to identify the appropriate index 
    // into the chrom name and start vectors
    CHRPOS i = CHRPOS(low-_startOffsets.begin());
    if (i >= _chromList.size()) {
    	return false; //position not on genome
    }
    chrom = _chromList[i - 1];
    start = genome_pos - _startOffsets[i - 1];
    return true;
}
    
CHRPOS NewGenomeFile::getChromSize(const string &chrom) {
	if (chrom == _currChromName) {
		return _currChromSize;
	}
	lookupType::const_iterator iter= _chromSizeIds.find(chrom);
    if (iter != _chromSizeIds.end()) {
    	_currChromName = iter->first;
    	_currChromSize = iter->second.first;
    	_currChromId = iter->second.second;
    	return _currChromSize;
    }
    cerr << "Error: chromosome " << chrom << " is not in the genome file " << _genomeFileName << ". Exiting." << endl;
    return INT_MAX;
}

CHRPOS NewGenomeFile::getChromSize(const string &chrom) const {
	if (chrom == _currChromName) {
		return _currChromSize;
	}
	lookupType::const_iterator iter= _chromSizeIds.find(chrom);
    if (iter != _chromSizeIds.end()) {
    	return iter->second.first;
    }
    cerr << "Error: chromosome " << chrom << " is not in the genome file " << _genomeFileName << ". Exiting." << endl;
    return INT_MAX;
}

CHRPOS NewGenomeFile::getChromId(const string &chrom) {
	if (chrom == _currChromName) {
		return _currChromId;
	}
	lookupType::const_iterator iter= _chromSizeIds.find(chrom);
    if (iter != _chromSizeIds.end()) {
    	_currChromName = iter->first;
    	_currChromSize = iter->second.first;
    	_currChromId = iter->second.second;
    	return _currChromId;
    } else {
    	cerr << "Error: requested chromosome " << chrom << " does not exist in the genome file " << _genomeFileName << ". Exiting." << endl;
    	exit(1);
    }
    return INT_MAX;
}