File: sequence.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (192 lines) | stat: -rw-r--r-- 5,364 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
// $Id: sequence.cpp 7627 2010-03-06 21:56:30Z cohenofi $

#include "sequence.h"

#include <algorithm>
using namespace std;


sequence::sequence(const string& str,
				   const string& name,
				   const string& remark,
				   const int id,
				   const alphabet* inAlph)
: _alphabet(inAlph->clone()), _remark(remark), _name(name),_id(id) 
{
	for (int k=0; k < str.size() ;k += _alphabet->stringSize()) {
		int charId = inAlph->fromChar(str, k);
		if (charId == -99) {
			string textToPrint = "unable to read sequence: " + name;
			errorMsg::reportError(textToPrint);
		}

		_vec.push_back(charId);
	}
}


sequence::sequence(const sequence& other) 
: _vec(other._vec), _alphabet(other._alphabet->clone()), 
  _remark(other._remark), _name(other._name),_id(other._id) 
{
	
}
// convert the other sequence to the alphabet inAlph.
sequence::sequence(const sequence& other,const alphabet* inAlph)
: _alphabet(inAlph->clone()), _remark(other._remark), _name(other._name), _id(other._id)
{
	const mulAlphabet* pMulAlphabet;
	// if the other.alphabet is amino or nucleotide and the inAlph is indel
	
	if ( (other._alphabet->size() == 20 && inAlph->size() == 2)
		|| (other._alphabet->size() == 4 && inAlph->size() == 2) )
	{
		for (int k=0; k < other.seqLen() ;k += other._alphabet->stringSize()) 
		{
			int charId = other._vec[k];
			
			if (charId == other._alphabet->gap())
					_vec.push_back(inAlph->fromChar("-",0));
			else
				_vec.push_back(inAlph->fromChar("X",0)); //also converts "." (charId==-3) to "X"
				//	unknown amino/nucleotide is converted to "X" and not to "?"
		}
	}
		
	// if the other.alphabet is amino or nucleotide and the inAlph is mulAlphabet
	else if ( (other._alphabet->size() == 20 && inAlph->size()%20 == 0)
		|| (other._alphabet->size() == 4 && inAlph->size()%4 == 0) )
	{
		for (int k=0; k < other.seqLen() ;++k) 
		{
			int charId = other._vec[k];
			string ch = other._alphabet->fromInt(charId);
			int mulCharId = _alphabet->fromChar(ch,0);
			_vec.push_back(mulCharId);
		}
	//	 debug OZ
			//cout << "other sequence: " << other << endl;
			//cout << "mul sequence " << (*this) << endl;
	//	 end of debug
	}
	// if the other.alphabet is mulAlphabet and the inAlph is it's baseAlphabet
	// (for example, if other.alphabet is a multiplied-amino and inAlph is amino, then the converted sequence
	// will have alphabet amino)
	else if ( ((inAlph->size() == 20) && (other._alphabet->size()%20 == 0))
		|| (inAlph->size() == 4) && (other._alphabet->size()%4 == 0))
	{
		pMulAlphabet=(mulAlphabet*)(other._alphabet);
		for (int k=0; k < other.seqLen() ;++k) 
			{
				int mulCharId = other._vec[k];
				int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
				_vec.push_back(baseId);
			}
	}

	// for gainLoss project - {0,1} in both, hence no conversion needed.
	// it should be the same for all cases with same alphabet
	else if ( inAlph->size() == other._alphabet->size() )
	{
		pMulAlphabet=(mulAlphabet*)(other._alphabet);
		for (int k=0; k < other.seqLen() ;++k) 
		{
			int mulCharId = other._vec[k];
			//int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
			_vec.push_back(mulCharId);
		}
	}	
	// I tried to implement it using dynamic_cast but it doesn't work...
	/*else if 
			(
				(pMulAlphabet = dynamic_cast<const mulAlphabet*>(other._alphabet)) != NULL
				)
		{ 
			if			(pMulAlphabet->getBaseAlphabet()->size() == inAlph->size())
		 {
			for (int k=0; k < other.seqLen() ;++k) 
			{
				int mulCharId = other._vec[k];
				int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
				_vec.push_back(baseId);
			}
		}
		}*/

	// (currently, there is no implimentions for other converts)
	else
	{
		string error = "unable to convert this kind of alphabet";
		errorMsg::reportError(error);
	}
}

sequence::~sequence()
{
	if (_alphabet) 
		delete _alphabet;
}

void sequence::resize(const int k, const int* val) {
	if (val == NULL) {
		_vec.resize(k,_alphabet->unknown());
	}
	else {
		_vec.resize(k,*val);
	}
}

string sequence::toString() const{
	string tmp;
	for (int k=0; k < _vec.size() ; ++k ){
		tmp+= _alphabet->fromInt(_vec[k]);
	}
	return tmp;
}

string sequence::toString(const int pos) const{
	return _alphabet->fromInt(_vec[pos]);
}	

void sequence::addFromString(const string& str) {
	for (int k=0; k < str.size() ; k+=_alphabet->stringSize()) {
		_vec.push_back(_alphabet->fromChar(str,k));
	}
}

class particip {
public:
	explicit particip()  {}
	bool operator()(int i) {
		return (i==-1000);
	}
};

//removePositions: the poitions to be removed are marked as '1' in posToRemoveVec
//all othehr positions are '0' 
void sequence::removePositions(const vector<int> & posToRemoveVec) 
{
	if(posToRemoveVec.size() != seqLen())
		errorMsg::reportError("the input vector must be same size as sequence length. in sequence::removePositions");
	for (int k=0; k < posToRemoveVec.size(); ++k) {
		if (posToRemoveVec[k] == 1) 
			_vec[k] = -1000;
	}
	vector<int>::iterator vec_iter;
	vec_iter =  remove_if(_vec.begin(),_vec.end(),particip());
	_vec.erase(vec_iter,_vec.end()); // pg 1170, primer.
}

//return the number of sites that are specific = not unknown, nor ambiguity, nor gap (for example, for nucleotides it will true for A,C,G, or T).
int sequence::seqLenSpecific() const
{
	int res = 0;
	for (int pos = 0; pos < seqLen(); ++pos)
	{
		if (isSpecific(pos))
			++res;
	}
	return res;
}