File: readTree.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (178 lines) | stat: -rw-r--r-- 5,251 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// $Id: readTree.cpp 5525 2008-12-19 20:17:05Z itaymay $

#include "definitions.h"
#include "errorMsg.h"
#include "someUtil.h"
#include "readTree.h"
#include <iostream>
using namespace std;





// forward declarations

//----------------------------------------------------------------------------------------------
//	about reading tree topology from files:
//	usually a tree topology is represented by a line like this
//	(((Langur:0.8,Baboon:0.55):0.3,Human:0.44):0.5,Rat:0.02,(Cow:0.2,Horse:0.04):0.03);
//	the syntax of such a line is (part, part, part, part)
//	where part is either (part,part, part, ...):distace or name:distance
// or without the distance!
//	it should notice that the tree is unrooted.
//	if we look at the above file format, one can notice that the number of comas (",") is 
//	always one less than the number of leaves (synonyms for leaves are OTUs and external nodes)
//	the function GetNumberOfLeaves counts the numnber of comas and returns the number of leaves.
//	in the example below there are 6 leaves.

//*******************************************************************************
// constructors
//*******************************************************************************





vector<char> PutTreeFileIntoVector(istream &in) {
	vector<char> tree_contents;
	bool endWithDotComa = false;
	char chTemp;
	while (( !in.eof()) && (tree_contents.size() < MAX_FILE_SIZE))
	{
		in.get(chTemp);
#ifdef WIN32
		if (chTemp == -52) return tree_contents; //tal addition.
#endif
		if ( !isspace( chTemp ) )
			tree_contents.push_back(chTemp);
		if (chTemp == ';') {
			endWithDotComa = true;
			break;
		}
	}

	if (tree_contents.size() >= MAX_FILE_SIZE) {
		vector<string> err;
		err.push_back("Error reading tree file. The tree file is too large");
		errorMsg::reportError(err,1); // also quit the program
	}
	if (endWithDotComa == false) tree_contents.clear(); // remove junk from the last ; till the end of the file.
	return tree_contents;
}




int GetNumberOfLeaves(const vector<char> &tree_contents) {
	int iCommasCounter = 0;
	vector<char>::const_iterator itCurrent = tree_contents.begin();
	for ( ; itCurrent != tree_contents.end(); ++itCurrent ) {
		if (*itCurrent==COMMA)
			++iCommasCounter;
	}
	return ++iCommasCounter; //#leaves is always one more than number of comas
}

int GetNumberOfInternalNodes(const vector<char> &tree_contents) {
	int iCloseCounter = 0;
	vector<char>::const_iterator itCurrent = tree_contents.begin();
	for ( ; itCurrent != tree_contents.end(); ++itCurrent ) {
		if (*itCurrent==CLOSING_BRACE) ++iCloseCounter;
		if (*itCurrent==CLOSING_BRACE2) ++iCloseCounter;
	}
	return iCloseCounter; //number of HTUs is always the number of ")"
}


bool verifyChar(vector<char>::const_iterator &p_itCurrent, const char p_cCharToFind) {
	if ( (*p_itCurrent)==p_cCharToFind ) 	return true;
	return false;
}




// IsAtomicPart decides whether we will now read a taxa name (return true),
// or read an OPENING_BRACE which will say us, that we will read a complicated strucure.
bool IsAtomicPart(const vector<char>::const_iterator p_itCurrent) {
	if ( (*p_itCurrent)==OPENING_BRACE ) return false;
	else if ( (*p_itCurrent)==OPENING_BRACE2 ) return false;
	return true;
}

//-----------------------------------------------------------------------------
// there are 2 options for the tree format.
// either (name1:0.43, name2: 0.45 , (name3 : 2 , name 4: 5) : 3.332)
// or without the distances (name1, name2 , (name3 , name4) )
// here we return true if the tree file is with the distance, or false, if the tree file
// has not distances. 
// if distances exist: after the name there will always be a colon
// if distance exist, also move the iterator, to the beggining of the number
//-----------------------------------------------------------------------------
bool DistanceExists(vector<char>::const_iterator& p_itCurrent) {

	if ((*p_itCurrent)==COLON ) 	{
		++p_itCurrent;
		return true;
	}
	return false;
}

void clearPosibleComment(vector<char>::const_iterator& p_itCurrent) {
  if ((*p_itCurrent)=='[' ) {
	while (*(++p_itCurrent) != ']');
	++p_itCurrent;				// move over "]"
  }
}

string readPosibleComment(vector<char>::const_iterator& p_itCurrent) {
  string comment = "";

  if ((*p_itCurrent)=='[' ) 
  {
	vector<char>::const_iterator tmp= (p_itCurrent+1);
	if ((*tmp++)=='&' &&
		(*tmp++)=='&' &&
		(*tmp++)=='N' &&
		(*tmp++)=='H' &&
		(*tmp++)=='X') // see http://www.genetics.wustl.edu/eddy/forester/NHX.pdf
	// [&&NHX...]
	{			
		p_itCurrent += 5;
		while (*(++p_itCurrent) != ']')
		{
		  comment += *(p_itCurrent);
		}
		++p_itCurrent;				// move over "]"
	}
	else // [...]
	{
		// Skip over the text in []
		++p_itCurrent;
		while (*(p_itCurrent) != ']')
            ++p_itCurrent;				
		++p_itCurrent;	// move over "]"
		
	}
  }
  if (comment.size())
	LOG(10,<<"comment ="<<comment<<endl);

  return comment;
}



MDOUBLE getDistance(vector<char>::const_iterator &p_itCurrent) {
	string sTempNumber;
	for ( ; isdigit(*p_itCurrent) || (*p_itCurrent)==PERIOD || (*p_itCurrent)=='E'|| (*p_itCurrent)=='e'|| (*p_itCurrent)=='-' || (*p_itCurrent)=='+'; ++p_itCurrent)	
		sTempNumber += (*p_itCurrent);
	MDOUBLE dDistance = string2double(sTempNumber);
	return dDistance;
}