1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
|
// $Id: readTree.cpp 5525 2008-12-19 20:17:05Z itaymay $
#include "definitions.h"
#include "errorMsg.h"
#include "someUtil.h"
#include "readTree.h"
#include <iostream>
using namespace std;
// forward declarations
//----------------------------------------------------------------------------------------------
// about reading tree topology from files:
// usually a tree topology is represented by a line like this
// (((Langur:0.8,Baboon:0.55):0.3,Human:0.44):0.5,Rat:0.02,(Cow:0.2,Horse:0.04):0.03);
// the syntax of such a line is (part, part, part, part)
// where part is either (part,part, part, ...):distace or name:distance
// or without the distance!
// it should notice that the tree is unrooted.
// if we look at the above file format, one can notice that the number of comas (",") is
// always one less than the number of leaves (synonyms for leaves are OTUs and external nodes)
// the function GetNumberOfLeaves counts the numnber of comas and returns the number of leaves.
// in the example below there are 6 leaves.
//*******************************************************************************
// constructors
//*******************************************************************************
vector<char> PutTreeFileIntoVector(istream &in) {
vector<char> tree_contents;
bool endWithDotComa = false;
char chTemp;
while (( !in.eof()) && (tree_contents.size() < MAX_FILE_SIZE))
{
in.get(chTemp);
#ifdef WIN32
if (chTemp == -52) return tree_contents; //tal addition.
#endif
if ( !isspace( chTemp ) )
tree_contents.push_back(chTemp);
if (chTemp == ';') {
endWithDotComa = true;
break;
}
}
if (tree_contents.size() >= MAX_FILE_SIZE) {
vector<string> err;
err.push_back("Error reading tree file. The tree file is too large");
errorMsg::reportError(err,1); // also quit the program
}
if (endWithDotComa == false) tree_contents.clear(); // remove junk from the last ; till the end of the file.
return tree_contents;
}
int GetNumberOfLeaves(const vector<char> &tree_contents) {
int iCommasCounter = 0;
vector<char>::const_iterator itCurrent = tree_contents.begin();
for ( ; itCurrent != tree_contents.end(); ++itCurrent ) {
if (*itCurrent==COMMA)
++iCommasCounter;
}
return ++iCommasCounter; //#leaves is always one more than number of comas
}
int GetNumberOfInternalNodes(const vector<char> &tree_contents) {
int iCloseCounter = 0;
vector<char>::const_iterator itCurrent = tree_contents.begin();
for ( ; itCurrent != tree_contents.end(); ++itCurrent ) {
if (*itCurrent==CLOSING_BRACE) ++iCloseCounter;
if (*itCurrent==CLOSING_BRACE2) ++iCloseCounter;
}
return iCloseCounter; //number of HTUs is always the number of ")"
}
bool verifyChar(vector<char>::const_iterator &p_itCurrent, const char p_cCharToFind) {
if ( (*p_itCurrent)==p_cCharToFind ) return true;
return false;
}
// IsAtomicPart decides whether we will now read a taxa name (return true),
// or read an OPENING_BRACE which will say us, that we will read a complicated strucure.
bool IsAtomicPart(const vector<char>::const_iterator p_itCurrent) {
if ( (*p_itCurrent)==OPENING_BRACE ) return false;
else if ( (*p_itCurrent)==OPENING_BRACE2 ) return false;
return true;
}
//-----------------------------------------------------------------------------
// there are 2 options for the tree format.
// either (name1:0.43, name2: 0.45 , (name3 : 2 , name 4: 5) : 3.332)
// or without the distances (name1, name2 , (name3 , name4) )
// here we return true if the tree file is with the distance, or false, if the tree file
// has not distances.
// if distances exist: after the name there will always be a colon
// if distance exist, also move the iterator, to the beggining of the number
//-----------------------------------------------------------------------------
bool DistanceExists(vector<char>::const_iterator& p_itCurrent) {
if ((*p_itCurrent)==COLON ) {
++p_itCurrent;
return true;
}
return false;
}
void clearPosibleComment(vector<char>::const_iterator& p_itCurrent) {
if ((*p_itCurrent)=='[' ) {
while (*(++p_itCurrent) != ']');
++p_itCurrent; // move over "]"
}
}
string readPosibleComment(vector<char>::const_iterator& p_itCurrent) {
string comment = "";
if ((*p_itCurrent)=='[' )
{
vector<char>::const_iterator tmp= (p_itCurrent+1);
if ((*tmp++)=='&' &&
(*tmp++)=='&' &&
(*tmp++)=='N' &&
(*tmp++)=='H' &&
(*tmp++)=='X') // see http://www.genetics.wustl.edu/eddy/forester/NHX.pdf
// [&&NHX...]
{
p_itCurrent += 5;
while (*(++p_itCurrent) != ']')
{
comment += *(p_itCurrent);
}
++p_itCurrent; // move over "]"
}
else // [...]
{
// Skip over the text in []
++p_itCurrent;
while (*(p_itCurrent) != ']')
++p_itCurrent;
++p_itCurrent; // move over "]"
}
}
if (comment.size())
LOG(10,<<"comment ="<<comment<<endl);
return comment;
}
MDOUBLE getDistance(vector<char>::const_iterator &p_itCurrent) {
string sTempNumber;
for ( ; isdigit(*p_itCurrent) || (*p_itCurrent)==PERIOD || (*p_itCurrent)=='E'|| (*p_itCurrent)=='e'|| (*p_itCurrent)=='-' || (*p_itCurrent)=='+'; ++p_itCurrent)
sTempNumber += (*p_itCurrent);
MDOUBLE dDistance = string2double(sTempNumber);
return dDistance;
}
|