1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
|
/////////////////////////////////////////////////////////////////
// EvolutionaryTree.h
//
// Utilities for reading/writing multiple sequence data.
/////////////////////////////////////////////////////////////////
#ifndef EVOLUTIONARYTREE_H
#define EVOLUTIONARYTREE_H
#include <string>
#include <list>
#include <stdio.h>
#include "SafeVector.h"
#include "MultiSequence.h"
#include "Sequence.h"
using namespace std;
/////////////////////////////////////////////////////////////////
// TreeNode
//
// The fundamental unit for representing an alignment tree. The
// guide tree is represented as a binary tree.
/////////////////////////////////////////////////////////////////
class TreeNode {
int sequenceLabel; // sequence label
TreeNode *left, *right, *parent; // pointers to left, right children
/////////////////////////////////////////////////////////////////
// TreeNode::PrintNode()
//
// Internal routine used to print out the sequence comments
// associated with the evolutionary tree, using a hierarchical
// parenthesized format.
/////////////////////////////////////////////////////////////////
void PrintNode (ostream &outfile, const MultiSequence *sequences) const {
// if this is a leaf node, print out the associated sequence comment
if (sequenceLabel >= 0)
outfile << sequences->GetSequence (sequenceLabel)->GetHeader();
// otherwise, it must have two children; print out their subtrees recursively
else {
assert (left);
assert (right);
outfile << "(";
left->PrintNode (outfile, sequences);
outfile << " ";
right->PrintNode (outfile, sequences);
outfile << ")";
}
}
public:
/////////////////////////////////////////////////////////////////
// TreeNode::TreeNode()
//
// Constructor for a tree node. Note that sequenceLabel = -1
// implies that the current node is not a leaf in the tree.
/////////////////////////////////////////////////////////////////
TreeNode (int sequenceLabel) : sequenceLabel (sequenceLabel),
left (NULL), right (NULL), parent (NULL) {
assert (sequenceLabel >= -1);
}
/////////////////////////////////////////////////////////////////
// TreeNode::~TreeNode()
//
// Destructor for a tree node. Recursively deletes all children.
/////////////////////////////////////////////////////////////////
~TreeNode (){
if (left){ delete left; left = NULL; }
if (right){ delete right; right = NULL; }
parent = NULL;
}
// getters
int GetSequenceLabel () const { return sequenceLabel; }
TreeNode *GetLeftChild () const { return left; }
TreeNode *GetRightChild () const { return right; }
TreeNode *GetParent () const { return parent; }
// setters
void SetSequenceLabel (int sequenceLabel){ this->sequenceLabel = sequenceLabel; assert (sequenceLabel >= -1); }
void SetLeftChild (TreeNode *left){ this->left = left; }
void SetRightChild (TreeNode *right){ this->right = right; }
void SetParent (TreeNode *parent){ this->parent = parent; }
/////////////////////////////////////////////////////////////////
// TreeNode::ComputeTree()
//
// Routine used to compute an evolutionary tree based on the
// given distance matrix. We assume the distance matrix has the
// form, distMatrix[i][j] = expected accuracy of aligning i with j.
/////////////////////////////////////////////////////////////////
static TreeNode *ComputeTree (const VVF &distMatrix){
int numSeqs = distMatrix.size(); // number of sequences in distance matrix
VVF distances (numSeqs, VF (numSeqs)); // a copy of the distance matrix
SafeVector<TreeNode *> nodes (numSeqs, NULL); // list of nodes for each sequence
SafeVector<int> valid (numSeqs, 1); // valid[i] tells whether or not the ith
// nodes in the distances and nodes array
// are valid
// initialization: make a copy of the distance matrix
for (int i = 0; i < numSeqs; i++)
for (int j = 0; j < numSeqs; j++)
distances[i][j] = distMatrix[i][j];
// initialization: create all the leaf nodes
for (int i = 0; i < numSeqs; i++){
nodes[i] = new TreeNode (i);
assert (nodes[i]);
}
// repeat until only a single node left
for (int numNodesLeft = numSeqs; numNodesLeft > 1; numNodesLeft--){
float bestProb = -1;
pair<int,int> bestPair;
// find the closest pair
for (int i = 0; i < numSeqs; i++) if (valid[i]){
for (int j = i+1; j < numSeqs; j++) if (valid[j]){
if (distances[i][j] > bestProb){
bestProb = distances[i][j];
bestPair = make_pair(i, j);
}
}
}
// merge the closest pair
TreeNode *newParent = new TreeNode (-1);
newParent->SetLeftChild (nodes[bestPair.first]);
newParent->SetRightChild (nodes[bestPair.second]);
nodes[bestPair.first]->SetParent (newParent);
nodes[bestPair.second]->SetParent (newParent);
nodes[bestPair.first] = newParent;
nodes[bestPair.second] = NULL;
// now update the distance matrix
for (int i = 0; i < numSeqs; i++) if (valid[i]){
distances[bestPair.first][i] = distances[i][bestPair.first]
= (distances[i][bestPair.first] + distances[i][bestPair.second]) * bestProb / 2;
}
// finally, mark the second node entry as no longer valid
valid[bestPair.second] = 0;
}
assert (nodes[0]);
return nodes[0];
}
/////////////////////////////////////////////////////////////////
// TreeNode::Print()
//
// Print out the subtree associated with this node in a
// parenthesized representation.
/////////////////////////////////////////////////////////////////
void Print (ostream &outfile, const MultiSequence *sequences) const {
outfile << "Alignment tree: ";
PrintNode (outfile, sequences);
outfile << endl;
}
};
#endif
|