File: EvolutionaryTree.h

package info (click to toggle)
probcons 1.12-15
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 664 kB
  • sloc: cpp: 7,263; xml: 567; makefile: 114; sh: 21
file content (176 lines) | stat: -rw-r--r-- 6,052 bytes parent folder | download | duplicates (27)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/////////////////////////////////////////////////////////////////
// EvolutionaryTree.h
//
// Utilities for reading/writing multiple sequence data.
/////////////////////////////////////////////////////////////////

#ifndef EVOLUTIONARYTREE_H
#define EVOLUTIONARYTREE_H

#include <string>
#include <list>
#include <stdio.h>
#include "SafeVector.h"
#include "MultiSequence.h"
#include "Sequence.h"

using namespace std;

/////////////////////////////////////////////////////////////////
// TreeNode
//
// The fundamental unit for representing an alignment tree.  The
// guide tree is represented as a binary tree.
/////////////////////////////////////////////////////////////////

class TreeNode {
  int sequenceLabel;                  // sequence label
  TreeNode *left, *right, *parent;    // pointers to left, right children

  /////////////////////////////////////////////////////////////////
  // TreeNode::PrintNode()
  //
  // Internal routine used to print out the sequence comments
  // associated with the evolutionary tree, using a hierarchical
  // parenthesized format.
  /////////////////////////////////////////////////////////////////

  void PrintNode (ostream &outfile, const MultiSequence *sequences) const {

    // if this is a leaf node, print out the associated sequence comment
    if (sequenceLabel >= 0)
      outfile << sequences->GetSequence (sequenceLabel)->GetHeader();

    // otherwise, it must have two children; print out their subtrees recursively
    else {
      assert (left);
      assert (right);

      outfile << "(";
      left->PrintNode (outfile, sequences);
      outfile << " ";
      right->PrintNode (outfile, sequences);
      outfile << ")";
    }
  }

 public:

  /////////////////////////////////////////////////////////////////
  // TreeNode::TreeNode()
  //
  // Constructor for a tree node.  Note that sequenceLabel = -1
  // implies that the current node is not a leaf in the tree.
  /////////////////////////////////////////////////////////////////

  TreeNode (int sequenceLabel) : sequenceLabel (sequenceLabel),
    left (NULL), right (NULL), parent (NULL) {
    assert (sequenceLabel >= -1);
  }

  /////////////////////////////////////////////////////////////////
  // TreeNode::~TreeNode()
  //
  // Destructor for a tree node.  Recursively deletes all children.
  /////////////////////////////////////////////////////////////////

  ~TreeNode (){
    if (left){ delete left; left = NULL; }
    if (right){ delete right; right = NULL; }
    parent = NULL;
  }


  // getters
  int GetSequenceLabel () const { return sequenceLabel; }
  TreeNode *GetLeftChild () const { return left; }
  TreeNode *GetRightChild () const { return right; }
  TreeNode *GetParent () const { return parent; }

  // setters
  void SetSequenceLabel (int sequenceLabel){ this->sequenceLabel = sequenceLabel; assert (sequenceLabel >= -1); }
  void SetLeftChild (TreeNode *left){ this->left = left; }
  void SetRightChild (TreeNode *right){ this->right = right; }
  void SetParent (TreeNode *parent){ this->parent = parent; }

  /////////////////////////////////////////////////////////////////
  // TreeNode::ComputeTree()
  //
  // Routine used to compute an evolutionary tree based on the
  // given distance matrix.  We assume the distance matrix has the
  // form, distMatrix[i][j] = expected accuracy of aligning i with j.
  /////////////////////////////////////////////////////////////////

  static TreeNode *ComputeTree (const VVF &distMatrix){

    int numSeqs = distMatrix.size();                 // number of sequences in distance matrix
    VVF distances (numSeqs, VF (numSeqs));           // a copy of the distance matrix
    SafeVector<TreeNode *> nodes (numSeqs, NULL);    // list of nodes for each sequence
    SafeVector<int> valid (numSeqs, 1);              // valid[i] tells whether or not the ith
                                                     // nodes in the distances and nodes array
                                                     // are valid

    // initialization: make a copy of the distance matrix
    for (int i = 0; i < numSeqs; i++)
      for (int j = 0; j < numSeqs; j++)
        distances[i][j] = distMatrix[i][j];

    // initialization: create all the leaf nodes
    for (int i = 0; i < numSeqs; i++){
      nodes[i] = new TreeNode (i);
      assert (nodes[i]);
    }

    // repeat until only a single node left
    for (int numNodesLeft = numSeqs; numNodesLeft > 1; numNodesLeft--){
      float bestProb = -1;
      pair<int,int> bestPair;

      // find the closest pair
      for (int i = 0; i < numSeqs; i++) if (valid[i]){
        for (int j = i+1; j < numSeqs; j++) if (valid[j]){
          if (distances[i][j] > bestProb){
            bestProb = distances[i][j];
            bestPair = make_pair(i, j);
          }
        }
      }

      // merge the closest pair
      TreeNode *newParent = new TreeNode (-1);
      newParent->SetLeftChild (nodes[bestPair.first]);
      newParent->SetRightChild (nodes[bestPair.second]);
      nodes[bestPair.first]->SetParent (newParent);
      nodes[bestPair.second]->SetParent (newParent);
      nodes[bestPair.first] = newParent;
      nodes[bestPair.second] = NULL;

      // now update the distance matrix
      for (int i = 0; i < numSeqs; i++) if (valid[i]){
        distances[bestPair.first][i] = distances[i][bestPair.first]
          = (distances[i][bestPair.first] + distances[i][bestPair.second]) * bestProb / 2;
      }

      // finally, mark the second node entry as no longer valid
      valid[bestPair.second] = 0;
    }

    assert (nodes[0]);
    return nodes[0];
  }

  /////////////////////////////////////////////////////////////////
  // TreeNode::Print()
  //
  // Print out the subtree associated with this node in a
  // parenthesized representation.
  /////////////////////////////////////////////////////////////////

  void Print (ostream &outfile, const MultiSequence *sequences) const {
    outfile << "Alignment tree: ";
    PrintNode (outfile, sequences);
    outfile << endl;
  }
};

#endif