1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
|
/*
* suffixtree.cpp
*
*
* Created by Pat Schloss on 12/15/08.
* Copyright 2008 Patrick D. Schloss. All rights reserved.
*
* This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that
* I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
*
* Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
* Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology.
* USA: Cambridge University Press
*
* The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
*
* I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at
* nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence.
* Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
* sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the
* article and Perl source code provided at http://www.ddj.com/web-development/184416093.
*
*/
#include "sequence.hpp"
#include "suffixnodes.hpp"
#include "suffixtree.hpp"
//********************************************************************************************************************
inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is necessary to print the tree and to sort the
return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent
}
//********************************************************************************************************************
SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); }
//********************************************************************************************************************
SuffixTree::~SuffixTree(){
for(int i=0;i<nodeVector.size();i++){ delete nodeVector[i]; }
nodeVector.clear();
}
//********************************************************************************************************************
void SuffixTree::loadSequence(Sequence seq){
nodeCounter = 0; // initially there are 0 nodes in the tree
activeStartPosition = 0;
activeEndPosition = -1;
seqName = seq.getName();
sequence = seq.convert2ints();
sequence += '5'; // this essentially concatenates a '$' to the end of the sequence to
int seqLength = sequence.length(); // make it a cononical suffix tree
nodeVector.push_back(new SuffixBranch(-1, 0, -1)); // enter the root of the suffix tree
activeNode = root = 0;
string hold;
for(int i=0;i<seqLength;i++){
addPrefix(i); // step through the sequence adding each prefix
}
}
//********************************************************************************************************************
string SuffixTree::getSeqName() {
return seqName;
}
//********************************************************************************************************************
void SuffixTree::print(){
vector<SuffixNode*> hold = nodeVector;
sort(hold.begin(), hold.end(), compareParents);
m->mothurOut("Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix"); m->mothurOutEndLine();
for(int i=1;i<=nodeCounter;i++){
hold[i]->print(sequence, i);
}
}
//********************************************************************************************************************
int SuffixTree::countSuffixes(string compareSequence, int& minValue){ // here we count the number of suffix parts
// we need to rewrite a user supplied sequence. if the
int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The
int seqLength = compareSequence.length(); // time complexity should be O(L)
int position = 0;
int presentNode = 0;
while(position < seqLength){ // while the position in the query sequence isn't at the end...
if(numSuffixes > minValue) { return 1000000; } // bail if the count gets too high
int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a
// child that matches the next character in the query
if(newNode == -1){
if(presentNode == 0){ position++; } // if not, go back to the root and increase the count
numSuffixes++; // by one.
presentNode = 0;
}
else{ // if there is, move to that node and see how far down
presentNode = newNode; // it we can get
for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
if(compareSequence[position] == sequence[i]){
position++; // as long as the query and branch agree, keep going
}
else{
numSuffixes++; // if there is a mismatch, increase the number of
presentNode = 0; // suffixes and go back to the root
break;
}
}
}
// if we get all the way through the node we'll go to the top of the while loop and find the child node
// that corresponds to what we are interested in
}
numSuffixes--; // the method puts an extra count on numSuffixes
if(numSuffixes < minValue) { minValue = numSuffixes; } // if the count is less than the previous minValue,
return numSuffixes; // change the value and return the number of suffixes
}
//********************************************************************************************************************
int SuffixTree::countSuffixes(string compareSequence){ // here we count the number of suffix parts
// we need to rewrite a user supplied sequence. if the
int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The
int seqLength = compareSequence.length(); // time complexity should be O(L)
int position = 0;
int presentNode = 0;
while(position < seqLength){ // while the position in the query sequence isn't at the end...
int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a
// child that matches the next character in the query
if(newNode == -1){
if(presentNode == 0){ position++; } // if not, go back to the root and increase the count
numSuffixes++; // by one.
presentNode = 0;
}
else{ // if there is, move to that node and see how far down
presentNode = newNode; // it we can get
for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
if(compareSequence[position] == sequence[i]){
position++; // as long as the query and branch agree, keep going
}
else{
numSuffixes++; // if there is a mismatch, increase the number of
presentNode = 0; // suffixes and go back to the root
break;
}
}
}
// if we get all the way through the node we'll go to the top of the while loop and find the child node
// that corresponds to what we are interested in
}
numSuffixes--; // the method puts an extra count on numSuffixes
return numSuffixes; // change the value and return the number of suffixes
}
//********************************************************************************************************************
void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really
// isn't the place to ask.
if ( isExplicit() == 0 ) { // if the node has no children...
int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
SuffixNode* tempNode = nodeVector[tempNodeIndex];
int span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
while ( span <= ( activeEndPosition - activeStartPosition ) ) {
activeStartPosition = activeStartPosition + span + 1;
activeNode = tempNodeIndex;
if ( activeStartPosition <= activeEndPosition ) {
tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]);
tempNode = nodeVector[tempNodeIndex];
span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
}
}
}
}
//********************************************************************************************************************
int SuffixTree::split(int nodeIndex, int position){ // leaves stay leaves, etc, to split a leaf we make a new interior
// node and reconnect everything
SuffixNode* node = nodeVector[nodeIndex]; // get the node that needs to be split
SuffixNode* parentNode = nodeVector[node->getParentNode()]; // get it's parent node
parentNode->eraseChild(sequence[node->getStartCharPos()]); // erase the present node from the registry of its parent
nodeCounter++;
SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition); // create a new node that will link the parent with the old child
parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);// give the parent the new child
nodeVector.push_back(newNode);
node->setParentNode(nodeCounter); // give the original node the new node as its parent
newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex);
// put the original node in the registry of the new node's children
newNode->setSuffixNode(activeNode);//link the new node with the old active node
// recalculate the startCharPosition of the outermost node
node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 );
return node->getParentNode();
}
//********************************************************************************************************************
void SuffixTree::makeSuffixLink(int& previous, int present){
// here we link the nodes that are suffixes of one another to rapidly speed through the tree
if ( previous > 0 ) { nodeVector[previous]->setSuffixNode(present); }
else { /* do nothing */ }
previous = present;
}
//********************************************************************************************************************
void SuffixTree::addPrefix(int prefixPosition){
int lastParentNode = -1; // we need to place a new prefix in the suffix tree
int parentNode = 0;
while(1){
parentNode = activeNode;
if(isExplicit() == 1){ // if the node is explicit (has kids), try to follow it down the branch if its there...
if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){ // break out and get next prefix...
break;
}
else{ // ...otherwise continue, we'll need to make a new node later on...
}
}
else{ // if it's not explicit (no kids), read through and see if all of the chars agree...
int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
int span = activeEndPosition - activeStartPosition;
if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){
break; // if the existing suffix agrees with the new one, grab a new prefix...
}
else{
parentNode = split(tempNode, prefixPosition); // ... otherwise we need to split the node
}
}
nodeCounter++; // we need to generate a new node here if the kid didn't exist, or we split a node
SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1);
nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter);
nodeVector.push_back(newSuffixLeaf);
makeSuffixLink( lastParentNode, parentNode ); // make a suffix link for the parent node
if(nodeVector[activeNode]->getParentNode() == -1){ // move along the start position for the tree
activeStartPosition++;
}
else {
activeNode = nodeVector[activeNode]->getSuffixNode();
}
canonize(); // frankly, i'm not entirely clear on what canonize does.
}
makeSuffixLink( lastParentNode, parentNode );
activeEndPosition++; // move along the end position for the tree
canonize(); // frankly, i'm not entirely clear on what canonize does.
}
//********************************************************************************************************************
|