1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
|
/*
BOOSTER: BOOtstrap Support by TransfER:
BOOSTER is an alternative method to compute bootstrap branch supports
in large trees. It uses transfer distance between bipartitions, instead
of perfect match.
Copyright (C) 2017 Frederic Lemoine, Jean-Baka Domelevo Entfellner, Olivier Gascuel
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef _TREE_H_
#define _TREE_H_
#include "hashtables_bfields.h" /* for the hashtables to store taxa names on the branches */
#include "hashmap.h"
#include "io.h"
#include <ctype.h>
#define TRUE 1
#define FALSE 0
#define MIN_BRLEN 1e-8
#define MAX_TREELENGTH 10000000 /* more or less 10MB for a tree file in NH format */
#define MAX_NODE_DEPTH 100000 /* max depth for nodes in the tree */
#define MAX_NAMELENGTH 255 /* max length of a taxon name */
#define MAX_COMMENTLENGTH 255 /* max length of a comment string in NHX format */
/* TYPES */
/* Every node in our binary trees has several neighbours with indices 0, 1, 2.... We allow polytomies of any degree.
An internal node with no multifurcation has 3 outgoing directions/neighbours.
In rooted trees, the interpretation is the following:
- for internal nodes, direction 0 is to the father, other directions are to the sons
- for tips (leaves), direction 0 is to the father, no other neighbours
- the root has only two neighbours.
So it's easy to check whether a node is a leaf (one neighbour) or the root of a rooted tree (two neighbours).
For unrooted trees, the interpretation is the same, except that no node has two neighbours.
The pseudo-root (from the NH import) is three-furcated, so behaves exactly like any other internal node.
It is not advisable to have several nodes of degree two in the same tree.
The root or pseudo-root is ALWAYS assigned id 0 at beginning. May change later, upon rerooting.
In any case, it is always pointed to by tree->node0. */
typedef struct __Node {
char* name;
char* comment; /* for further use: store any comment (e.g. from NHX format) */
int id; /* unique id attributed to the node */
short int nneigh; /* number of neighbours */
struct __Node** neigh; /* neighbour nodes */
struct __Edge** br; /* corresponding branches going from this node */
double depth; /* the depth of a node is its min distance to a leaf */
} Node;
/* Every edge connects two nodes.
By convention, all terminal branches will have the tip on their RIGHT end */
typedef struct __Edge {
int id;
Node *left, *right; /* in rooted trees the right end will always be the descendant.
In any case, a leaf is always on the right side of its branch. */
double brlen;
double branch_support;
int* subtype_counts[2]; /* first index is 0 for the left of the branch, 1 for its right side */
id_hash_table_t *hashtbl[2]; /* hashtables containing the ids of the taxa in each subtree */
/* index 0 corresponds to the left of the branch, index 1 to its right.
following our implementation, we only keep hashtbl[1] populated. */
short int had_zero_length; /* set at the moment when we read the tree, even though
we then immediately set the branch length to MIN_BRLEN */
short int has_branch_support;
int topo_depth; /* the topological depth is the number of taxa on the lightest side of the bipar */
} Edge;
typedef struct __Tree {
Node** a_nodes; /* array of node pointers */
Edge** a_edges; /* array of edge pointers */
Node* node0; /* the root or pseudo-root node */
int nb_nodes;
int nb_edges;
int nb_taxa;
char** taxa_names; /* store only once the taxa names */
int length_hashtables; /* the number of chained lists in the hashtables on the edges */
int next_avail_node_id;
int next_avail_edge_id;
int next_avail_taxon_id;
char** taxname_lookup_table;
} Tree;
/* FUNCTIONS */
/* UTILS/DEBUG: counting specific branches or nodes in the tree */
int count_zero_length_branches(Tree* tree);
int count_leaves(Tree* tree);
int count_roots(Tree* tree);
int count_multifurcations(Tree* tree);
int dir_a_to_b(Node* a, Node* b);
/* various statistics on branch support */
double mean_bootstrap_support(Tree* tree);
double median_bootstrap_support(Tree* tree);
int summary_bootstrap_support(Tree* tree, double* result);
/* parsing utils: discovering and dealing with tokens */
int index_next_toplevel_comma(char* in_str, int begin, int end);
int count_outer_commas(char* in_str, int begin, int end);
void strip_toplevel_parentheses(char* in_str, int begin, int end, int* pair);
int index_toplevel_colon(char* in_str, int begin, int end);
void parse_double(char* in_str, int begin, int end, double* location);
/* creating a node, a branch, a tree: to create a tree from scratch, not from parsing */
Node* new_node(const char* name, Tree* t, int degree);
Edge* new_edge(Tree* t);
Tree* new_tree(int nb_taxa, const char* name);
Node* graft_new_node_on_branch(Edge* target_edge, Tree* tree, double ratio_from_left, double new_edge_length, char* node_name);
/* collapsing a branch */
void collapse_branch(Edge* branch, Tree* tree);
/**
This function removes a taxon from the tree (identified by its taxon_id)
And recomputed the branch length of the branch it was branched on.
Also, the bootstrap support (if any) is recomputed, taking the maximum of the
supports of the joined branches
Be careful: The taxnames_lookup_table is modified after this function!
Do not use this function if you share the same taxnames_lookup_table in
several trees.
*/
void remove_taxon(int taxon_id, Tree* tree);
/**
If there remains 2 neighbors to connect_node
We connect them directly and delete connect_node
We keep l_edge and delete r_edge
-> If nneigh de connect node != 2 : Do nothing
This function deletes a node like that:
connect_node
l_edge r_edge
l_node *-------*--------* r_node
=> Careful: After this function, you may want to call
=> recompute_identifiers()
*/
void remove_single_node(Tree *tree, Node *connect_node);
/**
This method recomputes all the identifiers
of the nodes and of the edges
for which the tree->a_nodes is not null
or tree->a_edges is not null
It also recomputes the total number of edges
and nodes in the tree
*/
void recompute_identifiers(Tree *tree);
/**
This function shuffles the taxa of an input tree
*/
void shuffle_taxa(Tree *tree);
/* (re)rooting a tree */
void reroot_acceptable(Tree* t);
void unrooted_to_rooted(Tree* t);
/* To be called after a reroot*/
void reorient_edges(Tree *t);
void reorient_edges_recur(Node *n, Node *prev, Edge *e);
/* utility functions to deal with NH files */
unsigned int tell_size_of_one_tree(char* filename);
int copy_nh_stream_into_str(FILE* nh_stream, char* big_string);
/* actually parsing a tree */
void process_name_and_brlen(Node* son_node, Edge* edge, Tree* current_tree, char* in_str, int begin, int end);
Node* create_son_and_connect_to_father(Node* current_node, Tree* current_tree, int direction, char* in_str, int begin, int end);
void parse_substring_into_node(char* in_str, int begin, int end, Node* current_node, int has_father, Tree* current_tree);
Tree* parse_nh_string(char* in_str);
/* complete parse tree: parse NH string, update hashtables and subtype counts */
Tree *complete_parse_nh(char* big_string, char*** taxname_lookup_table);
/* taxname lookup table functions */
char** build_taxname_lookup_table(Tree* tree);
map_t build_taxid_hashmap(char** taxname_lookup_table, int nb_taxa);
void free_taxid_hashmap(map_t taxmap);
int free_hashmap_data(any_t arg,any_t key, any_t elemt);
char** get_taxname_lookup_table(Tree* tree);
Taxon_id get_tax_id_from_tax_name(char* str, char** lookup_table, int length);
/* (unnecessary/deprecated) multifurcation treatment */
void regraft_branch_on_node(Edge* branch, Node* target_node, int dir);
/***************************************************************
******************* neatly implementing tree traversals ******
***************************************************************/
void post_order_traversal_recur(Node* current, Node* origin, Tree* tree, void (*func)(Node*, Node*, Tree*));
void post_order_traversal(Tree* t, void (*func)(Node*, Node*, Tree*));
/* post order traversal with any data passed to the function call */
void post_order_traversal_data_recur(Node* current, Node* origin, Tree* tree, void*, void (*func)(Node*, Node*, Tree*, void*));
void post_order_traversal_data(Tree* t, void*, void (*func)(Node*, Node*, Tree*, void*));
void pre_order_traversal_recur(Node* current, Node* origin, Tree* tree, void (*func)(Node*, Node*, Tree*));
void pre_order_traversal(Tree* t, void (*func)(Node*, Node*, Tree*));
/* pre order traversal with any data passed to the function call */
void pre_order_traversal_data_recur(Node* current, Node* origin, Tree* tree, void* data, void (*func)(Node*, Node*, Tree*, void*));
void pre_order_traversal_data(Tree* t, void* data, void (*func)(Node*, Node*, Tree*, void*));
/* bootstrap values */
void update_bootstrap_supports_from_node_names(Tree* tree);
void update_bootstrap_supports_doer(Node* current, Node* origin, Tree* tree);
/* node depths */
void update_node_depths_post_doer(Node* target, Node* orig, Tree* t);
void update_node_depths_pre_doer(Node* target, Node* orig, Tree* t);
void update_node_depths_post_alltree(Tree* tree);
void update_node_depths_pre_alltree(Tree* tree);
/* topological depths */
void update_all_topo_depths_from_hashtables(Tree* tree);
int greatest_topo_depth(Tree* tree);
/* WORKING WITH HASHTABLES */
void update_hashtables_post_doer(Node* current, Node* orig, Tree* t);
void update_hashtables_pre_doer(Node* current, Node* orig, Tree* t);
void update_hashtables_post_alltree(Tree* tree);
void update_hashtables_pre_alltree(Tree* tree);
/* UNION AND INTERSECT CALCULATIONS FOR THE TRANSFER METHOD (from Bréhélin/Gascuel/Martin 2008) */
void update_i_c_post_order_ref_tree(Tree* ref_tree, Node* orig, Node* target, Tree* boot_tree, short unsigned** i_matrix, short unsigned** c_matrix);
void update_all_i_c_post_order_ref_tree(Tree* ref_tree, Tree* boot_tree, short unsigned** i_matrix, short unsigned** c_matrix);
void update_i_c_post_order_boot_tree(Tree* ref_tree, Tree* boot_tree, Node* orig, Node* target, short unsigned** i_matrix, short unsigned** c_matrix, short unsigned** hamming, short unsigned* min_dist, short unsigned* min_dist_edge);
void update_all_i_c_post_order_boot_tree(Tree* ref_tree, Tree* boot_tree, short unsigned** i_matrix, short unsigned** c_matrix, short unsigned** hamming, short unsigned* min_dist, short unsigned* min_dist_edge);
/*Generate Random Tree*/
/**
- nbr_taxa: Number of leaves in the output tree
- taxa_names: array of leaf names: must be NULL if you do not want to use it
(names will be numbered in this case)
- The output tree has branch lengths attributed using a normal distribution N(0.1,0.05), and any br len < 0 is set to 0
*/
Tree * gen_rand_tree(int nbr_taxa, char **taxa_names);
/* writing a tree */
void write_nh_tree(Tree* tree, FILE* stream);
void write_subtree_to_stream(Node* node, Node* node_from, FILE* stream);
/* freeing stuff */
void free_edge(Edge* edge);
void free_node(Node* node);
void free_tree(Tree* tree);
#endif /* _TREE_H_ */
|