File: tree_to_labels.R

package info (click to toggle)
r-cran-rotl 3.1.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,384 kB
  • sloc: sh: 9; makefile: 5
file content (47 lines) | stat: -rw-r--r-- 1,453 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
## Function to extract tip and edge labels from newick formatted strings
## useful when the tree is too small to be read in by ape/rncl.
## tr needs to be a newick formatted tree string
## - missing tips are removed (OK for OTL as it won't happen)
tree_to_labels <- function(tr, remove_quotes = TRUE) {
  n_right <- unlist(gregexpr("\\)", tr))
  n_left <- unlist(gregexpr("\\(", tr))

  if (n_right[1] == -1) n_right <- 0 else n_right <- length(n_right)
  if (n_left[1] == -1) n_left <- 0 else n_left <- length(n_left)

  if (!identical(n_right, n_left)) {
    stop("invalid newick string, numbers of ( and ) don't match")
  }

  ## remove white spaces
  tr <- gsub("\\s+", "", tr)

  ## remove branch lengths
  tr <- gsub(":[0-9]+(\\.[0-9]+)?", "", tr)

  ## TODO?: remove comments

  if (n_right < 1) {
    ## if only 1 tip
    tip_lbl <- gsub(";$", "", tr)
    edge_lbl <- character(0)
  } else {
    ## extract edge labels
    edge_lbl <- unlist(strsplit(tr, ")"))
    edge_lbl <- grep("^[^\\(]", edge_lbl, value = T)
    edge_lbl <- gsub("(,|;).*$", "", edge_lbl)
    edge_lbl <- edge_lbl[nzchar(edge_lbl)]

    ## extract tips
    tip_lbl <- unlist(strsplit(tr, ","))
    tip_lbl <- gsub("^\\(*", "", tip_lbl)
    tip_lbl <- gsub("\\).*$", "", tip_lbl)
    tip_lbl <- tip_lbl[nzchar(tip_lbl)]
  }

  if (remove_quotes) {
    tip_lbl <- gsub("^(\\\"|\\\')(.+)(\\\'|\\\")$", "\\2", tip_lbl)
  }

  list(tip_label = tip_lbl, edge_label = edge_lbl)
}