File: xml_structure.R

package info (click to toggle)
r-cran-xml2 1.1.0-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 852 kB
  • sloc: cpp: 1,057; xml: 115; sh: 53; ansic: 12; makefile: 6
file content (92 lines) | stat: -rw-r--r-- 2,610 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#' Show the structure of an html/xml document.
#'
#' Show the structure of an html/xml document without displaying any of
#' the values. This is useful if you want to get a high level view of the
#' way a document is organised. Compared to \code{xml_structure},
#' \code{html_structure} prints the id and class attributes.
#'
#' @param x HTML/XML document (or part there of)
#' @param indent Number of spaces to ident
#' @export
#' @examples
#' xml_structure(read_xml("<a><b><c/><c/></b><d/></a>"))
#'
#' rproj <- read_html(system.file("extdata","r-project.html", package = "xml2"))
#' xml_structure(rproj)
#' xml_structure(xml_find_all(rproj, ".//p"))
#'
#' h <- read_html("<body><p id = 'a'></p><p class = 'c d'></p></body>")
#' html_structure(h)
xml_structure <- function(x, indent = 2) {
  tree_structure(x, indent = indent, html = FALSE)
}

#' @export
#' @rdname xml_structure
html_structure <- function(x, indent = 2) {
  tree_structure(x, indent = indent, html = TRUE)
}

tree_structure <- function(x, indent = 2, html = FALSE) {
  UseMethod("tree_structure")
}

#' @export
tree_structure.xml_missing <- function(x, indent = 2, html = FALSE) {
  NA_character_
}

#' @export
tree_structure.xml_nodeset <- function(x, indent = 2, html = FALSE) {
  for (i in seq_along(x)) {
    cat("[[", i, "]]\n", sep = "")
    print_xml_structure(x[[i]], indent = indent, html = html)
    cat("\n")
  }

  invisible()
}

#' @export
tree_structure.xml_node <-  function(x, indent = 2, html = FALSE) {
  print_xml_structure(x, indent = indent, html = html)
  invisible()
}

print_xml_structure <- function(x, prefix = 0, indent = 2, html = FALSE) {
  padding <- paste(rep(" ", prefix), collapse = "")
  type <- xml_type(x)

  if (type == "element") {

    attr <- xml_attrs(x)
    if (html) {
      html_attrs <- list()
      if ("id" %in% names(attr)) {
        html_attrs$id <- paste0("#", attr[["id"]])
        attr <- attr[setdiff(names(attr), "id")]
      }

      if ("class" %in% names(attr)) {
        html_attrs$class <- paste0(".", gsub(" ", ".", attr[["class"]]))
        attr <- attr[setdiff(names(attr), "class")]
      }

      attr_str <- paste(unlist(html_attrs), collapse = " ")
    } else {
      attr_str <- ""
    }

    if (length(attr) > 0) {
      attr_str <- paste0(attr_str, " [", paste0(names(attr), collapse = ", "), "]")
    }

    node <- paste0("<", xml_name(x), attr_str, ">")

    cat(padding, node, "\n", sep = "")
    lapply(xml_contents(x), print_xml_structure, prefix = prefix + indent,
      indent = indent, html = html)
  } else {
    cat(padding, "{", type, "}\n", sep = "")
  }
}