File: xml_structure.R

package info (click to toggle)
r-cran-xml2 1.4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 976 kB
  • sloc: cpp: 1,826; xml: 333; javascript: 238; ansic: 178; sh: 71; makefile: 6
file content (100 lines) | stat: -rw-r--r-- 2,955 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#' Show the structure of an html/xml document.
#'
#' Show the structure of an html/xml document without displaying any of
#' the values. This is useful if you want to get a high level view of the
#' way a document is organised. Compared to `xml_structure`,
#' `html_structure` prints the id and class attributes.
#'
#' @param x HTML/XML document (or part there of)
#' @param indent Number of spaces to ident
#' @inheritParams base::cat
#' @export
#' @examples
#' xml_structure(read_xml("<a><b><c/><c/></b><d/></a>"))
#'
#' rproj <- read_html(system.file("extdata", "r-project.html", package = "xml2"))
#' xml_structure(rproj)
#' xml_structure(xml_find_all(rproj, ".//p"))
#'
#' h <- read_html("<body><p id = 'a'></p><p class = 'c d'></p></body>")
#' html_structure(h)
xml_structure <- function(x, indent = 2, file = "") {
  cat(file = file)
  tree_structure(x, indent = indent, html = FALSE, file = file)
}

#' @export
#' @rdname xml_structure
html_structure <- function(x, indent = 2, file = "") {
  cat(file = file)
  tree_structure(x, indent = indent, html = TRUE, file = file)
}

tree_structure <- function(x, indent = 2, html = FALSE, file = "") {
  UseMethod("tree_structure")
}

#' @export
tree_structure.xml_missing <- function(x, indent = 2, html = FALSE, file = "") {
  NA_character_
}

#' @export
tree_structure.xml_nodeset <- function(x, indent = 2, html = FALSE, file = "") {
  for (i in seq_along(x)) {
    cat("[[", i, "]]\n", sep = "", file = file, append = TRUE)
    print_xml_structure(x[[i]], indent = indent, html = html, file = file)
    cat("\n", file = file, append = TRUE)
  }

  invisible()
}

#' @export
tree_structure.xml_node <- function(x, indent = 2, html = FALSE, file = "") {
  print_xml_structure(x, indent = indent, html = html, file = file)
  invisible()
}

print_xml_structure <- function(x, prefix = 0, indent = 2, html = FALSE, file = "") {
  padding <- paste(rep(" ", prefix), collapse = "")
  type <- xml_type(x)

  if (type == "element") {
    attr <- xml_attrs(x)
    if (html) {
      html_attrs <- list()
      if ("id" %in% names(attr)) {
        html_attrs$id <- paste0("#", attr[["id"]])
        attr <- attr[setdiff(names(attr), "id")]
      }

      if ("class" %in% names(attr)) {
        html_attrs$class <- paste0(".", gsub(" ", ".", attr[["class"]]))
        attr <- attr[setdiff(names(attr), "class")]
      }

      attr_str <- paste(unlist(html_attrs), collapse = " ")
    } else {
      attr_str <- ""
    }

    if (length(attr) > 0) {
      attr_str <- paste0(attr_str, " [", paste0(names(attr), collapse = ", "), "]")
    }

    node <- paste0("<", xml_name(x), attr_str, ">")

    cat(padding, node, "\n", sep = "", file = file, append = TRUE)
    lapply(
      xml_contents(x),
      print_xml_structure,
      prefix = prefix + indent,
      indent = indent,
      html = html,
      file = file
    )
  } else {
    cat(padding, "{", type, "}\n", sep = "", file = file, append = TRUE)
  }
}