File: xml_parse.R

package info (click to toggle)
r-cran-xml2 1.1.0-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 852 kB
  • sloc: cpp: 1,057; xml: 115; sh: 53; ansic: 12; makefile: 6
file content (128 lines) | stat: -rw-r--r-- 4,829 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#' Read HTML or XML.
#'
#' @param x A string, a connection, or a raw vector.
#'
#'   A string can be either a path, a url or literal xml. Urls will
#'   be converted into connections either using \code{base::url} or, if
#'   installed, \code{curl::curl}. Local paths ending in \code{.gz},
#'   \code{.bz2}, \code{.xz}, \code{.zip} will be automatically uncompressed.
#'
#'   If a connection, the complete connection is read into a raw vector before
#'   being parsed.
#' @param encoding Specify a default encoding for the document. Unless
#'   otherwise specified XML documents are assumed to be in UTF-8 or
#'   UTF-16. If the document is not UTF-8/16, and lacks an explicit
#'   encoding directive, this allows you to supply a default.
#' @param ... Additional arguments passed on to methods.
#' @param as_html Optionally parse an xml file as if it's html.
#' @param base_url When loading from a connection, raw vector or literal
#'   html/xml, this allows you to specify a base url for the document. Base
#'   urls are used to turn relative urls into absolute urls.
#' @param n If \code{file} is a connection, the number of bytes to read per
#'    iteration. Defaults to 64kb.
#' @param verbose When reading from a slow connection, this prints some
#'    output on every iteration so you know its working.
#' @param options Set parsing options for the libxml2 parser. Zero of more of
#' \Sexpr[results=rd]{xml2:::describe_options(xml2:::xml_parse_options())}
#' @return An XML document. HTML is normalised to valid XML - this may not
#'   be exactly the same transformation performed by the browser, but it's
#'   a reasonable approximation.
#' @export
#' @examples
#' # Literal xml/html is useful for small examples
#' read_xml("<foo><bar /></foo>")
#' read_html("<html><title>Hi<title></html>")
#' read_html("<html><title>Hi")
#'
#' # From a local path
#' read_html(system.file("extdata", "r-project.html", package = "xml2"))
#'
#' # From a url
#' cd <- read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml")
#' me <- read_html("http://had.co.nz")
read_xml <- function(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS") {
  UseMethod("read_xml")
}

#' @export
#' @rdname read_xml
read_html <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
  UseMethod("read_html")
}

#' @export
read_html.default <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
  options <- parse_options(options, xml_parse_options())

  suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options))
}

#' @export
read_html.response <- function(x, encoding = "", options = c("RECOVER",
    "NOERROR", "NOBLANKS"), ...) {
  need_package("httr")

  options <- parse_options(options, xml_parse_options())
  content <- httr::content(x, as = "raw")
  xml2::read_html(content, encoding = encoding, options = options, ...)
}

#' @export
#' @rdname read_xml
read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
                               options = "NOBLANKS") {

  options <- parse_options(options, xml_parse_options())
  if (grepl("<|>", x)) {
    read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html, options = options)
  } else {
    con <- path_to_connection(x)
    if (inherits(con, "connection")) {
      read_xml.connection(con, encoding = encoding, ..., as_html = as_html,
        base_url = x, options = options)
    } else {
      doc <- doc_parse_file(con, encoding = encoding, as_html = as_html,
        options = options)
      xml_document(doc)
    }
  }
}

#' @export
#' @rdname read_xml
read_xml.raw <- function(x, encoding = "", base_url = "", ...,
                         as_html = FALSE, options = "NOBLANKS") {
  options <- parse_options(options, xml_parse_options())

  doc <- doc_parse_raw(x, encoding = encoding, base_url = base_url,
    as_html = as_html, options = options)
  xml_document(doc)
}

#' @export
#' @rdname read_xml
read_xml.connection <- function(x, encoding = "", n = 64 * 1024,
                                verbose = FALSE, ..., base_url = "",
                                as_html = FALSE, options = "NOBLANKS") {
  options <- parse_options(options, xml_parse_options())

  if (!isOpen(x)) {
    open(x, "rb")
    on.exit(close(x))
  }

  raw <- read_connection_(x, n)
  read_xml.raw(raw, encoding = encoding, base_url = base_url, as_html =
    as_html, options = options)
}

#' @export
read_xml.response <- function(x, encoding = "", base_url = "", ...,
                              as_html = FALSE, options = "NOBLANKS") {
  need_package("httr")

  options <- parse_options(options, xml_parse_options())
  content <- httr::content(x, as = "raw")
  xml2::read_xml(content, encoding = encoding, base_url = base_url,
    as_html = as_html, option = options, ...)
}