File: htmlLinks.R

package info (click to toggle)
r-cran-xml 3.99-0.18-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 3,688 kB
  • sloc: ansic: 6,656; xml: 2,890; asm: 486; sh: 12; makefile: 2
file content (50 lines) | stat: -rw-r--r-- 1,214 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
readHTMLLinks = getHTMLLinks =
function(doc, externalOnly = TRUE, xpQuery = "//a/@href", baseURL = docName(doc),
          relative = FALSE)
{
  if(is.character(doc))
     doc = htmlParse(doc)

    # put a . in front of the xpQuery if we have a node rather than a document.
  if(is(doc, "XMLInternalNode") && grepl("^/", xpQuery))
     xpQuery = sprintf(".%s", xpQuery)

  links = as.character(getNodeSet(doc, xpQuery))
  links = if(externalOnly)
             grep("^#", links, value = TRUE, invert = TRUE)
          else
             links

        #XXX Put base URL onto these links, relative!
  if(relative)
    sapply(links, getRelativeURL, baseURL)
  else
    links
}



getHTMLExternalFiles =
function(doc, xpQuery = c("//img/@src", "//link/@href", "//script/@href", "//embed/@src"),
           baseURL = docName(doc), relative = FALSE, asNodes = FALSE, recursive = FALSE)
{
  if(is.character(doc))
     doc = htmlParse(doc)

  if(asNodes)
     xpQuery = gsub("/@[a-zA-Z-]$+", "", xpQuery)

  nodes = getNodeSet(doc, xpQuery)

  if(asNodes)
    return(nodes)

  nodes = as.character(nodes)
  
  ans = if(relative)
          getRelativeURL(nodes, baseURL)
        else
          nodes
   # recursive.
  ans
}