File: plain_tweets.R

package info (click to toggle)
r-cran-rtweet 1.1.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 18,224 kB
  • sloc: sh: 13; makefile: 2
file content (78 lines) | stat: -rw-r--r-- 2,154 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#' Clean up character vector (tweets) to more of a plain text.
#' 
#' Removes links, linebreaks, fancy spaces and apostrophes and convert everything to ASCII text.
#' Deprecated to be defunct for next release as there are better text processing tools.
#' @param x The desired character vector or data frame/list with named column/element
#'   "text" to be cleaned and processed.
#' @return Data reformatted with ascii encoding and normal ampersands and
#'   without URL links, line breaks, fancy spaces/tabs, fancy apostrophes,
#' @export
plain_tweets <- function(x) {
  lifecycle::deprecate_warn("1.0.0", "plain_tweets()")
  if (is.data.frame(x)) {
    if (has_name_(x, "text")) {
      x$text <- plain_tweets_(x$text)
    } else {
      stop("Couldn't find \"text\" variable.", call. = FALSE)
    }
  } else if (is.list(x)) {
    if (has_name_(x, "text")) {
      x$text <- plain_tweets_(x$text)
    } else {
      stop("Couldn't find \"text\" variable.", call. = FALSE)
    }
  } else {
    x <- plain_tweets_(x)
  }
  x
}

plain_tweets_ <- function(x) {
  if (is.factor(x)) {
    x <- as.character(x)
  }
  stopifnot(is.character(x))
  x <- rm_links(x)
  x <- rm_linebreaks(x)
  x <- rm_fancy_spaces(x)
  x <- rm_fancy_apostrophes(x)
  x <- rm_amp(x)
  x <- enc2ascii(x)
  trim_ws(x)
}


##----------------------------------------------------------------------------##
##                  remove/replace tricky chars and URL links                 ##
##----------------------------------------------------------------------------##

rm_fancy_apostrophes <- function(x) gsub(intToUtf8(8217), "'", x)

rm_fancy_spaces <- function(x) {
  gsub("\\t", " ", gsub(intToUtf8(65039), " ", x))
}

rm_links <- function(x) {
  x <- gsub("\\s?https?[[:graph:]]", "", x)
  gsub("\\s?\\b[[:graph:]]+(\\.com|\\.net|\\.gov|\\.io|\\.org)\\b", "", x)
}

rm_linebreaks <- function(x, y = " ") {
  gsub("\\n", y, x)
}

enc2ascii <- function(x, y = "") {
  iconv(x, to = "ascii", sub = y)
}

rm_amp <- function(x, y = "&") {
  if (is.null(y)) {
    y <- ""
  }
  gsub("&amp;", y, x)
}

trim_ws <- function(x) {
  x <- gsub("[ ]{2,}", " ", x)
  gsub("^[ ]+|[ ]+$", "", x)
}