File: functions-utils.R

package info (click to toggle)
r-bioc-ensembldb 2.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,764 kB
  • sloc: perl: 331; sh: 15; makefile: 5
file content (241 lines) | stat: -rw-r--r-- 8,882 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
############################################################
## Utility functions

############################################################
## orderDataFrameBy
##
## Simply orders the data.frame x based on the columns specified
## with by.
orderDataFrameBy <- function(x, by = "", decreasing = FALSE) {
    if (all(by == "") | all(is.null(by)))
        return(x)
    return(x[do.call(order,
                     args = c(list(method = "radix",
                                   decreasing = decreasing),
                              as.list(x[, by, drop = FALSE]))), ])
}

############################################################
## checkOrderBy
##
## Check the orderBy argument.
## o orderBy can be a character vector or a , separated list.
## o Ensure that the columns are valid by comparing with 'supported'.
## Returns a character vector, each element representing a column
## on which sorting should be performed.
checkOrderBy <- function(orderBy, supported = character()) {
    if (is.null(orderBy) | all(orderBy == "")) {
        return(orderBy)
    }
    if (length(orderBy) == 1 & length(grep(orderBy, pattern = ",")) > 0) {
        orderBy <- unlist(strsplit(orderBy, split = ","), use.names = FALSE)
        orderBy <- gsub(orderBy, pattern = " ", replacement = "", fixed = TRUE)
    }
    not_supported <- !(orderBy %in% supported)
    if (any(not_supported)) {
        warning("Columns in 'order.by' (",
                paste(orderBy[not_supported], collapse = ", "),
                ") are not in 'columns' and were thus removed.")
        orderBy <- orderBy[!not_supported]
        if (length(orderBy) == 0)
            orderBy <- ""
    }
    return(orderBy)
}

############################################################
## addFilterColumns
##
## This function checks the filter objects and adds, depending on the
## returnFilterColumns setting of the EnsDb, also columns for each of the
## filters, ensuring that:
## a) "Symlink" filters are added correctly (the column returned by the
##    column call without db are added).
## b) GRangesFilter: the feature is set based on the specified feature parameter
## Args:
addFilterColumns <- function(cols, filter = AnnotationFilterList(), edb) {
    if (missing(cols))
        cols <- NULL
    gimmeAll <- returnFilterColumns(edb)
    if (!gimmeAll)
        return(cols)
    ## Put filter into an AnnotationFilterList if it's not already one
    if (is(filter, "AnnotationFilter"))
        filter <- AnnotationFilterList(filter)
    ## Or alternatively process the filters and add columns.
    symFilts <- c("SymbolFilter")
    addC <- unlist(lapply(filter, function(z) {
        if(class(z) %in% symFilts)
            return(z@field)
        if (is(z, "AnnotationFilterList"))
            return(addFilterColumns(cols = cols, filter = z, edb))
        return(ensDbColumn(z))
    }))
    return(unique(c(cols, addC)))
}

############################################################
## SQLiteName2MySQL
##
## Convert the SQLite database name (file name) to the corresponding
## MySQL database name.
SQLiteName2MySQL <- function(x) {
    tolower(gsub(x, pattern = ".", replacement = "_", fixed = TRUE))
}


## running the shiny web app.
runEnsDbApp <- function(...){
    if(requireNamespace("shiny", quietly=TRUE)){
        message("Starting the EnsDb shiny web app. Use Ctrl-C to stop.")
        shiny::runApp(appDir=system.file("shinyHappyPeople",
                                         package="ensembldb"), ...)
    }else{
        stop("Package shiny not installed!")
    }
}

############################################################
## anyProteinColumns
##
## Check if any of 'x' are protein columns.
anyProteinColumns <- function(x){
    return(any(x %in% unlist(.ensdb_protein_tables(), use.names = FALSE)))
}

############################################################
## listProteinColumns
##
#' @description The \code{listProteinColumns} function allows to conveniently
#'     extract all database columns containing protein annotations from
#'     an \code{\linkS4class{EnsDb}} database.
#' 
#' @return The \code{listProteinColumns} function returns a character vector
#'     with the column names containing protein annotations or throws an error
#'     if no such annotations are available.
#' 
#' @rdname ProteinFunctionality
#' 
#' @examples
#'
#' ## List all columns containing protein annotations
#' library(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#' if (hasProteinData(edb))
#'     listProteinColumns(edb)
listProteinColumns <- function(object) {
    if (missing(object))
        stop("'object' is missing with no default.")
    if (!is(object, "EnsDb"))
        stop("'object' has to be an instance of an 'EnsDb' object.")
    if (!hasProteinData(object))
        stop("The provided EnsDb database does not contain protein annotations!")
    return(listColumns(object, c("protein", "uniprot", "protein_domain")))
}

############################################################
## .ProteinsFromDataframe
#' @param x \code{EnsDb} object.
#' 
#' @param data \code{data.frame} with the results from a call to the
#'     \code{proteins} method; has to have required columns \code{"protein_id"}
#'     and \code{"protein_sequence"}.
#' 
#' @noRd
.ProteinsFromDataframe <- function(x, data) {
    if (!all(c("protein_id", "protein_sequence") %in% colnames(data)))
        stop("Reguired columns 'protein_id' and 'protein_sequence' not in 'data'!")
    ## Get the column names for uniprot and protein_domain
    uniprot_cols <- listColumns(x, "uniprot")
    uniprot_cols <- uniprot_cols[uniprot_cols != "protein_id"]
    uniprot_cols <- uniprot_cols[uniprot_cols %in% colnames(data)]
    if (length(uniprot_cols) > 0)
        warning("Don't know yet how to handle the 1:n mapping between",
                " protein_id and uniprot_id!")

    prot_dom_cols <- listColumns(x, "protein_domain")
    prot_dom_cols <- prot_dom_cols[prot_dom_cols != "protein_id"]
    prot_dom_cols <- prot_dom_cols[prot_dom_cols %in% colnames(data)]

    ## Create the protein part of the object, i.e. the AAStringSet.
    ## Use all columns other than protein_id, protein_sequence
    prot_cols <- colnames(data)
    prot_cols <- prot_cols[!(prot_cols %in% c(uniprot_cols, prot_dom_cols))]
    protein_sub <- unique(data[, prot_cols, drop = FALSE])
    aass <- AAStringSet(protein_sub$protein_sequence)
    names(aass) <- protein_sub$protein_id
    prot_cols <- prot_cols[!(prot_cols %in% c("protein_id", "protein_sequence"))]
    if (length(prot_cols) > 0) {
        mcols(aass) <- DataFrame(protein_sub[, prot_cols, drop = FALSE])
        ## drop these columns from data to eventually speed up splits
        data <- data[, !(colnames(data) %in% prot_cols), drop = FALSE]
    }

    ## How to process the Uniprot here??? have a 1:n mapping!

    ## Create the protein domain part
    if (length(prot_dom_cols) > 0) {
        message("Processing protein domains not yet implemented!")
        ## Split the dataframe by protein_id
        ## process this list to create the IRangesList.
        ## pranges should have the same order and the same names
    } else {
        pranges <- IRangesList(replicate(length(aass), IRanges()))
        names(pranges) <- names(aass)
    }
    metadata <- list(created = date())

    ##return(new("Proteins", aa = aass, pranges = pranges, metadata = metadata))
}

## map chromosome strand...
strand2num <- function(x){
    if (is.numeric(x)) {
        if (x >= 0) return(1)
        else return(-1)
    }
    xm <- x
    if(xm == "+" | xm == "-")
        xm <- paste0(xm, 1)
    xm <- as.numeric(xm)
    if (is.na(xm))
        stop("'", x, "' can not be converted to a strand!")
    return(xm)
}

num2strand <- function(x){
    if(x < 0){
        return("-")
    }else{
        return("+")
    }
}

#' @description Collapses entries in the \code{"entrezid"} column of a
#'     \code{data.frame} or \code{DataFrame} making the rest of \code{x} unique.
#'
#' @param x Either a \code{data.frame} or a \code{DataFrame}.
#'
#' @param by \code{character(1)} defining the column by which the
#'     \code{"entrezid"} column should be splitted.
#' 
#' @author Johannes Rainer
#' 
#' @noRd
.collapseEntrezidInTable <- function(x, by = "gene_id") {
    ## Slow version: use unique call.
    eg_idx <- which(colnames(x) == "entrezid")
    if (length(eg_idx)) {
        ## Avoid an additional lapply unique call.
        tmp <- unique(x[, c(by, "entrezid")])
        egs <- split(tmp[, "entrezid"],
                     f = factor(tmp[, by], levels = unique(tmp[, by])))
        ## Use a unique call.
        ## x_sub <- x[match(names(egs), x[, by]), , drop = FALSE] would be much
        ## faster but does not work e.g. for exons or transcripts.
        x_sub <- unique(x[, -eg_idx, drop = FALSE])
        x_sub$entrezid <- egs[x_sub[, by]]
        return(x_sub)
    }
    x
}