File: get_known_signatures.R

package info (click to toggle)
r-bioc-mutationalpatterns 3.16.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,360 kB
  • sloc: sh: 8; makefile: 2
file content (261 lines) | stat: -rw-r--r-- 9,482 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#' Get known signatures
#'
#' This function loads a signature matrix of pre-defined signatures.
#' It can retrieve signatures for different types of mutations.
#' It can also retrieve signatures from different sources.
#' Additionally, different signature types can be retrieved.
#' (The possible types are: Reference, tissue specific or drug exposure signatures.)
#' For the COSMIC signatures both GRCh37, GRCh38 and mm10 versions of the signatures can be used.
#' Finally, the user can choose whether to include possible artifacts.
#' If no signatures have been defined for a specific combination of options,
#' then an error is given.
#'
#' Possible combinations:
#' COSMIC:
#' - all muttypes. (tsb_snv is the same as in version 3.1)
#' - reference
#' - Can include possible artifacts for SNVs
#' - For the SNVs and DBSs both GRCh37 and GRCh38 versions are available
#' 
#' COSMIC_v3.1:
#' - all muttypes.
#' - reference
#' - Can include possible artifacts for SNVs
#' 
#' SIGNAL:
#'  - SNV. (+ DBS, when using exposure signatures.)
#'  - all signature types
#'  - Can include possible artifacts for reference SNVs
#'
#' SPARSE:
#'  - SNV
#'  - reference
#'
#'  Artifacts can be included when using reference signatures for
#'  SNVs with COSMIC and SIGNAL
#'
#'
#' The signatures bundled in this package came from several sources.
#' Please cite the associated papers if you use them.
#'
#' The COSMIC signatures were downloaded from:
#' https://cancer.sanger.ac.uk/signatures
#' Currently, both version 3.2 and 3.1 are available.
#' Paper:  Alexandrov, L.B. et al., 2020, Nature
#'
#' The SIGNAL signatures were downloaded from:
#' https://signal.mutationalsignatures.com/
#' They were downloaded on: 03 July 2020.
#' Paper: Andrea Degasperi et al., 2020, Nature Cancer
#' Exposure paper: Jill E Kucab et al., 2019, Cell
#'
#' The SPARSE signatures were downloaded from:
#' https://www.biorxiv.org/content/10.1101/384834v2
#' They were downloaded on: 03 July 2020.
#' Paper: Daniele Ramazzotti et al., 2019, Bioarchive
#'
#' @param muttype The type of mutations. Possible values:
#'              * 'snv' (default);
#'              * 'dbs';
#'              * 'indel';
#'              * 'tsb_snv' transcription strand bias snv;
#' @param source The signature source. Possible values:
#'              * 'COSMIC' (default. Currently v3.2);
#'              * 'COSMIC_v3.1';
#'              * 'COSMIC_v3.2';
#'              * 'SIGNAL';
#'              * 'SPARSE';
#' @param sig_type The type of signature. Possible values:
#'              * 'reference' (default);
#'              * 'exposure';
#'              * 'tissue';
#' @param incl_poss_artifacts Whether to include possible
#' artifacts. (default: FALSE)
#' @param tissue_type The specific tissue to select signatures from.
#' Can only be used when looking at tissue specific signatures.
#' Keep this at NA to see tissue specific signatures for all tissues.
#' @param genome The genome version that is used. This only works for COSMIC signatures.
#'              * 'GRCh37' (default);
#'              * 'GRCh38';
#'              * 'mm10';
#' @return A signature matrix
#' @export
#'
#' @examples
#'
#' ## Get reference snv signature from COSMIC
#' get_known_signatures()
#'
#' ## Get reference snv signature from COSMIC,
#' ## including potential artifacts.
#' get_known_signatures(incl_poss_artifacts = TRUE)
#'
#' ## Get a GRCh38 version of the signatures
#' get_known_signatures(genome = "GRCh38")
#' 
#' ## Get DBS signatures
#' get_known_signatures("dbs")
#'
#' ## Get indel signatures
#' get_known_signatures("indel")
#'
#' ## Get transcription strand bias snv signatures
#' get_known_signatures("tsb_snv")
#'
#' ## Get COSMIC version 3.1 of the signatures
#' get_known_signatures(source = "COSMIC_v3.1")
#'
#' ## Get reference signatures from SIGNAL
#' get_known_signatures(source = "SIGNAL")
#'
#' ## Get reference signatures from SIGNAL,
#' ## including potential artifacts
#' get_known_signatures(source = "SIGNAL", incl_poss_artifacts = TRUE)
#'
#' ## Get exposure signatures from SIGNAL
#' get_known_signatures(source = "SIGNAL", sig_type = "exposure")
#'
#' ## Get DBS exposure signatures from SIGNAL
#' get_known_signatures("dbs", source = "SIGNAL", sig_type = "exposure")
#'
#' ## Get all tissue specific signatures from SIGNAL
#' get_known_signatures(source = "SIGNAL", sig_type = "tissue")
#'
#' ## Get Bladder specific signatures from SIGNAL
#' get_known_signatures(
#'   source = "SIGNAL",
#'   sig_type = "tissue",
#'   tissue_type = "Bladder"
#' )
#'
#' ## If you use an incorrect tissue_type an error is given.
#'
#' ## Get sparse signatures
#' get_known_signatures(source = "SPARSE")
get_known_signatures <- function(muttype = c("snv", 
                                             "dbs", 
                                             "indel", 
                                             "tsb_snv"),
                                 source = c("COSMIC", 
                                            "SIGNAL", 
                                            "SPARSE", 
                                            "COSMIC_v3.1", 
                                            "COSMIC_v3.2"),
                                 sig_type = c("reference", 
                                              "exposure", 
                                              "tissue"),
                                 incl_poss_artifacts = FALSE,
                                 tissue_type = c(
                                   NA, "Biliary", "Bladder", "Bone",
                                   "Breast", "Cervix", "CNS",
                                   "Colorectal", "Esophagus", "Head",
                                   "Kidney", "Liver", "Lung",
                                   "Lymphoid", "Myeloid", "Ovary",
                                   "Pancreas", "Prostate", "Skin",
                                   "Stomach", "Thyroid", "Uterus"
                                 ),
                                 genome = c("GRCh37", "GRCh38", "mm10")) {

  # Match arguments
  muttype <- match.arg(muttype)
  source <- match.arg(source)
  sig_type <- match.arg(sig_type)
  tissue_type <- match.arg(tissue_type)
  genome <- match.arg(genome)

  # Use the newest COSMIC version as the default
  if (source == "COSMIC"){
    source = "COSMIC_v3.2"
  }
  
  # Provide a message that the old COSMIC version will not be supported in the future.
  if (source == "COSMIC_v3.1"){
    message(paste0("You are currently attempting to use an older COSMIC version. ",
            "This version will likely be removed in future releases. ",
            "Consider switching to a newer version or manually saving the matrix you are using now."))
  }
  
  # Check that a correct combination of arguments is used.
  if (!.is_na(tissue_type) & sig_type != "tissue") {
    stop("tissue_type can only be used with `sig_type = 'tissue'`",
      call. = FALSE
    )
  }
  
  if (source != "COSMIC_v3.2" & (genome == "GRCh38" | genome == "mm10")){
    stop("genome can only be used with `source = 'COSMIC' or `source = 'COSMIC_v3.2'",
         call. = FALSE)
  }
  
  if ((genome == "GRCh38" | genome == "mm10") & (muttype == "indel" | muttype == "tsb_snv")){
    stop(paste0("There are no COSMIC indel or tsb_snv signatures that are ",
                "normalized for GRCh38 or mm10.\n",
                "Use the GRCh37 signatures instead."),
         call. = FALSE)
  }

  # Use the older COSMIC version for tsb_snv, 
  # because there is no 3.2 version for this signature type.
  if (source == "COSMIC_v3.2" & muttype == "tsb_snv"){
    source = "COSMIC_v3.1"
  }
  
  # Determine signature file name
  basename_sig <- paste0(muttype, "_", source, "_", sig_type, ".txt")
  if (source == "COSMIC_v3.2"){
    basename_sig <- paste0(muttype, "_", source, "_", sig_type, "_", genome, ".txt")
  }
  #basename_sig <- stringr::str_remove(basename_sig, "_v3.1|_v3.2")
  fname_sig <- file.path("extdata", "signatures", basename_sig)
  fname_sig <- system.file(fname_sig, package = "MutationalPatterns")

  # Give error if file doesn't exist.
  if (!file.exists(fname_sig)) {
    stop(paste0(
      "The signature file: ", fname_sig, " does not exist.\n",
      "Look at the documentation of 'get_known_signatures()' for",
      " all the possible combinations of arguments."
    ),
    call. = FALSE
    )
  }

  # Read in signature file
  signatures <- read.table(fname_sig, sep = "\t", header = TRUE)


  # Remove meta columns
  if (muttype == "snv") {
    meta_cols <- c(1, 2)
  } else if (muttype == "tsb_snv") {
    meta_cols <- c(1, 2, 3)
  } else {
    meta_cols <- 1
  }
  signatures <- as.matrix(signatures[, -meta_cols, drop = FALSE])

  # Remove possible artifacts
  if (!incl_poss_artifacts) {
    if (source == "SIGNAL" & sig_type == "reference") {
      good_cols <- grep("Ref.Sig.N[0-9]{0-2}",
        colnames(signatures),
        invert = TRUE
      )
      signatures <- signatures[, good_cols, drop = FALSE]
    }

    if ((source == "COSMIC_v3.2" | source == "COSMIC_v3.1") & muttype == "snv") {
      bad_sigs <- paste0("SBS", c(27, 43, seq(45, 60)))
      good_cols <- !colnames(signatures) %in% bad_sigs
      signatures <- signatures[, good_cols, drop = FALSE]
    }
  }

  # Select signatures of the specified tissue type
  if (!.is_na(tissue_type)) {
    tissue_cols <- grep(paste0("^", tissue_type, "_"), colnames(signatures))
    signatures <- signatures[, tissue_cols, drop = FALSE]
  }

  return(signatures)
}