File: coordinate-mapping-use-cases.R

package info (click to toggle)
r-bioc-ensembldb 2.6.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 2,720 kB
  • sloc: perl: 326; makefile: 5
file content (152 lines) | stat: -rw-r--r-- 7,045 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
## ----biocstyle, echo = FALSE, results = "asis", message = FALSE------------
library(BiocStyle)
BiocStyle::markdown()

## ----load-libs, message = FALSE--------------------------------------------
library(ensembldb)
library(EnsDb.Hsapiens.v86)
edb <- EnsDb.Hsapiens.v86

## Retrieve the genes
gns <- genes(edb, filter = ~ protein_domain_id == "PF00010" & seq_name == "21")

## ----gene-GRanges----------------------------------------------------------
gns

## ----olig-tx-models, message = FALSE---------------------------------------
## Change chromosome naming style to UCSC
seqlevelsStyle(edb) <- "UCSC"


## ----edb-subset, message = FALSE, echo = FALSE-----------------------------
## Subset the EnsDb to speed up vignette processing
edb <- filter(edb, filter = ~ seq_name %in% c("chr21", "chr16"))

## ----retrieve-olig-tx-models, message = FALSE------------------------------
## Retrieve the transcript models for OLIG1 and OLIG2 that encode the
## the protein domain
txs <- getGeneRegionTrackForGviz(
    edb, filter = ~ genename %in% c("OLIG1", "OLIG2") &
             protein_domain_id == "PF00010")

## ----olig-prot-doms, message = FALSE---------------------------------------
pdoms <- proteins(edb, filter = ~ tx_id %in% txs$transcript &
                           protein_domain_source == "pfam",
                  columns = c("protein_domain_id", "prot_dom_start",
                              "prot_dom_end"))
pdoms

## ----olig-proteinToGenome, message = FALSE---------------------------------
pdoms_rng <- IRanges(start = pdoms$prot_dom_start, end = pdoms$prot_dom_end,
                     names = pdoms$protein_id)

pdoms_gnm <- proteinToGenome(pdoms_rng, edb)

## ----olig-proteinToGenome-result, message = FALSE--------------------------
pdoms_gnm


## ----olig-proteinToGenome-reorganize, message = FALSE----------------------
## Convert the list to a GRanges with grouping information
pdoms_gnm_grng <- unlist(GRangesList(pdoms_gnm))
pdoms_gnm_grng$id <- rep(pdoms$protein_domain_id, lengths(pdoms_gnm))
pdoms_gnm_grng$grp <- rep(1:nrow(pdoms), lengths(pdoms_gnm))

pdoms_gnm_grng

## ----olig-plot, message = FALSE, warning = FALSE, fig.align = "center", fig.width = 8, fig.height = 2, fig.cap = "Transcripts of genes OLIG1 and OLIG2 encoding a helix-loop-helix protein domain. Shown are all transcripts of the genes OLIG1 and OLIG2 that encode a protein with a helix-loop-helix protein domain (PF00010). Genomic positions encoding protein domains defined in Pfam are shown in light blue.", fig.pos = "h!"----
library(Gviz)

## Define the individual tracks:
## - Ideagram
ideo_track <- IdeogramTrack(genome = "hg38", chromosome = "chr21")
## - Genome axis
gaxis_track <- GenomeAxisTrack()
## - Transcripts
gene_track <- GeneRegionTrack(txs, showId = TRUE, just.group = "right",
                              name = "", geneSymbol = TRUE, size = 0.5)
## - Protein domains
pdom_track <- AnnotationTrack(pdoms_gnm_grng, group = pdoms_gnm_grng$grp,
                              id = pdoms_gnm_grng$id, groupAnnotation = "id",
                              just.group = "right", shape = "box",
                              name = "Protein domains", size = 0.5)

## Generate the plot
plotTracks(list(ideo_track, gaxis_track, gene_track, pdom_track))


## ----sim2-fetch, message = FALSE-------------------------------------------
## Fetch all SIM2 transcripts encoding PF00010
txs <- getGeneRegionTrackForGviz(edb, filter = ~ genename == "SIM2" &
                                          protein_domain_id == "PF00010")
## Fetch all Pfam protein domains within these transcripts
pdoms <- proteins(edb, filter = ~ tx_id %in% txs$transcript &
                           protein_domain_source == "pfam",
                  columns = c("protein_domain_id", "prot_dom_start",
                              "prot_dom_end"))


## ----sim2-plot, message = FALSE, warning = FALSE, fig.align = "center", fig.width = 8, fig.height = 2, fig.cap = "Transcripts of the gene SIM2 encoding a helix-loop-helix domain. Shown are all transcripts of SIM2 encoding a protein with a helix-loop-helix protein domain (PF00010). Genomic positions encoding protein domains defined in Pfam are shown in light blue.", echo = FALSE, fig.pos = "h!"----
pdoms_rng <- IRanges(start = pdoms$prot_dom_start, end = pdoms$prot_dom_end,
                     names = pdoms$protein_id)
pdoms_gnm <- proteinToGenome(pdoms_rng, edb)

## Convert the list to a GRanges with grouping information
pdoms_gnm_grng <- unlist(GRangesList(pdoms_gnm))
pdoms_gnm_grng$id <- rep(pdoms$protein_domain_id, lengths(pdoms_gnm))
pdoms_gnm_grng$grp <- rep(1:nrow(pdoms), lengths(pdoms_gnm))

gene_track <- GeneRegionTrack(txs, showId = TRUE, just.group = "right",
                              name = "", geneSymbol = TRUE, size = 0.5)
## - Protein domains
pdom_track <- AnnotationTrack(pdoms_gnm_grng, group = pdoms_gnm_grng$grp,
                              id = pdoms_gnm_grng$id, groupAnnotation = "id",
                              just.group = "right", shape = "box",
                              name = "Protein domains", size = 0.5)

## Generate the plot
plotTracks(list(ideo_track, gaxis_track, gene_track, pdom_track))


## ----ex2-map, message = FALSE, warning = FALSE-----------------------------
gnm_pos <- GRanges("chr16", IRanges(89920138, width = 1))
prt_pos <- genomeToProtein(gnm_pos, edb)
prt_pos

## ----ex2-select, message = FALSE-------------------------------------------
select(edb, keys = ~ protein_id == names(prt_pos[[1]]), columns = "SYMBOL")

## ----ex2-plot, message = FALSE, warning = FALSE, fig.cap = "Transcripts overlapping, or close to, the genomic position of interest. Shown are all transcripts The genomic position of the variant is highlighted in red.", fig.width = 8, fig.height = 4, fig.pos = "h!"----
## Get transcripts overlapping the genomic position.
txs <- getGeneRegionTrackForGviz(edb, filter = GRangesFilter(gnm_pos))

## Get all transcripts within the region from the start of the most 5'
## and end of the most 3' exon.
all_txs <- getGeneRegionTrackForGviz(
    edb, filter = GRangesFilter(range(txs), type = "within"))

## Plot the data
## - Ideogram
ideo_track <- IdeogramTrack(genome = "hg38", chromosome = "chr16")
## - Genome axis
gaxis_track <- GenomeAxisTrack()
## - Transcripts
gene_track <- GeneRegionTrack(all_txs, showId = TRUE, just.group = "right",
                              name = "", geneSymbol = TRUE, size = 0.5)
## - highlight the region.
hl_track <- HighlightTrack(list(gaxis_track, gene_track), range = gnm_pos)

## Generate the plot
plotTracks(list(ideo_track, hl_track))


## ----ex2-res, message = FALSE----------------------------------------------
## Get the amino acid sequences for the 3 transcripts
prt_seq <- proteins(edb, return.type = "AAStringSet",
                    filter = ~ protein_id == names(prt_pos[[1]]))
## Extract the amino acid at position 294
subseq(prt_seq, start = 294, end = 294)

## ----sessionInfo-----------------------------------------------------------
sessionInfo()