1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
|
Evaluate whether GC content calculated on the RNA sequences using the Ensembl
Perl API matches the one calculated using R and the genomic DNA sequence.
```{r}
library(ensembldb)
edb <- EnsDb("/Users/jo/Projects/EnsDbs/98/EnsDb.Hsapiens.v98.sqlite")
chrs <- c(1:22, "X", "Y")
txs <- transcripts(edb, filter = ~ seq_name %in% chrs)
any(is.na(mcols(txs)$gc_content))
```
Get the sequences using R functionality.
```{r}
library(GenomicFeatures)
dna <- getGenomeTwoBitFile(edb)
tx_exons <- exonsBy(edb, "tx", filter = ~ seq_name %in% chrs)
tx_seqs <- extractTranscriptSeqs(dna, tx_exons)
library(Biostrings)
gc_prop <- letterFrequency(tx_seqs, letters = "GC", as.prob = TRUE)[, 1]
names(gc_prop) <- names(tx_seqs)
```
Compare GC contents...
```{r}
gc_prop_perl <- mcols(txs)$gc_content
names(gc_prop_perl) <- names(txs)
stopifnot(all(names(gc_prop) %in% names(gc_prop_perl)))
gc_prop_perl <- gc_prop_perl[names(gc_prop)]
head(gc_prop_perl)
head(gc_prop)
library(testthat)
expect_equal(gc_prop_perl, 100 * gc_prop)
```
|