File: extractUpstreamSeqs.Rd

package info (click to toggle)
r-bioc-genomicfeatures 1.50.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,056 kB
  • sloc: makefile: 6; sh: 2
file content (168 lines) | stat: -rw-r--r-- 6,346 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
\name{extractUpstreamSeqs}

\alias{extractUpstreamSeqs}
\alias{extractUpstreamSeqs,GenomicRanges-method}
\alias{extractUpstreamSeqs,TxDb-method}
\alias{extractUpstreamSeqs,GRangesList-method}


\title{Extract sequences upstream of a set of genes or transcripts}

\description{
  \code{extractUpstreamSeqs} is a generic function for extracting
  sequences upstream of a supplied set of genes or transcripts.
}

\usage{
extractUpstreamSeqs(x, genes, width=1000, ...)

## Dispatch is on the 2nd argument!

\S4method{extractUpstreamSeqs}{GenomicRanges}(x, genes, width=1000)

\S4method{extractUpstreamSeqs}{TxDb}(x, genes, width=1000, exclude.seqlevels=NULL)
}

\arguments{
  \item{x}{
    An object containing the chromosome sequences from which to extract the
    upstream sequences. It can be a \link[BSgenome]{BSgenome},
    \link[rtracklayer]{TwoBitFile}, or \link[Rsamtools]{FaFile} object,
    or any \emph{genome sequence container}.
    More formally, \code{x} must be an object for which
    \code{\link[GenomeInfoDb]{seqinfo}} and \code{\link[Biostrings]{getSeq}}
    are defined.
  }
  \item{genes}{
    An object containing the locations (i.e. chromosome name, start, end, and
    strand) of the genes or transcripts with respect to the reference genome.
    Only \link[GenomicRanges]{GenomicRanges} and \link{TxDb} objects
    are supported at the moment. If the latter, the gene locations are obtained
    by calling the \code{\link{genes}} function on the \link{TxDb}
    object internally.
  }
  \item{width}{
    How many bases to extract upstream of each TSS (transcription start site).
  }
  \item{...}{
    Additional arguments, for use in specific methods.
  }
  \item{exclude.seqlevels}{
    A character vector containing the chromosome names (a.k.a. sequence levels)
    to exclude when the genes are obtained from a \link{TxDb} object.
  }
}

\value{
  A \link[Biostrings]{DNAStringSet} object containing one upstream sequence
  per gene (or per transcript if \code{genes} is a
  \link[GenomicRanges]{GenomicRanges} object containing transcript ranges).

  More precisely, if \code{genes} is a \link[GenomicRanges]{GenomicRanges}
  object, the returned object is \emph{parallel} to it, that is, the i-th
  element in the returned object is the upstream sequence corresponding to
  the i-th gene (or transcript) in \code{genes}. Also the names on the
  \link[GenomicRanges]{GenomicRanges} object are propagated to the returned
  object.

  If \code{genes} is a \link{TxDb} object, the names on the returned
  object are the gene IDs found in the \link{TxDb} object. To see the
  type of gene IDs (i.e. Entrez gene ID or Ensembl gene ID or ...), you can
  display \code{genes} with \code{show(genes)}.

  In addition, the returned object has the following metadata columns
  (accessible with \code{\link{mcols}}) that provide some information about
  the gene (or transcript) corresponding to each upstream sequence:
  \itemize{
    \item \code{gene_seqnames}: the chromosome name of the gene (or
          transcript);
    \item \code{gene_strand}: the strand of the gene (or transcript);
    \item \code{gene_TSS}: the transcription start site of the gene (or
          transcript).
  }
}

\note{
  IMPORTANT: Always make sure to use a TxDb package (or \link{TxDb}
  object) that contains a gene model compatible with the \emph{genome sequence
  container} \code{x}, that is, a gene model based on the exact same reference
  genome as \code{x}.

  See
  \url{http://bioconductor.org/packages/release/BiocViews.html#___TxDb}
  for the list of TxDb packages available in the current release of
  Bioconductor.
  Note that you can make your own custom \link{TxDb} object from
  various annotation resources by using one of the \code{makeTxDbFrom*()}
  functions listed in the "See also" section below.
}

\author{Hervé Pagès}

\seealso{
  \itemize{
    \item \code{\link{makeTxDbFromUCSC}}, \code{\link{makeTxDbFromBiomart}},
          and \code{\link{makeTxDbFromEnsembl}}, for making a \link{TxDb}
          object from online resources.

    \item \code{\link{makeTxDbFromGRanges}} and \code{\link{makeTxDbFromGFF}}
          for making a \link{TxDb} object from a \link[GenomicRanges]{GRanges}
          object, or from a GFF or GTF file.

    \item The \code{\link[BSgenome]{available.genomes}} function in the
          \pkg{BSgenome} package for checking avaibility of BSgenome
          data packages (and installing the desired one).

    \item The \link[BSgenome]{BSgenome}, \link[rtracklayer]{TwoBitFile}, and
          \link[Rsamtools]{FaFile} classes, defined and documented
          in the \pkg{BSgenome}, \pkg{rtracklayer}, and \pkg{Rsamtools}
          packages, respectively.

    \item The \link{TxDb} class.

    \item The \code{\link{genes}} function for extracting gene ranges from
          a \link{TxDb} object.

    \item The \link[GenomicRanges]{GenomicRanges} class defined and documented
          in the \pkg{GenomicRanges} package.

    \item The \link[Biostrings]{DNAStringSet} class defined and documented
          in the \pkg{Biostrings} package.

    \item The \code{\link[GenomeInfoDb]{seqinfo}} getter defined and documented
          in the \pkg{GenomeInfoDb} package.

    \item The \code{\link[Biostrings]{getSeq}} function for extracting
          subsequences from a sequence container.
  }
}

\examples{
## Load a genome:
library(BSgenome.Dmelanogaster.UCSC.dm3)
genome <- BSgenome.Dmelanogaster.UCSC.dm3
genome

## Use a TxDb object:
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
txdb  # contains Ensembl gene IDs

## Because the chrU and chrUextra sequences are made of concatenated
## scaffolds (see http://genome.ucsc.edu/cgi-bin/hgGateway?db=dm3),
## extracting the upstream sequences for genes located on these
## scaffolds is not reliable. So we exclude them:
exclude <- c("chrU", "chrUextra")
up1000seqs <- extractUpstreamSeqs(genome, txdb, width=1000,
                                  exclude.seqlevels=exclude)
up1000seqs  # the names are Ensembl gene IDs
mcols(up1000seqs)

## Upstream sequences for genes close to the chromosome bounds can be
## shorter than 1000 (note that this does not happen for circular
## chromosomes like chrM):
table(width(up1000seqs))
mcols(up1000seqs)[width(up1000seqs) != 1000, ]
}

\keyword{manip}