1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
|
\name{toPWM}
\docType{methods}
\alias{toPWM}
\alias{toPWM,character-method}
\alias{toPWM,DNAStringSet-method}
\alias{toPWM,matrix-method}
\alias{toPWM,PFMatrix-method}
\alias{toPWM,PFMatrixList-method}
\title{toPWM method}
\description{
Converts a raw frequency matrix (PFMatrix) to a position weight matrix (PWMatrix).
It takes the type, bases background frequencies, pseudocounts as parameters.
}
\usage{
toPWM(x, type=c("log2probratio", "prob"), pseudocounts=0.8,
bg=c(A=0.25, C=0.25, G=0.25, T=0.25))
}
\arguments{
\item{x}{
For \code{toPWM}, a \link{PFMatrix},
rectangular \link{DNAStringSet} object ("rectangular" means that
all elements have the same number of characters) with
no IUPAC ambiguity letters,
a rectangular \link{character} vector or
a \link{matrix} with rownames containing at least A, C, G and T,
or a \link{PFMatrixList} object.
}
\item{type}{
The type of PWM generated, should be one of "log2probratio" or "prob".
"log2probratio" will generate the PWM matrix in log-scale,
while "prob" will give the PWM matrix in probability scale of 0 to 1.
}
\item{pseudocounts}{
pseudocounts is a numeric non-negative vector,
which means you can specify different pseudocounts for each site.
The values will be recycled if shorter than the length of sites.
0.8 is recommended. See the reference below for more details.
In the TFBS perl module, the squared root of the column sum of the matrix,
i.e., the number of motifs used to construct the PFM, is used.
}
\item{bg}{
bg is a vector of background frequencies of four bases
with names containing A, C, G, T.
When toPWM is applied to a \code{PFMatrix}, if bg is not specified,
it will use the bg information contained in \code{PFMatrix}.
}
}
\details{
The raw position frequency matrix (PFM) is usually converted into
a position weight matrix (PWM),
also known as position specific scoring matrix (PSSM).
The PWM provides the probability of each base at certain position and
used for scanning the genomic sequences.
The implementation here is slightly different from \code{PWM} in
\code{Biostrings} package by choosing the pseudocounts.
Pseudocounts is necessary for correcting the small number of counts
or eliminating the zero values before log transformation.
\deqn{postProbs = \frac{PFM + bg * pseudocounts}{ncol(PFM) + sum(bg) * pseudocounts}}{%
postProbs = (PFM + bg * pseudocounts) / (ncol(PFM) + sum(bg) * pseudocounts)}
\deqn{priorProbs = bg / sum(bg)}{%
priorProbs = bg / sum(bg)}
\deqn{PWM_{log2probratio} = \log_2{postProbs \over priorProbs}}{%
PWM_log2probratio = log2(postProbs / priorProbs)}
\deqn{PWM_{prob} = postProbs}{%
PWM_prob = postProbs}
}
\value{
A \code{PWMatrix} object that contains the background frequency and
pseudocounts used.
}
\references{
Wasserman, W. W., & Sandelin, A. (2004). Applied bioinformatics for the identification of regulatory elements. Nature Publishing Group, 5(4), 276-287. doi:10.1038/nrg1315
Nishida, K., Frith, M. C., & Nakai, K. (2009). Pseudocounts for transcription factor binding sites. Nucleic acids research, 37(3), 939-944. doi:10.1093/nar/gkn1019
}
\author{
Ge Tan
}
\seealso{
\code{\link{toICM}},
\code{\linkS4class{XMatrix}}
}
\examples{
## Constructe a PFMatrix
pfm <- PFMatrix(ID="MA0004.1", name="Arnt", matrixClass="Zipper-Type",
strand="+", bg=c(A=0.25, C=0.25, G=0.25, T=0.25),
tags=list(family="Helix-Loop-Helix", species="10090",
tax_group="vertebrates",
medline="7592839", type="SELEX", ACC="P53762",
pazar_tf_id="TF0000003",
TFBSshape_ID="11", TFencyclopedia_ID="580"),
profileMatrix=matrix(c(4L, 19L, 0L, 0L, 0L, 0L,
16L, 0L, 20L, 0L, 0L, 0L,
0L, 1L, 0L, 20L, 0L, 20L,
0L, 0L, 0L, 0L, 20L, 0L),
byrow=TRUE, nrow=4,
dimnames=list(c("A", "C", "G", "T")))
)
## Convert it into a PWMatrix
pwm <- toPWM(pfm, type="log2probratio", pseudocounts=0.8)
## Conversion on PWMatrixList
data(MA0003.2)
data(MA0004.1)
pfmList <- PFMatrixList(pfm1=MA0003.2, pfm2=MA0004.1, use.names=TRUE)
pwmList <- toPWM(pfmList, pseudocounts=0.8)
}
\keyword{methods}
|