File: toPWM-methods.Rd

package info (click to toggle)
r-bioc-tfbstools 1.28.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 940 kB
  • sloc: xml: 1,137; ansic: 590; asm: 54; sh: 13; makefile: 2
file content (119 lines) | stat: -rw-r--r-- 4,579 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
\name{toPWM}
\docType{methods}
\alias{toPWM}
\alias{toPWM,character-method}
\alias{toPWM,DNAStringSet-method}
\alias{toPWM,matrix-method}
\alias{toPWM,PFMatrix-method}
\alias{toPWM,PFMatrixList-method}

\title{toPWM method}
\description{
  Converts a raw frequency matrix (PFMatrix) to a position weight matrix (PWMatrix).
  It takes the type, bases background frequencies, pseudocounts as parameters.
}

\usage{
  toPWM(x, type=c("log2probratio", "prob"), pseudocounts=0.8, 
        bg=c(A=0.25, C=0.25, G=0.25, T=0.25))
}

\arguments{
  \item{x}{
    For \code{toPWM}, a \link{PFMatrix}, 
    rectangular \link{DNAStringSet} object ("rectangular" means that 
    all elements have the same number of characters) with 
    no IUPAC ambiguity letters,
    a rectangular \link{character} vector or 
    a \link{matrix} with rownames containing at least A, C, G and T,
    or a \link{PFMatrixList} object.
  }
  \item{type}{
    The type of PWM generated, should be one of "log2probratio" or "prob".
    "log2probratio" will generate the PWM matrix in log-scale,
    while "prob" will give the PWM matrix in probability scale of 0 to 1.
  }
  \item{pseudocounts}{
    pseudocounts is a numeric non-negative vector, 
    which means you can specify different pseudocounts for each site. 
    The values will be recycled if shorter than the length of sites. 
    0.8 is recommended. See the reference below for more details.
    In the TFBS perl module, the squared root of the column sum of the matrix, 
    i.e., the number of motifs used to construct the PFM, is used.
  }
  \item{bg}{
    bg is a vector of background frequencies of four bases 
    with names containing A, C, G, T. 
    When toPWM is applied to a \code{PFMatrix}, if bg is not specified, 
    it will use the bg information contained in \code{PFMatrix}. 
  }
}

\details{
  The raw position frequency matrix (PFM) is usually converted into 
  a position weight matrix (PWM), 
  also known as position specific scoring matrix (PSSM).
  The PWM provides the probability of each base at certain position and 
  used for scanning the genomic sequences.
  The implementation here is slightly different from \code{PWM} in 
  \code{Biostrings} package by choosing the pseudocounts. 
  Pseudocounts is necessary for correcting the small number of counts 
  or eliminating the zero values before log transformation.

  \deqn{postProbs = \frac{PFM + bg * pseudocounts}{ncol(PFM) + sum(bg) * pseudocounts}}{%
  postProbs = (PFM + bg * pseudocounts) / (ncol(PFM) + sum(bg) * pseudocounts)}
  \deqn{priorProbs = bg / sum(bg)}{%
  priorProbs = bg / sum(bg)}
  \deqn{PWM_{log2probratio} = \log_2{postProbs \over priorProbs}}{%
  PWM_log2probratio = log2(postProbs / priorProbs)}
  \deqn{PWM_{prob} = postProbs}{%
  PWM_prob = postProbs}
}

\value{
  A \code{PWMatrix} object that contains the background frequency and 
  pseudocounts used.
}

\references{
Wasserman, W. W., & Sandelin, A. (2004). Applied bioinformatics for the identification of regulatory elements. Nature Publishing Group, 5(4), 276-287. doi:10.1038/nrg1315

Nishida, K., Frith, M. C., & Nakai, K. (2009). Pseudocounts for transcription factor binding sites. Nucleic acids research, 37(3), 939-944. doi:10.1093/nar/gkn1019
}

\author{
Ge Tan
}

\seealso{
\code{\link{toICM}},
\code{\linkS4class{XMatrix}}
}

\examples{
  ## Constructe a PFMatrix
  pfm <- PFMatrix(ID="MA0004.1", name="Arnt", matrixClass="Zipper-Type", 
                  strand="+", bg=c(A=0.25, C=0.25, G=0.25, T=0.25), 
                  tags=list(family="Helix-Loop-Helix", species="10090", 
                            tax_group="vertebrates",
                            medline="7592839", type="SELEX", ACC="P53762", 
                            pazar_tf_id="TF0000003",
                            TFBSshape_ID="11", TFencyclopedia_ID="580"),
                  profileMatrix=matrix(c(4L,  19L, 0L,  0L,  0L,  0L,
                                         16L, 0L,  20L, 0L,  0L,  0L,
                                         0L,  1L,  0L,  20L, 0L,  20L,
                                         0L,  0L,  0L,  0L,  20L, 0L), 
                                       byrow=TRUE, nrow=4, 
                                       dimnames=list(c("A", "C", "G", "T")))
                  )
  ## Convert it into a PWMatrix
  pwm <- toPWM(pfm, type="log2probratio", pseudocounts=0.8)
  
  ## Conversion on PWMatrixList
  data(MA0003.2)
  data(MA0004.1)
  pfmList <- PFMatrixList(pfm1=MA0003.2, pfm2=MA0004.1, use.names=TRUE)
  pwmList <- toPWM(pfmList, pseudocounts=0.8)
}

\keyword{methods}