File: Motif.py

package info (click to toggle)
python-biopython 1.42-2
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 17,584 kB
  • ctags: 12,272
  • sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (207 lines) | stat: -rw-r--r-- 6,374 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Copyright 2003 by Bartek Wilczynski.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""
Implementation of sequence motifs.
"""

from __future__ import generators
from Bio.SubsMat import FreqTable

class Motif(object):
    """
    A class representing sequence motifs.
    """
    def __init__(self):
        self.instances = []
        self.score = 0.0
        self.mask = []
        self._pwm_is_current = 0
        self._pwm = []
        self.alphabet=None
        self.length=None

    def _check_length(self, len):
        if self.length==None:
            self.length = len
        elif self.length != len:
            raise ValueError, "You can't change the length of the motif"

    def _check_alphabet(self,alphabet):
        if self.alphabet==None:
            self.alphabet=alphabet
        elif self.alphabet != alphabet:
                raise ValueError, "Wrong Alphabet"
        
    def add_instance(self,instance):
        """
        adds new instance to the motif
        """
        self._check_alphabet(instance.alphabet)
        self._check_length(len(instance))
        self.instances.append(instance)
        self._pwm_is_current = False

    def set_mask(self,mask):
        """
        sets the mask for the motif

        The mask should be a string containing asterisks in the position of significant columns and spaces in other columns
        """
        self._check_length(len(mask))
        self.mask=[]
        for char in mask:
            if char=="*":
                self.mask.append(1)
            elif char==" ":
                self.mask.append(0)
            else:
                raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)

    def pwm(self):
        """
        returns the PWM computed for the set of instances
        """
        
        if self._pwm_is_current:
            return self._pwm
        #we need to compute new pwm
        self._pwm = []
        for i in xrange(len(self.mask)):
            dict = {}
            #filling the dict with 0's
            for letter in self.alphabet.letters:
                dict[letter]=0
            #counting the occurences of letters in instances
            for seq in self.instances:
                dict[seq[i]]=dict[seq[i]]+1
            self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet)) 
        self._pwm_is_current=True
        return self._pwm

    def search_instances(self,sequence):
        """
        a generator function, returning found positions of instances of the motif in a given sequence
        """
        for pos in xrange(0,len(sequence)-self.length+1):
            for instance in self.instances:
                if instance.tostring()==sequence[pos:pos+self.length].tostring():
                    yield(pos,instance)
                    break # no other instance will fit (we don't want to return multiple hits)

    def score_hit(self,sequence,position,normalized=1,masked=0):
        """
        give the pwm score for a given position
        """
        score = 0.0
        for pos in xrange(self.length):
            if not masked or self.mask[pos]:
                score += self.pwm()[pos][sequence[position+pos]]
        if normalized:
            if not masked:
                score/=self.length
            else:
                score/=len(filter(lambda x: x, self.mask))
        return score
    
    def search_pwm(self,sequence,threshold=0.0,normalized=1,masked=1):
        """
        a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
        """

        for pos in xrange(0,len(sequence)-self.length+1):
            score = self.score_hit(sequence,pos,normalized,masked)
            if score > threshold:
                yield (pos,score)

    
    def sim(self, motif, masked = 0):
        """
        return the similarity score for the given motif against self.

        We use the Pearson's correlation of the respective probabilities.
        If the motifs have different length or mask raise the ValueError.
        """

        from math import sqrt
        
        if self.alphabet != motif.alphabet:
            raise ValueError("Wrong alphabet")
        
        if self.length != motif.length:
            raise ValueError("Wrong length")
        
        if masked and self.mask!=motif.mask:
            raise ValueError("Wrong mask")

        sxx = 0 # \sum x^2
        sxy = 0 # \sum x \cdot y
        sx = 0  # \sum x
        sy = 0  # \sum y
        syy = 0 # \sum x^2

        for pos in xrange(self.length):
            if not masked or self.mask:
                for l in self.alphabet.letters:
                    xi = self.pwm()[pos][l]
                    yi = motif.pwm()[pos][l]
                    sx = sx + xi
                    sy = sy + yi
                    sxx = sxx + xi * xi
                    syy = syy + yi * yi
                    sxy = sxy + xi * yi
                    
        if masked:
            norm = len(filter(lambda x: x,self.mask))
        else:
            norm = self.length
            
        norm *= len(self.alphabet.letters)
        s1 = (sxy - sx*sy*1.0/norm)
        s2 = (sxx - sx*sx*1.0/norm)*(syy- sy*sy*1.0/norm)
        
        return s1/sqrt(s2)
            
    def read(self,stream):
        """
        reads the motif from the stream

        the self.alphabet variable must be set before
        """
        from Bio.Seq import Seq
        while 1:
            ln = stream.readline()
            if "*" in ln:
                self.set_mask(ln.strip("\n\c"))
                break
            self.add_instance(Seq(ln.strip(),self.alphabet))
        
    def __str__(self):
        """
        string representation of motif
        """
        str = ""
        for inst in self.instances:
            str = str + inst.tostring() + "\n"

        for i in xrange(self.length):
            if self.mask[i]:
                str = str + "*"
            else:
                str = str + " "
        str = str + "\n"

        return str

    def write(self,stream):
        """
        writes the motif to the stream
        """

        stream.write(self.__str__())