File: Pattern.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 46,860 kB
  • ctags: 13,237
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (259 lines) | stat: -rw-r--r-- 9,024 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""Generic functionality useful for all gene representations.

This module contains classes which can be used for all the different
types of patterns available for representing gene information (ie. motifs,
signatures and schemas). These are the general classes which should be
handle any of the different specific patterns.
"""
# standard library
import random

# biopython
from Bio.Alphabet import _verify_alphabet
from Bio.Seq import Seq, MutableSeq


class PatternIO(object):
    """Allow reading and writing of patterns to files.

    This just defines a simple persistence class for patterns, making
    it easy to write them to a file and read 'em back.
    """
    def __init__(self, alphabet=None):
        """Intialize the reader and writer class.

        Arguments:

        o alphabet - An optional argument specifying the alphabet
        which patterns should follow. If an alphabet is set it'll be used
        to verify that all patterns follow it.

        Attributes:
        o separator - A character to use in separating items in a signature
        when it is written to a file and read back. This character should
        not be in the possible alphabet of the sequences, or there will
        be trouble.
        """
        self._alphabet = alphabet

        self.separator = ";"

    def write(self, pattern_list, output_handle):
        """Write a list of patterns to the given handle.
        """
        for pattern in pattern_list:
            # deal with signatures, concatentate them with the separator
            if isinstance(pattern, (list, tuple)):
                string_pattern = self.separator.join(pattern)
            # deal with the normal cases
            else:
                string_pattern = pattern

            output_handle.write("%s\n" % string_pattern)

    def write_seq(self, seq_pattern_list, output_handle):
        """Convenience function to write Seq objects to a file.

        This can take Seqs and MutableSeqs, and write them to a file
        as strings.
        """
        # convert the seq patterns into just string patterns
        all_patterns = []

        for seq_pattern in seq_pattern_list:
            if isinstance(seq_pattern, MutableSeq):
                seq = seq_pattern.toseq()
                all_patterns.append(str(seq))
            elif isinstance(seq_pattern, Seq):
                all_patterns.append(str(seq_pattern))
            else:
                raise ValueError("Unexpected pattern type %r" % seq_pattern)

        self.write(all_patterns, output_handle)

    def read(self, input_handle):
        """Read patterns from the specified handle.
        """
        all_patterns = []

        while True:
            cur_line = input_handle.readline()

            if not(cur_line):
                break

            cur_pattern = cur_line.rstrip()
            # split up signatures
            if self.separator in cur_pattern:
                cur_pattern = tuple(cur_pattern.split(self.separator))

            if self._alphabet is not None:
                # make single patterns (not signatures) into lists, so we
                # can check signatures and single patterns the same
                if not isinstance(cur_pattern, tuple):
                    test_pattern = [cur_pattern]
                else:
                    test_pattern = cur_pattern
                for pattern_item in test_pattern:
                    pattern_seq = Seq(pattern_item, self._alphabet)
                    if not(_verify_alphabet(pattern_seq)):
                        raise ValueError("Pattern %s not matching alphabet %s"
                                         % (cur_pattern, self._alphabet))

            all_patterns.append(cur_pattern)

        return all_patterns


class PatternRepository(object):
    """This holds a list of specific patterns found in sequences.

    This is designed to be a general holder for a set of patterns and
    should be subclassed for specific implementations (ie. holding Motifs
    or Signatures.
    """
    def __init__(self, pattern_info):
        """Initialize a repository with patterns,

        Arguments:

            - pattern_info - A representation of all of the patterns found in
              a finder search. This should be a dictionary, where the keys
              are patterns, and the values are the number of times a pattern is
              found.

        The patterns are represented interally as a list of two
        tuples, where the first element is the number of times a pattern
        occurs, and the second is the pattern itself. This makes it easy
        to sort the list and return the top N patterns.
        """
        self._pattern_dict = pattern_info

        # create the list representation
        self._pattern_list = []
        for pattern_name in self._pattern_dict:
            self._pattern_list.append((self._pattern_dict[pattern_name],
                                       pattern_name))

        self._pattern_list.sort()
        self._pattern_list.reverse()

    def get_all(self):
        """Retrieve all of the patterns in the repository.
        """
        patterns = []
        for pattern_info in self._pattern_list:
            patterns.append(pattern_info[1])

        return patterns

    def get_random(self, num_patterns):
        """Retrieve the specified number of patterns randomly.

        Randomly selects patterns from the list and returns them.

        Arguments:

        o num_patterns - The total number of patterns to return.
        """
        all_patterns = []

        while len(all_patterns) < num_patterns:
            # pick a pattern, and only add it if it is not already present
            new_pattern_info = random.choice(self._pattern_list)

            if new_pattern_info[1] not in all_patterns:
                all_patterns.append(new_pattern_info[1])

        return all_patterns

    def get_top_percentage(self, percent):
        """Return a percentage of the patterns.

        This returns the top 'percent' percentage of the patterns in the
        repository.
        """
        all_patterns = self.get_all()

        num_to_return = int(len(all_patterns) * percent)

        return all_patterns[:num_to_return]

    def get_top(self, num_patterns):
        """Return the specified number of most frequently occurring patterns

        Arguments:

        o num_patterns - The number of patterns to return.
        """
        all_patterns = []
        for pattern_info in self._pattern_list[:num_patterns]:
            all_patterns.append(pattern_info[1])

        return all_patterns

    def get_differing(self, top_num, bottom_num):
        """Retrieve patterns that are at the extreme ranges.

        This returns both patterns at the top of the list (ie. the same as
        returned by get_top) and at the bottom of the list. This
        is especially useful for patterns that are the differences between
        two sets of patterns.

        Arguments:

        o top_num - The number of patterns to take from the top of the list.

        o bottom_num - The number of patterns to take from the bottom of
        the list.
        """
        all_patterns = []
        # first get from the top of the list
        for pattern_info in self._pattern_list[:top_num]:
            all_patterns.append(pattern_info[1])

        # then from the bottom
        for pattern_info in self._pattern_list[-bottom_num:]:
            all_patterns.append(pattern_info[1])

        return all_patterns

    def remove_polyA(self, at_percentage=.9):
        """Remove patterns which are likely due to polyA tails from the lists.

        This is just a helper function to remove pattenrs which are likely
        just due to polyA tails, and thus are not really great motifs.
        This will also get rid of stuff like ATATAT, which might be a
        useful motif, so use at your own discretion.

        XXX Could we write a more general function, based on info content
        or something like that?

        Arguments:

        o at_percentage - The percentage of A and T residues in a pattern
        that qualifies it for being removed.
        """
        remove_list = []
        # find all of the really AT rich patterns
        for pattern_info in self._pattern_list:
            pattern_at = float(pattern_info[1].count('A') + pattern_info[1].count('T')) / len(pattern_info[1])
            if pattern_at > at_percentage:
                remove_list.append(pattern_info)

        # now remove them from the master list
        for to_remove in remove_list:
            self._pattern_list.remove(to_remove)

    def count(self, pattern):
        """Return the number of times the specified pattern is found.
        """
        try:
            return self._pattern_dict[pattern]
        except KeyError:
            return 0