File: pwm.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (160 lines) | stat: -rw-r--r-- 5,402 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
Classes for working with position specific matrices.
"""

from copy import copy

import numpy
from numpy import (
    float32,
    int16,
    log2,
    maximum,
    nan,
    newaxis,
    ones,
    zeros,
)

from . import _pwm


class BaseMatrix:
    """
    Base class for position specific matrices.
    """

    def __init__(self, alphabet=None, sorted_alphabet=None, char_to_index=None, values=None):
        self.alphabet = alphabet
        self.sorted_alphabet = sorted_alphabet
        self.char_to_index = char_to_index
        self.values = values

    @classmethod
    def from_rows(Class, alphabet, rows):
        """
        Create a new matrix for a sequence over alphabet `alphabet` taking
        values from `rows` which is a list whose length is the width of the
        matrix, and whose elements are lists of values associated with each
        character (in the order those characters appear in alphabet).
        """
        # Sorted alphabet
        sorted_alphabet = sorted(alphabet)
        # Character to index mapping (initialized to -1)
        char_to_index = zeros((256), int16) - 1
        for i, ch in enumerate(sorted_alphabet):
            char_to_index[ord(ch)] = i
        # Array
        values = zeros((len(rows), len(alphabet)), float32)
        for i, row in enumerate(rows):
            assert len(row) == len(alphabet)
            for ch, val in zip(alphabet, row):
                values[i, char_to_index[ord(ch)]] = val
        # Matrix
        matrix = Class()
        matrix.alphabet = alphabet
        matrix.sorted_alphabet = sorted_alphabet
        matrix.char_to_index = char_to_index
        matrix.values = values
        return matrix

    @classmethod
    def create_from_other(Class, other, values=None):
        """
        Create a new Matrix with attributes taken from `other` but with the
        values taken from `values` if provided
        """
        m = Class()
        m.alphabet = other.alphabet
        m.sorted_alphabet = other.sorted_alphabet
        m.char_to_index = other.char_to_index
        if values is not None:
            m.values = values
        else:
            m.values = other.values
        return m

    @property
    def width(self):
        """
        Return the width (size along the sequence axis) of this matrix.
        """
        return self.values.shape[0]

    def reverse_complement(self):
        """
        Create the reverse complement of this matrix. The result probably
        only makese sense if the alphabet is that of DNA ('A','C','G','T').
        """
        rval = copy(self)
        # Conveniently enough, reversing rows and columns is exactly what we
        # want, since this results in A swapping with T and C swapping with G.
        rval.values = self.values[::-1, ::-1].copy()
        return rval


class FrequencyMatrix(BaseMatrix):
    """
    A position specific count/frequency matrix.
    """

    DEFAULT_CORRECTION = 0.0000000001
    """
    Default value to use for correcting when dealing with counts of zero,
    chosen to produce scoring matrices that are the same as produced by CREAD.
    """

    def to_logodds_scoring_matrix(self, background=None, correction=DEFAULT_CORRECTION):
        """
        Create a standard logodds scoring matrix.
        """
        alphabet_size = len(self.alphabet)
        if background is None:
            background = ones(alphabet_size, float32) / alphabet_size
        # Row totals as a one column array
        totals = numpy.sum(self.values, 1)[:, newaxis]
        values = log2(maximum(self.values, correction)) - log2(totals) - log2(maximum(background, correction))
        return ScoringMatrix.create_from_other(self, values.astype(float32))

    def to_stormo_scoring_matrix(self, background=None):
        """
        Create a scoring matrix from this count matrix using the method from:

        Hertz, G.Z. and G.D. Stormo (1999). Identifying DNA and protein patterns with statistically
        significant alignments of multiple sequences. Bioinformatics 15(7): 563-577.
        """
        alphabet_size = len(self.alphabet)
        if background is None:
            background = ones(alphabet_size, float32) / alphabet_size
        # Row totals as a one column array
        totals = numpy.sum(self.values, 1)[:, newaxis]
        values = log2(self.values + background) - log2(totals + 1) - log2(background)
        return ScoringMatrix.create_from_other(self, values.astype(float32))


class ScoringMatrix(BaseMatrix):
    """
    A position specific matrix containing values that are suitable for
    scoring a sequence.
    """

    def score_string(self, string):
        """
        Score each valid position in `string` using this scoring matrix.
        Positions which were not scored are set to nan.
        """
        rval = zeros(len(string), float32)
        rval[:] = nan
        _pwm.score_string(self.values, self.char_to_index, string, rval)
        return rval

    def score_string_with_gaps(self, string):
        """
        Score each valid position in `string` using this scoring matrix.
        Positions which were not scored are set to nan. Gap characters are
        ignored (matrices score across them).
        """
        rval = zeros(len(string), float32)
        rval[:] = nan
        _pwm.score_string_with_gaps(self.values, self.char_to_index, string, rval)
        return rval