File: score.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (370 lines) | stat: -rw-r--r-- 12,035 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
"""
Support for scoring alignments using arbitrary scoring matrices, arbitrary
alphabets, and affine gap penalties.
"""

from numpy import (
    float32,
    int32,
    ones,
    zeros,
)


class ScoringScheme:
    # note that gap_open and gap_extend are penalties, which means you should make them positive
    def __init__(
        self,
        gap_open,
        gap_extend,
        default=-100,
        alphabet1="ACGT",
        alphabet2=None,
        gap1="-",
        gap2=None,
        text1_range=128,
        text2_range=None,
        typecode=int32,
    ):
        if text2_range is None:
            text2_range = text1_range
        if alphabet2 is None:
            alphabet2 = alphabet1
        if gap2 is None:
            gap2 = gap1  # (scheme with gap1=gap2=None is legit)
        if isinstance(alphabet1, str):
            alphabet1 = list(alphabet1)
        if isinstance(alphabet2, str):
            alphabet2 = list(alphabet2)
        self.table = ones((text1_range, text2_range), typecode)
        self.table *= default
        self.gap_open = gap_open
        self.gap_extend = gap_extend
        self.gap1 = gap1
        self.gap2 = gap2
        self.alphabet1 = alphabet1
        self.alphabet2 = alphabet2
        # private _set_score and _get_score allow subclasses to override them to
        # implement a different underlying table object

    def _set_score(self, a_b_pair, val):
        (a, b) = a_b_pair
        self.table[a, b] = val

    def _get_score(self, a_b_pair):
        (a, b) = a_b_pair
        return self.table[a, b]

    def set_score(self, a, b, val, foldcase1=False, foldcase2=False):
        self._set_score((a, b), val)
        if foldcase1:
            aCh = chr(a)
            if aCh.isupper():
                aa = ord(aCh.lower())
            elif aCh.islower():
                aa = ord(aCh.upper())
            else:
                foldcase1 = False
        if foldcase2:
            bCh = chr(b)
            if bCh.isupper():
                bb = ord(bCh.lower())
            elif bCh.islower():
                bb = ord(bCh.upper())
            else:
                foldcase2 = False
        if foldcase1 and foldcase2:
            self._set_score((aa, b), val)
            self._set_score((a, bb), val)
            self._set_score((aa, bb), val)
        elif foldcase1:
            self._set_score((aa, b), val)
        elif foldcase2:
            self._set_score((a, bb), val)

    def score_alignment(self, a):
        return score_alignment(self, a)

    def score_texts(self, text1, text2):
        return score_texts(self, text1, text2)

    def __str__(self):
        isDna1 = "".join(self.alphabet1) == "ACGT"
        isDna2 = "".join(self.alphabet2) == "ACGT"
        labelRows = not (isDna1 and isDna2)
        width = 3
        for a in self.alphabet1:
            for b in self.alphabet2:
                score = self._get_score((ord(a), ord(b)))
                if isinstance(score, float):
                    s = f"{score:8.6f}"
                else:
                    s = f"{score}"
                if len(s) + 1 > width:
                    width = len(s) + 1
        lines = []
        line = []
        if labelRows:
            if isDna1:
                line.append(" ")
            else:
                line.append("  ")
        for b in self.alphabet2:
            if isDna2:
                s = b
            else:
                s = f"{ord(b):02X}"
            line.append("%*s" % (width, s))
        lines.append(("".join(line)) + "\n")
        for a in self.alphabet1:
            line = []
            if labelRows:
                if isDna1:
                    line.append(a)
                else:
                    line.append(f"{ord(a):02X}")
            for b in self.alphabet2:
                score = self._get_score((ord(a), ord(b)))
                if isinstance(score, float):
                    s = f"{score:8.6f}"
                else:
                    s = f"{score}"
                line.append("%*s" % (width, s))
            lines.append(("".join(line)) + "\n")
        return "".join(lines)


def read_scoring_scheme(f, gap_open, gap_extend, gap1="-", gap2=None, **kwargs):
    """
    Initialize scoring scheme from a file containint a blastz style text blob.
    f can be either a file or the name of a file.
    """
    close_it = False
    if isinstance(f, str):
        f = open(f)
        close_it = True
    ss = build_scoring_scheme("".join(list(f)), gap_open, gap_extend, gap1=gap1, gap2=gap2, **kwargs)
    if close_it:
        f.close()
    return ss


def build_scoring_scheme(s, gap_open, gap_extend, gap1="-", gap2=None, **kwargs):
    """
    Initialize scoring scheme from a blastz style text blob, first line
    specifies the bases for each row/col, subsequent lines contain the
    corresponding scores.  Slaw extensions allow for unusual and/or
    asymmetric alphabets.  Symbols can be two digit hex, and each row
    begins with symbol.  Note that a row corresponds to a symbol in text1
    and a column to a symbol in text2.

    examples:

       blastz                       slaw

          A    C    G    T               01   02    A    C    G    T
         91 -114  -31 -123          01  200 -200  -50  100  -50  100
       -114  100 -125  -31          02 -200  200  100  -50  100  -50
        -31 -125  100 -114
       -123  -31 -114   91
    """
    # perform initial parse to determine alphabets and locate scores
    bad_matrix = "invalid scoring matrix"
    s = s.rstrip("\n")
    lines = s.split("\n")
    rows = []
    symbols2 = lines.pop(0).split()
    symbols1 = None
    rows_have_syms = False
    a_la_blastz = True
    for i, line in enumerate(lines):
        row_scores = line.split()
        if len(row_scores) == len(symbols2):  # blastz-style row
            if symbols1 is None:
                if len(lines) != len(symbols2):
                    raise bad_matrix
                symbols1 = symbols2
            elif rows_have_syms:
                raise bad_matrix
        elif len(row_scores) == len(symbols2) + 1:  # row starts with symbol
            if symbols1 is None:
                symbols1 = []
                rows_have_syms = True
                a_la_blastz = False
            elif not rows_have_syms:
                raise bad_matrix
            symbols1.append(row_scores.pop(0))
        else:
            raise bad_matrix
        rows.append(row_scores)
    # convert alphabets from strings to characters
    try:
        alphabet1 = [sym_to_char(sym) for sym in symbols1]
        alphabet2 = [sym_to_char(sym) for sym in symbols2]
    except ValueError:
        raise bad_matrix
    if (alphabet1 != symbols1) or (alphabet2 != symbols2):
        a_la_blastz = False
    if a_la_blastz:
        alphabet1 = [ch.upper() for ch in alphabet1]
        alphabet2 = [ch.upper() for ch in alphabet2]
    # decide if rows and/or columns should reflect case
    if a_la_blastz:
        foldcase1 = foldcase2 = True
    else:
        foldcase1 = "".join(alphabet1) == "ACGT"
        foldcase2 = "".join(alphabet2) == "ACGT"
    # create appropriately sized matrix
    text1_range = text2_range = 128
    if ord(max(alphabet1)) >= 128:
        text1_range = 256
    if ord(max(alphabet2)) >= 128:
        text2_range = 256
    typecode = int32
    for i, row_scores in enumerate(rows):
        for j, score in enumerate(map(int_or_float, row_scores)):
            if isinstance(score, float):
                typecode = float32
    if isinstance(gap_open, float):
        typecode = float32
    if isinstance(gap_extend, float):
        typecode = float32
    ss = ScoringScheme(
        gap_open,
        gap_extend,
        alphabet1=alphabet1,
        alphabet2=alphabet2,
        gap1=gap1,
        gap2=gap2,
        text1_range=text1_range,
        text2_range=text2_range,
        typecode=typecode,
        **kwargs,
    )
    # fill matrix
    for i, row_scores in enumerate(rows):
        for j, score in enumerate(map(int_or_float, row_scores)):
            ss.set_score(ord(alphabet1[i]), ord(alphabet2[j]), score)
            if foldcase1 and foldcase2:
                ss.set_score(ord(alphabet1[i].lower()), ord(alphabet2[j].upper()), score)
                ss.set_score(ord(alphabet1[i].upper()), ord(alphabet2[j].lower()), score)
                ss.set_score(ord(alphabet1[i].lower()), ord(alphabet2[j].lower()), score)
            elif foldcase1:
                ss.set_score(ord(alphabet1[i].lower()), ord(alphabet2[j]), score)
            elif foldcase2:
                ss.set_score(ord(alphabet1[i]), ord(alphabet2[j].lower()), score)
    return ss


def int_or_float(s):
    try:
        return int(s)
    except ValueError:
        return float(s)


# convert possible two-char symbol to a single character


def sym_to_char(sym):
    if len(sym) == 1:
        return sym
    elif len(sym) != 2:
        raise ValueError
    else:
        return chr(int(sym, base=16))


def score_alignment(scoring_scheme, a):
    score = 0
    ncomps = len(a.components)
    for i in range(ncomps):
        for j in range(i + 1, ncomps):
            score += score_texts(scoring_scheme, a.components[i].text, a.components[j].text)
    return score


def score_texts(scoring_scheme, text1, text2):
    rval = 0
    last_gap_a = last_gap_b = False
    for i in range(len(text1)):
        a = text1[i]
        b = text2[i]
        # Ignore gap/gap pair
        if a == scoring_scheme.gap1 and b == scoring_scheme.gap2:
            continue
        # Gap in first species
        elif a == scoring_scheme.gap1:
            rval -= scoring_scheme.gap_extend
            if not last_gap_a:
                rval -= scoring_scheme.gap_open
                last_gap_a = True
                last_gap_b = False
        # Gap in second species
        elif b == scoring_scheme.gap2:
            rval -= scoring_scheme.gap_extend
            if not last_gap_b:
                rval -= scoring_scheme.gap_open
                last_gap_a = False
                last_gap_b = True
        # Aligned base
        else:
            rval += scoring_scheme._get_score((ord(a), ord(b)))
            last_gap_a = last_gap_b = False
    return rval


def accumulate_scores(scoring_scheme, text1, text2, skip_ref_gaps=False):
    """
    Return cumulative scores for each position in alignment as a 1d array.

    If `skip_ref_gaps` is False positions in returned array correspond to each
    column in alignment, if True they correspond to each non-gap position (each
    base) in text1.
    """
    if skip_ref_gaps:
        rval = zeros(len(text1) - text1.count(scoring_scheme.gap1))
    else:
        rval = zeros(len(text1))
    score = 0
    pos = 0
    last_gap_a = last_gap_b = False
    for i in range(len(text1)):
        a = text1[i]
        b = text2[i]
        # Ignore gap/gap pair
        if a == scoring_scheme.gap1 and b == scoring_scheme.gap2:
            continue
        # Gap in first species
        elif a == scoring_scheme.gap1:
            score -= scoring_scheme.gap_extend
            if not last_gap_a:
                score -= scoring_scheme.gap_open
                last_gap_a = True
                last_gap_b = False
        # Gap in second species
        elif b == scoring_scheme.gap2:
            score -= scoring_scheme.gap_extend
            if not last_gap_b:
                score -= scoring_scheme.gap_open
                last_gap_a = False
                last_gap_b = True
        # Aligned base
        else:
            score += scoring_scheme._get_score((ord(a), ord(b)))
            last_gap_a = last_gap_b = False
        if not (skip_ref_gaps) or a != scoring_scheme.gap1:
            rval[pos] = score
            pos += 1
    return rval


hox70 = build_scoring_scheme(
    """  A    C    G    T
                                  91 -114  -31 -123
                                -114  100 -125  -31
                                 -31 -125  100 -114
                                -123  -31 -114   91 """,
    400,
    30,
)