File: paml_matrix.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (50 lines) | stat: -rw-r--r-- 1,583 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python

import numpy

Float = numpy.core.numerictypes.sctype2char(float)

from cogent.evolve import substitution_model

__author__ = "Matthew Wakefield"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Matthew Wakefield", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Matthew Wakefield"
__email__ = "wakefield@wehi.edu.au"
__status__ = "Production"

three_letter_order = 'ARNDCQEGHILKMFPSTWYV'
aa_order = numpy.array([ord(aa) for aa in three_letter_order])
reorder = numpy.argsort(aa_order)

def numbers_in(f):
    for line in f:
        for word in line.split():
            yield float(word)

def PamlMatrixParser(f):
    """Parses a matrix of amino acid transition probabilities and amino acid
    frequencies in the format used by PAML and returns a symetric array in single
    letter alphabetical order and a dictionary of frequencies for use by
    substitution_model.EmpiricalProteinMatrix"""
    matrix = numpy.zeros([20,20], Float)
    next_number = numbers_in(f).next
    for row in range(1,20):
        for col in range(0, row):
            matrix[row,col] = matrix[col,row] = next_number()
            
    freqs = [next_number() for i in range(20)]
    total = sum(freqs)
    assert abs(total-1) < 0.001, freqs
    freqs = [freq/total for freq in freqs]
    
    matrix = numpy.take(matrix, reorder, 0)
    matrix = numpy.take(matrix, reorder, 1)
    
    assert numpy.alltrue(matrix == numpy.transpose(matrix))
    
    freqs = dict(zip(three_letter_order, freqs))
    
    return (matrix, freqs)