1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
|
#!/usr/bin/env python
import numpy
Float = numpy.core.numerictypes.sctype2char(float)
from cogent.evolve import substitution_model
__author__ = "Matthew Wakefield"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Matthew Wakefield", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Matthew Wakefield"
__email__ = "wakefield@wehi.edu.au"
__status__ = "Production"
three_letter_order = 'ARNDCQEGHILKMFPSTWYV'
aa_order = numpy.array([ord(aa) for aa in three_letter_order])
reorder = numpy.argsort(aa_order)
def numbers_in(f):
for line in f:
for word in line.split():
yield float(word)
def PamlMatrixParser(f):
"""Parses a matrix of amino acid transition probabilities and amino acid
frequencies in the format used by PAML and returns a symetric array in single
letter alphabetical order and a dictionary of frequencies for use by
substitution_model.EmpiricalProteinMatrix"""
matrix = numpy.zeros([20,20], Float)
next_number = numbers_in(f).next
for row in range(1,20):
for col in range(0, row):
matrix[row,col] = matrix[col,row] = next_number()
freqs = [next_number() for i in range(20)]
total = sum(freqs)
assert abs(total-1) < 0.001, freqs
freqs = [freq/total for freq in freqs]
matrix = numpy.take(matrix, reorder, 0)
matrix = numpy.take(matrix, reorder, 1)
assert numpy.alltrue(matrix == numpy.transpose(matrix))
freqs = dict(zip(three_letter_order, freqs))
return (matrix, freqs)
|