File: paml_matrix.py

package info (click to toggle)

python-cogent 1.5.3-2

links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd
size: 16,424 kB
ctags: 24,343
sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10

file content (50 lines) | stat: -rw-r--r-- 1,583 bytes

#!/usr/bin/env python

import numpy

Float = numpy.core.numerictypes.sctype2char(float)

from cogent.evolve import substitution_model

__author__ = "Matthew Wakefield"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Matthew Wakefield", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Matthew Wakefield"
__email__ = "wakefield@wehi.edu.au"
__status__ = "Production"

three_letter_order = 'ARNDCQEGHILKMFPSTWYV'
aa_order = numpy.array([ord(aa) for aa in three_letter_order])
reorder = numpy.argsort(aa_order)

def numbers_in(f):
    for line in f:
        for word in line.split():
            yield float(word)

def PamlMatrixParser(f):
    """Parses a matrix of amino acid transition probabilities and amino acid
    frequencies in the format used by PAML and returns a symetric array in single
    letter alphabetical order and a dictionary of frequencies for use by
    substitution_model.EmpiricalProteinMatrix"""
    matrix = numpy.zeros([20,20], Float)
    next_number = numbers_in(f).next
    for row in range(1,20):
        for col in range(0, row):
            matrix[row,col] = matrix[col,row] = next_number()
            
    freqs = [next_number() for i in range(20)]
    total = sum(freqs)
    assert abs(total-1) < 0.001, freqs
    freqs = [freq/total for freq in freqs]
    
    matrix = numpy.take(matrix, reorder, 0)
    matrix = numpy.take(matrix, reorder, 1)
    
    assert numpy.alltrue(matrix == numpy.transpose(matrix))
    
    freqs = dict(zip(three_letter_order, freqs))
    
    return (matrix, freqs)