1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
#!/usr/bin/env python
"""
Returns the likelihood resulting from a model in which motif probabilities
are assumed to be equal to the observed motif frequencies, as described by
Goldman (1993). This model is not informative for inferring the evolutionary
process, but its likelihood indicates the maximum possible likelihood value
for a site-independent evolutionary process.
"""
from __future__ import division
import re
from numpy import log
from cogent import LoadSeqs
__author__ = "Helen Lindsay, Gavin Huttley"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Helen Lindsay", "Gavin Huttley", "Daniel McDonald"]
cite = "Goldman, N. (1993). Statistical tests of models of DNA substitution. J Mol Evol, 36: 182-98"
__license__ = "GPL"
__version__ = "1.4.1"
__maintainer__ = "Gavin Huttley"
__email__ = "gavin.huttley@anu.edu.au"
__status__ = "Production"
def _transpose(array):
new_array = []
num_cols = len(array[0])
for col in range(num_cols):
new_row = []
for row in array:
new_row += [row[col]]
new_array.append(new_row)
return new_array
def _take(array, indices):
new_array = []
for index in indices:
new_array.append(array[index])
return new_array
def aligned_columns_to_rows(aln, motif_len, exclude_chars = '-N?'):
"""return alignment broken into motifs as a transposed list with
sequences as columns and aligned columns as rows
Arguments:
exclude_chars: a character string suitable for use as a regular expression"""
exclude_chars = re.compile("[%s]" % exclude_chars)
exclude_indices = set()
array = []
for name in aln.Names:
motifs = list(aln.getGappedSeq(name).getInMotifSize(motif_len))
array.append(motifs)
for motif_index, motif in enumerate(motifs):
if exclude_chars.findall(motif):
exclude_indices.update([motif_index])
include_indices = set(range(len(array[0]))).difference(exclude_indices)
include_indices = list(include_indices)
include_indices.sort()
array = _transpose(array)
array = _take(array, include_indices)
return array
def count_column_freqs(columns_list):
"""return the frequency of columns"""
col_freq_dict = {}
for column in columns_list:
column = ' '.join(column)
col_freq_dict[column] = col_freq_dict.get(column, 0) + 1
return col_freq_dict
def get_ML_probs(columns_list, with_patterns=False):
"""returns the column log-likelihoods and frequencies
Argument:
- with_patterns: the site patterns are returned"""
n = len(columns_list)
col_freq_dict = count_column_freqs(columns_list)
col_lnL_freqs = []
for column_pattern, freq in col_freq_dict.items():
# note, the behaviour of / is changed due to the __future__ import
if with_patterns:
row = [column_pattern, freq/n, freq]
else:
row = [freq/n, freq]
col_lnL_freqs.append(row)
return col_lnL_freqs
def get_G93_lnL_from_array(columns_list):
"""return the best log likelihood for a list of aligned columns"""
col_stats = get_ML_probs(columns_list)
log_likelihood = 0
for freq, num in col_stats:
pattern_lnL = log(freq)*num
log_likelihood += pattern_lnL
return log_likelihood
def BestLogLikelihood(aln, alphabet, exclude_chars = '-N?'):
"""returns the best log-likelihood according to Goldman 1993.
Arguments:
- exclude_chars: a character string suitable for use as a regular
expression
"""
# need to use the alphabet, so we can enforce character compliance
aln = LoadSeqs(data=aln.todict(), moltype=alphabet.MolType)
motif_len = alphabet.getMotifLen()
columns = aligned_columns_to_rows(aln, motif_len, exclude_chars)
log_likelihood = get_G93_lnL_from_array(columns)
return log_likelihood
|