File: best_likelihood.py

package info (click to toggle)
python-cogent 1.4.1-1.2
  • links: PTS, VCS
  • area: non-free
  • in suites: squeeze
  • size: 13,260 kB
  • ctags: 20,087
  • sloc: python: 116,163; ansic: 732; makefile: 74; sh: 9
file content (110 lines) | stat: -rw-r--r-- 3,923 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
"""
Returns the likelihood resulting from a model in which motif probabilities 
are assumed to be equal to the observed motif frequencies, as described by 
Goldman (1993).  This model is not informative for inferring the evolutionary 
process, but its likelihood indicates the maximum possible likelihood value 
for a site-independent evolutionary process.
"""
from __future__ import division
import re
from numpy import log

from cogent import LoadSeqs

__author__ = "Helen Lindsay, Gavin Huttley"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Helen Lindsay", "Gavin Huttley", "Daniel McDonald"]
cite = "Goldman, N. (1993).  Statistical tests of models of DNA substitution.  J Mol Evol, 36: 182-98"
__license__ = "GPL"
__version__ = "1.4.1"
__maintainer__ = "Gavin Huttley"
__email__ = "gavin.huttley@anu.edu.au"
__status__ = "Production"

def _transpose(array):
    new_array = []
    num_cols = len(array[0])
    for col in range(num_cols):
        new_row = []
        for row in array:
            new_row += [row[col]]
        new_array.append(new_row)
    return new_array

def _take(array, indices):
    new_array = []
    for index in indices:
        new_array.append(array[index])
    return new_array

def aligned_columns_to_rows(aln, motif_len, exclude_chars = '-N?'):
    """return alignment broken into motifs as a transposed list with
    sequences as columns and aligned columns as rows
    
    Arguments:
        exclude_chars: a character string suitable for use as a regular expression"""
    exclude_chars = re.compile("[%s]" % exclude_chars)
    exclude_indices = set()
    array = []
    for name in aln.Names:
        motifs = list(aln.getGappedSeq(name).getInMotifSize(motif_len))
        array.append(motifs)
        for motif_index, motif in enumerate(motifs):
            if exclude_chars.findall(motif):
                exclude_indices.update([motif_index])
    include_indices = set(range(len(array[0]))).difference(exclude_indices)
    include_indices = list(include_indices)
    include_indices.sort()
    array = _transpose(array)
    array = _take(array, include_indices)
    return array

def count_column_freqs(columns_list):
    """return the frequency of columns"""
    col_freq_dict = {}
    for column in columns_list:
        column = ' '.join(column)
        col_freq_dict[column] = col_freq_dict.get(column, 0) + 1
    return col_freq_dict

def get_ML_probs(columns_list, with_patterns=False):
    """returns the column log-likelihoods and frequencies
    
    Argument:
        - with_patterns: the site patterns are returned"""
    n = len(columns_list)
    col_freq_dict = count_column_freqs(columns_list)
    col_lnL_freqs = []
    for column_pattern, freq in col_freq_dict.items():
        # note, the behaviour of / is changed due to the __future__ import
        if with_patterns:
            row = [column_pattern, freq/n, freq]
        else:
            row = [freq/n, freq]
        col_lnL_freqs.append(row)
    return col_lnL_freqs

def get_G93_lnL_from_array(columns_list):
    """return the best log likelihood for a list of aligned columns"""
    col_stats = get_ML_probs(columns_list)
    log_likelihood = 0
    for freq, num in col_stats:
        pattern_lnL = log(freq)*num
        log_likelihood += pattern_lnL
    return log_likelihood

def BestLogLikelihood(aln, alphabet, exclude_chars = '-N?'):
    """returns the best log-likelihood according to Goldman 1993.
    
    Arguments:
        - exclude_chars: a character string suitable for use as a regular
          expression
    """
    # need to use the alphabet, so we can enforce character compliance
    aln = LoadSeqs(data=aln.todict(), moltype=alphabet.MolType)
    motif_len = alphabet.getMotifLen()
    columns = aligned_columns_to_rows(aln, motif_len, exclude_chars)
    log_likelihood = get_G93_lnL_from_array(columns)
    return log_likelihood