File: __init__.py | Debian Sources

File: init.py

package info (click to toggle)
python-biopython 1.42-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (629 lines) | stat: -rw-r--r-- 22,572 bytes
parent folder | download | duplicates (3)
"""Substitution matrices, log odds matrices, and operations on them.
"""
import re
import string
import sys
import copy
import math
# BioPython imports
from Bio import Alphabet
from Bio.SubsMat import FreqTable
# from Bio.Tools import statfns

log = math.log
# Matrix types
NOTYPE = 0
ACCREP = 1
OBSFREQ = 2
SUBS = 3
EXPFREQ = 4
LO = 5
EPSILON = 0.00000000000001
class BadMatrix(Exception):
   """Exception raised when verifying a matrix"""
   def __str__(self):
      return "Bad Matrix"
BadMatrixError = BadMatrix()

# 5/2001 added the following:
# * Methods for subtraction, addition and multiplication of matrices
# * Generation of an expected frequency table from an observed frequency matrix
# * Calculation of linear correlation coefficient between two matrices. Needs
#   Bio.Tools.statfns
# * Calculation of relative entropy is now done using the _make_relative_entropy method
#   and is stored in the member self.relative_entropy
# * Calculation of entropy is now done using the _make_entropy method and is stored in
#   the member self.entropy
# * Jensen-Shannon distance between the distributions from which the matrices are
#   derived. This is a distance function based on the distribution's entropies.
#
# Substitution matrix routines
# Iddo Friedberg idoerg@cc.huji.ac.il
# Biopython license applies (http://biopython.org)
# 
# General:
# -------
# You should have python 2.0 or above. 
# http://www.python.org
# You should have biopython (http://biopython.org) installed.
# 
# This module provides a class and a few routines for generating
# substitution matrices, similar ot BLOSUM or PAM matrices, but based on
# user-provided data.
# The class used for these matrices is SeqMat
# 
# Matrices are implemented as a user dictionary. Each index contains a
# 2-tuple, which are the two residue/nucleotide types replaced. The value
# differs according to the matrix's purpose: e.g in a log-odds frequency
# matrix, the value would be log(Pij/(Pi*Pj)) where:
# Pij: frequency of substitution of letter (residue/nucletide) i by j 
# Pi, Pj: expected frequencies of i and j, respectively.
# 
# Usage:
# -----
# The following section is layed out in the order by which most people wish
# to generate a log-odds matrix. Of course, interim matrices can be
# generated and investigated. Most people just want a log-odds matrix,
# that's all.
# 
# Generating an Accepted Replacement Matrix:
# -----------------------------------------
#  Initially, you should generate an accepted replacement matrix
#  (ARM) from your data. The values in ARM are the _counted_ number of
#  replacements according to your data. The data could be a set of pairs
#  or multiple alignments. So for instance if Alanine was replaced by
#  Cysteine 10 times, and Cysteine by Alanine 12 times, the corresponding
#  ARM entries would be:
#  ['A','C']: 10, ['C','A'] 12
#  as order doesn't matter, user can already provide only one entry:
#  ['A','C']: 22 
#  A SeqMat instance may be initialized with either a full (first
#  method of counting: 10, 12) or half (the latter method, 22) matrices. A
#  Full protein alphabet matrix would be of the size 20x20 = 400. A Half
#  matrix of that alphabet would be 20x20/2 + 20/2 = 210. That is because
#  same-letter entries don't change. (The matrix diagonal). Given an
#  alphabet size of N:
#  Full matrix size:N*N
#  Half matrix size: N(N+1)/2
#  
#  If you provide a full matrix, the constructore will create a half-matrix
#  automatically.
#  If you provide a half-matrix, make sure
#  of a (low, high) sorted order in the keys: there should only be 
#  a ('A','C') not a ('C','A').
#
# Internal functions:
# 
# Generating the observed frequency matrix (OFM):
# ----------------------------------------------
#  Use: OFM = _build_obs_freq_mat(ARM)
#  The OFM is generated from the ARM, only instead of replacement counts, it
#  contains replacement frequencies.
# Generating an expected frequency matrix (EFM):
# ---------------------------------------------
#  Use: EFM = _build_exp_freq_mat(OFM,exp_freq_table)
#  exp_freq_table: should be a freqTableC instantiation. See freqTable.py for
#  detailed information. Briefly, the expected frequency table has the
#  frequencies of appearance for each member of the alphabet
# Generating a substitution frequency matrix (SFM):
# ------------------------------------------------
#  Use: SFM = _build_subs_mat(OFM,EFM)
#  Accepts an OFM, EFM. Provides the division product of the corresponding
#  values. 
# Generating a log-odds matrix (LOM):
# ----------------------------------
#  Use: LOM=_build_log_odds_mat(SFM[,logbase=10,factor=10.0,roundit=1])
#  Accepts an SFM. logbase: base of the logarithm used to generate the
#  log-odds values. factor: factor used to multiply the log-odds values.
#  roundit: default - true. Whether to round the values.
#  Each entry is generated by log(LOM[key])*factor
#  And rounded if required.
#
# External:
# ---------
# In most cases, users will want to generate a log-odds matrix only, without
# explicitly calling the OFM --> EFM --> SFM stages. The function
# build_log_odds_matrix does that. User provides an ARM and an expected
# frequency table. The function returns the log-odds matrix
#
class SeqMat(dict):
   """A Generic sequence matrix class
   The key is a 2-tuple containing the letter indices of the matrix. Those
   should be sorted in the tuple (low, high). Because each matrix is dealt
   with as a half-matrix."""

   def _alphabet_from_matrix(self):
      ab_dict = {}
      s = ''
      for i in self.keys():
         ab_dict[i[0]] = 1
         ab_dict[i[1]] = 1
      letters_list = ab_dict.keys()
      letters_list.sort()
      for i in letters_list:
         s = s + i
      self.alphabet.letters = s

   def __init__(self,data=None, alphabet=None,
             mat_type=NOTYPE,mat_name='',build_later=0):
      # User may supply:
      # data: matrix itself
      # mat_type: its type. See below
      # mat_name: its name. See below.
      # alphabet: an instance of Bio.Alphabet, or a subclass. If not
      # supplied, constructor builds its own from that matrix."""
      # build_later: skip the matrix size assertion. User will build the
      # matrix after creating the instance. Constructor builds a half matrix
      # filled with zeroes.

      assert type(mat_type) == type(1)
      assert type(mat_name) == type('')

      # "data" may be:
      # 1) None --> then self.data is an empty dictionary
      # 2) type({}) --> then self.data takes the items in data
      # 3) An instance of SeqMat
      # This whole creation-during-execution is done to avoid changing
      # default values, the way Python does because default values are
      # created when the function is defined, not when it is created.
      assert (type(data) == type({}) or isinstance(data,dict) or
              data == None)
      if data == None:
         data = {}
      else:
         self.update(data)
      if alphabet == None:
         alphabet = Alphabet.Alphabet()
      assert Alphabet.generic_alphabet.contains(alphabet)
      self.alphabet = alphabet

      # If passed alphabet is empty, use the letters in the matrix itself
      if not self.alphabet.letters:
         self._alphabet_from_matrix()
      # Assert matrix size: half or full
      if not build_later:
         N = len(self.alphabet.letters)
         assert len(self) == N**2 or len(self) == N*(N+1)/2
      self.ab_list = list(self.alphabet.letters)
      self.ab_list.sort()
      # type can be: ACCREP, OBSFREQ, SUBS, EXPFREQ, LO
      self.mat_type = mat_type
      # Names: a string like "BLOSUM62" or "PAM250"
      self.mat_name = mat_name
      if build_later:
         self._init_zero()
      else:
			# Convert full to half if matrix is not already a log-odds matrix
         if self.mat_type <> LO:
            self._full_to_half()
         self._correct_matrix()
      self.sum_letters = {}
      self.relative_entropy = 0

   def _correct_matrix(self):
      keylist = self.keys()
      for key in keylist:
         if key[0] > key[1]:
            self[(key[1],key[0])] = self[key]
            del self[key]

   def _full_to_half(self):
      """
      Convert a full-matrix to a half-matrix
      """
      # For instance: two entries ('A','C'):13 and ('C','A'):20 will be summed
      # into ('A','C'): 33 and the index ('C','A') will be deleted
      # alphabet.letters:('A','A') and ('C','C') will remain the same.

      N = len(self.alphabet.letters)
      # Do nothing if this is already a half-matrix
      if len(self) == N*(N+1)/2:
         return
      for i in self.ab_list:
         for j in self.ab_list[:self.ab_list.index(i)+1]:
            if i <> j:
               self[j,i] = self[j,i] + self[i,j]
               del self[i,j]

   def _init_zero(self):
      for i in self.ab_list:
         for j in self.ab_list[:self.ab_list.index(i)+1]:
            self[j,i] = 0.

   def make_relative_entropy(self,obs_freq_mat):
      """if this matrix is a log-odds matrix, return its entropy
      Needs the observed frequency matrix for that"""
      ent = 0.
      if self.mat_type == LO:
         for i in self.keys():
            ent += obs_freq_mat[i]*self[i]/log(2)
      elif self.mat_type == SUBS:
         for i in self.keys():
            if self[i] > EPSILON:
               ent += obs_freq_mat[i]*log(self[i])/log(2)
      else:
         raise TypeError,"entropy: substitution or log-odds matrices only"
      self.relative_entropy = ent
   #
   def make_entropy(self):
      self.entropy = 0
      for i in self.keys():
         if self[i] > EPSILON:
            self.entropy += self[i]*log(self[i])/log(2)
      self.entropy = -self.entropy
   def letter_sum(self,letter):
      assert letter in self.alphabet.letters
      sum = 0.
      for i in self.keys():
         if letter in i:
            if i[0] == i[1]:
               sum += self[i]
            else:
               sum += (self[i] / 2.)
      return sum

   def all_letters_sum(self):
      for letter in self.alphabet.letters:
         self.sum_letters[letter] = self.letter_sum(letter)
   def print_full_mat(self,f=None,format="%4d",topformat="%4s",
              alphabet=None,factor=1,non_sym=None):
      f = f or sys.stdout 
      # create a temporary dictionary, which holds the full matrix for
      # printing
      assert non_sym == None or type(non_sym) == type(1.) or \
      type(non_sym) == type(1)
      full_mat = copy.copy(self)
      for i in self:
         if i[0] <> i[1]:
            full_mat[(i[1],i[0])] = full_mat[i]
      if not alphabet:
         alphabet = self.ab_list
      topline = ''
      for i in alphabet:
         topline = topline + topformat % i
      topline = topline + '\n'
      f.write(topline)
      for i in alphabet:
         outline = i
         for j in alphabet:
            if alphabet.index(j) > alphabet.index(i) and non_sym <> None:
               val = non_sym
            else:
               val = full_mat[i,j]
               val *= factor
            if val <= -999:
               cur_str = '  ND' 
            else:
               cur_str = format % val
            
            outline = outline+cur_str
         outline = outline+'\n'
         f.write(outline)

   def print_mat(self,f=None,format="%4d",bottomformat="%4s",
              alphabet=None,factor=1):
      """Print a nice half-matrix. f=sys.stdout to see on the screen
      User may pass own alphabet, which should contain all letters in the
      alphabet of the matrix, but may be in a different order. This
      order will be the order of the letters on the axes"""
      
      f = f or sys.stdout
      if not alphabet:
         alphabet = self.ab_list
      bottomline = ''
      for i in alphabet:
         bottomline = bottomline + bottomformat % i
      bottomline = bottomline + '\n'
      for i in alphabet:
         outline = i
         for j in alphabet[:alphabet.index(i)+1]:
            try:
               val = self[j,i]
            except KeyError:
               val = self[i,j]
            val *= factor
            if val == -999:
               cur_str = '  ND' 
            else:
               cur_str = format % val
            
            outline = outline+cur_str
         outline = outline+'\n'
         f.write(outline)
      f.write(bottomline)
   def __sub__(self,other):
      """ returns a number which is the subtraction product of the two matrices"""
      mat_diff = 0
      for i in self.keys():
         mat_diff += (self[i] - other[i])
      return mat_diff
   def __mul__(self,other):
      """ returns a matrix for which each entry is the multiplication product of the
      two matrices passed"""
      new_mat = copy.copy(self)
      for i in self.keys():
         new_mat[i] *= other[i]
      return new_mat
   def __sum__(self, other):
      new_mat = copy.copy(self)
      for i in self.keys():
         new_mat[i] += other[i]
      return new_mat

def _build_obs_freq_mat(acc_rep_mat):
   """
   build_obs_freq_mat(acc_rep_mat):
   Build the observed frequency matrix, from an accepted replacements matrix
   The accRep matrix should be generated by the user.
   """
   # Note: acc_rep_mat should already be a half_matrix!!
   sum = 0.
   for i in acc_rep_mat.values():
      sum += i
   obs_freq_mat = SeqMat(alphabet=acc_rep_mat.alphabet,build_later=1)
   for i in acc_rep_mat.keys():
      obs_freq_mat[i] = acc_rep_mat[i]/sum
   obs_freq_mat.mat_type = OBSFREQ
   return obs_freq_mat

def _exp_freq_table_from_obs_freq(obs_freq_mat):
   exp_freq_table = {}
   for i in obs_freq_mat.alphabet.letters:
      exp_freq_table[i] = 0.
   for i in obs_freq_mat.keys():
      if i[0] == i[1]:
         exp_freq_table[i[0]] += obs_freq_mat[i]
      else:
         exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
         exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
   return FreqTable.FreqTable(exp_freq_table,FreqTable.FREQ)

def _build_exp_freq_mat(exp_freq_table):
   """Build an expected frequency matrix
   exp_freq_table: should be a FreqTable instance
   """
   exp_freq_mat = SeqMat(alphabet=exp_freq_table.alphabet,build_later=1)
   for i in exp_freq_mat.keys():
      if i[0] == i[1]:
         exp_freq_mat[i] = exp_freq_table[i[0]]**2
      else:
         exp_freq_mat[i] = 2.0*exp_freq_table[i[0]]*exp_freq_table[i[1]]
   exp_freq_mat.mat_type = EXPFREQ
   return exp_freq_mat
#
# Build the substitution matrix
#
def _build_subs_mat(obs_freq_mat,exp_freq_mat):
   """ Build the substitution matrix """
   if obs_freq_mat.ab_list <> exp_freq_mat.ab_list:
      raise ValueError, "Alphabet mismatch in passed matrices"
   subs_mat = SeqMat(obs_freq_mat)
   for i in obs_freq_mat.keys():
      subs_mat[i] = obs_freq_mat[i]/exp_freq_mat[i]

   subs_mat.mat_type = SUBS
   return subs_mat

#
# Build a log-odds matrix
#
def _build_log_odds_mat(subs_mat,logbase=2,factor=10.0,round_digit=0,keep_nd=0):
   """_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
   Build a log-odds matrix
   logbase=2: base of logarithm used to build (default 2)
   factor=10.: a factor by which each matrix entry is multiplied
   round_digit: roundoff place after decimal point
   keep_nd: if true, keeps the -999 value for non-determined values (for which there
   are no substitutions in the frequency substitutions matrix). If false, plants the
   minimum log-odds value of the matrix in entries containing -999
   """
   lo_mat = SeqMat(subs_mat)
   for i in subs_mat.keys():
      if subs_mat[i] < EPSILON:
         lo_mat[i] = -999
      else:
         lo_mat[i] = round(factor*log(subs_mat[i])/log(logbase),round_digit)
   lo_mat.mat_type = LO
   mat_min = min(lo_mat.values())
   if not keep_nd:
      for i in lo_mat.keys():
         if lo_mat[i] <= -999:
            lo_mat[i] = mat_min
   return lo_mat

#
# External function. User provides an accepted replacement matrix, and,
# optionally the following: expected frequency table, log base, mult. factor,
# and rounding factor. Generates a log-odds matrix, calling internal SubsMat
# functions.
#
def make_log_odds_matrix(acc_rep_mat,exp_freq_table=None,logbase=2,
                    factor=1.,round_digit=9,keep_nd=0):
   obs_freq_mat = _build_obs_freq_mat(acc_rep_mat)
   if not exp_freq_table:
      exp_freq_table = _exp_freq_table_from_obs_freq(obs_freq_mat)
   exp_freq_mat = _build_exp_freq_mat(exp_freq_table)
   subs_mat = _build_subs_mat(obs_freq_mat, exp_freq_mat)
   lo_mat = _build_log_odds_mat(subs_mat,logbase,factor,round_digit,keep_nd)
   lo_mat.make_relative_entropy(obs_freq_mat)
   return lo_mat
def observed_frequency_to_substitution_matrix(obs_freq_mat):
   exp_freq_table = _exp_freq_table_from_obs_freq(obs_freq_mat)
   exp_freq_mat = _build_exp_freq_mat(exp_freq_table)
   subs_mat = _build_subs_mat(obs_freq_mat, exp_freq_mat)
   return subs_mat
def read_text_matrix(data_file,mat_type=NOTYPE):
   matrix = {}
   tmp = string.split(data_file.read(),"\n")
   table=[]
   for i in tmp: 
      table.append(string.split(i))
   # remove records beginning with ``#''
   for rec in table[:]:
      if (rec.count('#') > 0):
         table.remove(rec)

   # remove null lists
   while (table.count([]) > 0):
      table.remove([])
   # build a dictionary
   alphabet = table[0]
   j = 0
   for rec in table[1:]:
      # print j
      row = alphabet[j]
      # row = rec[0]
      if re.compile('[A-z\*]').match(rec[0]):
         first_col = 1
      else:
         first_col = 0
      i = 0
      for field in rec[first_col:]:
         col = alphabet[i]
         matrix[(row,col)] = string.atof(field)
         i += 1
      j += 1
   # delete entries with an asterisk
   for i in matrix.keys():
      if '*' in i: del(matrix[i])
   ret_mat = SeqMat(matrix,mat_type=mat_type)
   return ret_mat

diagNO = 1
diagONLY = 2
diagALL = 3

def two_mat_relative_entropy(mat_1,mat_2,logbase=2,diag=diagALL):
   rel_ent = 0.
   key_list_1 = mat_1.keys(); key_list_2 = mat_2.keys()
   key_list_1.sort(); key_list_2.sort()
   key_list = []
   sum_ent_1 = 0.; sum_ent_2 = 0.
   for i in key_list_1:
      if i in key_list_2:
         key_list.append(i)
   if len(key_list_1) <> len(key_list_2):
   
      sys.stderr.write("Warning:first matrix has more entries than the second\n")
   if key_list_1 <> key_list_2:
      sys.stderr.write("Warning: indices not the same between matrices\n")
   for key in key_list:
      if diag == diagNO and key[0] == key[1]:
         continue
      if diag == diagONLY and key[0] <> key[1]:
         continue
      if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
         sum_ent_1 += mat_1[key]
         sum_ent_2 += mat_2[key]
         
   for key in key_list:
      if diag == diagNO and key[0] == key[1]:
         continue
      if diag == diagONLY and key[0] <> key[1]:
         continue
      if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
         val_1 = mat_1[key] / sum_ent_1
         val_2 = mat_2[key] / sum_ent_2
#         rel_ent += mat_1[key] * log(mat_1[key]/mat_2[key])/log(logbase)
         rel_ent += val_1 * log(val_1/val_2)/log(logbase)
   return rel_ent

## Gives the linear correlation coefficient between two matrices
#def two_mat_correlation(mat_1, mat_2):
# Wait for the statistical package before uncommenting this
#
#   corr_list = []
#   assert mat_1.ab_list == mat_2.ab_list
#   for ab_pair in mat_1.keys():
#      try:
#         corr_list.append((mat_1[ab_pair], mat_2[ab_pair]))
#      except KeyError:
#         sys.stderr.write("Error:two_mat_correlation: %s is not a common key\n" %
#                           mat_1)
#   return statfns.corr(corr_list)

# Jensen-Shannon Distance
# Need to input observed frequency matrices
def two_mat_DJS(mat_1,mat_2,pi_1=0.5,pi_2=0.5):
   assert mat_1.ab_list == mat_2.ab_list
   assert pi_1 > 0 and pi_2 > 0 and pi_1< 1 and pi_2 <1
   assert not (pi_1 + pi_2 - 1.0 > EPSILON)
   sum_mat = SeqMat(build_later=1)
   sum_mat.ab_list = mat_1.ab_list
   for i in mat_1.keys():
      sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
   sum_mat.make_entropy()
   mat_1.make_entropy()
   mat_2.make_entropy()
   # print mat_1.entropy, mat_2.entropy
   dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 *mat_2.entropy
   return dJS
      
"""
This isn't working yet. Boo hoo!
def two_mat_print(mat_1, mat_2, f=None,alphabet=None,factor_1=1, factor_2=1,
                  format="%4d",bottomformat="%4s",topformat="%4s",
                  topindent=7*" ", bottomindent=1*" "):
   f = f or sys.stdout
   if not alphabet:
      assert mat_1.ab_list == mat_2.ab_list
      alphabet = mat_1.ab_list
   len_alphabet = len(alphabet)
   print_mat = {}
   topline = topindent
   bottomline = bottomindent
   for i in alphabet:
      bottomline += bottomformat % i
      topline += topformat % alphabet[len_alphabet-alphabet.index(i)-1]
   topline += '\n'
   bottomline += '\n'
   f.write(topline)
   for i in alphabet:
      for j in alphabet:
         print_mat[i,j] = -999
   diag_1 = {}; diag_2 = {}
   for i in alphabet:
      for j in alphabet[:alphabet.index(i)+1]:
         if i == j:
            diag_1[i] = mat_1[(i,i)] 
            diag_2[i] = mat_2[(alphabet[len_alphabet-alphabet.index(i)-1],
                   alphabet[len_alphabet-alphabet.index(i)-1])]
         else:
            if i > j:
               key = (j,i)
            else:
               key = (i,j)
            mat_2_key = [alphabet[len_alphabet-alphabet.index(key[0])-1],
                   alphabet[len_alphabet-alphabet.index(key[1])-1]]
            # print mat_2_key
            mat_2_key.sort(); mat_2_key = tuple(mat_2_key)
            # print key ,"||",  mat_2_key
            print_mat[key] = mat_2[mat_2_key] 
            print_mat[(key[1],key[0])] = mat_1[key]
   for i in alphabet:
      outline = i
      for j in alphabet:
         if i == j:
            if diag_1[i] == -999:
               val_1 = ' ND'
            else:
               val_1 = format % (diag_1[i]*factor_1)
            if diag_2[i] == -999:
               val_2 = ' ND'
            else:
               val_2 = format % (diag_2[i]*factor_2)
            cur_str = val_1 + "  " + val_2
         else:
            if print_mat[(i,j)] == -999:
               val = ' ND'
            elif alphabet.index(i) > alphabet.index(j):
               val = format % (print_mat[(i,j)]*factor_1)
            else:
               val = format % (print_mat[(i,j)]*factor_2)
            cur_str = val
         outline += cur_str
      outline += bottomformat % (alphabet[len_alphabet-alphabet.index(i)-1] +
                                 '\n')
      f.write(outline)
   f.write(bottomline)
"""