1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
# Copyright 2004 by Jason A. Hackney. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
from Bio import Seq
from Bio.Alphabet import IUPAC
from math import sqrt
import sys
class Motif:
"""A generic motif class.
A Motif has information about the alphabet used in the motif,
the length of the motif, the motif name, the number of occurrences,
and the individual instances of the motif. Each of the instances
is an object of the Instance class.
Methods:
add_instance(instance): adds a new instance of the motif.
get_instance_by_name(name): returns an instance with the specified name.
"""
def __init__(self):
self.instances = []
self.alphabet = None
self.length = 0
self.num_occurrences = 0
self.name = ""
self.consensus = ""
self.pssm = []
def add_instance (self, instance):
if isinstance(instance, Instance):
if self.length:
instance._length(self.length)
if self.name:
instance._motifname(self.name)
self.instances.append(instance)
def get_instance_by_name (self,name):
for i in set.instances:
if i.seqname == name:
return i
return None
def _alphabet (self, alphabet):
if alphabet == IUPAC.unambiguous_dna or alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna:
self.alphabet = alphabet
else:
return -1
def _length (self, length):
if type(length) == int:
self.length = length
else:
length = int(length)
self.length = length
def _name (self, name):
self.name = name
def _consensus (self, consensus):
if self.alphabet:
self.consensus = Seq.Seq(consensus, self.alphabet)
else:
self.consensus = consensus
def _numoccurrences (self, number):
if type(number) == int:
self.num_occurrences = number
else:
number = int(number)
self.num_occurrences = number
def make_pssm (self):
if self.alphabet == None:
raise ValueError, "Alphabet for motif has not been set"
moieties = ''
if self.alphabet == IUPAC.unambiguous_dna:
moieties = 'ACGT'
if self.alphabet == IUPAC.protein:
moieties = 'ACDEFGHIKLMNPQRSTVWY'
pssm = []
if self.instances and self.instances[0].sequence:
for position in self.instances[0].sequence:
pos = []
for m in moieties:
pos.append(0.0)
pssm.append(pos)
for instance in self.instances:
for position in range(self.length):
my_moiety = instance.sequence[position]
try:
moiety_index = moieties.index(my_moiety)
except ValueError:
moiety_index = 0
pssm[position][moiety_index] += 1.0
pssm = [[x/len(self.instances) for x in y] for y in pssm ]
pssm = [tuple(x) for x in pssm]
self.pssm = pssm
else:
self.pssm = None
def make_consensus (self, minimum_frequency = 0.6):
if not self.pssm:
self.make_pssm()
consensus = ''
null_character = 'N'
moieties = 'ACGT'
if self.alphabet == IUPAC.protein:
null_character = 'X'
moieties = 'ACDEFGHIKLMNPQRSTVWY'
for position in self.pssm:
this_position = null_character
vals = zip(position,moieties)
good_values = filter(lambda x: x[0] >= minimum_frequency, vals)
if good_values:
letters = [str(x[1]) for x in good_values]
my_letter = '/'.join(letters)
else:
my_letter = null_character
consensus += my_letter
self.consensus = consensus
class MEMEMotif (Motif):
"""A subclass of Motif used in parsing MEME (and MAST) output.
This sublcass defines functions and data specific to MEME motifs.
This includes the evalue for a motif and the PSSM of the motif.
Methods:
add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values.
add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies
add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position.
compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif
"""
def __init__ (self):
Motif.__init__(self)
self.evalue = 0.0
self.pssm = []
self.logodds = []
def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
inst = Instance()
inst._sequence(sequence)
inst._pvalue(pvalue)
inst._seqname(name)
inst._start(start)
inst._strand(strand)
if self.length:
inst._length(self.length)
if self.name:
inst._motifname(self.name)
self.instances.append(inst)
def _evalue (self, evalue):
if type(evalue) == float:
self.evalue = evalue
else:
evalue = float(evalue)
self.evalue = evalue
def add_to_pssm (self, thisposition):
self.pssm.append(thisposition)
def add_to_logodds (self, thisposition):
self.logodds.append(thisposition)
def compare_motifs (self, motif):
if isinstance(motif, MEMEMotif):
if not self.pssm:
raise ValueError, 'This motif does not have a PSSM'
if not motif.pssm:
raise ValueError, 'The other motif does not have a PSSM'
mylen = len(self.pssm)
yourlen = len(motif.pssm)
myr = None
if mylen == yourlen:
myr = _corr(self.pssm, motif.pssm)
elif mylen < yourlen:
diff = yourlen - mylen
for i in range(0,diff):
yourpssm = motif.pssm[i:mylen+i]
r = _corr(self.pssm, yourpssm)
if r > myr:
myr = r
else:
diff = mylen - yourlen
for i in range(0, diff):
mypssm = self.pssm[i:yourlen+i]
r = _corr(mypssm, motif.pssm)
if r > myr:
myr = r
return myr
else:
sys.stderr.write(str(m2))
return None
class Instance:
"""A class describing the instances of a motif, and the data thereof.
"""
def __init__ (self):
self.sequence = None
self.sequence_name = ""
self.start = 0
self.pvalue = 1.0
self.strand = 0
self.length = 0
self.motif_name = ""
def _sequence (self, seq):
self.sequence = seq
def _seqname (self, name):
self.sequence_name = name
def _motifname (self, name):
self.motif_name = name
def _start (self,start):
start = int(start)
self.start = start
def _pvalue (self,pval):
pval = float(pval)
self.pvalue = pval
def _score (self, score):
score = float(score)
self.score = score
def _strand (self, strand):
self.strand = strand
def _length (self, length):
self.length = length
def _corr (x,y):
sx = 0
sy = 0
sxx = 0
syy = 0
sxy = 0
#make the two-dimensional lists one dimensional
if type(x[0] == list):
x = reduce(lambda a,b: a+b, x)
if type(y[0] == list):
y = reduce(lambda a,b: a+b, y)
sq = lambda t: t*t
length = len(x)
for a,b in zip(x,y):
sx += a
sy += b
sxx += sq(a)
syy += sq(b)
sxy += a*b
s1 = sxy - sx * sy * 1.0/length
s2 = (sxx - sx * sy * 1.0/length) * (syy - sx * sy * 1.0/length)
r = s1 / sqrt (s2)
return r
|