1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
|
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
#
"""Find and deal with signatures in biological sequence data.
In addition to representing sequences according to motifs (see Motif.py
for more information), we can also use Signatures, which are conserved
regions that are not necessarily consecutive. This may be useful in
the case of very diverged sequences, where signatures may pick out
important conservation that can't be found by motifs (hopefully!).
"""
# biopython
from Bio.Alphabet import _verify_alphabet
from Bio.Seq import Seq
# local stuff
from .Pattern import PatternRepository
class SignatureFinder(object):
"""Find Signatures in a group of sequence records.
In this simple implementation, signatures are just defined as a
two motifs separated by a gap. We need something a lot smarter than
this to find more complicated signatures.
"""
def __init__(self, alphabet_strict=1):
"""Initialize a finder to get signatures.
Arguments:
o alphabet_strict - Specify whether signatures should be required
to have all letters in the signature be consistent with the
alphabet of the original sequence. This requires that all Seqs
used have a consistent alphabet. This helps protect against getting
useless signatures full of ambiguity signals.
"""
self._alphabet_strict = alphabet_strict
def find(self, seq_records, signature_size, max_gap):
"""Find all signatures in a group of sequences.
Arguments:
o seq_records - A list of SeqRecord objects we'll use the sequences
from to find signatures.
o signature_size - The size of each half of a signature (ie. if this
is set at 3, then the signature could be AGC-----GAC)
o max_gap - The maximum gap size between two parts of a signature.
"""
sig_info = self._get_signature_dict(seq_records, signature_size,
max_gap)
return PatternRepository(sig_info)
def _get_signature_dict(self, seq_records, sig_size, max_gap):
"""Return a dictionary with all signatures and their counts.
This internal function does all of the hard work for the
find_signatures function.
"""
if self._alphabet_strict:
alphabet = seq_records[0].seq.alphabet
else:
alphabet = None
# loop through all records to find signatures
all_sigs = {}
for seq_record in seq_records:
# if we are working with alphabets, make sure we are consistent
if alphabet is not None:
assert seq_record.seq.alphabet == alphabet, \
"Working with alphabet %s and got %s" % \
(alphabet, seq_record.seq.alphabet)
# now start finding signatures in the sequence
largest_sig_size = sig_size * 2 + max_gap
for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
# find the first part of the signature
first_sig = str(seq_record.seq[start:start + sig_size])
# now find all of the second parts of the signature
for second in range(start + 1, (start + 1) + max_gap):
second_sig = str(seq_record.seq[second: second + sig_size])
# if we are being alphabet strict, make sure both parts
# of the sig fall within the specified alphabet
if alphabet is not None:
first_seq = Seq(first_sig, alphabet)
second_seq = Seq(second_sig, alphabet)
if _verify_alphabet(first_seq) \
and _verify_alphabet(second_seq):
all_sigs = self._add_sig(all_sigs,
(first_sig, second_sig))
# if we are not being strict, just add the motif
else:
all_sigs = self._add_sig(all_sigs,
(first_sig, second_sig))
return all_sigs
def _add_sig(self, sig_dict, sig_to_add):
"""Add a signature to the given dictionary.
"""
# incrememt the count of the signature if it is already present
if sig_to_add in sig_dict:
sig_dict[sig_to_add] += 1
# otherwise add it to the dictionary
else:
sig_dict[sig_to_add] = 1
return sig_dict
class SignatureCoder(object):
"""Convert a Sequence into its signature representatives.
This takes a sequence and a set of signatures, and converts the
sequence into a list of numbers representing the relative amounts
each signature is seen in the sequence. This allows a sequence to
serve as input into a neural network.
"""
def __init__(self, signatures, max_gap):
"""Initialize with the signatures to look for.
Arguments:
o signatures - A complete list of signatures, in order, that
are to be searched for in the sequences. The signatures should
be represented as a tuple of (first part of the signature,
second_part of the signature) -- ('GATC', 'GATC').
o max_gap - The maximum gap we can have between the two
elements of the signature.
"""
self._signatures = signatures
self._max_gap = max_gap
# check to be sure the signatures are all the same size
# only do this if we actually have signatures
if len(self._signatures) > 0:
first_sig_size = len(self._signatures[0][0])
second_sig_size = len(self._signatures[0][1])
assert first_sig_size == second_sig_size, \
"Ends of the signature do not match: %s" \
% self._signatures[0]
for sig in self._signatures:
assert len(sig[0]) == first_sig_size, \
"Got first part of signature %s, expected size %s" % \
(sig[0], first_sig_size)
assert len(sig[1]) == second_sig_size, \
"Got second part of signature %s, expected size %s" % \
(sig[1], second_sig_size)
def representation(self, sequence):
"""Convert a sequence into a representation of its signatures.
Arguments:
o sequence - A Seq object we are going to convert into a set of
signatures.
Returns:
A list of relative signature representations. Each item in the
list corresponds to the signature passed in to the initializer and
is the number of times that the signature was found, divided by the
total number of signatures found in the sequence.
"""
# check to be sure we have signatures to deal with,
# otherwise just return an empty list
if len(self._signatures) == 0:
return []
# initialize a dictionary to hold the signature counts
sequence_sigs = {}
for sig in self._signatures:
sequence_sigs[sig] = 0
# get a list of all of the first parts of the signatures
all_first_sigs = []
for sig_start, sig_end in self._signatures:
all_first_sigs.append(sig_start)
# count all of the signatures we are looking for in the sequence
sig_size = len(self._signatures[0][0])
smallest_sig_size = sig_size * 2
for start in range(len(sequence) - (smallest_sig_size - 1)):
# if the first part matches any of the signatures we are looking
# for, then expand out to look for the second part
first_sig = str(sequence[start:start + sig_size])
if first_sig in all_first_sigs:
for second in range(start + sig_size,
(start + sig_size + 1) + self._max_gap):
second_sig = str(sequence[second:second + sig_size])
# if we find the motif, increase the counts for it
if (first_sig, second_sig) in sequence_sigs:
sequence_sigs[(first_sig, second_sig)] += 1
# -- normalize the signature info to go between zero and one
min_count = min(sequence_sigs.values())
max_count = max(sequence_sigs.values())
# as long as we have some signatures present, normalize them
# otherwise we'll just return 0 for everything
if max_count > 0:
for sig in sequence_sigs:
sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count) /
float(max_count))
# return the relative signature info in the specified order
sig_amounts = []
for sig in self._signatures:
sig_amounts.append(sequence_sigs[sig])
return sig_amounts
|