1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
|
"""
Sequence distance metrics (:mod:`skbio.sequence.distance`)
==========================================================
.. currentmodule:: skbio.sequence.distance
This module contains functions for computing distances between scikit-bio
``Sequence`` objects. These functions can be used directly or supplied to other
parts of the scikit-bio API that accept a sequence distance metric as input,
such as :meth:`skbio.sequence.Sequence.distance` and
:meth:`skbio.stats.distance.DistanceMatrix.from_iterable`.
Functions
---------
.. autosummary::
:toctree: generated/
hamming
kmer_distance
"""
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import scipy.spatial.distance
import skbio
from skbio.util._decorator import experimental
@experimental(as_of='0.4.2')
def hamming(seq1, seq2):
"""Compute Hamming distance between two sequences.
The Hamming distance between two equal-length sequences is the proportion
of differing characters.
Parameters
----------
seq1, seq2 : Sequence
Sequences to compute Hamming distance between.
Returns
-------
float
Hamming distance between `seq1` and `seq2`.
Raises
------
TypeError
If `seq1` and `seq2` are not ``Sequence`` instances.
TypeError
If `seq1` and `seq2` are not the same type.
ValueError
If `seq1` and `seq2` are not the same length.
See Also
--------
scipy.spatial.distance.hamming
Notes
-----
``np.nan`` will be returned if the sequences do not contain any characters.
This function does not make assumptions about the sequence alphabet in use.
Each sequence object's underlying sequence of characters are used to
compute Hamming distance. Characters that may be considered equivalent in
certain contexts (e.g., `-` and `.` as gap characters) are treated as
distinct characters when computing Hamming distance.
Examples
--------
>>> from skbio import Sequence
>>> from skbio.sequence.distance import hamming
>>> seq1 = Sequence('AGGGTA')
>>> seq2 = Sequence('CGTTTA')
>>> hamming(seq1, seq2)
0.5
"""
_check_seqs(seq1, seq2)
# Hamming requires equal length sequences. We are checking this here
# because the error you would get otherwise is cryptic.
if len(seq1) != len(seq2):
raise ValueError(
"Hamming distance can only be computed between sequences of equal "
"length (%d != %d)" % (len(seq1), len(seq2)))
# scipy throws a RuntimeWarning when computing Hamming distance on length 0
# input.
if not seq1:
distance = np.nan
else:
distance = scipy.spatial.distance.hamming(seq1.values, seq2.values)
return float(distance)
@experimental(as_of='0.5.0')
def kmer_distance(seq1, seq2, k, overlap=True):
"""Compute the kmer distance between a pair of sequences
The kmer distance between two sequences is the fraction of kmers that are
unique to either sequence.
Parameters
----------
seq1, seq2 : Sequence
Sequences to compute kmer distance between.
k : int
The kmer length.
overlap : bool, optional
Defines whether the kmers should be overlapping or not.
Returns
-------
float
kmer distance between `seq1` and `seq2`.
Raises
------
ValueError
If `k` is less than 1.
TypeError
If `seq1` and `seq2` are not ``Sequence`` instances.
TypeError
If `seq1` and `seq2` are not the same type.
Notes
-----
kmer counts are not incorporated in this distance metric.
``np.nan`` will be returned if there are no kmers defined for the
sequences.
Examples
--------
>>> from skbio import Sequence
>>> seq1 = Sequence('ATCGGCGAT')
>>> seq2 = Sequence('GCAGATGTG')
>>> kmer_distance(seq1, seq2, 3) # doctest: +ELLIPSIS
0.9230769230...
"""
_check_seqs(seq1, seq2)
seq1_kmers = set(map(str, seq1.iter_kmers(k, overlap=overlap)))
seq2_kmers = set(map(str, seq2.iter_kmers(k, overlap=overlap)))
all_kmers = seq1_kmers | seq2_kmers
if not all_kmers:
return np.nan
shared_kmers = seq1_kmers & seq2_kmers
number_unique = len(all_kmers) - len(shared_kmers)
fraction_unique = number_unique / len(all_kmers)
return fraction_unique
def _check_seqs(seq1, seq2):
# Asserts both sequences are skbio.sequence objects
for seq in seq1, seq2:
if not isinstance(seq, skbio.Sequence):
raise TypeError(
"`seq1` and `seq2` must be Sequence instances, not %r"
% type(seq).__name__)
# Asserts sequences have the same type
if type(seq1) is not type(seq2):
raise TypeError(
"Sequences must have matching type. Type %r does not match type %r"
% (type(seq1).__name__, type(seq2).__name__))
|