File: _embedding.py

package info (click to toggle)
python-skbio 0.6.2-4
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 9,312 kB
  • sloc: python: 60,482; ansic: 672; makefile: 224
file content (372 lines) | stat: -rw-r--r-- 10,892 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy as np
import pandas as pd
from scipy.linalg import svd

from skbio.sequence import Sequence
from skbio._base import SkbioObject
from skbio.stats.ordination import OrdinationResults
from skbio.diversity import beta_diversity


def _repr_helper(rstr, org_name, new_name, dim_name, regex_match, shape):
    rstr = rstr.replace(org_name, new_name)
    n_indent = 4  # see Sequence.__repr__
    indent = " " * n_indent
    rstr = rstr.replace(
        regex_match,
        f"{dim_name} dimension: {shape}\n{indent}has gaps",
    )
    return rstr


class Embedding(SkbioObject):
    r"""Embedding for a biological object.

    Parameters
    ----------
    embedding : array_like
        Embedding matrix where the first axis is indexed by `ids`.
    ids : array_like
        IDs of biological objects.

    """

    @property
    def embedding(self):
        r"""The embedding tensor."""
        return self._embedding

    @property
    def ids(self):
        r"""IDs corresponding to each row of the embedding."""
        # each embedding row corresponds to an id
        return self._ids

    def __init__(self, embedding, ids, **kwargs):
        # make sure that the embedding has the same length as the sequence
        ids_len = len(ids)
        if embedding.shape[0] != ids_len:
            raise ValueError(
                f"The embedding ({embedding.shape[0]}) must have the "
                f"same length as the ids ({ids_len})."
            )

        self._embedding = np.asarray(embedding)
        self._ids = np.asarray(ids)

    def __str__(self):
        raise NotImplementedError("This method should be implemented by subclasses.")

    def bytes(self):
        r"""Bytes representation of string encoding."""
        seq = np.frombuffer(str(self).encode("ascii"), dtype=np.uint8)
        return seq


class SequenceEmbedding(Embedding):
    r"""Embedding for a biological sequence.

    Parameters
    ----------
    embedding : array_like
        The embedding of the sequence. Row vectors correspond to the latent character
        coordinates.
    sequence : str, Sequence, or 1D ndarray
        Characters representing the sequence itself.

    See Also
    --------
    Embedding
    skbio.sequence.Sequence

    """

    def __init__(self, embedding, sequence, **kwargs):
        if isinstance(sequence, Sequence):
            sequence = str(sequence)
        if isinstance(sequence, str):
            sequence = sequence.encode("ascii")
        seq = np.frombuffer(sequence, dtype=np.uint8)
        super(SequenceEmbedding, self).__init__(embedding, seq, **kwargs)

    def __str__(self):
        r"""String representation of the underlying sequence."""
        return str(self._ids.tobytes().decode("ascii"))

    @property
    def sequence(self):
        r"""String representation of the underlying sequence."""
        return str(self)

    def __repr__(self):
        r"""Return a string representation of the SequenceEmbedding object.

        Returns
        -------
        str
            String representation of the SequenceEmbedding object.

        See Also
        --------
        skbio.sequence.Protein

        """
        seq = Sequence(self.sequence)
        rstr = _repr_helper(
            repr(seq),
            "Sequence",
            "SequenceEmbedding",
            "embedding",
            regex_match="length",
            shape=self.embedding.shape[1],
        )
        return rstr


class EmbeddingVector(Embedding):
    r"""Vector representation for a biological entity.

    Parameters
    ----------
    vector : 1D or 2D array_like
        The vector representation of the sequence. Typically a 1D array. Can also be a
        2D array with only one row.
    sequence : str, Sequence, or 1D ndarray
        Characters representing the sequence itself.

    See Also
    --------
    Embedding

    """

    def __init__(self, vector, obj, **kwargs):
        super(EmbeddingVector, self).__init__(vector, obj, **kwargs)

    def __str__(self):
        return self._ids[0].decode("ascii")

    @property
    def vector(self):
        r"""Vector representation for the biological entity."""
        return self._embedding.squeeze()

    @property
    def embedding(self):
        r"""The embedding tensor."""
        return self._embedding.reshape(1, -1)


class SequenceVector(EmbeddingVector):
    r"""Vector representation for a biological sequence.

    Parameters
    ----------
    vector : 1D or 2D array_like
        The vector representation of the sequence. Typically a 1D array. Can also be a
        2D array with only one row.
    sequence : str, Sequence, or 1D ndarray
        Characters representing the sequence itself.

    See Also
    --------
    EmbeddingVector
    skbio.sequence.Sequence

    """

    def __init__(self, vector, sequence, **kwargs):
        vector = np.atleast_2d(vector)
        if vector.shape[0] != 1:
            raise ValueError("Only one vector per sequence is allowed.")

        if isinstance(sequence, Sequence):
            sequence = str(sequence)
        if isinstance(sequence, str):
            sequence = sequence.encode("ascii")
        sequence = np.array([sequence], dtype="O")

        super(SequenceVector, self).__init__(vector, sequence, **kwargs)

    @property
    def sequence(self):
        r"""String representation of the underlying sequence."""
        return str(self)

    def __repr__(self):
        r"""Return a string representation of the SequenceVector object.

        Returns
        -------
        str
            A string representation of the SequenceVector object.

        See Also
        --------
        skbio.sequence.Sequence

        """
        seq = Sequence(str(self))
        rstr = _repr_helper(
            repr(seq),
            "Sequence",
            "SequenceVector",
            "vector",
            regex_match="length",
            shape=self.embedding.shape[1],
        )
        return rstr


def embed_vec_to_numpy(vectors, validate=True):
    r"""Convert an iterable of EmbeddingVector objects to a NumPy array.

    Parameters
    ----------
    vectors : iterable of EmbeddingVector objects
        An iterable of EmbeddingVector objects, or objects that
        subclass EmbeddingVector.
    validate : bool, optional
        If ``True``, validate that all vectors have the same length
        and are valid types.

    Returns
    -------
    ndarray of shape (n_objects, n_features)
        A NumPy array where n_features corresponds to the dimensionality of the latent
        space.

    Raises
    ------
    ValueError
        If the vectors do not have the same length.

    """
    if validate:
        subcls = [issubclass(type(ev), EmbeddingVector) for ev in vectors]
        if not all(subcls):
            raise ValueError(
                "Input iterable contains objects that "
                "do not subclass EmbeddingVector."
            )

        types = [type(ev) for ev in vectors]
        if not all(t == types[0] for t in types):
            raise ValueError("All objects must be of the same type.")

        lens = [len(ev.vector) for ev in vectors]
        if not all(ln == lens[0] for ln in lens):
            raise ValueError("All vectors must have the same length.")

    data = np.vstack([ev.vector for ev in vectors])
    return data


def embed_vec_to_distances(vectors, metric="euclidean", validate=True):
    r"""Convert EmbeddingVector objects to a DistanceMatrix object.

    Parameters
    ----------
    vectors : iterable of EmbeddingVector objects
        An iterable of EmbeddingVector objects, or objects that
        subclass EmbeddingVector.
    metric : str or callable, optional
        The distance metric to use. Must be a valid metric for
        ``scipy.spatial.distance.pdist``.
    validate : bool, optional
        If ``True``, validate that all vectors have the same length
        and are valid types.

    Returns
    -------
    DistanceMatrix
        A distance matrix representing pairwise distances among objects calculated by
        the given metric.

    See Also
    --------
    skbio.stats.distance.DistanceMatrix

    """
    data = embed_vec_to_numpy(vectors, validate=validate)
    ids = [str(ev) for ev in vectors]
    return beta_diversity(metric, data, ids)


def embed_vec_to_ordination(vectors, validate=True):
    r"""Convert EmbeddingVector objects to an Ordination object.

    A singular value decomposition (SVD) is performed on the data.

    Parameters
    ----------
    vectors : iterable of EmbeddingVector objects
        An iterable of EmbeddingVector objects, or objects that subclass
        EmbeddingVector.
    validate : bool, optional
        If ``True``, validate that all vectors have the same length and are valid
        types.

    Returns
    -------
    OrdinationResults
        Ordination results with objects as samples and latent variables as features.

    See Also
    --------
    skbio.stats.ordination.OrdinationResults

    """
    data = embed_vec_to_numpy(vectors, validate=validate)
    u, s, vh = svd(data, full_matrices=False)
    eigvals = s**2
    short_name = "SVD"
    long_name = "Singular Value Decomposition"
    # note that we are moving half of the singular values
    # in the eigvals to the samples and the other half to the features
    # this is to help with the interpretation of the ordination
    # if visualizing with biplots
    ordr = OrdinationResults(
        short_method_name=short_name,
        long_method_name=long_name,
        eigvals=eigvals,
        proportion_explained=eigvals / eigvals.sum(),
        samples=pd.DataFrame(u @ np.diag(s), index=[str(ev) for ev in vectors]),
        features=pd.DataFrame(vh.T, index=range(data.shape[1])),
    )
    return ordr


def embed_vec_to_dataframe(vectors, validate=True):
    r"""Convert a list of SequenceVector objects to a pandas DataFrame.

    Parameters
    ----------
    vectors : iterable of EmbeddingVector objects
        An iterable of EmbeddingVector objects, or objects that
        subclass EmbeddingVector.
    validate : bool, optional
        If ``True``, validate that all vectors have the same length
        and are valid types.

    Returns
    -------
    pd.DataFrame
        Data frame containing the embedding vectors as rows (index) and object IDs as
        columns.

    See Also
    --------
    pd.DataFrame

    """
    data = embed_vec_to_numpy(vectors, validate=validate)
    return pd.DataFrame(data, index=[str(ev) for ev in vectors])