File: biom.py

package info (click to toggle)
python-skbio 0.6.2-4
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 9,312 kB
  • sloc: python: 60,482; ansic: 672; makefile: 224
file content (130 lines) | stat: -rw-r--r-- 3,861 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
r"""BIOM-Format (:mod:`skbio.io.format.biom`)
============================================

.. currentmodule:: skbio.io.format.biom

The BIOM-Format (format v2.1.0) is an HDF5-based format to represent sample/feature
counts or relative abundances. It is designed specifically for sparse data.
Internally, it stores the data in both compressed sparse row, and compressed
sparse column representation. It additionally has support for representing sample
and feature metadata.

.. note::

   Internally, BIOM describes features as observations, whereas scikit-bio uses the
   term features.

Format Support
--------------
**Has Sniffer: Yes**

+------+------+-------------------------------------------------------+
|Reader|Writer|                      Object Class                     |
+======+======+=======================================================+
|Yes   |Yes   |:mod:`skbio.table.Table`                               |
+------+------+-------------------------------------------------------+

Format Specification
--------------------
The official format specification for BIOM-Format can be found at [1]_.

Examples
--------
Here we will write an existing BIOM table, and re-read it. Note that the Table
from ``biom`` implicitly gets the ``.write`` method from the IO registry. This
``ByteIO`` object can be a file path in a regular use case.

>>> import io, skbio
>>> f = io.BytesIO()
>>> skbio.table.example_table.write(f)  # doctest: +ELLIPSIS
<_io.BytesIO object at ...>
>>> roundtrip = skbio.read(f, into=skbio.Table)
>>> roundtrip
2 x 3 <class 'biom.table.Table'> with 5 nonzero entries (83% dense)

References
----------
.. [1] http://biom-format.org/documentation/format_versions/biom-2.1.html

"""  # noqa: D205, D415

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------

import h5py

import skbio
from skbio.io import create_format
from skbio.table import Table

from .. import BIOMFormatError


biom = create_format("biom", encoding="binary")


@biom.sniffer()
def _biom_sniffer(fh):
    # this can be buffered, in which case .peek will return the buffer
    # so slice just in case
    magic = fh.peek(8)[:8]

    # From https://en.wikipedia.org/wiki/Hierarchical_Data_Format
    # Note that Wikipedia specifies: "\211HDF\r\n\032\n" which is an ordinal form:
    # >>> ord('\211')
    # 137
    # >>> ord('\x89')
    # 137
    # >>> ord('\032')
    # 26
    # >>> ord('\x1a')
    # 26
    if magic == b"\x89HDF\r\n\x1a\n":
        fp = h5py.File(fh, "r")
        url = fp.attrs.get("format-url")
        version = fp.attrs.get("format-version")

        if url is None or version is None:
            return False, {}
        if url != "http://biom-format.org":
            return False, {}
        if list(version) != [2, 1]:
            return False, {}

        return True, {}
    else:
        return False, {}


@biom.reader(Table)
def _biom_to_table_into(fh):
    return _biom_to_table(fh)


@biom.reader(None)
def _biom_to_table_default(fh):
    # skbio.read('foo.biom', format='biom')
    # will return a generator, that subsequently iterates the table.
    # returning a single item tuple yields expected behavior such that:
    # next(skbio.read('foo.biom', format='biom')) == Table
    return (_biom_to_table(fh),)


def _biom_to_table(fh):
    h5grp = h5py.File(fh, "r")
    return Table.from_hdf5(h5grp)


@biom.writer(Table)
def _sktable_to_biom(obj, fh):
    _table_to_biom(obj, fh)


def _table_to_biom(obj, fh):
    h5grp = h5py.File(fh, "w")
    obj.to_hdf5(h5grp, f"Written by scikit-bio version {skbio.__version__}")