File: table_subsetter.py

package info (click to toggle)
python-biom-format 2.1.7%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 51,820 kB
  • sloc: python: 12,757; makefile: 155; sh: 79
file content (142 lines) | stat: -rw-r--r-- 5,381 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

from __future__ import division

import click

from biom.cli import cli
from biom.parse import (get_axis_indices, direct_slice_data, direct_parse_key,
                        generatedby)
from biom.table import Table
from biom.util import biom_open, HAVE_H5PY


@cli.command(name='subset-table')
@click.option('-i', '--input-hdf5-fp', default=None,
              type=click.Path(exists=True, dir_okay=False),
              help='the input hdf5 BIOM table filepath to subset')
@click.option('-j', '--input-json-fp', default=None,
              type=click.Path(exists=True, dir_okay=False),
              help='the input json BIOM table filepath to subset')
@click.option('-a', '--axis', required=True,
              type=click.Choice(['sample', 'observation']),
              help='the axis to subset over, either sample or observation')
@click.option('-s', '--ids', required=True,
              type=click.Path(exists=True, dir_okay=False),
              help='a file containing a single column of IDs to retain '
                   '(either sample IDs or observation IDs, depending on the '
                   'axis)')
@click.option('-o', '--output-fp', required=True,
              type=click.Path(writable=True, dir_okay=False),
              help='the output BIOM table filepath')
def subset_table(input_hdf5_fp, input_json_fp, axis, ids, output_fp):
    """Subset a BIOM table.

    Subset a BIOM table, over either observations or samples, without fully
    parsing it. This command is intended to assist in working with very large
    tables when tight on memory, or as a lightweight way to subset a full
    table. Currently, it is possible to produce tables with rows or columns
    (observations or samples) that are fully zeroed.

    Example usage:

    Choose a subset of the observations in table.biom (JSON) and write them to
    subset.biom:

    $ biom subset-table -j table.biom -a observations -s observation_ids.txt \
           -o subset.biom

    Choose a subset of the observations in table.biom (HDF5) and write them to
    subset.biom:

    $ biom subset-table -i table.biom -a observations -s observation_ids.txt \
           -o subset.biom

    """
    if input_json_fp is not None:
        with open(input_json_fp, 'U') as f:
            input_json_fp = f.read()

    with open(ids, 'U') as f:
        ids = []
        for line in f:
            if not line.startswith('#'):
                ids.append(line.strip().split('\t')[0])

    table, format_ = _subset_table(input_hdf5_fp, input_json_fp, axis, ids)

    if format_ == 'json':
        with open(output_fp, 'w') as f:
            for line in table:
                f.write(line)
                f.write('\n')
    else:
        if HAVE_H5PY:
            import h5py
        else:
            # This should never be raised here
            raise ImportError("h5py is not available, cannot write HDF5!")

        with h5py.File(output_fp, 'w') as f:
            table.to_hdf5(f, generatedby())


def _subset_table(hdf5_biom, json_table_str, axis, ids):
    if axis not in ['sample', 'observation']:
        raise ValueError("Invalid axis '%s'. Must be either 'sample' or "
                         "'observation'." % axis)

    if hdf5_biom is None and json_table_str is None:
        raise ValueError("Must specify an input table")
    elif hdf5_biom is not None and json_table_str is not None:
        raise ValueError("Can only specify one input table")

    if json_table_str is not None:
        idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
        new_data = direct_slice_data(json_table_str, idxs, axis)

        # multiple walks over the string. bad form, but easy right now
        # ...should add a yield_and_ignore parser or something.
        def subset_generator():
            yield "{"
            yield direct_parse_key(json_table_str, "id")
            yield ","
            yield direct_parse_key(json_table_str, "format")
            yield ","
            yield direct_parse_key(json_table_str, "format_url")
            yield ","
            yield direct_parse_key(json_table_str, "type")
            yield ","
            yield direct_parse_key(json_table_str, "generated_by")
            yield ","
            yield direct_parse_key(json_table_str, "date")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_type")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_element_type")
            yield ","
            yield new_data
            yield ","
            yield new_axis_md
            yield ","

            if axis == "observation":
                yield direct_parse_key(json_table_str, "columns")
            else:
                yield direct_parse_key(json_table_str, "rows")
            yield "}"

        format_ = 'json'
        table = subset_generator()
    else:
        with biom_open(hdf5_biom) as f:
            table = Table.from_hdf5(f, ids=ids, axis=axis)
        format_ = 'hdf5'

    return table, format_