1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------
from __future__ import division
import click
from biom.cli import cli
from biom.parse import (get_axis_indices, direct_slice_data, direct_parse_key,
generatedby)
from biom.table import Table
from biom.util import biom_open, HAVE_H5PY
@cli.command(name='subset-table')
@click.option('-i', '--input-hdf5-fp', default=None,
type=click.Path(exists=True, dir_okay=False),
help='the input hdf5 BIOM table filepath to subset')
@click.option('-j', '--input-json-fp', default=None,
type=click.Path(exists=True, dir_okay=False),
help='the input json BIOM table filepath to subset')
@click.option('-a', '--axis', required=True,
type=click.Choice(['sample', 'observation']),
help='the axis to subset over, either sample or observation')
@click.option('-s', '--ids', required=True,
type=click.Path(exists=True, dir_okay=False),
help='a file containing a single column of IDs to retain '
'(either sample IDs or observation IDs, depending on the '
'axis)')
@click.option('-o', '--output-fp', required=True,
type=click.Path(writable=True, dir_okay=False),
help='the output BIOM table filepath')
def subset_table(input_hdf5_fp, input_json_fp, axis, ids, output_fp):
"""Subset a BIOM table.
Subset a BIOM table, over either observations or samples, without fully
parsing it. This command is intended to assist in working with very large
tables when tight on memory, or as a lightweight way to subset a full
table. Currently, it is possible to produce tables with rows or columns
(observations or samples) that are fully zeroed.
Example usage:
Choose a subset of the observations in table.biom (JSON) and write them to
subset.biom:
$ biom subset-table -j table.biom -a observations -s observation_ids.txt \
-o subset.biom
Choose a subset of the observations in table.biom (HDF5) and write them to
subset.biom:
$ biom subset-table -i table.biom -a observations -s observation_ids.txt \
-o subset.biom
"""
if input_json_fp is not None:
with open(input_json_fp, 'U') as f:
input_json_fp = f.read()
with open(ids, 'U') as f:
ids = []
for line in f:
if not line.startswith('#'):
ids.append(line.strip().split('\t')[0])
table, format_ = _subset_table(input_hdf5_fp, input_json_fp, axis, ids)
if format_ == 'json':
with open(output_fp, 'w') as f:
for line in table:
f.write(line)
f.write('\n')
else:
if HAVE_H5PY:
import h5py
else:
# This should never be raised here
raise ImportError("h5py is not available, cannot write HDF5!")
with h5py.File(output_fp, 'w') as f:
table.to_hdf5(f, generatedby())
def _subset_table(hdf5_biom, json_table_str, axis, ids):
if axis not in ['sample', 'observation']:
raise ValueError("Invalid axis '%s'. Must be either 'sample' or "
"'observation'." % axis)
if hdf5_biom is None and json_table_str is None:
raise ValueError("Must specify an input table")
elif hdf5_biom is not None and json_table_str is not None:
raise ValueError("Can only specify one input table")
if json_table_str is not None:
idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
new_data = direct_slice_data(json_table_str, idxs, axis)
# multiple walks over the string. bad form, but easy right now
# ...should add a yield_and_ignore parser or something.
def subset_generator():
yield "{"
yield direct_parse_key(json_table_str, "id")
yield ","
yield direct_parse_key(json_table_str, "format")
yield ","
yield direct_parse_key(json_table_str, "format_url")
yield ","
yield direct_parse_key(json_table_str, "type")
yield ","
yield direct_parse_key(json_table_str, "generated_by")
yield ","
yield direct_parse_key(json_table_str, "date")
yield ","
yield direct_parse_key(json_table_str, "matrix_type")
yield ","
yield direct_parse_key(json_table_str, "matrix_element_type")
yield ","
yield new_data
yield ","
yield new_axis_md
yield ","
if axis == "observation":
yield direct_parse_key(json_table_str, "columns")
else:
yield direct_parse_key(json_table_str, "rows")
yield "}"
format_ = 'json'
table = subset_generator()
else:
with biom_open(hdf5_biom) as f:
table = Table.from_hdf5(f, ids=ids, axis=axis)
format_ = 'hdf5'
return table, format_
|