File: metadata_adder.py

package info (click to toggle)
python-biom-format 2.1.7%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 51,820 kB
  • sloc: python: 12,757; makefile: 155; sh: 79
file content (186 lines) | stat: -rw-r--r-- 7,632 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

from __future__ import division

import click

from biom import load_table
from biom.cli import cli
from biom.cli.util import write_biom_table
from biom.parse import MetadataMap
from biom.util import HAVE_H5PY


@cli.command(name='add-metadata')
@click.option('-i', '--input-fp', required=True,
              type=click.Path(exists=True, dir_okay=False),
              help='The input BIOM table')
@click.option('-o', '--output-fp', required=True,
              type=click.Path(exists=False, dir_okay=False),
              help='The output BIOM table')
@click.option('-m', '--sample-metadata-fp', required=False,
              type=click.Path(exists=True, dir_okay=False),
              help='The sample metadata mapping file (will add sample '
                   'metadata to the input BIOM table, if provided).')
@click.option('--observation-metadata-fp', required=False,
              type=click.Path(exists=True, dir_okay=False),
              help='The observation metadata mapping file (will add '
                   'observation metadata to the input BIOM table, if '
                   'provided).')
@click.option('--sc-separated', required=False, type=click.STRING,
              help='Comma-separated list of the metadata fields to split '
                   'on semicolons. This is useful for hierarchical data such '
                   'as taxonomy or functional categories.')
@click.option('--sc-pipe-separated', required=False, type=click.STRING,
              help='Comma-separated list of the metadata fields to split '
                   'on semicolons and pipes ("|"). This is useful for '
                   'hierarchical data such as functional categories with '
                   'one-to-many mappings (e.g. x;y;z|x;y;w)).')
@click.option('--int-fields', required=False, type=click.STRING,
              help='Comma-separated list of the metadata fields to cast '
                   'to integers. This is useful for integer data such as '
                   '"DaysSinceStart".')
@click.option('--float-fields', required=False, type=click.STRING,
              help='Comma-separated list of the metadata fields to cast '
                   'to floating point numbers. This is useful for real number '
                   'data such as "pH".')
@click.option('--sample-header', required=False, type=click.STRING,
              help='Comma-separated list of the sample metadata field '
                   'names. This is useful if a header line is not provided '
                   'with the metadata, if you want to rename the fields, or '
                   'if you want to include only the first n fields where n is '
                   'the number of entries provided here.')
@click.option('--observation-header', required=False, type=click.STRING,
              help='Comma-separated list of the observation metadata '
                   'field names. This is useful if a header line is not '
                   'provided with the metadata, if you want to rename the '
                   'fields, or if you want to include only the first n fields '
                   'where n is the number of entries provided here.')
@click.option('--output-as-json', default=not HAVE_H5PY, is_flag=True,
              help='Write the output file in JSON format.')
def add_metadata(input_fp, output_fp, sample_metadata_fp,
                 observation_metadata_fp, sc_separated, sc_pipe_separated,
                 int_fields, float_fields, sample_header, observation_header,
                 output_as_json):
    """Add metadata to a BIOM table.

    Add sample and/or observation metadata to BIOM-formatted files. See
    examples here: http://biom-format.org/documentation/adding_metadata.html

    Example usage:

    Add sample metadata to a BIOM table:

    $ biom add-metadata -i otu_table.biom -o table_with_sample_metadata.biom
      -m sample_metadata.txt
    """
    table = load_table(input_fp)
    if sample_metadata_fp is not None:
        sample_metadata_f = open(sample_metadata_fp, 'U')
    else:
        sample_metadata_f = None
    if observation_metadata_fp is not None:
        observation_metadata_f = open(observation_metadata_fp, 'U')
    else:
        observation_metadata_f = None
    if sc_separated is not None:
        sc_separated = sc_separated.split(',')
    if sc_pipe_separated is not None:
        sc_pipe_separated = sc_pipe_separated.split(',')
    if int_fields is not None:
        int_fields = int_fields.split(',')
    if float_fields is not None:
        float_fields = float_fields.split(',')
    if sample_header is not None:
        sample_header = sample_header.split(',')
    if observation_header is not None:
        observation_header = observation_header.split(',')

    result = _add_metadata(table, sample_metadata_f, observation_metadata_f,
                           sc_separated, sc_pipe_separated, int_fields,
                           float_fields, sample_header, observation_header)

    if output_as_json:
        fmt = 'json'
    else:
        fmt = 'hdf5'

    write_biom_table(result, fmt, output_fp)


def _split_on_semicolons(x):
    return [e.strip() for e in x.split(';')]


def _split_on_semicolons_and_pipes(x):
    return [[e.strip() for e in y.split(';')] for y in x.split('|')]


def _int(x):
    try:
        return int(x)
    except ValueError:
        return x


def _float(x):
    try:
        return float(x)
    except ValueError:
        return x


def _add_metadata(table, sample_metadata=None, observation_metadata=None,
                  sc_separated=None, sc_pipe_separated=None, int_fields=None,
                  float_fields=None, sample_header=None,
                  observation_header=None):

    if sample_metadata is None and observation_metadata is None:
        raise ValueError('Must specify sample_metadata and/or '
                         'observation_metadata.')

    # define metadata processing functions, if any
    process_fns = {}
    if sc_separated is not None:
        process_fns.update(dict.fromkeys(sc_separated,
                                         _split_on_semicolons))

    if sc_pipe_separated is not None:
        process_fns.update(dict.fromkeys(sc_pipe_separated,
                           _split_on_semicolons_and_pipes))

    if int_fields is not None:
        process_fns.update(dict.fromkeys(int_fields, _int))

    if float_fields is not None:
        process_fns.update(dict.fromkeys(float_fields, _float))

    # parse mapping files
    if sample_metadata is not None:
        sample_metadata = MetadataMap.from_file(sample_metadata,
                                                process_fns=process_fns,
                                                header=sample_header)

    if observation_metadata is not None:
        observation_metadata = MetadataMap.from_file(
            observation_metadata,
            process_fns=process_fns,
            header=observation_header)

    # NAUGHTY: this is modifying the input table IN PLACE!!! And then
    # RETURNING IT! MetadataAdder is angry!

    # add metadata as necessary
    if sample_metadata:
        table.add_metadata(sample_metadata, axis='sample')

    if observation_metadata:
        table.add_metadata(observation_metadata, axis='observation')

    return table