File: table_summarizer.py

package info (click to toggle)
python-biom-format 2.1.7%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 51,820 kB
  • sloc: python: 12,757; makefile: 155; sh: 79
file content (146 lines) | stat: -rw-r--r-- 5,382 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

from __future__ import division

from operator import itemgetter
import locale

import click
from numpy import std

from biom import load_table
from biom.cli import cli
from biom.util import compute_counts_per_sample_stats


@cli.command(name='summarize-table')
@click.option('-i', '--input-fp', required=True,
              type=click.Path(exists=True, dir_okay=False),
              help='The input BIOM table')
@click.option('-o', '--output-fp', default=None,
              type=click.Path(writable=True, dir_okay=False),
              help='An output file-path')
@click.option('--qualitative', default=False, is_flag=True,
              help="Present counts as number of unique observation ids per"
                   " sample, rather than counts of observations per sample.")
@click.option('--observations', default=False, is_flag=True,
              help="Summarize over observations")
def summarize_table(input_fp, output_fp, qualitative, observations):
    """Summarize sample or observation data in a BIOM table.

    Provides details on the observation counts per sample, including summary
    statistics, as well as metadata categories associated with samples and
    observations.

    Example usage:

    Write a summary of table.biom to table_summary.txt:

    $ biom summarize-table -i table.biom -o table_summary.txt

    """
    table = load_table(input_fp)
    result = _summarize_table(table, qualitative, observations)
    if output_fp:
        with open(output_fp, 'w') as fh:
            fh.write(result)
    else:
        click.echo(result)


def _summarize_table(table, qualitative=False, observations=False):
    lines = []
    locale.setlocale(locale.LC_ALL, '')

    if observations:
        table = table.transpose()

    min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\
        compute_counts_per_sample_stats(table, qualitative)
    num_observations = len(table.ids(axis='observation'))

    counts_per_sample_values = list(counts_per_samp.values())

    if table.metadata() is None:
        sample_md_keys = ["None provided"]
    else:
        sample_md_keys = table.metadata()[0].keys()

    if table.metadata(axis='observation') is None:
        observation_md_keys = ["None provided"]
    else:
        observation_md_keys = table.metadata(axis='observation')[0].keys()

    num_samples = len(table.ids())

    if observations:
        # as this is a transpose of the original table...
        lines.append('Num samples: ' + locale.format('%d', num_observations,
                                                     grouping=True))
        lines.append('Num observations: ' + locale.format('%d', num_samples,
                                                          grouping=True))
    else:
        lines.append('Num samples: ' + locale.format('%d', num_samples,
                                                     grouping=True))
        lines.append('Num observations: ' + locale.format('%d',
                     num_observations, grouping=True))

    if not qualitative:
        total_count = sum(counts_per_sample_values)
        lines.append('Total count: ' + locale.format('%d', total_count,
                                                     grouping=True))
        lines.append('Table density (fraction of non-zero values): %1.3f' %
                     table.get_table_density())

    lines.append('')

    if qualitative:
        if observations:
            lines.append('Sample/observations summary:')
        else:
            lines.append('Observations/sample summary:')
    else:
        lines.append('Counts/sample summary:')

    lines.append(' Min: ' + locale.format('%1.3f', min_counts, grouping=True))
    lines.append(' Max: ' + locale.format('%1.3f', max_counts, grouping=True))
    lines.append(' Median: ' + locale.format('%1.3f', median_counts,
                                             grouping=True))
    lines.append(' Mean: ' + locale.format('%1.3f', mean_counts,
                                           grouping=True))
    lines.append(' Std. dev.: ' + locale.format('%1.3f',
                 std(counts_per_sample_values), grouping=True))

    if observations:
        # since this is a transpose...
        lines.append(
            ' Sample Metadata Categories: %s' %
            '; '.join(observation_md_keys))
        lines.append(
            ' Observation Metadata Categories: %s' %
            '; '.join(sample_md_keys))
        lines.append('')
    else:
        lines.append(
            ' Sample Metadata Categories: %s' %
            '; '.join(sample_md_keys))
        lines.append(
            ' Observation Metadata Categories: %s' %
            '; '.join(observation_md_keys))
        lines.append('')

    if qualitative:
        lines.append('Observations/sample detail:')
    else:
        lines.append('Counts/sample detail:')

    for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)):
        lines.append('%s: ' % k + locale.format('%1.3f', v, grouping=True))

    return "\n".join(lines)