File: table_summarizer.py

package info (click to toggle)
python-biom-format 2.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,548 kB
  • ctags: 1,051
  • sloc: python: 6,257; makefile: 137; sh: 52
file content (159 lines) | stat: -rw-r--r-- 5,727 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python

# -----------------------------------------------------------------------------
# Copyright (c) 2011-2013, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

from __future__ import division
from pyqi.core.command import (Command, CommandIn, CommandOut,
                               ParameterCollection)

from numpy import std
from operator import itemgetter
from biom.util import compute_counts_per_sample_stats

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011-2013, The BIOM Format Development Team"
__credits__ = ["Greg Caporaso", "Daniel McDonald", "Jose Antonio Navas Molina"]
__license__ = "BSD"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"


class TableSummarizer(Command):

    """
     Example usage:
      from biom.commands.table_summarizer import TableSummarizer
      from biom.parse import parse_biom_table
      c = TableSummarizer()
      table_f = open("table.biom")
      t = parse_biom_table(table_f)
      table_f.seek(0)
      result = c(table=(t,None))
      result = c(table=(t,None),qualitative=True)
      result = c(table=(t,table_f),qualitative=True)
      table_f.close()
    """
    BriefDescription = "Summarize sample or observation data in a BIOM table"
    LongDescription = ("Provides details on the observation counts per sample,"
                       " including summary statistics, as well as metadata "
                       "categories associated with samples and observations.")

    CommandIns = ParameterCollection([
        CommandIn(Name='table',
                  DataType=tuple,
                  Description='the input BIOM table',
                  Required=True),
        CommandIn(Name='qualitative',
                  DataType=bool,
                  Description=('Present counts as number of unique '
                               'observation ids per sample, rather than '
                               'counts of observations per sample.'),
                  Required=False,
                  Default=False),
        CommandIn(Name='observations',
                  DataType=bool,
                  Default=False,
                  Description=('Summarize over observations'))
    ])

    CommandOuts = ParameterCollection([
        CommandOut(Name='biom_summary',
                   DataType=list,
                   Description='The table summary')
    ])

    def run(self, **kwargs):
        result = {}
        qualitative = kwargs['qualitative']
        by_observations = kwargs['observations']
        table, table_lines = kwargs['table']

        if by_observations:
            table = table.transpose()

        min_counts, max_counts, median_counts, mean_counts, counts_per_samp =\
            compute_counts_per_sample_stats(table, qualitative)
        num_observations = len(table.ids(axis='observation'))

        counts_per_sample_values = counts_per_samp.values()

        if table.metadata() is None:
            sample_md_keys = ["None provided"]
        else:
            sample_md_keys = table.metadata()[0].keys()

        if table.metadata(axis='observation') is None:
            observation_md_keys = ["None provided"]
        else:
            observation_md_keys = table.metadata(axis='observation')[0].keys()

        lines = []

        num_samples = len(table.ids())

        if by_observations:
            # as this is a transpose of the original table...
            lines.append('Num samples: %d' % num_observations)
            lines.append('Num observations: %d' % num_samples)
        else:
            lines.append('Num samples: %d' % num_samples)
            lines.append('Num observations: %d' % num_observations)

        if not qualitative:
            total_count = sum(counts_per_sample_values)
            lines.append('Total count: %d' % total_count)
            lines.append('Table density (fraction of non-zero values): %1.3f' %
                         table.get_table_density())

        lines.append('')

        if qualitative:
            if by_observations:
                lines.append('Sample/observations summary:')
            else:
                lines.append('Observations/sample summary:')
        else:
            lines.append('Counts/sample summary:')

        lines.append(' Min: %r' % min_counts)
        lines.append(' Max: %r' % max_counts)
        lines.append(' Median: %1.3f' % median_counts)
        lines.append(' Mean: %1.3f' % mean_counts)
        lines.append(' Std. dev.: %1.3f' % std(counts_per_sample_values))

        if by_observations:
            # since this is a transpose...
            lines.append(
                ' Sample Metadata Categories: %s' %
                '; '.join(observation_md_keys))
            lines.append(
                ' Observation Metadata Categories: %s' %
                '; '.join(sample_md_keys))
            lines.append('')
        else:
            lines.append(
                ' Sample Metadata Categories: %s' %
                '; '.join(sample_md_keys))
            lines.append(
                ' Observation Metadata Categories: %s' %
                '; '.join(observation_md_keys))
            lines.append('')

        if qualitative:
            lines.append('Observations/sample detail:')
        else:
            lines.append('Counts/sample detail:')

        for k, v in sorted(counts_per_samp.items(), key=itemgetter(1)):
            lines.append(' %s: %r' % (k, v))

        result['biom_summary'] = lines
        return result

CommandConstructor = TableSummarizer