1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2022, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os.path
import pkg_resources
import biom
import q2templates
import numpy as np
import pandas as pd
import seaborn as sns
TEMPLATES = pkg_resources.resource_filename(
'q2_feature_table._core_features', 'core_features_assets')
def core_features(output_dir, table: biom.Table, min_fraction: float = 0.5,
max_fraction: float = 1.0, steps: int = 11) -> None:
if max_fraction < min_fraction:
raise ValueError('min_fraction (%r) parameter must be less than '
'max_fraction (%r) parameter.' %
(min_fraction, max_fraction))
index_fp = os.path.join(TEMPLATES, 'index.html')
context = {
'num_samples': table.shape[1],
'num_features': table.shape[0]
}
if min_fraction == max_fraction:
fractions = [min_fraction]
else:
fractions = np.linspace(min_fraction, max_fraction, steps)
rounded_fractions = _round_fractions(fractions)
data = []
file_links = []
for fraction, rounded_fraction in zip(fractions, rounded_fractions):
core_features = _get_core_features(table, fraction)
core_feature_count = len(core_features)
data.append([fraction, core_feature_count])
if core_feature_count > 0:
core_feature_fn = 'core-features-%s.tsv' % rounded_fraction
core_feature_fp = os.path.join(output_dir, core_feature_fn)
file_links.append("<a href='./%s'>TSV</a>" % core_feature_fn)
core_features.to_csv(core_feature_fp, sep='\t',
index_label='Feature ID')
else:
file_links.append('No core features')
df = pd.DataFrame(data, columns=['Fraction of samples', 'Feature count'])
df['Fraction of features'] = df['Feature count'] / table.shape[0]
df['Feature list'] = file_links
# newer versions of seaborn don't like dataframes with fewer than two rows
if len(fractions) > 1:
ax = sns.regplot(data=df, x='Fraction of samples', y='Feature count',
fit_reg=False)
# matplotlib will issue a UserWarning if attempting to set left and
# right bounds to the same value.
ax.set_xbound(min(fractions), max(fractions))
ax.set_ybound(0, max(df['Feature count']) + 1)
ax.get_figure().savefig(
os.path.join(output_dir, 'core-feature-counts.svg'))
context['show_plot'] = True
context['table_html'] = q2templates.df_to_html(df, index=False,
escape=False)
q2templates.render(index_fp, output_dir, context=context)
def _get_core_features(table, fraction):
filter_f = _get_filter_to_core_f(table, fraction)
feature_filtered_table = table.filter(
filter_f, axis='observation', inplace=False)
index = []
data = []
for values, id_, _ in feature_filtered_table.iter(axis='observation'):
index.append(id_)
data.append(_seven_number_summary(values))
if len(data) > 0:
return pd.DataFrame(data, index=index).sort_values(
by='50%', ascending=False)
else:
return pd.DataFrame()
def _get_filter_to_core_f(table, fraction):
# determine the number of samples that must have a non-zero value for a
# feature to be considered part of the core
min_count = fraction * table.shape[1]
def f(values, *_):
return np.count_nonzero(values) >= min_count
return f
def _seven_number_summary(a):
# TODO this should probably be publicly accessible throughout QIIME 2. it's
# also currently implemented in q2-demux `summarize` and q2-diversity
# `alpha-rarefaction`
stats = pd.Series(a, dtype=float).describe(
percentiles=[0.02, 0.09, 0.25, 0.5, 0.75, 0.91, 0.98])
drop_cols = stats.index.isin(['std', 'mean', 'min', 'max', 'count'])
stats = stats[~drop_cols]
return stats
def _round_fractions(fractions):
"""Format float fractions as rounded fraction strings.
Fractions are rounded as short as possible without losing the precision
necessary to distinguish one fraction from another. The rounding starts at
3 decimal places and continues up to 10 decimal places, if a shorter
rounded representation isn't converged upon earlier. If rounding at 10
decimal places doesn't provide enough precision, each fraction's full
float precision will be used (i.e. as returned by `repr(float)`).
For example, [0.12345, 0.12351] would be rounded to four decimal places:
['0.1234', '0.1235']. 3 decimal places would be tried first but there isn't
enough precision to keep both fractions distinguishable.
"""
for decimals in range(3, 11):
rounded = ['{fraction:1.{decimals}f}'.format(fraction=fraction,
decimals=decimals)
for fraction in fractions]
if len(rounded) == len(set(rounded)):
return rounded
return [repr(fraction) for fraction in fractions]
|