1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
|
#!/usr/bin/env python
# File created on 09 Feb 2010
from __future__ import division
import operator
import numpy
import os
import sys
from qiime.collate_alpha import write_output_file, make_output_row
from qiime.parse import (filter_otus_by_lineage, parse_matrix,
parse_rarefaction_fname)
from qiime.format import format_otu_table, format_matrix
from qiime.util import FunctionWithParams
from qiime.util import parse_command_line_parameters, make_option
__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Justin Kuczynski"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"
__status__ = "Release"
#collate_alpha.py
script_info={}
script_info['brief_description']="""Collate alpha diversity results"""
script_info['script_description']="""When performing batch analyses on the OTU table (e.g. rarefaction followed by alpha diversity), the result of alpha_diversity.py comprises many files, which need to be concatenated into a single file for generating rarefaction curves. This script joins those files.
Input files are:
each file represents one (rarefied) otu table
each row in a file represents one sample
each column in a file represents one diversity metric
Output files are:
each file represents one diversity metric
each row in a file represents one (rarefied) otu table
each column in a file represents one sample
The input directory should contain only otu tables. The output directory should be empty or nonexistant and the example file is optional.
If you have a set of rarefied OTU tables, make sure the example file contains every sample present in the otu talbes. You should typically choose the file with the fewest sequences per sample, to avoid files with sparse samples omitted.
"""
script_info['script_usage']=[]
script_info['script_usage'].append(("""Example:""","""The user inputs the results from batch alpha diversity (e.g. alpha_div_chao1_PD/) and the location where the results should be written (e.g. collated_alpha/), as shown by the following command:""","""collate_alpha.py -i alpha_div_chao1_PD/ -o collated_alpha/"""))
script_info['output_description']="""This script takes the resulting files from batch alpha diversity and collates them into (one file for each metric used).
This script transforms a series of files, named (e.g. alpha_rarefaction_20_0.txt, alpha_rarefaction_20_1.txt, etc.) into a (usually much smaller) set of files named (e.g. chao1.txt, PD_whole_tree.txt, etc.), where the columns correspond to samples and rows to the rarefaction files inputted, as shown by the following:
========================== ==================== ========= ====== ======
\ sequences per sample iteration PC.354 PC.355
========================== ==================== ========= ====== ======
alpha_rarefaction_20_0.txt 20 0 0.925 0.915
alpha_rarefaction_20_1.txt 20 1 0.9 0.89
alpha_rarefaction_20_2.txt 20 2 0.88 0.915
alpha_rarefaction_20_3.txt 20 3 0.91 0.93
... ... ... ... ...
========================== ==================== ========= ====== ======
"""
script_info['required_options']=[\
make_option('-i', '--input_path',
help='input path (a directory)'),
make_option('-o', '--output_path',
help='output path (a directory). will be created if needed')
]
script_info['optional_options']=[\
make_option('-e', '--example_path',
help='example alpha_diversity analysis file, containing all samples'+\
' and all metrics to be included in the collated result'+\
'[Default: chosen automatically (see usage string)]')
]
script_info['version'] = __version__
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
if len(args) != 0:
parser.error("Positional argument detected. make sure all"+\
' parameters are identified.' +\
'\ne.g.: include the \"-m\" in \"-m MINIMUM_LENGTH\"')
input_dir = opts.input_path
output_dir = opts.output_path
example_filepath = opts.example_path
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_names = os.listdir(input_dir)
file_names = [fname for fname in file_names if not fname.startswith('.')]
if example_filepath is None:
# table row is base_name, seqs_per_sam, iters, ext
file_name_table = map(parse_rarefaction_fname, file_names)
# sort on seqs/sam
sorted_fname_table = sorted(file_name_table, key=operator.itemgetter(1))
# now map back to file name
example_fname = file_names[file_name_table.index(sorted_fname_table[0])]
example_filepath = os.path.join(input_dir,example_fname)
f = open(example_filepath, 'U')
all_metrics, all_samples, example_data = parse_matrix(f)
num_cols = len(all_samples)
f.close()
# make the table 1 row at a time
# we're building a rarefaction by sample mtx from
# a sample by metric matrix
# each metric is one output file
for metric in all_metrics:
metric_file_data = []
for fname in file_names:
# f_ here refers to the input file currently being processed
# to distinguish from the output file we're building
f = open(os.path.join(input_dir,fname), 'U')
f_metrics, f_samples, f_data = parse_matrix(f)
f.close()
metric_file_data.append(\
make_output_row(f_metrics, metric, f_samples,
f_data, fname,num_cols,all_samples))
write_output_file(metric_file_data, output_dir, metric, all_samples)
if __name__ == "__main__":
main()
|