1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
|
#!/usr/bin/env python
# File created on 02 Nov 2011
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
from os import makedirs
from qiime.util import (parse_command_line_parameters,
make_option, get_options_lookup, load_qiime_config)
from qiime.parse import parse_qiime_parameters
from qiime.workflow.util import (validate_and_set_jobs_to_start,
call_commands_serially, print_commands, no_status_updates, print_to_stdout)
from qiime.workflow.pick_open_reference_otus import (
pick_subsampled_open_reference_otus,
iterative_pick_subsampled_open_reference_otus)
qiime_config = load_qiime_config()
options_lookup = get_options_lookup()
script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
script_info['script_usage'] = []
script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow on seqs1.fna using refseqs.fna as the reference "
"collection. ALWAYS SPECIFY ABSOLUTE FILE PATHS (absolute path "
"represented here as $PWD, but will generally look something like "
"/home/ubuntu/my_analysis/", "%prog -i $PWD/seqs1.fna -r $PWD/refseqs.fna "
"-o $PWD/ucrss/ -s 0.1 -p $PWD/ucrss_params.txt"))
script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow on seqs1.fna using refseqs.fna as the reference "
"collection and using usearch61 and usearch61_ref as the OTU picking "
"methods. ALWAYS SPECIFY ABSOLUTE FILE PATHS (absolute path represented "
"here as $PWD, but will generally look something like "
"/home/ubuntu/my_analysis/", "%prog -i $PWD/seqs1.fna -r $PWD/refseqs.fna "
"-o $PWD/ucrss_usearch/ -s 0.1 -p $PWD/ucrss_params.txt -m usearch61"))
script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow in iterative mode on seqs1.fna and seqs2.fna using "
"refseqs.fna as the initial reference collection. ALWAYS SPECIFY ABSOLUTE "
"FILE PATHS (absolute path represented here as $PWD, but will generally "
"look something like /home/ubuntu/my_analysis/", "%prog "
"-i $PWD/seqs1.fna,$PWD/seqs2.fna -r $PWD/refseqs.fna -o $PWD/ucrss_iter/ "
"-s 0.1 -p $PWD/ucrss_params.txt"))
script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow in iterative mode on seqs1.fna and seqs2.fna using "
"refseqs.fna as the initial reference collection. This is useful if "
"you're working with marker genes that do not result in useful alignment "
"(e.g., fungal ITS). ALWAYS SPECIFY ABSOLUTE FILE PATHS (absolute path "
"represented here as $PWD, but will generally look something like "
"/home/ubuntu/my_analysis/", "%prog -i $PWD/seqs1.fna,$PWD/seqs2.fna -r "
"$PWD/refseqs.fna -o $PWD/ucrss_iter_no_tree/ -s 0.1 "
"-p $PWD/ucrss_params.txt --suppress_align_and_tree"))
script_info['script_usage'].append(("", "Run the subsampled open-reference "
"OTU picking workflow in iterative mode on seqs1.fna and seqs2.fna using "
"refseqs.fna as the initial reference collection, suppressing assignment "
"of taxonomy. This is useful if you're working with a reference "
"collection without associated taxonomy. ALWAYS SPECIFY ABSOLUTE FILE "
"PATHS (absolute path represented here as $PWD, but will generally look "
"something like /home/ubuntu/my_analysis/", "%prog "
"-i $PWD/seqs1.fna,$PWD/seqs2.fna -r $PWD/refseqs.fna "
"-o $PWD/ucrss_iter_no_tax/ -s 0.1 -p $PWD/ucrss_params.txt "
"--suppress_taxonomy_assignment"))
script_info['script_usage_output_to_remove'] = [
'$PWD/ucrss/', '$PWD/ucrss_iter/', '$PWD/ucrss_usearch/',
'$PWD/ucrss_iter_no_tree/', '$PWD/ucrss_iter_no_tax/'
]
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i','--input_fps', help='the input sequences filepath or '
'comma-separated list of filepaths', type='existing_filepaths'),
make_option('-r','--reference_fp', type='existing_filepath', help='the '
'reference sequences'),
make_option('-o','--output_dir', type='new_dirpath', help='the output '
'directory'),
]
script_info['optional_options'] = [
make_option('-m','--otu_picking_method', type='choice',
choices=['uclust','usearch61'], help=('The OTU picking method to use '
'for reference and de novo steps. Passing usearch61, for example, '
'means that usearch61 will be used for the de novo steps and '
'usearch61_ref will be used for reference steps. [default: %default]'),
default='uclust'),
make_option('-p','--parameter_fp', type='existing_filepath', help='path '
'to the parameter file, which specifies changes to the default '
'behavior. See http://www.qiime.org/documentation/file_formats.html#'
'qiime-parameters . [if omitted, default values will be used]'),
make_option('--prefilter_refseqs_fp', type='existing_filepath',
help='the reference sequences to use for the prefilter, if different '
'from the reference sequecnces to use for the OTU picking [default: '
'same as passed for --reference_fp]'),
make_option('-n','--new_ref_set_id', default='New', type='string',
help='Unique identifier for OTUs that get created in this ref set '
'(this is useful to support combining of reference sets) '
'[default:%default]'),
make_option('-f','--force', action='store_true', dest='force',
help='Force overwrite of existing output directory (note: existing '
'files in output_dir will not be removed) [default: %default]'),
## print to shell script doesn't work for this workflow as there is a mix of
## command calls and api calls. i need to refactor the workflow scripts...
# make_option('-w','--print_only',action='store_true',\
# dest='print_only',help='Print the commands but don\'t call them -- '+\
# 'useful for debugging [default: %default]',default=False),\
make_option('-a','--parallel', action='store_true', dest='parallel',
default=False, help='Run in parallel where available '
'[default: %default]'),
options_lookup['jobs_to_start_workflow'],
make_option('-s','--percent_subsample', type='float', default='0.001',
help='Percent of failure sequences to include in the subsample to '
'cluster de novo (larger numbers should give more comprehensive '
'results but will be slower) [default:%default]'),
make_option('--prefilter_percent_id', type='float', default='0.60',
help='Sequences are pre-clustered at this percent id against the '
'reference and any reads which fail to hit are discarded (a quality '
'filter); pass 0.0 to disable [default:%default]'),
make_option('--step1_otu_map_fp',type='existing_filepath',
help='reference OTU picking OTU map (to avoid rebuilding if one has '
'already been built)'),
make_option('--step1_failures_fasta_fp', type='existing_filepath',
help='reference OTU picking failures fasta filepath (to avoid '
'rebuilding if one has already been built)'),
make_option('--suppress_step4', action='store_true', default=False,
help='suppress the final de novo OTU picking step (may be necessary '
'for extremely large data sets) [default: %default]'),
make_option('--min_otu_size', type='int', default=2,
help='the minimum otu size (in number of sequences) to retain the otu '
'[default: %default]'),
make_option('--suppress_taxonomy_assignment', action='store_true',
default=False, help='skip the taxonomy assignment step, resulting in '
'an OTU table without taxonomy [default: %default]'),
make_option('--suppress_align_and_tree', action='store_true',
default=False,
help='skip the sequence alignment and tree-building steps [default: '
'%default]')
]
script_info['version'] = __version__
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
verbose = opts.verbose
input_fps = opts.input_fps
refseqs_fp = opts.reference_fp
output_dir = opts.output_dir
otu_picking_method = opts.otu_picking_method
verbose = opts.verbose
print_only = False
percent_subsample = opts.percent_subsample
new_ref_set_id = opts.new_ref_set_id
prefilter_refseqs_fp = opts.prefilter_refseqs_fp
prefilter_percent_id = opts.prefilter_percent_id
if prefilter_percent_id == 0.0:
prefilter_percent_id = None
if otu_picking_method == 'uclust':
denovo_otu_picking_method = 'uclust'
reference_otu_picking_method = 'uclust_ref'
elif otu_picking_method == 'usearch61':
denovo_otu_picking_method = 'usearch61'
reference_otu_picking_method = 'usearch61_ref'
else:
# it shouldn't be possible to get here
option_parser.error('Unkown OTU picking method: %s' %
otu_picking_method)
parallel = opts.parallel
# No longer checking that jobs_to_start > 2, but
# commenting as we may change our minds about this.
#if parallel: raise_error_on_parallel_unavailable()
if opts.parameter_fp:
try:
parameter_f = open(opts.parameter_fp, 'U')
except IOError:
raise IOError("Can't open parameters file (%s). Does it exist? "
"Do you have read access?" % opts.parameter_fp)
params = parse_qiime_parameters(parameter_f)
parameter_f.close()
else:
params = parse_qiime_parameters([])
# empty list returns empty defaultdict for now
jobs_to_start = opts.jobs_to_start
default_jobs_to_start = qiime_config['jobs_to_start']
validate_and_set_jobs_to_start(params, jobs_to_start,
default_jobs_to_start, parallel, option_parser)
try:
makedirs(output_dir)
except OSError:
if opts.force:
pass
else:
option_parser.error("Output directory already exists. Please "
"choose a different directory, or force overwrite with -f.")
if print_only:
command_handler = print_commands
else:
command_handler = call_commands_serially
if verbose:
status_update_callback = print_to_stdout
else:
status_update_callback = no_status_updates
if len(input_fps) == 1:
pick_subsampled_open_reference_otus(input_fp=input_fps[0],
refseqs_fp=refseqs_fp, output_dir=output_dir,
percent_subsample=percent_subsample, new_ref_set_id=new_ref_set_id,
command_handler=command_handler, params=params,
min_otu_size=opts.min_otu_size,
run_assign_tax=not opts.suppress_taxonomy_assignment,
run_align_and_tree=not opts.suppress_align_and_tree,
qiime_config=qiime_config,
prefilter_refseqs_fp=prefilter_refseqs_fp,
prefilter_percent_id=prefilter_percent_id,
step1_otu_map_fp=opts.step1_otu_map_fp,
step1_failures_fasta_fp=opts.step1_failures_fasta_fp,
parallel=parallel, suppress_step4=opts.suppress_step4, logger=None,
denovo_otu_picking_method=denovo_otu_picking_method,
reference_otu_picking_method=reference_otu_picking_method,
status_update_callback=status_update_callback)
else:
iterative_pick_subsampled_open_reference_otus(input_fps=input_fps,
refseqs_fp=refseqs_fp, output_dir=output_dir,
percent_subsample=percent_subsample, new_ref_set_id=new_ref_set_id,
command_handler=command_handler, params=params,
min_otu_size=opts.min_otu_size,
run_assign_tax=not opts.suppress_taxonomy_assignment,
run_align_and_tree=not opts.suppress_align_and_tree,
qiime_config=qiime_config,
prefilter_refseqs_fp=prefilter_refseqs_fp,
prefilter_percent_id=prefilter_percent_id,
step1_otu_map_fp=opts.step1_otu_map_fp,
step1_failures_fasta_fp=opts.step1_failures_fasta_fp,
parallel=parallel, suppress_step4=opts.suppress_step4, logger=None,
denovo_otu_picking_method=denovo_otu_picking_method,
reference_otu_picking_method=reference_otu_picking_method,
status_update_callback=status_update_callback)
if __name__ == "__main__":
main()
|