1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
|
#!/usr/bin/env python
from __future__ import division
""" A routine to clean up 454 sequencing data."""
__author__ = "Jens Reeder"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Jens Reeder", "Rob Knight"]#remember to add yourself if you make changes
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Jens Reeder"
__email__ = "jens.reeder@gmail.com"
__status__ = "Release"
from os import makedirs, remove
from os.path import exists
from qiime.util import get_tmp_filename
from cogent.util.misc import create_dir
from qiime.util import parse_command_line_parameters, get_options_lookup,\
make_option
from qiime.denoiser.preprocess import STANDARD_BACTERIAL_PRIMER
from qiime.denoiser.flowgram_clustering import denoise_seqs, denoise_per_sample
from qiime.denoiser.utils import files_exist, get_denoiser_data_dir,\
cat_sff_files
options_lookup = get_options_lookup()
DENOISER_DATA_DIR= get_denoiser_data_dir()
#denoiser.py
script_info={}
script_info['brief_description']="""Remove noise from 454 sequencing data"""
script_info['script_description']="""The denoiser removes sequencing noise characteristic to pyrosequencing by flowgram clustering. For a detailed explanation of the underlying algorithm see (Reeder and Knight, Nature Methods 7(9), 2010)."""
script_info['script_usage'] = [ \
( "",
"""Run denoiser on flowgrams in 454Reads.sff.txt with read-to-barcode mapping in seqs.fna,
put results into Outdir, log progress in Outdir/denoiser.log""",
"""%prog -i 454Reads.sff.txt -f seqs.fna -v -o Outdir"""),
( "Multiple sff.txt files",
"""Run denoiser on two flowgram files in 454Reads_1.sff.txt and 454Reads_2.sff.txt
with read-to-barcode mapping in seqs.fna, put results into Outdir,
log progress in Outdir/denoiser.log""",
"""%prog -i 454Reads_1.sff.txt,454Reads_2.sff.txt -f seqs.fna -v -o Outdir"""),
( "Denoise multiple library separately",
"""Run denoiser on flowgrams in 454Reads.sff.txt with read-to-barcode mapping in seqs.fna,
split input files into libraries and process each library separately,
put results into Outdir, log progress in Outdir/denoiser.log""",
"%prog -S -i 454Reads.sff.txt -f seqs.fna -v -o Outdir"),
( "Resuming a failed run",
"""Resume a previous denoiser run from breakpoint stored in Outdir_from_failed_run/checkpoints/checkpoint100.pickle.
The checkpoint option requires the -p or --preprocess option, which usually can be set to the output dir of the failed run.
All other arguments must be identical to the failed run.""",
"%prog -i 454Reads.sff.txt -f seqs.fna -v -o Outdir_resumed -p Outdir_from_failed_run --checkpoint Outdir_from_failed_run/checkpoints/checkpoint100.pickle"),
]
script_info['output_description']="""
centroids.fasta: The cluster representatives of each cluster
singletons.fasta: contains all unclustered reads
denoiser_mapping.txt: This file contains the actual clusters. The cluster centroid is given first,
the cluster members follow after the ':'.
checkpoints/ : directory with checkpoints
Note that the centroids and singleton files are disjoint. For most downstream analyses one wants to cat the two files.
"""
script_info['required_options']=[\
make_option('-i','--input_file',action='store',\
type='string',dest='sff_fp',help='path to flowgram file. '+\
'Separate several files by commas '+\
'[REQUIRED]', default=None)
]
script_info['optional_options']=[ \
make_option('-f','--fasta_fp',action='store',\
type='string',dest='fasta_fp',\
help='path to fasta input file. '+\
'Reads not in the fasta file are filtered out '+\
'before denoising. File format is as produced by '+\
'split_libraries.py '+\
'[default: %default]',\
default=None),
make_option('-o','--output_dir',action='store',\
type='string',dest='output_dir',help='path to output'+\
' directory [default: random dir in ./]',\
default=None),
make_option('-c','--cluster',action='store_true',\
dest='cluster', help='Use cluster/multiple CPUs for '+\
'flowgram alignments [default: %default]',\
default=False),
make_option('-p','--preprocess_fp',action='store',\
type='string',dest='preprocess_fp',\
help='Do not do preprocessing (phase I),'\
+'instead use already preprocessed data in PREPROCESS_FP',\
default=None),
make_option('--checkpoint_fp',action='store',\
type='string',dest='checkpoint_fp',\
help='Resume denoising from checkpoint. '\
+'Be careful when changing parameters for'\
+' a resumed run. Requires -p option. '+\
' [default: %default]',\
default=None),
make_option('-s','--squeeze',action='store_true',\
dest='squeeze', help='Use run-length encoding for prefix '+\
'filtering in phase I [default: %default]',\
default=False),
make_option('-S','--split',action='store_true',\
dest='split', help='Split input into per library sets '+\
'and denoise separately [default: %default]',\
default=False),
make_option('--force',action='store_true',\
dest='force', help='Force overwrite of existing '\
+'directory [default: %default]',\
default=False),
make_option('--primer',action='store',\
type='string',dest='primer',\
help='primer sequence '+\
'[default: %default]',\
default=STANDARD_BACTERIAL_PRIMER),
make_option('-n','--num_cpus',action='store',\
type='int',dest='num_cpus',\
help='number of cpus, requires -c '+\
'[default: %default]', default=1),
make_option('-m','--max_num_iterations',action='store',\
type='int',dest='max_num_iter',\
help='maximal number of iterations in phase II. '+\
'None means unlimited iterations '+\
'[default: %default]', default=None),
make_option('-b','--bail_out',action='store',\
type='int',dest='bail',\
help='stop clustering in phase II with '+\
'clusters smaller or equal than BAILde'+\
' [default: %default]', default=1),
make_option('--percent_id',action='store',\
type='float',dest='percent_id',\
help='sequence similarity clustering '+\
'threshold [default: %default]', default=0.97),
make_option('--low_cut-off',action='store',\
type='float',dest='low_cutoff',\
help='low clustering threshold for phase II '+\
'[default: %default]', default=3.75),
make_option('--high_cut-off',action='store',\
type='float',dest='high_cutoff',\
help='high clustering threshold for phase III '+\
'[default: %default]', default=4.5),
make_option('--low_memory',action='store_true',\
dest='low_memory', help='Use slower, low '+\
'memory method [default: %default]',\
default=False),
make_option('-e', '--error_profile',action='store',\
dest='error_profile', help='path to error profile '+\
'[default= %default]',\
default=DENOISER_DATA_DIR+'FLX_error_profile.dat'),
#might be needed once we switch to Titanium as default
# make_option('-flx', action='store_true',\
# dest='flx', help='shortcut for '+\
# "-e %s/FLX_error_profile.dat --low_cut-off=3.75 --high_cut_off=4.5" % DENOISER_DATA_DIR),
make_option('--titanium', action='store_true',\
dest='titanium', help='shortcut for '+\
'-e ' + DENOISER_DATA_DIR +\
'/Titanium_error_profile.dat --low_cut-off=4 --high_cut_off=5 . '+\
'Warning: overwrites all previous cut-off values '+\
'[DEFAULT: %default]', default=False)
]
script_info['version'] = __version__
def main(commandline_args=None):
parser, opts, args = parse_command_line_parameters(**script_info)
if not opts.sff_fp:
parser.error('Required option flowgram file path (-i) not specified')
elif not files_exist(opts.sff_fp):
parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
% opts.sff_fp)
if(opts.checkpoint_fp):
bp_fp = opts.checkpoint_fp
if not exists(bp_fp):
parser.error('Specified checkpoint file does not exist: %s' % bp_fp)
#peek into sff.txt files to make sure they are parseable
#cat_sff_fles is lazy and only reads header
flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(',')))
if(opts.split and opts.preprocess_fp):
parser.error('Options --split and --preprocess_fp are exclusive')
if(opts.preprocess_fp):
pp_fp = opts.preprocess_fp
if not exists(opts.preprocess_fp):
parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp)
if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)):
parser.error('Specified preprocess directory does not contain expected files: ' +\
'prefix_mapping.txt and prefix_dereplicated.fasta')
if opts.titanium:
opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat'
opts.low_cutoff = 4
opts.high_cutoff = 5
if not exists(opts.error_profile):
parser.error('Specified error profile %s does not exist' % opts.error_profile)
if opts.output_dir:
#make sure it always ends on /
tmpoutdir=opts.output_dir+"/"
else:
#make random dir in current dir
tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/")
create_dir(tmpoutdir, not opts.force)
log_fp = 'denoiser.log'
if opts.split:
denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster,
opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail,
opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp,
opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter,
opts.titanium)
else:
denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster,
opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer,
opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory,
opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium,
opts.checkpoint_fp)
if __name__ == "__main__":
main()
|