File: extract_seqs_by_sample_id.py

package info (click to toggle)
qiime 1.4.0-2
links: PTS, VCS
area: main
in suites: wheezy
size: 29,704 kB
sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (102 lines) | stat: -rwxr-xr-x 4,593 bytes
#!/usr/bin/env python
# File created on 08 Jan 2010.
from __future__ import division

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"


from cogent.parse.fasta import MinimalFastaParser
from qiime.util import parse_command_line_parameters, get_options_lookup
from qiime.util import make_option
from qiime.util import extract_seqs_by_sample_id
from qiime.parse import parse_mapping_file
from qiime.filter_by_metadata import (parse_metadata_state_descriptions, 
                                      get_sample_ids)

options_lookup = get_options_lookup()

script_info={}
script_info['brief_description']="""Extract sequences based on the SampleID"""
script_info['script_description']="""This script creates a fasta file which will contain only sequences that ARE associated with a set of sample IDs, OR all sequences that are NOT associated with a set of sample IDs (-n)"""
script_info['script_usage']=[]
script_info['script_usage'].append(("""Examples:""","""Create the file outseqs.fasta (-o), which will be a subset of inseqs.fasta (-i) containing only the sequences THAT ARE associated with sample ids S2, S3, \
S4 (-s). As always, sample IDs are case-sensitive:""","""extract_seqs_by_sample_id.py -i inseqs.fasta -o outseqs.fasta -s S2,S3,S4"""))
script_info['script_usage'].append(("""""","""Create the file outseqs.fasta (-o), which will be a subset of inseqs.fasta (-i) containing only the sequences THAT ARE NOT (-n) associated with sample ids S2, S3, S4 (-s). As always, sample IDs are case-sensitive:""","""extract_seqs_by_sample_id.py -i inseqs.fasta -o outseqs.fasta -s S2,S3,S4 -n"""))

script_info['script_usage'].append(("""""","""Create the file outseqs.fasta (-o), which will be a subset of inseqs.fasta (-i) containing only the sequences THAT ARE associated with sample ids whose "Treatment" value is "Fast" in the mapping file:""","""extract_seqs_by_sample_id.py -i inseqs.fasta -o outseqs.fasta -m map.txt -s "Treatment:Fast" """))

script_info['output_description']="""The script produces a fasta file containing containing only the specified SampleIDs."""

script_info['required_options']=[
 options_lookup['fasta_as_primary_input'],
 make_option('-o','--output_fasta_fp',help='the output fasta file')
]

script_info['optional_options']=[
 make_option('-n','--negate',action='store_true',default=False,
  help='negate the sample ID list (i.e., output sample '+
  'ids not passed via -s) [default: %default]'),
 make_option('-s','--sample_ids',\
  help="comma-separated sample_ids to include in output fasta file"+\
  " (or exclude if --negate), or string describing mapping file states"+\
  " defining sample ids (mapping_fp must be provided for the latter)"),
 options_lookup['mapping_fp']]
script_info['version'] = __version__



def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    negate = opts.negate
    sample_ids = opts.sample_ids
    mapping_fp = opts.mapping_fp
    input_fasta_fp = opts.input_fasta_fp
    output_fasta_fp = opts.output_fasta_fp
    
    if not mapping_fp:
        sample_ids = sample_ids.split(',')
    else:
        map_data, map_header, map_comments = parse_mapping_file(mapping_fp)
        sample_ids = get_sample_ids(
                         map_data,
                         map_header,
                         parse_metadata_state_descriptions(sample_ids))
        if len(sample_ids) == 0:
            raise ValueError,\
             "No samples match the search criteria: %s" % valid_states
    
    
    if opts.verbose:
        # This is useful when using the --valid_states feature so you can 
        # find out if a search query didn't work as you expected before a 
        # lot of time is spent
        print "Extracting samples: %s" % ', '.join(sample_ids)
    
    try:
        seqs = MinimalFastaParser(open(input_fasta_fp))
    except IOError:
        option_parser.error('Cannot open %s. Does it exist? Do you have read access?'%\
         input_fasta_fp)
        exit(1)
        
    try:
        output_fasta_f = open(output_fasta_fp,'w')
    except IOError:
        option_parser.error("Cannot open %s. Does path exist? Do you have write access?" %\
         output_fasta_fp)
        exit(1)
    
    for r in extract_seqs_by_sample_id(seqs,sample_ids,negate):
        output_fasta_f.write('>%s\n%s\n' % r)
    output_fasta_f.close()

if __name__ == "__main__":
    main()