File: validate_mapping_file.py

package info (click to toggle)
qiime 1.8.0%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 130,508 kB
  • ctags: 10,145
  • sloc: python: 110,826; haskell: 379; sh: 169; makefile: 125
file content (161 lines) | stat: -rwxr-xr-x 7,772 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
from __future__ import division

__author__ = "William Walters"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["William Walters"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "William Walters"
__email__ = "William.A.Walters@colorado.edu"
 
from string import letters, digits

from qiime.util import parse_command_line_parameters, get_options_lookup,\
 make_option, create_dir
from qiime.check_id_map import check_mapping_file

options_lookup = get_options_lookup()
script_info={}
script_info['brief_description']="""Checks user's metadata mapping file for \
required data, valid format"""
script_info['script_description']="""Specifically, we check that:

    - The BarcodeSequence, LinkerPrimerSequences, and ReversePrimer fields 
       have valid IUPAC DNA characters, and BarcodeSequence characters
       are non-degenerate (error)
    - The SampleID, BarcodeSequence, LinkerPrimerSequence, and Description
       headers are present. (error)
    - There are not duplicate header fields (error)
    - There are not duplicate barcodes (error)
    - Barcodes are of the same length.  Suppressed when 
       variable_len_barcode flag is passed (warning)
    - The headers do not contain invalid characters (alphanumeric and 
       underscore only) (warning)
    - The data fields do not contain invalid characters (alphanumeric, 
       underscore, space, and +-%./:,; characters) (warning)
    - SampleID fields are MIENS compliant (only alphanumeric
       and . characters). (warning)
    - There are no duplicates when the primer and variable length 
       barcodes are appended (error)
    - There are no duplicates when barcodes and added demultiplex 
       fields (-j option) are combined (error)
    - Data fields are not found beyond the Description column (warning)
      
    Details about the metadata mapping file format can be found here:
    http://www.qiime.org/documentation/file_formats.html#metadata-mapping-files
    
    Errors and warnings are saved to a log file.  Errors can be caused by
    problems with the headers, invalid characters in barcodes or primers, or
    by duplications in SampleIDs or barcodes.
    
    Warnings can arise from invalid characters and variable length barcodes that
    are not specified with the --variable_len_barcode.
    Warnings will contain a reference to the cell (row,column) that the 
    warning arose from.
    
    In addition to the log file, a "corrected_mapping" file will be created.
    Any invalid characters will be replaced with '.' characters in
    the SampleID fields (to enforce MIENS compliance) and text in other data
    fields will be replaced with the character specified by the -c parameter,
    which is an underscore "_" by default.
    
    A html file will be created as well, which will show locations of 
    warnings and errors, highlighted in yellow and red respectively.  If no
    errors or warnings were present the file will display a message saying 
    such.  Header errors can mask other errors, so these should be corrected
    first.
    
    If pooled primers are used, separate with a comma.  For instance, a pooled
    set of three 27f primers (used to increase taxonomic coverage) could be
    specified in the LinkerPrimerSequence fields as such:
    AGGGTTCGATTCTGGCTCAG,AGAGTTTGATCCTGGCTTAG,AGAATTTGATCTTGGTTCAG
"""
script_info['script_usage']=[]
script_info['script_usage'].append(("""Example:""","""Check the Fasting_Map.txt \
    mapping file for problems, supplying the required mapping file, and output \
    the results in the validate_mapping_file_output directory""","""%prog -m \
    Fasting_Map.txt -o validate_mapping_file_output"""))
script_info['output_description']="""A log file, html file, and \
corrected_mapping.txt file will be written to the current output directory."""
script_info['required_options']= [\
    make_option('-m', '--mapping_fp',type='existing_filepath',
                help='Metadata mapping filepath')
    
]
script_info['optional_options']= [\
    make_option('-o', '--output_dir',type='new_dirpath',
        help='Required output directory for log file, corrected mapping '+\
        'file, and html file. [default: %default]', default="./"),
    make_option('-v', '--verbose',
        help='Enable printing information to standard out '+\
        '[default: %default]', default=True,action='store_false'),
    make_option('-c', '--char_replace',type='string',
        help='Changes the default character used to replace invalid '+\
        'characters found in the mapping file.  Must be a valid character ('+\
        'alphanumeric, period, or underscore).'+\
        '[default: %default]', default="_"),
    make_option('-b', '--not_barcoded',
        action='store_true', default=False,
        help='Use -b if barcodes are not present.  BarcodeSequence header '+\
        'still required.  [default: %default]'),
    make_option('-B', '--variable_len_barcodes',
        action='store_true', default=False,
        help='Use -B if variable length barcodes are present to suppress '+\
        'warnings about barcodes of unequal length. [default: %default]'),
    make_option('-p', '--disable_primer_check',
        action='store_true', default=False,
        help='Use -p to disable checks for primers.  LinkerPrimerSequence '+\
        'header still required. [default: %default]'),
    make_option('-j', '--added_demultiplex_field',type='string',
        help='Use -j to add a field to use in the mapping file as '+\
        'additional demultiplexing (can be used with or without barcodes).  '+\
        'All combinations '+\
        'of barcodes/primers and the these fields must be unique. The '+\
        'fields must contain values that can be parsed from the fasta labels '+\
        'such as "plate=R_2008_12_09".  In this case, "plate" would be the '+\
        'column header and "R_2008_12_09" would be the field data (minus '+\
        'quotes) in the mapping file.  To use the run prefix from the fasta '+\
        'label, such as ">FLP3FBN01ELBSX", where "FLP3FBN01" is generated '+\
        'from the run ID, use "-j run_prefix" and set the run prefix to '+\
        'be used as the data under the column header "run_prefix". '+\
        ' [default: %default]'),
    make_option('-s', '--suppress_html',
        action='store_true', default=False,
        help='Use -s to disable html file generation, can be useful for '+\
        'extremely large mapping files. [default: %default]'),]
        
script_info['version'] = __version__

def main():
    option_parser, opts, args =\
     parse_command_line_parameters(suppress_verbose=True, **script_info)
      
    mapping_fp = opts.mapping_fp
    has_barcodes = not opts.not_barcoded
    variable_len_barcodes = opts.variable_len_barcodes
    output_dir = opts.output_dir + "/"
    char_replace = opts.char_replace
    verbose = opts.verbose
    disable_primer_check = opts.disable_primer_check
    added_demultiplex_field = opts.added_demultiplex_field
    suppress_html = opts.suppress_html
        
    # Create output directory, check path/access to mapping file
    create_dir(output_dir)
    
    # Test for valid replacement characters
    valid_replacement_chars = digits + letters + "_" + "."
    if char_replace not in valid_replacement_chars:
        option_parser.error('-c option requires alphanumeric, period, or '+\
        'underscore character.')
    if len(char_replace) != 1:
        option_parser.error('-c parameter must be a single character.')
    
    check_mapping_file(mapping_fp, output_dir, has_barcodes, char_replace,\
     verbose, variable_len_barcodes,
     disable_primer_check, added_demultiplex_field, suppress_html)


if __name__ == "__main__":
    main()