File: make_3d_plots.py

package info (click to toggle)
qiime 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 29,704 kB
  • sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (413 lines) | stat: -rwxr-xr-x 23,038 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/usr/bin/env python
# File created on 09 Feb 2010
#file make_3d_plots.py

from __future__ import division

__author__ = "Jesse Stombaugh"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Jesse Stombaugh", "Rob Knight", "Micah Hamady", "Dan Knights",
    "Justin Kuczynski", "Antonio Gonzalez Pena"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Jesse Stombaugh"
__email__ = "jesse.stombaugh@colorado.edu"
__status__ = "Release"
 

from qiime.util import parse_command_line_parameters, get_options_lookup, create_dir
from qiime.util import make_option
from qiime.make_3d_plots import generate_3d_plots, generate_3d_plots_invue
from qiime.parse import parse_coords,group_by_field,group_by_fields
import shutil
import os
from qiime.colors import sample_color_prefs_and_map_data_from_options
from random import choice
from time import strftime
from qiime.util import get_qiime_project_dir
from qiime.make_3d_plots import get_coord,get_map,remove_unmapped_samples, \
                                get_custom_coords, \
                                process_custom_axes, process_coord_filenames, \
                                remove_nans, scale_custom_coords,\
                                validate_coord_files
from qiime.biplots import get_taxa,get_taxa_coords,get_taxa_prevalence,\
    remove_rare_taxa, make_mage_taxa, make_biplot_scores_output
from cogent.util.misc import get_random_directory_name
import numpy as np

options_lookup = get_options_lookup()

#make_3d_plots.py
script_info={}
script_info['brief_description']="""Make 3D PCoA plots"""
script_info['script_description']="""This script automates the construction of 3D plots (kinemage format) from the PCoA output file generated by principal_coordinates.py (e.g. P1 vs. P2 vs. P3, P2 vs. P3 vs. P4, etc., where P1 is the first component)."""
script_info['script_usage']=[]
script_info['script_usage'].append(("""Default Usage:""","""If you just want to use the default output, you can supply the principal coordinates file (i.e., resulting file from principal_coordinates.py) and a user-generated mapping file, where the default coloring will be based on the SampleID as follows:""","""%prog -i beta_div_coords.txt -m Mapping_file.txt"""))
script_info['script_usage'].append(("","""Additionally, the user can supply their mapping file ("-m") and a specific category to color by ("-b") or any combination of categories. When using the -b option, the user can specify the coloring for multiple mapping labels, where each mapping label is separated by a comma, for example: -b 'mapping_column1,mapping_column2'. The user can also combine mapping labels and color by the combined label that is created by inserting an '&&' between the input columns, for example: -b 'mapping_column1&&mapping_column2'.""",""))
script_info['script_usage'].append(("","""If the user would like to color all categories in their metadata mapping file, they can pass 'ALL' to the '-b' option, as follows:""","""%prog -i beta_div_coords.txt -m Mapping_file.txt -b ALL"""))
script_info['script_usage'].append(("","""As an alternative, the user can supply a preferences (prefs) file, using the -p option. The prefs file allows the user to give specific samples their own columns within a given mapping column. This file also allows the user to perform a color gradient, given a specific mapping column.

If the user wants to color by using the prefs file (e.g. prefs.txt), they can use the following code:""","""%prog -i beta_div_coords.txt -m Mapping_file.txt -p prefs.txt
"""))
script_info['script_usage'].append(("""Output Directory:""","""If you want to give an specific output directory (e.g. "3d_plots"), use the following code:""","""%prog -i principal_coordinates-output_file --o 3d_plots/"""))
script_info['script_usage'].append(("""Background Color Example:""","""If the user would like to color the background white they can use the '-k' option as follows:""","""%prog -i beta_div_coords.txt -m Mapping_file.txt -b ALL -k white"""))
script_info['script_usage'].append(("""Jackknifed Principal Coordinates (w/ confidence intervals):""","""If you have created jackknifed PCoA files, you can pass the folder containing those files, instead of a single file.  The user can also specify the opacity of the ellipses around each point "--ellipsoid_opacity", which is a value from 0-1. Currently there are two metrics "--ellipsoid_method" that can be used for generating the ellipsoids, which are 'IQR' and 'sdev'. The user can specify all of these options as follows:""", """%prog -i jackknifed_pcoas/ -m Mapping_file.txt -b \'mapping_column1,mapping_column1&&mapping_column2\' --ellipsoid_opacity=0.5 --ellipsoid_method=IQR"""))
script_info['script_usage'].append(("""Bi-Plots:""","""If the user would like to see which taxa are more prevalent in different areas of the PCoA plot, they can generate Bi-Plots, by passing a principal coordinates file or folder "-i", a mapping file "-m", and a summarized taxa file "-t" from summarize_taxa.py. Can be combined with jacknifed principal coordinates.""", """%prog -i pcoa.txt -m Mapping_file.txt -t otu_table_level3.txt"""))
script_info['output_description']="""By default, the script will plot the first three dimensions in your file. Other combinations can be viewed using the "Views:Choose viewing axes" option in the KiNG viewer (Chen, Davis, & Richardson, 2009), which may require the installation of kinemage software. The first 10 components can be viewed using "Views:Paralled coordinates" option or typing "/". The mouse can be used to modify display parameters, to click and rotate the viewing axes, to select specific points (clicking on a point shows the sample identity in the low left corner), or to select different analyses (upper right window). Although samples are most easily viewed in 2D, the third dimension is indicated by coloring each sample (dot/label) along a gradient corresponding to the depth along the third component (bright colors indicate points close to the viewer)."""
script_info['required_options']=[\
    make_option('-i', '--coord_fname',
        help='Input principal coordinates filepath (i.e.,' +\
        ' resulting file from principal_coordinates.py).  Alternatively,' +\
        ' a directory containing multiple principal coordinates files for' +\
        ' jackknifed PCoA results.',
        type='existing_path'),
    make_option('-m', '--map_fname', dest='map_fname',
        help='Input metadata mapping filepath',
        type='existing_filepath')
    ]
script_info['optional_options']=[\
    make_option('-b', '--colorby', dest='colorby',\
        help='Comma-separated list categories metadata categories' +\
        ' (column headers) ' +\
        'to color by in the plots. The categories must match the name of a ' +\
        'column header in the mapping file exactly. Multiple categories ' +\
        'can be list by comma separating them without spaces. The user can ' +\
        'also combine columns in the mapping file by separating the ' +\
        'categories by "&&" without spaces. [default=color by all]'),
    make_option('-a', '--custom_axes',
        help='This is the category from the metadata mapping file to use as' +\
        ' a custom axis in the plot.  For instance, if there is a pH' +\
        ' category and you would like to see the samples plotted on that' +\
        ' axis instead of PC1, PC2, etc., one can use this option.  It is' +\
        ' also useful for plotting time-series data. Note: if there is any' +\
        ' non-numeric data in the column, it will not be plotted' +\
        ' [default: %default]'),
    make_option('-p', '--prefs_path',
        help='Input user-generated preferences filepath. NOTE: This is a' +\
        ' file with a dictionary containing preferences for the analysis.' +\
        ' [default: %default]',
        type='existing_filepath'),
    make_option('-k', '--background_color',
        help='Background color to use in the plots. [default: %default]',
        default='black',type='choice',choices=['black','white'],),
    make_option('--ellipsoid_smoothness',
        help='Used only when plotting ellipsoids for jackknifed' +\
        ' beta diversity (i.e. using a directory of coord files' +\
        ' instead of a single coord file). Valid choices are 0-3.' +\
        ' A value of 0 produces very coarse "ellipsoids" but is' +\
        ' fast to render. If you encounter a memory' +\
        ' error when generating or displaying the plots, try including' +\
        ' just one metadata column in your plot. If you still have trouble,' +\
        ' reduce the smoothness level to 0. [default: %default]',
        default="1",type="choice",choices=["0","1","2","3"]),
    make_option('--ellipsoid_opacity',
        help='Used only when plotting ellipsoids for jackknifed' +\
        ' beta diversity (i.e. using a directory of coord files' +\
        ' instead of a single coord file). The valid range is between 0-1.' +\
        ' 0 produces completely transparent (invisible) ellipsoids' +\
        ' and 1 produces completely opaque ellipsoids.' +\
        ' [default=%default]', \
        default=0.33,type=float),
    make_option('--ellipsoid_method',
        help='Used only when plotting ellipsoids for jackknifed' +\
        ' beta diversity (i.e. using a directory of coord files' +\
        ' instead of a single coord file). Valid values are "IQR" and' +\
        ' "sdev". [default=%default]',default="IQR",
        type="choice",choices=["IQR","sdev"]),
    make_option('--master_pcoa',
        help='Used only when plotting ellipsoids for jackknifed beta' +\
        ' diversity (i.e. using a directory of coord files' +\
        ' instead of a single coord file). These coordinates will be the' +\
        ' center of each ellipisoid. [default: %default; arbitrarily' +\
        ' chosen PC matrix will define the center point]',default=None,
        type='existing_filepath'),
    #bipot options
    make_option('-t', '--taxa_fname',
        help='Used only when generating BiPlots. Input summarized taxa '+\
        'filepath (i.e., from summarize_taxa.py). '+\
        'Taxa will be plotted with the samples. [default=%default]', 
        default=None,
        type='existing_filepath'),
    make_option('--n_taxa_keep',
        help='Used only when generating BiPlots. This is the number of taxa '+\
        ' to display. Use -1 to display all. [default: %default]',default=10,
        type=int),
    make_option('--biplot_output_file', 
        help='Used only when generating BiPlots. Output coordinates filepath '+\
        ' when generating a biplot. [default: %default]',default=None,
        type='new_filepath'),
    make_option('--output_format',
        help='Output format. If this option is set to invue you will' +\
        ' need to also use the option -b to define which column(s) from the' +\
        ' metadata file the script should use when writing an output file.' +\
        ' [default: %default]', default='king',type='choice',
        choices=['king','invue']),
    # inVUE options
    make_option('-n', '--interpolation_points', type="int", 
        help='Used only when generating inVUE plots. Number of points' +\
        ' between samples for interpolatation. [default: %default]', default=0),
    make_option('--polyhedron_points', type="int", 
        help='Used only when generating inVUE plots. The number of points' +\
        ' to be generated when creating a frame' +\
        ' around the PCoA plots. [default: %default]', default=4),
    make_option('--polyhedron_offset', type="float", 
        help='Used only when generating inVUE plots. The offset to be added' +\
        ' to each point created when using the' +\
        ' --polyhedron_points option. This is only used when' +\
        ' using the invue output_format. [default: %default]', default=1.5),
    # vector analysis options
    make_option('--add_vectors', dest='add_vectors', default=None,
        help='Create vectors based on a column of the mapping file. This.parameter' +\
        ' accepts up to 2 columns: (1) create the vectors, (2) sort them.' +\
        ' If you wanted to group by Species and' +\
        ' order by SampleID you will pass --add_vectors=Species but if you' +\
        ' wanted to group by Species but order by DOB you will pass' +\
        ' --add_vectors=Species,DOB;' +\
        ' this is useful when you use --custom_axes param [default: %default]'),
    make_option('--rms_algorithm', dest='rms_algorithm', default=None,
        help='The algorithm to calculate the RMS, either avg or trajectory;' +\
        ' both algorithms use all the dimensions and weights them using their' +\
        ' percentange explained; return the norm of the created vectors; and their ' +\
        ' confidence using ANOVA. The vectors are created as follows: for' +\
        ' avg it calculates the average at each timepoint (averaging within' +\
        ' a group), then calculates the norm of each point; for trajectory ' +\
        ' calculates the norm from the 1st-2nd, 2nd-3rd, etc. [default: %default]'),
    make_option('--rms_path', dest='rms_path', default='RMS_output.txt',
        help='Name of the file to save the root mean square (RMS) of the vectors' +\
        ' grouped by the column used with the --add_vectors function. Note that' +\
        ' this option only works with --add_vectors. The file is going to be' +\
        ' created inside the output_dir and its name will start with "RMS".' +\
        ' [default: %default]'),
    options_lookup['output_dir'],
]

script_info['option_label']={'coord_fname':'Principal coordinates filepath',
                             'map_fname':'QIIME-formatted mapping filepath',
                             'colorby': 'Colorby category',
                             'prefs_path': 'Preferences filepath',
                             'background_color': 'Background color',
                             'ellipsoid_opacity':'Ellipsoid opacity',
                             'ellipsoid_method':'Ellipsoid method',
                             'ellipsoid_smoothness':'Ellipsoid smoothness',
                             'taxa_fname': 'Summarized Taxa filepath',
                             'n_taxa_keep': '# of taxa to keep',
                             'biplot_output_file':'Output biplot coordinate filepath',
                             'master_pcoa':'Master principal coordinates filepath',
                             'output_dir': 'Output directory',
                             'output_format': 'Output format',
                             'interpolation_points': '# of interpolation points',
                             'polyhedron_points':'# of polyhedron points',
                             'polyhedron_offset':'Polyhedron offset',
                             'custom_axes':'Custom Axis',
                             'add_vectors':'Create vectors based on metadata',
                             'rms_path':'RMS output path calculations',
                             'rms_algorithm':'RMS algorithm'}
script_info['version'] = __version__

def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    prefs, data, background_color, label_color, ball_scale, arrow_colors= \
                            sample_color_prefs_and_map_data_from_options(opts)
                
    if opts.output_format == 'invue':
        # validating the number of points for interpolation
        if (opts.interpolation_points<0):
            option_parser.error('The --interpolation_points should be ' +\
                            'greater or equal to 0.')
                            
        # make sure that coord file has internally consistent # of columns
        coord_files_valid = validate_coord_files(opts.coord_fname)
        if not coord_files_valid:
            option_parser.error('Every line of every coord file must ' +\
                            'have the same number of columns.')
                            
        #Open and get coord data
        data['coord'] = get_coord(opts.coord_fname, opts.ellipsoid_method)
    
        # remove any samples not present in mapping file
        remove_unmapped_samples(data['map'],data['coord'])

        # if no samples overlapped between mapping file and otu table, exit
        if len(data['coord'][0]) == 0:
            print "\nError: OTU table and mapping file had no samples in common\n"
            exit(1)

        if opts.output_dir:
            create_dir(opts.output_dir,False)
            dir_path=opts.output_dir
        else:
            dir_path='./'
        
        filepath=opts.coord_fname
        if os.path.isdir(filepath):
            coord_files = [fname for fname in os.listdir(filepath) if not \
                           fname.startswith('.')]
            filename = os.path.split(coord_files[0])[-1]
        else:
            filename = os.path.split(filepath)[-1]
	
        generate_3d_plots_invue(prefs, data, dir_path, filename, \
            opts.interpolation_points, opts.polyhedron_points, \
            opts.polyhedron_offset)
        
        #finish script
        return

    # Potential conflicts
    if not opts.custom_axes is None and os.path.isdir(opts.coord_fname):
        # can't do averaged pcoa plots _and_ custom axes in the same plot
        option_parser.error("Please supply either custom axes or multiple coordinate \
files, but not both.")
    # check that smoothness is an integer between 0 and 3
    try:
        ellipsoid_smoothness = int(opts.ellipsoid_smoothness)
    except:
        option_parser.error("Please supply an integer ellipsoid smoothness \
value.")
    if ellipsoid_smoothness < 0 or ellipsoid_smoothness > 3:
        option_parser.error("Please supply an ellipsoid smoothness value \
between 0 and 3.")
    # check that opacity is a float between 0 and 1
    try:
        ellipsoid_alpha = float(opts.ellipsoid_opacity)
    except:
        option_parser.error("Please supply a number for ellipsoid opacity.")
    if ellipsoid_alpha < 0 or ellipsoid_alpha > 1:
        option_parser.error("Please supply an ellipsoid opacity value \
between 0 and 1.")
    # check that ellipsoid method is valid
    ellipsoid_methods = ['IQR','sdev']
    if not opts.ellipsoid_method in ellipsoid_methods:
        option_parser.error("Please supply a valid ellipsoid method. \
Valid methods are: " + ', '.join(ellipsoid_methods) + ".")
  
    # gather ellipsoid drawing preferences
    ellipsoid_prefs = {}
    ellipsoid_prefs["smoothness"] = ellipsoid_smoothness
    ellipsoid_prefs["alpha"] = ellipsoid_alpha

    # make sure that coord file has internally consistent # of columns
    coord_files_valid = validate_coord_files(opts.coord_fname)
    if not coord_files_valid:
        option_parser.error('Every line of every coord file must ' +\
                            'have the same number of columns.')

    #Open and get coord data
    data['coord'] = get_coord(opts.coord_fname, opts.ellipsoid_method)
    
    # remove any samples not present in mapping file
    remove_unmapped_samples(data['map'],data['coord'])
    
    # if no samples overlapped between mapping file and otu table, exit
    if len(data['coord'][0]) == 0:
        print "\nError: OTU table and mapping file had no samples in common\n"
        exit(1)

    # process custom axes, if present.
    custom_axes = None
    if opts.custom_axes:	
        custom_axes = process_custom_axes(opts.custom_axes)
        get_custom_coords(custom_axes, data['map'], data['coord'])
        remove_nans(data['coord'])
        scale_custom_coords(custom_axes,data['coord'])

    # process vectors if requeted
    if opts.add_vectors:
        add_vectors={}
        add_vectors['vectors'] = opts.add_vectors.split(',')
        if len(add_vectors)>3:
            raise ValueError, 'You must add maximum 3 columns but %s' % opts.add_vectors
        
        # Validating RMS values
        if opts.rms_algorithm:
            valid_chars = '_.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            for c in opts.rms_path:
                if c not in valid_chars:
                    raise ValueError, 'rms_path (%s) has invalid chars' % opts.rms_path
            add_vectors['rms_output'] = {}
            add_vectors['rms_algorithm']=opts.rms_algorithm
            add_vectors['eigvals'] = data['coord'][3]
        else:
            add_vectors['rms_algorithm'] = None
        add_vectors['rms_path'] = opts.rms_path
    else:
    	add_vectors = None

    if opts.taxa_fname != None:
        # get taxonomy counts
        # get list of sample_ids that haven't been removed
        sample_ids = data['coord'][0]
        # get taxa summaries for all sample_ids
        lineages, taxa_counts = get_taxa(opts.taxa_fname, sample_ids)
        data['taxa'] = {}
        data['taxa']['lineages'] = lineages
        data['taxa']['counts'] = taxa_counts

        # get average relative abundance of taxa
        data['taxa']['prevalence'] = get_taxa_prevalence(data['taxa']['counts'])
        remove_rare_taxa(data['taxa'],nkeep=opts.n_taxa_keep)
        # get coordinates of taxa (weighted mean of sample scores)
        data['taxa']['coord'] = get_taxa_coords(data['taxa']['counts'],
            data['coord'][1])
        data['taxa']['coord']

        # write taxa coords if requested
        if not opts.biplot_output_file is None:
            output = make_biplot_scores_output(data['taxa'])            
            fout = open(opts.biplot_output_file,'w')
            fout.write('\n'.join(output))
            fout.close()


    if opts.output_dir:
        create_dir(opts.output_dir,False)
        dir_path=opts.output_dir
    else:
        dir_path='./'
    
    qiime_dir=get_qiime_project_dir()

    jar_path=os.path.join(qiime_dir,'qiime/support_files/jar/')

    data_dir_path = get_random_directory_name(output_dir=dir_path,
                                              return_absolute_path=False)    
    
    try:
        os.mkdir(data_dir_path)
    except OSError:
        pass

    data_file_path=data_dir_path

    jar_dir_path = os.path.join(dir_path,'jar')
    
    try:
        os.mkdir(jar_dir_path)
    except OSError:
        pass
    
    shutil.copyfile(os.path.join(jar_path,'king.jar'), os.path.join(jar_dir_path,'king.jar'))

    filepath=opts.coord_fname
    if os.path.isdir(filepath):
        coord_files = [fname for fname in os.listdir(filepath) if not \
                           fname.startswith('.')]
        filename = os.path.split(coord_files[0])[-1]
    else:
        filename = os.path.split(filepath)[-1]

    try:
        action = generate_3d_plots
    except NameError:
        action = None

    #Place this outside try/except so we don't mask NameError in action
    if action:
        action(prefs,data,custom_axes,background_color,label_color,dir_path, \
                data_file_path,filename,ellipsoid_prefs=ellipsoid_prefs, \
                add_vectors=add_vectors)


if __name__ == "__main__":
    main()