File: transform_coordinate_matrices.py

package info (click to toggle)
qiime 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 29,704 kB
  • sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (130 lines) | stat: -rwxr-xr-x 5,798 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
# File created on 21 Feb 2010
from __future__ import division

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Greg Caporaso","Justin Kuczynski","Jose Carlos Clemente Litran"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"

from qiime.util import make_option
from os.path import split, splitext, exists
from os import makedirs
from numpy import log10
from qiime.util import parse_command_line_parameters
from qiime.parse import fields_to_dict
from qiime.format import format_p_value_for_num_iters
from qiime.transform_coordinate_matrices import procrustes_monte_carlo,\
    get_procrustes_results

script_info={}
script_info['brief_description']="""Transform 2 coordinate matrices"""
script_info['script_description']="""This script transforms 2 coordinate matrices (e.g., the output of principal_coordinates.py) using procrustes analysis to minimize the distances between corresponding points. Monte Carlo simulations can additionally be performed (-r random trials are run) to estimate the probability of seeing an M^2 value as extreme as the actual M^2."""
script_info['script_usage']=[]
script_info['script_usage'].append(("Generate monte carlo p-values","","""%prog -r 1000 -i weighted_unifrac_coords.txt,unweighted_unifrac_coords.txt""",))
  
script_info['script_usage'].append(("Write the transformed procrustes matrices to file","","""%prog -o out/ -i weighted_unifrac_coords.txt,unweighted_unifrac_coords.txt"""))

script_info['output_description']="""Two transformed coordinate matrices corresponding to the two input coordinate matrices, and (if -r was specified) a text file summarizing the results of the Monte Carlo simulations."""
script_info['required_options']=[\
 make_option('-i','--input_fps',type='existing_filepaths',help='comma-separated input files'),\
 make_option('-o','--output_dir',type='new_dirpath',help='the output directory'),\
]
script_info['optional_options']=[\
 make_option('-r','--random_trials',type='int',
    help='Number of random permutations of matrix2 to perform. '+
    ' [default: (no Monte Carlo analysis performed)]',default=None),
 make_option('-d','--num_dimensions',type='int',default=3,
    help='Number of dimensions to include in output matrices'+
    ' [default: %default]'),
 make_option('-s','--sample_id_map_fp',
    type='existing_filepath',
    help='Map of original sample ids to new sample ids [default: %default]',
    default=None),
 make_option('--store_trial_details',
    help='Store PC matrices for individual trials [default: %default]',
    default=False,action='store_true'),
]

script_info['version'] = __version__



def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    random_trials = opts.random_trials
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
        
    output_dir = opts.output_dir
    sample_id_map_fp = opts.sample_id_map_fp
    num_dimensions = opts.num_dimensions
    
    if not exists(output_dir): 
        makedirs(output_dir)
    
    if opts.store_trial_details:
        trial_output_dir = '%s/trial_details/' % output_dir
    else:
        trial_output_dir = None
  
    input_fp1 = opts.input_fps[0]
    input_fp2 = opts.input_fps[1]
    input_fp1_dir, input_fn1 = split(input_fp1)
    input_fp1_basename, input_fp1_ext = splitext(input_fn1)
    input_fp2_dir, input_fn2 = split(input_fp2)
    input_fp2_basename, input_fp2_ext = splitext(input_fn2)
    output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\
     (output_dir,input_fp1_basename,input_fp2_basename)
    output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir
    output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir
    
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    transformed_coords1, transformed_coords2, m_squared =\
      get_procrustes_results(open(input_fp1,'U'),\
                             open(input_fp2,'U'),\
                             sample_id_map=sample_id_map,\
                             randomize=False,
                             max_dimensions=num_dimensions)
    output_matrix1_f = open(output_matrix1_fp,'w')
    output_matrix1_f.write(transformed_coords1)
    output_matrix1_f.close()
    output_matrix2_f = open(output_matrix2_fp,'w')
    output_matrix2_f.write(transformed_coords2)
    output_matrix2_f.close()
    
    if random_trials:
        summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2']
        coords_f1 = list(open(input_fp1,'U'))
        coords_f2 = list(open(input_fp2,'U'))
        actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
         procrustes_monte_carlo(coords_f1,\
                                coords_f2,\
                                trials=random_trials,\
                                max_dimensions=num_dimensions,
                                sample_id_map=sample_id_map,
                                trial_output_dir=trial_output_dir)
        # truncate the p-value to the correct number of significant
        # digits
        mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
        max_dims_str = str(num_dimensions or 'alldim')
        summary_file_lines.append('%s %s %s %s %d %1.3f' %\
         (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\
          count_better, actual_m_squared))
        f = open(output_summary_fp,'w')
        f.write('\n'.join(summary_file_lines))
        f.close()


if __name__ == "__main__":
    main()