File: cluster_quality.py

package info (click to toggle)
qiime 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 29,704 kB
  • sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (91 lines) | stat: -rwxr-xr-x 3,331 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# File created on 10 Mar 2011
from __future__ import division

__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Justin Kuczynski"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"
__status__ = "Release"
 

from qiime.util import parse_command_line_parameters
from qiime.util import make_option
from qiime.parse import parse_mapping_file_to_dict, parse_distmat
import os
import sys
import numpy

from qiime.cluster_quality import clust_qual_ratio

script_info={}
script_info['brief_description']="""compute the quality of a cluster"""
script_info['script_description']="""The input is a distance matrix (i.e. resulting file from beta_diversity.py)."""
script_info['script_usage']=[]
script_info['script_usage'].append(("""cluster quality based on the treatment category:""","""to compute the quality of clusters, and print to stdout, use the following idiom:""","""cluster_quality.py -i unweighted_unifrac_distance_matrix.txt -m Fasting_Map.txt -c Treatment"""))
script_info['script_usage'].append(('','',''))
script_info['output_description']="""The output is either a single number (with -s), or a more detailed output of the similarity between and within clusters."""
script_info['required_options']=[
make_option('-i', '--input_path',
     help='input distance matrix file'),\

make_option('-m', '--map',
     help='mapping file'),

make_option('-c', '--category',
     help='column of mapping file delimiting clusters'),



]
script_info['optional_options']=[

make_option('-o', '--output_path',default=None,
            help='output path, prints to stdout if omitted'),
make_option('-s', '--short',action="store_true",
            help='print only '+\
            'the ratio of mean dissimilarities between/within clusters'+\
            ' instead of more detailed output'),

make_option('--metric', default='ratio',
     help='choice of quality metric to apply. Currently only one option '+\
     'exists, the ratio of mean(distances between samples from different '+\
     'clusters) to mean(distances between samples from the same cluster) '+\
     'Default: %default'),
]
script_info['version'] = __version__


def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.output_path != None:
        outf = open(opts.output_path,'w')
    else:
        outf = sys.stdout

    dists = parse_distmat(open(opts.input_path,'U'))
    map_data = parse_mapping_file_to_dict(open(opts.map,'U'))
    diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category)



    if opts.short:
        print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists)
    else:
        print >> outf, "dissimilarity ratio between/within (large for clustered data):"
        print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists)
        print >> outf, "dissimilarities between clusters: mean, std, num:"
        print >> outf, '\t'.join(map(str,[numpy.mean(diff_dists), numpy.std(diff_dists),
         len(diff_dists)]))
        print >> outf, "dissimilarities within clusters: mean, std, num:"
        print >> outf, '\t'.join(map(str,[numpy.mean(same_dists), numpy.std(same_dists),
         len(same_dists)]))



if __name__ == "__main__":
    main()