File: analyseSRST2.py

package info (click to toggle)
srst2 0.2.0-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 8,668 kB
  • sloc: python: 3,119; sh: 50; makefile: 28
file content (349 lines) | stat: -rwxr-xr-x 11,520 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/bin/env python

# Python Version 2.7.5
#
# Basic analysis of SRST2 output
# Authors - Kathryn Holt (kholt@unimelb.edu.au)
#
# Dependencies:
#	R

from argparse import (ArgumentParser, FileType)
import logging
from subprocess import call, check_output, CalledProcessError, STDOUT
import os, sys, re, collections, operator
from scipy.stats import binom_test, linregress
from math import log
from itertools import groupby
from operator import itemgetter
from collections import OrderedDict

def parse_args():
	"Parse the input arguments, use '-h' for help"

	parser = ArgumentParser(description='Basic analysis of SRST2 output')

	# options
	parser.add_argument('--prev_output', nargs='+', type=str, required=False, help='SRST2 results files to compile')
	parser.add_argument('--sample_metadata', nargs='+', type=str, required=False, help='Sample metadata to merge into table (tab delimited)')	
	parser.add_argument('--output', type=str, required=True, help='Prefix for the output files')
	parser.add_argument('--log', action="store_true", required=False, help='Switch on logging to file (otherwise log to stdout)')
	
	return parser.parse_args() 

# take list of mlst results dictionaries and/or list of gene results dictionaries
# print table of mlst results + gene results for all samples
def compile_results(args,mlst_results,db_results,sample_metadata_hashes,compiled_output_file):

	o = file(compiled_output_file,"w")
	
	# get list of all samples and genes present in these datasets
	sample_list = [] # each entry is a sample present in at least one db
	gene_list = [] # genes read from gene db results
	variable_list = [] # metadata variables
	mlst_cols = 0
	mlst_header_string = ""
	blank_mlst_section = ""
	
	mlst_results_master = {} # compilation of all MLST results
	db_results_master = collections.defaultdict(dict) # compilation of all gene results
	metadata_master = collections.defaultdict(dict) # compilation of all metadata
	
	st_counts = {} # key = ST, value = count
	
	# store mlst data from dict (as string)
	if len(mlst_results) > 0:
	
		for mlst_result in mlst_results:
		
			# check length of the mlst string
			if "Sample" in mlst_result:
				test_string = mlst_result["Sample"]
				if mlst_cols == 0:
					mlst_header_string = test_string
			else:
				test_string = mlst_result[mlst_result.keys()[0]] # no header line?
			test_string_split = test_string.split("\t")
			this_mlst_cols = len(test_string)
			
			if (mlst_cols == 0) or (mlst_cols == this_mlst_cols):
				mlst_cols = this_mlst_cols
				blank_mlst_section = "\t" * (mlst_cols-1) # blank MLST string in case some samples missing
				# use this data
				for sample in mlst_result:
					mlst_results_master[sample] = mlst_result[sample]
					if sample not in sample_list:
						sample_list.append(sample)
			elif mlst_cols != this_mlst_cols:
				# don't process this data further
				logging.info("Problem reconciling MLST data from, first MLST results encountered had " + str(mlst_cols) + " columns, this one has " + str(this_mlst_cols) + " columns?")
				if args.mlst_db:
					logging.info("Compiled report will contain only the MLST data from this run, not previous outputs")
				else:
					logging.info("Compiled report will contain only the data from the first MLST result set provided")

	# store gene data from dict (as dict of dicts)
	if len(db_results) > 0:
		for results in db_results:
			for sample in results:
				if sample not in sample_list:
					sample_list.append(sample)
				for gene in results[sample]:
					if gene != "failed":
						db_results_master[sample][gene] = results[sample][gene]
						if gene not in gene_list:
							gene_list.append(gene)
							
	# store metadata from dict (as dict of dicts)						
	if len(sample_metadata_hashes) > 0:
		for results in sample_metadata_hashes:
			for sample in results:
				if sample not in sample_list:
					sample_list.append(sample)
				for variable in results[sample]:
#					print metadata_master
#					metadata_master[sample] = results[sample][variable]
					metadata_master[sample][variable] = results[sample][variable]
					if variable not in variable_list:
						variable_list.append(variable)
						
	print variable_list
						
	if "Sample" in sample_list:
		sample_list.remove("Sample")
	sample_list.sort()
	gene_list.sort()
	
	# print header
	header_elements = []
	if len(mlst_results) > 0:
		header_elements.append(mlst_header_string) # mlst column headings
	else:
		header_elements.append("Sample")
	if len(variable_list) > 0:
		header_elements += variable_list # metadata column headings
	if len(gene_list) > 0:
		header_elements += gene_list # gene column headings
	o.write("\t".join(header_elements)+"\n")
	
	# print results for all samples
	for sample in sample_list:
	
		sample_info = [] # first entry is mlst string OR sample name, rest are metadata or genes
		
		# print mlst if provided, otherwise just print sample name
		if len(mlst_results_master) > 0:
			if sample in mlst_results_master:
				sample_info.append(mlst_results_master[sample])
				this_st = mlst_results_master[sample].split("\t")[1]
			else:
				sample_info.append(sample+blank_mlst_section)
				this_st = "unknown"
			# record the MLST result				
			if this_st in st_counts:
				st_counts[this_st] += 1
			else:
				st_counts[this_st] = 1
		else:
			sample_info.append(sample)
		
		# get metadata if provided
		if sample in metadata_master:
			for variable in variable_list:
				if variable in metadata_master[sample]:
					sample_info.append(metadata_master[sample][variable])
				else:
					sample_info.append("-")
		else:
			for variable in variable_list:
				sample_info.append("?") # record no gene data on this strain
				
		# get gene info if provided
		if sample in db_results_master:
			for gene in gene_list:
				if gene in db_results_master[sample]:
					sample_info.append(db_results_master[sample][gene])
				else:
					sample_info.append("-")
		else:
			for gene in gene_list:
				sample_info.append("?") # record no gene data on this strain
			
		o.write("\t".join(sample_info)+"\n")
		
	o.close()
	
	logging.info("Compiled data on " + str(len(sample_list)) + " samples printed to: " + compiled_output_file)
	
	# log ST counts
	if len(mlst_results_master) > 0:
		logging.info("Detected " + str(len(st_counts.keys())) + " STs: ")
		sts = st_counts.keys()
		sts.sort()
		for st in sts:
			logging.info("ST" + st + "\t" + str(st_counts[st]))
	
	return True
	
# read data from file into dict
def read_results_from_file(infile,metadata):
	
	if metadata:
	
		# process as with a genes table
		results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele
		with open(infile) as f:
			header = []
			for line in f:
				line_split = line.rstrip().split("\t")
				if len(header) == 0:
					header = line_split
				else:
					sample = line_split[0]
					for i in range(1,len(line_split)):
						variable = header[i]
						results[sample][variable] = line_split[i]

		dbtype = "metadata"
		dbname = infile
		logging.info("Reading metadata from: " + infile)
				
	else:
	
		results_info = infile.split("__")
		
		if len(results_info) > 1:
	
			if results_info[-1] == "compiledResults.txt":
				dbtype = "compiled"
				dbname = results_info[0] # output identifier
			else:
				dbtype = results_info[1] # mlst or genes
				dbname = results_info[2] # database

			logging.info("Processing " + dbtype + " results from file " + infile)
	
			if dbtype == "genes":	
				results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele
				with open(infile) as f:
					header = []
					for line in f:
						line_split = line.rstrip().split("\t")
						if len(header) == 0:
							header = line_split
						else:
							sample = line_split[0]
							for i in range(1,len(line_split)):
								gene = header[i]
								results[sample][gene] = line_split[i]

			elif dbtype == "mlst":	
				results = {} # key = sample, value = MLST string
				with open(infile) as f:
					for line in f:
						results[line.split("\t")[0]] = line.rstrip() # store header line too (index "Sample")
				
			elif dbtype == "compiled":
				results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele
				with open(infile) as f:
					header = []
					mlst_cols = 0 # INDEX of the last mlst column
					n_cols = 0
					for line in f:
						line_split = line.rstrip().split("\t")
						if len(header) == 0:
							header = line_split
							n_cols = len(header)
							if header[1] == "ST":
								# there is mlst data reported
								mlst_cols = 2 # first locus column
								while header[mlst_cols] != "depth":
									mlst_cols += 1
								results["Sample"]["mlst"] = "\t".join(line_split[0:(mlst_cols+1)])
						else:
							sample = line_split[0]
							if mlst_cols > 0:
								results[sample]["mlst"] = "\t".join(line_split[0:(mlst_cols+1)])
							if n_cols > mlst_cols:
								# read genes component
								for i in range(mlst_cols+1,n_cols):
									# note i=1 if mlst_cols==0, ie we are reading all
									gene = header[i]
									if len(line_split) > i:
										results[sample][gene] = line_split[i]
									else:
										results[sample][gene] = "-"
							
		else:
			results = False
			dbtype = False
			dbname = False
			logging.info("Couldn't decide what to do with file results file provided: " + infile)
				
	return results, dbtype, dbname

	
def main():
	args = parse_args()
	if args.log is True:
		logfile = args.output + ".log"
	else:
		logfile = None
	logging.basicConfig(
		filename=logfile,
		level=logging.DEBUG,
		filemode='w',
		format='%(asctime)s %(message)s',
		datefmt='%m/%d/%Y %H:%M:%S')
	logging.info('program started')
	logging.info('command line: {0}'.format(' '.join(sys.argv)))

	# vars to store results
	mlst_results_hashes = [] # dict (sample->MLST result string) for each MLST output files created/read
	gene_result_hashes = [] # dict (sample->gene->result) for each gene typing output files created/read
	sample_metadata_hashes = [] # dict (sample->variable->value) for each variable read from metadata file

	# read in results from prior results files
	if args.prev_output:
	
		unique_results_files = list(OrderedDict.fromkeys(args.prev_output))
	
		for results_file in unique_results_files:
		
			results, dbtype, dbname = read_results_from_file(results_file, False)
			
			if dbtype == "mlst":
				mlst_results_hashes.append(results)
				
			elif dbtype == "genes":
				gene_result_hashes.append(results)
				
			elif dbtype == "compiled":
				# store mlst in its own db
				mlst_results = {}
				for sample in results:
					if "mlst" in results[sample]:
						mlst_results[sample] = results[sample]["mlst"]
						del results[sample]["mlst"]
				mlst_results_hashes.append(mlst_results)
				gene_result_hashes.append(results)
				
	# read in metadata from file
	if args.sample_metadata:
		for metadata_file in args.sample_metadata:
			results, dbtype, dbname = read_results_from_file(metadata_file, True)
			sample_metadata_hashes.append(results)

	# compile results if multiple databases or datasets provided
	if ( (len(gene_result_hashes) + len(mlst_results_hashes) + len(sample_metadata_hashes)) > 1 ):
		compiled_output_file = args.output + "__compiledResults.txt"
		compile_results(args,mlst_results_hashes,gene_result_hashes,sample_metadata_hashes,compiled_output_file)
	
	elif args.prev_output:
		logging.info('One previous output file was provided, but there is no other data to compile with.')
	
	logging.info('SRST2 has finished.')


if __name__ == '__main__':
	main()