File: getmlst.py

package info (click to toggle)
srst2 0.2.0-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 8,668 kB
  • sloc: python: 3,119; sh: 50; makefile: 28
file content (198 lines) | stat: -rwxr-xr-x 7,137 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python

'''
Download MLST datasets from this site: http://pubmlst.org/data/ by
parsing an xml file (http://pubmlst.org/data/dbases.xml).

Data is downloaded for a species determined by the user:
- profiles (maps STs to allele numbers)
- numbered sequences for each locus in the scheme

In addition, the alleles are concatenated together for use with SRST2.

A log file is also generated in the working directory, detailing the
time, date and location of all files downloaded, as well as the <retrieved> 
tag which tells us when the XML entry was last updated. 

If the species name input by the user matches multiple <species> in the
xml file, the script simply reports the possible matches so the user can
try again.
'''

from argparse import ArgumentParser
import xml.dom.minidom as xml
import urllib2 as url
import re, os, glob
from urlparse import urlparse

def parse_args():
	parser = ArgumentParser(description='Download MLST datasets by species'
										'from pubmlst.org.')

	parser.add_argument('--repository_url',
						metavar = 'URL',
						default = 'http://pubmlst.org/data/dbases.xml',
						help = 'URL for MLST repository XML index')

	parser.add_argument('--species',
						metavar = 'NAME',
						required = True,
						help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
						
	parser.add_argument('--force_scheme_name',
						action="store_true",
						default = False,
						help = 'Flage to force downloading of specific scheme name (e.g. "Clostridium difficile")')
						
	return parser.parse_args()

# test if a node is an Element and that it has a specific tag name
def testElementTag(node, name):
		return node.nodeType == node.ELEMENT_NODE and node.localName == name

# Get the text from an element node with a text node child
def getText(element):
	result = ''
	for node in element.childNodes:
		if node.nodeType == node.TEXT_NODE:
			result += node.data
	return normaliseText(result)

# remove unwanted whitespace including linebreaks etc.
def normaliseText(str):
	return ' '.join(str.split())

# A collection of interesting information about a taxa
class SpeciesInfo(object):
	def __init__(self):
		self.name = None # String name of species
		self.database_url = None # URL as string
		self.retrieved = None # date as string 
		self.profiles_url = None # URL as string 
		self.profiles_count = None # positive integer
		self.loci = [] # list of loci 

class LocusInfo(object):
	def __init__(self):
		self.url = None
		self.name = None

# retrieve the interesting information for a given sample element
def getSpeciesInfo(species_node, species, exact):
	this_name = getText(species_node)
	store = False
	if exact:
		if this_name == species:
			store = True
	else:
		if this_name.startswith(species):
			store = True
	if store:
		info = SpeciesInfo()
		info.name = this_name
		for mlst_node in species_node.getElementsByTagName('mlst'):
			for database_node in mlst_node.getElementsByTagName('database'):
				for database_child_node in database_node.childNodes:
					if testElementTag(database_child_node, 'url'):
						info.database_url = getText(database_child_node)
					elif testElementTag(database_child_node, 'retrieved'):
						info.retrieved = getText(database_child_node)
					elif testElementTag(database_child_node, 'profiles'):
						for profile_count in database_child_node.getElementsByTagName('count'):
							info.profiles_count = getText(profile_count)
						for profile_url in database_child_node.getElementsByTagName('url'):
							info.profiles_url = getText(profile_url)
					elif testElementTag(database_child_node, 'loci'):
						for locus_node in database_child_node.getElementsByTagName('locus'):
							locus_info = LocusInfo()
							locus_info.name = getText(locus_node)
							for locus_url in locus_node.getElementsByTagName('url'):
								locus_info.url = getText(locus_url)
							info.loci.append(locus_info)
		return info
	else:
		return None


def main():
	args = parse_args()
	docFile = url.urlopen(args.repository_url)
	doc = xml.parse(docFile)
	root = doc.childNodes[0]
	found_species = []
	for species_node in root.getElementsByTagName('species'):
		info = getSpeciesInfo(species_node, args.species, args.force_scheme_name)
		if info != None:
			found_species.append(info)
	if len(found_species) == 0:
		print "No species matched your query."
		exit(1)
	if len(found_species) > 1:
		print "The following {} species match your query, please be more specific:".format(len(found_species))
		for info in found_species:
			print info.name
		exit(2)

	assert len(found_species) == 1
	species_info = found_species[0]
	species_name_underscores = species_info.name.replace(' ', '_')
	species_name_underscores = species_name_underscores.replace('/', '_')

	# Remove any Bowtie/Samtools index files that already exist.
	for filename in glob.glob(species_name_underscores + '*.bt2'):
		os.remove(filename)
	for filename in glob.glob(species_name_underscores + '*.fai'):
		os.remove(filename)

	# output information for the single matching species
	species_all_fasta_filename = species_name_underscores + '.fasta'
	species_all_fasta_file = open(species_all_fasta_filename, 'w')
	log_filename = "mlst_data_download_{}_{}.log".format(species_name_underscores, species_info.retrieved)
	log_file = open(log_filename, "w")
	profile_path = urlparse(species_info.profiles_url).path
	profile_filename = profile_path.split('/')[-1]
	log_file.write("definitions: {}\n".format(profile_filename))
	log_file.write("{} profiles\n".format(species_info.profiles_count))
	log_file.write("sourced from: {}\n\n".format(species_info.profiles_url))
	profile_doc = url.urlopen(species_info.profiles_url)
	profile_file = open(profile_filename, 'w')
	profile_file.write(profile_doc.read())
	profile_file.close()
	profile_doc.close()
	for locus in species_info.loci:
		locus_path = urlparse(locus.url).path
		locus_filename = locus_path.split('/')[-1]
		log_file.write("locus {}\n".format(locus.name))
		log_file.write(locus_filename + '\n')
		log_file.write("Sourced from {}\n\n".format(locus.url))
		locus_doc = url.urlopen(locus.url)
		locus_file = open(locus_filename, 'w')
		locus_fasta_content = locus_doc.read()
		locus_file.write(locus_fasta_content)
		species_all_fasta_file.write(locus_fasta_content)
		locus_file.close()
		locus_doc.close()
	log_file.write("all loci: {}\n".format(species_all_fasta_filename))
	log_file.close()
	species_all_fasta_file.close()

	print "\n  For SRST2, remember to check what separator is being used in this allele database"
	head = os.popen('head -n 1 ' + species_all_fasta_filename).read().rstrip()
	m = re.match('>(.*)([_-])(\d*)',head).groups()
	if len(m)==3:
		print
		print "  Looks like --mlst_delimiter '" + m[1] + "'"
		print
		print "  " + head + "  --> -->  ",
		print m
	print 
	print "  Suggested srst2 command for use with this MLST database:"
	print
	print "    srst2 --output test --input_pe *.fastq.gz --mlst_db " + species_name_underscores + '.fasta',
	print "--mlst_definitions " + format(profile_filename),
	print "--mlst_delimiter '" + m[1] + "'"
	print


if __name__ == '__main__':
	main()