1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
#!/usr/bin/env python
'''
Download MLST datasets from this site: http://pubmlst.org/data/ by
parsing an xml file (http://pubmlst.org/data/dbases.xml).
Data is downloaded for a species determined by the user:
- profiles (maps STs to allele numbers)
- numbered sequences for each locus in the scheme
In addition, the alleles are concatenated together for use with SRST2.
A log file is also generated in the working directory, detailing the
time, date and location of all files downloaded, as well as the <retrieved>
tag which tells us when the XML entry was last updated.
If the species name input by the user matches multiple <species> in the
xml file, the script simply reports the possible matches so the user can
try again.
'''
from argparse import ArgumentParser
import xml.dom.minidom as xml
import urllib2 as url
import re, os, glob
from urlparse import urlparse
def parse_args():
parser = ArgumentParser(description='Download MLST datasets by species'
'from pubmlst.org.')
parser.add_argument('--repository_url',
metavar = 'URL',
default = 'http://pubmlst.org/data/dbases.xml',
help = 'URL for MLST repository XML index')
parser.add_argument('--species',
metavar = 'NAME',
required = True,
help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
parser.add_argument('--force_scheme_name',
action="store_true",
default = False,
help = 'Flage to force downloading of specific scheme name (e.g. "Clostridium difficile")')
return parser.parse_args()
# test if a node is an Element and that it has a specific tag name
def testElementTag(node, name):
return node.nodeType == node.ELEMENT_NODE and node.localName == name
# Get the text from an element node with a text node child
def getText(element):
result = ''
for node in element.childNodes:
if node.nodeType == node.TEXT_NODE:
result += node.data
return normaliseText(result)
# remove unwanted whitespace including linebreaks etc.
def normaliseText(str):
return ' '.join(str.split())
# A collection of interesting information about a taxa
class SpeciesInfo(object):
def __init__(self):
self.name = None # String name of species
self.database_url = None # URL as string
self.retrieved = None # date as string
self.profiles_url = None # URL as string
self.profiles_count = None # positive integer
self.loci = [] # list of loci
class LocusInfo(object):
def __init__(self):
self.url = None
self.name = None
# retrieve the interesting information for a given sample element
def getSpeciesInfo(species_node, species, exact):
this_name = getText(species_node)
store = False
if exact:
if this_name == species:
store = True
else:
if this_name.startswith(species):
store = True
if store:
info = SpeciesInfo()
info.name = this_name
for mlst_node in species_node.getElementsByTagName('mlst'):
for database_node in mlst_node.getElementsByTagName('database'):
for database_child_node in database_node.childNodes:
if testElementTag(database_child_node, 'url'):
info.database_url = getText(database_child_node)
elif testElementTag(database_child_node, 'retrieved'):
info.retrieved = getText(database_child_node)
elif testElementTag(database_child_node, 'profiles'):
for profile_count in database_child_node.getElementsByTagName('count'):
info.profiles_count = getText(profile_count)
for profile_url in database_child_node.getElementsByTagName('url'):
info.profiles_url = getText(profile_url)
elif testElementTag(database_child_node, 'loci'):
for locus_node in database_child_node.getElementsByTagName('locus'):
locus_info = LocusInfo()
locus_info.name = getText(locus_node)
for locus_url in locus_node.getElementsByTagName('url'):
locus_info.url = getText(locus_url)
info.loci.append(locus_info)
return info
else:
return None
def main():
args = parse_args()
docFile = url.urlopen(args.repository_url)
doc = xml.parse(docFile)
root = doc.childNodes[0]
found_species = []
for species_node in root.getElementsByTagName('species'):
info = getSpeciesInfo(species_node, args.species, args.force_scheme_name)
if info != None:
found_species.append(info)
if len(found_species) == 0:
print "No species matched your query."
exit(1)
if len(found_species) > 1:
print "The following {} species match your query, please be more specific:".format(len(found_species))
for info in found_species:
print info.name
exit(2)
assert len(found_species) == 1
species_info = found_species[0]
species_name_underscores = species_info.name.replace(' ', '_')
species_name_underscores = species_name_underscores.replace('/', '_')
# Remove any Bowtie/Samtools index files that already exist.
for filename in glob.glob(species_name_underscores + '*.bt2'):
os.remove(filename)
for filename in glob.glob(species_name_underscores + '*.fai'):
os.remove(filename)
# output information for the single matching species
species_all_fasta_filename = species_name_underscores + '.fasta'
species_all_fasta_file = open(species_all_fasta_filename, 'w')
log_filename = "mlst_data_download_{}_{}.log".format(species_name_underscores, species_info.retrieved)
log_file = open(log_filename, "w")
profile_path = urlparse(species_info.profiles_url).path
profile_filename = profile_path.split('/')[-1]
log_file.write("definitions: {}\n".format(profile_filename))
log_file.write("{} profiles\n".format(species_info.profiles_count))
log_file.write("sourced from: {}\n\n".format(species_info.profiles_url))
profile_doc = url.urlopen(species_info.profiles_url)
profile_file = open(profile_filename, 'w')
profile_file.write(profile_doc.read())
profile_file.close()
profile_doc.close()
for locus in species_info.loci:
locus_path = urlparse(locus.url).path
locus_filename = locus_path.split('/')[-1]
log_file.write("locus {}\n".format(locus.name))
log_file.write(locus_filename + '\n')
log_file.write("Sourced from {}\n\n".format(locus.url))
locus_doc = url.urlopen(locus.url)
locus_file = open(locus_filename, 'w')
locus_fasta_content = locus_doc.read()
locus_file.write(locus_fasta_content)
species_all_fasta_file.write(locus_fasta_content)
locus_file.close()
locus_doc.close()
log_file.write("all loci: {}\n".format(species_all_fasta_filename))
log_file.close()
species_all_fasta_file.close()
print "\n For SRST2, remember to check what separator is being used in this allele database"
head = os.popen('head -n 1 ' + species_all_fasta_filename).read().rstrip()
m = re.match('>(.*)([_-])(\d*)',head).groups()
if len(m)==3:
print
print " Looks like --mlst_delimiter '" + m[1] + "'"
print
print " " + head + " --> --> ",
print m
print
print " Suggested srst2 command for use with this MLST database:"
print
print " srst2 --output test --input_pe *.fastq.gz --mlst_db " + species_name_underscores + '.fasta',
print "--mlst_definitions " + format(profile_filename),
print "--mlst_delimiter '" + m[1] + "'"
print
if __name__ == '__main__':
main()
|