File: consensus_alignment.py

package info (click to toggle)
srst2 0.2.0-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 8,668 kB
  • sloc: python: 3,119; sh: 50; makefile: 28
file content (57 lines) | stat: -rw-r--r-- 2,046 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# input = consensus alleles fasta files output by srst2 (*.all_consensus_alleles.fasta), containing >1 locus per file
# output = one fasta file of seqs per locus
# Author: Kat Holt (kholt@unimelb.edu.au)

# modules
import string, re, collections
import os, sys, subprocess
from argparse import (ArgumentParser, FileType)

# BioPython modules for reading and writing sequences
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

def parse_args():
	"Parse the input arguments, use '-h' for help."

	parser = ArgumentParser(description='SRST2 - Short Read Sequence Typer (v2)')

	parser.add_argument(
		'--input', nargs='+', type=str, required=False,
		help='Input files (should be *.all_consensus_alleles.fasta)')

	parser.add_argument('--pre', type=str, required=True, help='Prefix for output files')
	parser.add_argument('--type', type=str, required=True, help='either mlst or gene')
	parser.add_argument('--mlst_delimiter', type=str, required=False, default="-",
		help='For MLST seqs, specifcy the character(s) separating gene name from allele number in MLST database (default "-", as in arcc-1)')

	return parser.parse_args() 


def main():
	args = parse_args()
	
	# seqs = collections.defaultdict(dict) # key1 = locus, key2 = strain, value = sequence
	
	# read in seqs and print each new locus to a file
	for fasta_file in args.input:
		for record in SeqIO.parse(open(fasta_file, "r"), "fasta"):
			x = record.description # locus strain
			strain = x.split()[1]
			locus = x.split()[0]
			if args.type == "mlst":
				locus_id = locus.split(args.mlst_delimiter)[0]
				allele = locus
			elif args.type == "gene":
				locus_id = locus.split("__")[0] + "__" + locus.split("__")[1]
				allele = locus.split("__")[2] + "__" + locus.split("__")[3]
			out_file = args.pre + "__" + locus_id + ".fasta"
			o = file(out_file,"a") # open seq file for appending
			o.write(">" + strain + " " + locus + "\n")
			o.write(str(record.seq) + "\n")
			o.close()

if __name__ == '__main__':
	main()