File: make_argannot_fasta.py

package info (click to toggle)
kleborate 2.4.1-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 175,052 kB
  • sloc: python: 3,498; sh: 76; makefile: 10
file content (71 lines) | stat: -rwxr-xr-x 2,300 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3

"""
This script modifies the ARGannot_r3.fasta file (comes with SRST2) for use with Kleborate.
Specifically, it removes alleles which are not in the ARGannot_clustered80_r3.csv file.

It takes two arguments:
  1) SRST2's copy of ARGannot_r3.fasta
  2) the ARGannot_clustered80_r3.csv file made by make_argannot_csv.py

Usage:
  ./make_argannot_fasta.py path/to/srst2/data/ARGannot_r3.fasta ARGannot_clustered80_r3.csv > ARGannot_r3.fasta
"""

import sys


def main():
    argannot_fasta_filename = sys.argv[1]
    argannot_csv_filename = sys.argv[2]

    allele_names = set()
    with open(argannot_csv_filename, 'rt') as argannot_csv_file:
        for line in argannot_csv_file:
            allele_names.add(line.split(',')[4])

    fasta_seqs = load_fasta(argannot_fasta_filename)
    fasta_seqs = sorted(fasta_seqs, key=fasta_sorting_key)

    for name, seq in fasta_seqs:
        name = name.replace('NimB_Nitroimidazole_Gene', 'NimB')  # fix verbose gene name
        name = name.replace('_Colistin__', '_Col__')  # colistin class name to just three letters
        name = name.replace('_Agly__', '_AGly__')  # fix capitalisation
        allele_name = name.split('__')[2]
        if allele_name in allele_names:
            print('>' + name)
            print(seq, end='')


def load_fasta(filename):
    """Returns the names and sequences for the given fasta file."""
    fasta_seqs = []
    with open(filename, 'rt') as fasta_file:
        name = ''
        sequence = ''
        for line in fasta_file:
            stripped_line = line.strip()
            if not stripped_line:
                continue
            if stripped_line[0] == '>':  # Header line = start of new contig
                if name:
                    fasta_seqs.append((name, sequence))
                    sequence = ''
                name = stripped_line[1:]
            else:
                sequence += line
        if name:
            fasta_seqs.append((name, sequence))
    return fasta_seqs


def fasta_sorting_key(fasta_record):
    """
    This function defines a sorting key for fasta so 'TEM-2' will come before 'TEM-10', etc.
    """
    name_parts = fasta_record[0].split()[0].split('__')
    return int(name_parts[0]), int(name_parts[3])


if __name__ == '__main__':
    main()