File: bla_info.py

package info (click to toggle)
kleborate 2.4.1-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 175,052 kB
  • sloc: python: 3,498; sh: 76; makefile: 10
file content (187 lines) | stat: -rwxr-xr-x 8,275 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python3

"""
This script generates a table of beta-lactamase class information.

It takes two arguments:
  1) The Lahey info file at ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Lahey.tab
  2) A GenBank file downloaded from these records:
     https://www.ncbi.nlm.nih.gov/nuccore?term=313047%5BBioProject%5D
     (use 'Send to' then 'File' and set the format to GenBank)

Usage:
  ./bla_info.py Lahey.tab sequence.gb > bla_info_table

The output has three columns:
  * allele name
  * allele description
  * bla class

The bla class column can have these six possible values:
  * Bla_broad (broad spectrum beta-lactamases)
  * Bla_broad_inhR (broad spectrum beta-lactamases with resistance to beta-lactamase inhibitors)
  * Bla_ESBL (extended spectrum beta-lactamases)
  * Bla_ESBL_inhR (extended spectrum beta-lactamases with resistance to beta-lactamase inhibitors)
  * Bla_Carb (carbapenemase)
  * Bla (beta-lactamase that is none of the above)
"""

import sys
import re
from Bio import SeqIO


def main():
    lahey_filename = sys.argv[1]
    genbank_filename = sys.argv[2]

    lahey_descriptions = get_lahey_descriptions(lahey_filename)
    genbank_descriptions = get_genbank_descriptions(genbank_filename)

    all_alleles = sorted(set(list(lahey_descriptions.keys()) + list(genbank_descriptions.keys())),
                         key=allele_name_sorting_key)

    for allele in all_alleles:

        # If the allele is in only one of the two sources, then we just get the description/class
        # from the single source.
        if allele in lahey_descriptions and allele not in genbank_descriptions:
            description = lahey_descriptions[allele]
            bla_class = bla_class_from_description(description)
        elif allele in genbank_descriptions and allele not in lahey_descriptions:
            description = genbank_descriptions[allele]
            bla_class = bla_class_from_description(description)

        # If the allele is in both Lahey and the GenBank...
        else:
            lahey_description = lahey_descriptions[allele]
            genbank_description = genbank_descriptions[allele]
            lahey_bla_class = bla_class_from_description(lahey_description)
            genbank_bla_class = bla_class_from_description(genbank_description)

            # If they agree on the class, then we use the longer description (assuming that the
            # longer one is more descriptive).
            if lahey_bla_class == genbank_bla_class:
                bla_class = lahey_bla_class
                description = lahey_description
                if len(genbank_description) > len(lahey_description):
                    description = genbank_description

            # If they disagree on the class, then we choose whichever class is 'worse'.
            else:
                class_values  = {'Bla': 0,
                                 'Bla_broad': 1, 'Bla_broad_inhR': 2,
                                 'Bla_ESBL': 3, 'Bla_ESBL_inhR': 4,
                                 'Bla_Carb': 5}
                if class_values[lahey_bla_class] > class_values[genbank_bla_class]:
                    bla_class = lahey_bla_class
                    description = lahey_description
                else:
                    bla_class = genbank_bla_class
                    description = genbank_description

        print('\t'.join([allele, description, bla_class]))


def get_lahey_descriptions(lahey_filename):
    descriptions = {}
    with open(lahey_filename, 'rt') as lahey_file:
        for line in lahey_file:
            if line.startswith('#'):
                continue
            line_parts = line.strip().split('\t')
            allele = line_parts[0]
            descriptions[allele] = line_parts[7].rsplit(' ', 1)[0]
    return descriptions


def get_genbank_descriptions(genbank_filename):
    descriptions = {}
    for seq_record in SeqIO.parse(sys.argv[2], 'genbank'):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                allele = feature.qualifiers.get('allele', ['none'])[0]
                gene = feature.qualifiers.get('gene', ['none'])[0]
                product = feature.qualifiers.get('product', [''])[0]
                note = feature.qualifiers.get('note', [''])[0]
                function = feature.qualifiers.get('function', [''])[0]

                if allele.lower().startswith('bla') and gene.lower().startswith('bla'):
                    allele = allele[3:]  # remove 'bla'
                    if allele not in descriptions:  # Don't overwrite the Lahey description.

                        # The product likely ends with the allele name. If so, remove it.
                        allele_name_len = len(allele)
                        if allele != 'none' and allele_name_len and \
                                product.lower().endswith(allele.lower()):
                            product = product[:-allele_name_len]
                        product = product.rstrip()
                        if product.lower().endswith(' bla'):
                            product = product[:-4]

                        # The 'product' feature probably has all of the description we need.
                        # However, if there is additional information in the 'note' or 'function'
                        # features, then add those too.
                        description = product
                        if ('carbapenem' in note and 'carbapenem' not in description) or \
                                ('extended' in note and 'extended' not in description) or \
                                ('broad' in note and 'broad' not in description) or \
                                ('inhibitor' in note and 'inhibitor' not in description):
                            description += ' ' + note
                        if ('carbapenem' in function and 'carbapenem' not in description) or \
                                ('extended' in function and 'extended' not in description) or \
                                ('broad' in function and 'broad' not in description) or \
                                ('inhibitor' in function and 'inhibitor' not in description):
                            description += ' ' + function
                        description = description.replace(',', ' ')
                        description = description.replace(';', ' ')
                        description = re.sub('\s+', ' ', description).strip()

                        # Special case to fix a particular long description.
                        if description == 'subclass B1 metallo-beta-lactamase New Delhi ' + \
                                          'metallo-beta-lactamase+ New Delhi MBL carbapenemase':
                            description = 'subclass B1 metallo-beta-lactamase carbapenemase'

                        descriptions[allele] = description
    return descriptions


def bla_class_from_description(description):
    if 'carbapenem-hydrolyzing' in description or \
            'carbapenemase' in description or \
            'confers carbapenem resistance' in description or \
            'hydrolyses beta-lactams including carbapenems' in description:
        return 'Bla_Carb'
    elif 'extended-spectrum' in description or \
            'extended spectrum' in description or \
            'ESBL' in description:
        if 'inhibitor-resistant' in description:
            return 'Bla_ESBL_inhR'
        else:
            return 'Bla_ESBL'
    elif 'broad-spectrum' in description or \
            'broad spectrum' in description:
        if 'inhibitor-resistant' in description:
            return 'Bla_broad_inhR'
        else:
            return 'Bla_broad'
    return 'Bla'


def allele_name_sorting_key(allele_name):
    """
    This function defines a sorting key for allele names. It puts padding zeros in front of
    numbers so 'TEM-2' will come before 'TEM-10', etc.
    """
    new_name_parts = []
    for name_part in allele_name.split('-'):
        try:
            name_part_int = int(name_part)
            new_name_parts.append('%05d' % name_part_int)
        except ValueError:
            new_name_parts.append(name_part)
    return '-'.join(new_name_parts)


if __name__ == '__main__':
    main()