File: fparseJSONtoRESULTS.py

package info (click to toggle)
resfinder 4.4.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,236 kB
  • sloc: python: 5,726; sh: 121; makefile: 18
file content (414 lines) | stat: -rw-r--r-- 20,662 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#!/usr/bin/env python3

import json
import sys
import os
import pandas as pd
from tabulate import tabulate
from argparse import ArgumentParser

# JSON class
class WriteResultFile:
    """Recreation of all ResFinder result files from ResFinder json file."""
    def __init__(self, json_file, db_path, outdir, sample, mp):
        self.json_file = json_file
        self.db_point_path = db_path
        self.outdir = outdir
        self.sample = sample
        self.mapping_method = mp
        # Load json data and check format
        self.data = WriteResultFile.open_and_read_json(self)
        WriteResultFile.check_json_std_format(self)
        # Load global variable
        self.species = self.data.get("provided_species")
        self.gene_lst= WriteResultFile.point_gene_list(self)

    def parseJSON(self):
        """
        Iterating through seq_regions and seq_variations classes in json file to extract information for result files.
        All Resfinder result files will be written in this function.
        """

        # Extract databases to list
        self.db_list = WriteResultFile.extract_databases(self)
        # Extract phenotypes to dict
        self.phenotype_dict,self.pheno_dict,self._pheno_species_dict = WriteResultFile.read_phenotypes(self)
        
        # Initiate result files
        self.ResFinder_results_tab = open(self.outdir + "/ResFinder_results_tab.txt", "w")
        self.ResFinder_results_tab.write("Resistance gene\tIdentity\tAlignment Length/Gene Length\tCoverage\t" +
        "Position in reference\tContig or Depth\tPosition in contig\tPhenotype\tAccession no.\tNotes\n")
        self.ResFinder_disinf_results_tab = open(self.outdir + "/ResFinder_disinf_results_tab.txt", "w")
        self.ResFinder_disinf_results_tab.write("Resistance gene\tIdentity\tAlignment Length/Gene Length\tCoverage\t" +
        "Position in reference\tContig or Depth\tPosition in contig\tPhenotype\tAccession no.\tNotes\n")
        self.PointFinder_results_tab = open(self.outdir + "/PointFinder_results.txt", "w")
        self.PointFinder_results_tab.write("Mutation\tIdentity\tNucleotide change\tAmino acid change\tResistance\tPMID\n")
        self.ResFinder_phenotable = open(self.outdir + "/pheno_table.txt", "w")
        self.ResFinder_phenotable.write("# ResFinder phenotype results.\n")
        self.ResFinder_phenotable = open(self.outdir + "/pheno_table.txt", "w")
        self.ResFinder_phenotable.write("# ResFinder phenotype results.\n")
        self.ResFinder_phenotable_species = open(self.outdir + "/pheno_table_" + self.species +".txt", "w")
        self.ResFinder_phenotable_species.write("# ResFinder phenotype results for " + self.species + ".\n")
        
        
        # Iterate through the json seq regions class
        self.res_matrix = []
        self.point_gene_hits = set()
        for j in self.data["seq_regions"].items():

            self.data_row = []
            if j[1].get("ref_database")[0] == self.db_list[0]:     # pointfinder hit
                self.point_gene_hits.add(j[1].get("name"))
            
            elif j[1].get("ref_database")[0] == self.db_list[1] or j[1].get("ref_database")[0] == self.db_list[2]:

                # Add pheno_table data from Resfinder results
                if float(j[1].get("identity")) == 100.0:
                    if j[1].get("alignment_length") == j[1].get("ref_seq_lenght"):
                        for phenotype in j[1].get("phenotypes"):
                            if phenotype in self.pheno_species_dict:
                                self.pheno_species_dict[phenotype][2].append(3)
                                self.pheno_species_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                            self.pheno_dict[phenotype][2].append(3)
                            self.pheno_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                    elif j[1].get("alignment_length") < j[1].get("ref_seq_lenght"):
                        for phenotype in j[1].get("phenotypes"):
                            if phenotype in self.pheno_species_dict:
                                self.pheno_species_dict[phenotype][2].append(1)
                                self.pheno_species_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                            self.pheno_dict[phenotype][2].append(1)
                            self.pheno_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                elif float(j[1].get("identity")) < 100.0:
                    if j[1].get("alignment_length") == j[1].get("ref_seq_lenght"):
                        for phenotype in j[1].get("phenotypes"):
                            if phenotype in self.pheno_species_dict:
                                self.pheno_species_dict[phenotype][2].append(2)
                                self.pheno_species_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                            self.pheno_dict[phenotype][2].append(2)
                            self.pheno_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                    elif j[1].get("alignment_length") < j[1].get("ref_seq_lenght"):
                        for phenotype in j[1].get("phenotypes"):
                            if phenotype in self.pheno_species_dict:
                                self.pheno_species_dict[phenotype][2].append(1)
                                self.pheno_species_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                            self.pheno_dict[phenotype][2].append(1)
                            self.pheno_dict[phenotype][3].append("{} ({})".format(j[1].get("name"),j[1].get("ref_id")))
                
                # Extract informations for tab and txt
                self.data_row.append(j[1].get("name"))
                self.data_row.append(j[1].get("identity"))
                self.data_row.append(str(j[1].get("alignment_length")) + ".."+ str(j[1].get("ref_seq_lenght")))
                self.data_row.append(j[1].get("coverage"))
                self.data_row.append(str(j[1].get("ref_start_pos")) + ".."+ str(j[1].get("ref_end_pos")))
                if self.mapping_method == "kma":
                    self.data_row.append(j[1].get("depth"))
                elif self.mapping_method == "blast":
                    self.data_row.append(j[1].get("query_id"))
                self.data_row.append(str(j[1].get("query_start_pos")) + ".."+ str(j[1].get("query_end_pos")))
                phenotypes = set()
                for pheno in j[1].get("phenotypes"):
                    if pheno in self.phenotype_dict:
                        phenotypes.add(str(self.phenotype_dict.get(j[1].get("phenotypes")[0]))[2:-2].capitalize())
                for i in phenotypes:
                    self.data_row.append(i + " resistance")
                self.data_row.append(j[1].get("ref_acc"))
                self.data_row.append(j[1].get("notes")[0])
                
                if j[1].get("ref_database")[0] == self.db_list[1]:  # resfinder hit
                    self.res_matrix += [self.data_row]
                    # Write Resfinder tab result file
                    WriteResultFile.write_tab(self, self.ResFinder_results_tab)
                elif j[1].get("ref_database")[0] == self.db_list[2]: # disinfectant hit
                    # Write to Disinfectant tab result file
                    WriteResultFile.write_tab(self, self.ResFinder_disinf_results_tab )
                
            else:
                print("Error: unknown reference database has excluded seq region:" + j[0])
        self.ResFinder_results_tab.close()

        # Write pheno_table files
        WriteResultFile.write_pheno_table(self, self.pheno_dict, self.ResFinder_phenotable)
        WriteResultFile.write_pheno_table(self, self.pheno_species_dict, self.ResFinder_phenotable_species)

        # Write Resfinder result txt
        WriteResultFile.write_txt(self)

        # Iterate through the json sequence variations class
        self.point_matrix = []
        self.gene_set = {}
        for j in self.data["seq_variations"].items():

            # Add genes with pointfinder hits
            self.gene_set.add(j[0].split("_",1)[0])

            # Store data if any phenotype
            self.data_row = []
            if j[1].get("phenotypes"):
                self.data_row.append(j[1].get("genes")[0] + j[1].get("seq_var"))
                # RNA mutations
                if j[0].split("_",1)[0] in ("16S","23S"):
                    self.data_row.append(j[1].get("ref_codon").upper() + "->"+ j[1].get("var_codon").upper())
                    self.data_row.append("RNA mutations")
                else:
                    # DNA mutations
                    self.data_row.append(j[1].get("codon_change").upper())
                    self.data_row.append(j[1].get("ref_aa").upper() + "->"+ j[1].get("var_aa").upper())
                self.data_row.append(j[1].get("phenotype").capitalize())
                self.data_row.append(j[1].get("pmids"))
                self.point_matrix += [self.data_row]
                
                # Write PointFinder tab result file
                WriteResultFile.write_tab(self, self.PointFinder_results_tab)
        self.PointFinder_results_tab.close()
        
        # Write PointFinder_table.txt result file
        WriteResultFile.write_point_table(self)

        
    def open_and_read_json(self):
        """Open and load data from json file"""
        self.f = open(self.json_file, 'r')
        data = json.load(self.f)
        self.f.close()
        return data

    def check_json_std_format(self):
        """Checks if all classes in the std ResFinder json format is contained in the json infile."""
        json_file_keys = set(self.data.keys())
        json_std_format_keys = {"type", "databases", "seq_regions", "seq_variations",
        "phenotypes", "software_name", "software_version", "software_commit", "run_date",
        "key", "provided_species" , "result_summary"}
        if not json_file_keys.union(json_std_format_keys):
            sys.exit("Error: json file does not follow the std format.")

    def extract_databases(self):
        """Store reference databases from databases class. Returns a list of database names"""
        self.db_list = []
        for db in self.data.get("databases").keys():
            if db.startswith("P"):      # PointFinder DB
                self.db_list.insert(0,db)
            elif db.startswith("R"):    # ResFinder DB
                self.db_list.insert(1,db)
            elif db.startswith("D"):    # DisinFinder DB
                self.db_list.insert(2,db)
            else:
                return print("Error: Unknown database in json file: " + db)
        return self.db_list

    def read_phenotypes(self):
        """
        Store phenotypes predicted in phneotype class in dicts
        Returns 3 dicts:
            phenotype_dict for Pointfinder and Resfinder hits
            pheno_dict for pheno table overview
            self.pheno_species_dict for species specific pheno table overview
        """
        self.phenotype_dict = {}
        self.pheno_dict = {}
        self.pheno_species_dict = {}
        for j in self.data["phenotypes"].items():

            if j[1].get("amr_resistant"):
                # store data for pheno_table
                if j[1].get("amr_species_relevant"):
                    self.pheno_species_dict[j[1].get("amr_resistance")] = [j[1].get("amr_classes")[0],"Resistant",[],[]]
                    self.pheno_dict[j[1].get("amr_resistance")] = [j[1].get("amr_classes")[0],"Resistant",[],[]]
                else:
                    self.pheno_dict[j[1].get("amr_resistance")] = [j[1].get("amr_classes")[0],"Resistant",[],[]]
                
                if j[1].get("ref_database") == self.db_list[0]:
                    # store phenotype for pointfinder results
                    for k in range(len(j[1].get("seq_variations"))):
                        self.phenotype_dict[j[1].get("seq_variations")[k]] = j[1].get("amr_resistance")
                else:
                    # store phenotype
                    self.phenotype_dict[j[1].get("amr_resistance")] = j[1].get("amr_classes")
            else:
                # store data for pheno_table
                if j[1].get("amr_species_relevant"):
                    self.pheno_species_dict[j[1].get("amr_resistance")] = [j[1].get("amr_classes")[0],"No resistance",[0],[]]
                    self.pheno_dict[j[1].get("amr_resistance")] = [j[1].get("amr_classes")[0],"No resistance",[0],[]]
                else:
                    self.pheno_dict[j[1].get("amr_resistance")] = [j[1].get("amr_classes")[0],"No resistance",[0],[]]

        return self.phenotype_dict, self.pheno_dict, self.pheno_species_dict
    
    def group_matrix(self):
        """Group Pointfinder hits by genes"""
        self.res = {i: [] for i in self.gene_lst}
        for row in self.point_matrix:
            self.res[row[0].split()[0]].append(row)
        return self.res
     
    def write_point_table(self):
        self.PointFinder_table = open(self.outdir + "/PointFinder_table.txt", "w")
        self.PointFinder_table.write("Chromosomal point mutations - Results\nSpecies: {}\nGenes: {}" + 
        "\nMapping methode: {}\n\n\nKnown Mutations\n\n".format(self.species,", ".join(sorted(self.gene_lst)),self.mapping_method))
        res = WriteResultFile.group_matrix(self)
        for gene in res.keys():
            self.PointFinder_table.write(gene + "\n")
            if self.res.get(gene):
                self.PointFinder_table.write("Mutation\tIdentity\tNucleotide change\tAmino acid change\tResistance\tPMID\n")
                WriteResultFile.write_tab(self, self.PointFinder_table)
                self.PointFinder_table.write("\n")
            elif gene in self.point_gene_hits and gene not in self.gene_set:
                self.PointFinder_table.write("No known mutations found in {}\n\n".format(gene))
            else:
                self.PointFinder_table.write("No hit found\n\n")
        self.PointFinder_table.close()

    def write_pheno_table(self, dict, outfile):
        """Write pheno table files from dict."""
        outfile.write("#\n# " + self.sample + "\n" +
        "#The phenotype 'No resistance' should be interpreted with\n" +
        "#caution, as it only means that nothing in the used\n" +
        "# database indicate resistance, but resistance could exist\n" +
        "# from 'unknown' or not yet implemented sources.\n#\n" +
        "# The 'Match' column stores one of the integers 0, 1, 2, 3.\n" +
        "#      0: No match found\n" +
        "#      1: Match < 100% ID AND match length < ref length or Match = 100% ID AND match length < ref length\n" +
        "#      2: Match < 100% ID AND match length = ref length\n" +
        "#      3: Match = 100% ID AND match length = ref length\n" +
        "# If several hits causing the same resistance are found,\n" +
        "# the highest number will be stored in the 'Match' column.\n\n" +
        "# Antimicrobial\tClass\tWGS-predicted phenotype\tMatch\tGenetic background\n")
        for phenotype in dict.keys():
            values = dict.get(phenotype)
            outfile.write("{}\t{}\t{}\t{}\t{}\n".format(phenotype,values[0],values[1],max(values[2]),(", ".join(sorted(values[3])))))
        outfile.close()

    def write_tab(self, outfile):
        """Write a list into a tabseperated file."""
        outfile.write("\t".join(map(str,self.data_row)) + "\n")

    def write_txt(self):
        """Write ResFinder_results.txt from a matrix A=[[]] grouped by phneotypes."""
        df = pd.DataFrame(self.res_matrix, columns=["Resistance gene","Identity",
        "Alignment Length/Gene Length","Coverage","Position in reference",
        "Contig or Depth","Position in contig","Phenotype","Accession no.","Notes"])
        # group by phenotype
        grouped_df = df.sort_values(by=['Phenotype']).groupby(['Phenotype'])

        txt_str = ""
        for key, item in grouped_df:
            title = key
            headers = list(grouped_df.get_group(title).columns)
            rows = list(grouped_df.get_group(title).values)
            table = WriteResultFile.text_table(title, headers, rows)
            txt_str += table

        # print entire table
        with open(self.outdir + "/ResFinder_results.txt", "w") as f:
            f.write(txt_str)

    def text_table(title, headers, rows, table_format='psql'):
        ''' Create text table

        USAGE:
            >>> from tabulate import tabulate
            >>> title = 'My Title'
            >>> headers = ['A','B']
            >>> rows = [[1,2],[3,4]]
            >>> print(text_table(title, headers, rows))
            +-----------+
            | My Title  |
            +-----+-----+
            |    A |    B |
            +=====+=====+
            |    1 |    2 |
            |    3 |    4 |
            +-----+-----+
        '''
        # Create table
        table = tabulate(rows, headers, tablefmt=table_format)
        # Prepare title injection
        width = len(table.split('\n')[0])
        tlen = len(title)
        if tlen + 4 > width:
            # Truncate oversized titles
            tlen = width - 4
            title = title[:tlen]
        spaces = width - 2 - tlen
        left_spacer = ' ' * int(spaces / 2)
        right_spacer = ' ' * (spaces - len(left_spacer))
        # Update table with title
        table = '\n'.join(['+%s+' % ('-' * (width - 2)),
                          '|%s%s%s|' % (left_spacer,
                          title, right_spacer),
                          table, '\n'])
        return table
    
    def point_gene_list(self):
        """Extract genes from resistance overview in Pointfinder 
        for relevant species. A list with genes is returned"""
        gene_set = set()
        file = os.path.join(self.db_point_path, self.species, "resistens-overview.txt")
        with open(file) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                gene = line.split()[0].split("_", maxsplit=1)[0]
                gene_set.add(gene)
        self.gene_lst = list(gene_set)
        return self.gene_lst


##########################################################################
# PARSE COMMAND LINE OPTIONS
##########################################################################

parser = ArgumentParser()
parser.add_argument("-i", "--infile",
                    help="JSON file from ResFinder",
                    required=True)
parser.add_argument("-o", "--outdir",
                    help="Path to output directory",
                    default=".")
parser.add_argument("-db_point", "--db_path_point",
                    help=("Path to the databases for PointFinder."),
                    required=True, default=None)
parser.add_argument("-mp", "--mapping_method",
                    help="Mapping method used for ResFinder run, either KMA or BLAST",
                    required=True)
parser.add_argument("-s", "--sample_name",
                    help="Name of sample file ran with ResFinder",
                    required=True)

args = parser.parse_args()

# Load and check input file(s)
if args.infile is None:
    sys.exit("Input Error: No input file provided!\n")
infile = args.infile
if not os.path.exists(infile):
    sys.exit("Input Error: Input file not found at expected location: %s"%(infile))

# Check if valid output directory is provided
if args.outdir:
    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
else:
    outdir = os.getcwd()

# Check if valid database is provided
if args.db_path_point:
    db_path = os.path.abspath(args.db_path_point)
    if not os.path.exists(args.db_path_point):
       sys.exit("Input Error: The specified database directory does not "
                "exist:" + db_path + "\n") 
# Check existence of the resistens-overview file in a species
res_overview_file = "%s/campylobacter/resistens-overview.txt" % (db_path)
if not os.path.isfile(res_overview_file):
    sys.exit("Resistance-overview.txt not found at expected location: %s"%(res_overview_file))


# Defining varibales
method_path = args.mapping_method
sample = args.sample_name


# Call JSON class to recreate ResFinder result files
results = WriteResultFile(infile, db_path, outdir, sample, method_path)
results.check_json_std_format()
results.parseJSON()