File: curate_alignment.smk

package info (click to toggle)
python-pangolearn 2022-07-09%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 184,720 kB
  • sloc: python: 801; sh: 77; makefile: 16
file content (318 lines) | stat: -rw-r--r-- 11,991 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import os
import sys
sys.path.insert(0, '/localdisk/home/s1680070/repositories/pangoLEARN')

from Bio import SeqIO

import collections
import hashlib
import collections
import csv
from pangoLEARN.training.get_lineage_positions import get_relevant_positions
from pangoLEARN.training.utils import *

from Bio import SeqIO
from datetime import date
today = date.today()
csv.field_size_limit(sys.maxsize)

repo_path = config["repo_path"].rstrip("/")
pangoLEARN_path = os.path.join(repo_path, "pangoLEARN")
pango_designation_path = os.path.join(repo_path, "pango-designation")

# config["pango_version"] = get_pango_version(pango_designation_path)

data_date = config["data_date"]
config["trim_start"] = 265
config["trim_end"] = 29674
config["lineages_csv"]=f"{pango_designation_path}/lineages.csv"
config["reference"] = f"{pangoLEARN_path}/pangoLEARN/training/reference.fasta"
config["genbank_ref"] = f"{pangoLEARN_path}/pangoLEARN/training/WH04.gb"
config["datadir"]= f"/localdisk/home/shared/raccoon-dog/{data_date}_gisaid/publish/gisaid"

rule all:
    input:
        os.path.join(config["outdir"],"alignment.filtered.fasta"),
        # os.path.join(config["outdir"],"lineage_recall_report.txt"),
        os.path.join(config["outdir"],"pangolearn.init.py"),
        os.path.join(config["outdir"],"pangolin_data.init.py"),
        os.path.join(config["outdir"],"training_summary.rf.txt"),
        os.path.join(config["outdir"],"decision_tree_rules.txt"),
        os.path.join(config["outdir"],"lineage.hash.csv")

rule make_plearn_init:
    output:
        init = os.path.join(config["outdir"],"pangolearn.init.py")
    run:
        pangolearn_new_v = config["pangolearn_version"]
        pango_version = config["pango_version"]
        with open(output.init,"w") as fw:
            fw.write(f'''_program = "pangoLEARN"
__version__ = "{pangolearn_new_v}"
PANGO_VERSION = "{pango_version}"

__all__ = ["training"]

from pangoLEARN import *
''')


rule make_pdata_init:
    output:
        init = os.path.join(config["outdir"],"pangolin_data.init.py")
    run:
        pango_version = config["pango_version"]
        with open(output.init,"w") as fw:
            fw.write(f'''_program = "pangolin_data"
__version__ = "{pango_version}"

''')

rule filter_alignment:
    input:
        csv = config["lineages_csv"],
        fasta = os.path.join(config["datadir"],f"gisaid_{data_date}_all_alignment.fa"),
        full_csv = os.path.join(config["datadir"],f"gisaid_{data_date}_all_metadata.csv")
    output:
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
        csv = os.path.join(config["outdir"],"lineages.metadata.filtered.csv"),
        csv_all = os.path.join(config["outdir"],"lineages.designated.csv")
    run:
        csv_len = 0
        seqs_len = 0
        lineages = {}
        all_lineages = {}
        all_len = 0
        with open(input.csv,"r") as f:
            for l in f:
                l = l.rstrip("\n")
                name,lineage = l.split(",")
                
                lineages[name]=lineage
                csv_len +=1
                all_lineages[name]=lineage
                all_len +=1
        with open(output.csv,"w") as fw:
            with open(output.csv_all,"w") as fw2:
                with open(input.full_csv,"r") as f:
                
                    reader = csv.DictReader(f)
                    header = reader.fieldnames

                    writer = csv.DictWriter(fw, fieldnames=header, lineterminator="\n")
                    writer.writeheader()

                    writer2 = csv.DictWriter(fw2, fieldnames=header, lineterminator="\n")
                    writer2.writeheader()

                    for row in reader:
                        name = row["sequence_name"].replace("SouthAfrica","South_Africa")
                        if name in lineages:
                            new_row = row
                            new_row["lineage"] = lineages[name]
                            writer.writerow(new_row)
                        if name in all_lineages:
                            all_row = row
                            all_row["lineage"] = all_lineages[name]
                            writer2.writerow(all_row)
        written = {}
        with open(output.fasta,"w") as fw:
            for record in SeqIO.parse(input.fasta, "fasta"):
                record.id = record.id.replace("SouthAfrica","South_Africa")
                if record.id in lineages and not record.id in written:
                    fw.write(f">{record.id}\n{record.seq}\n")

                    written[record.id]=1
                    seqs_len +=1
        
        print("Number of sequences in gisaid designated", all_len)
        print("Number of sequences going into pangolearn training",csv_len)
        print("Number of sequences found on gisaid", seqs_len)
        

rule align_with_minimap2:
    input:
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
        reference = config["reference"]
    output:
        sam = os.path.join(config["outdir"],"alignment.sam")
    shell:
        """
        minimap2 -a -x asm5 -t {workflow.cores} \
        {input.reference:q} \
        {input.fasta:q} > {output.sam:q}
        """

rule get_variants:
    input:
        sam = os.path.join(config["outdir"],"alignment.sam")
    output:
        csv = os.path.join(config["outdir"],"variants.csv") #gisaid.mutations.csv
    shell:
        """
        gofasta sam variants -t {workflow.cores} \
        --samfile {input.sam:q} \
        --reference {config[reference]} \
        --genbank {config[genbank_ref]} \
        --outfile {output.csv}
        """

rule add_lineage:
    input:
        csv = os.path.join(config["outdir"],"variants.csv"), #gisaid.mutations.csv
        lineages = os.path.join(config["outdir"],"lineages.designated.csv")
    output:
        csv = os.path.join(config["outdir"],"variants.lineages.csv")
    run:
        lineages_dict = {}
        with open(input.lineages,"r") as f:
            reader= csv.DictReader(f)
            for row in reader:
                lineages_dict[row["sequence_name"]] = row["lineage"]
        with open(output.csv, "w") as fw:
            fw.write("sequence_name,nucleotide_variants,lineage,why_excluded\n")
            with open(input.csv,"r") as f:
                for l in f:
                    l = l.strip("\n")
                    name,variants = l.split(",")
                    if name =="query":
                        pass
                    elif name in lineages_dict:
                        fw.write(f"{name},{variants},{lineages_dict[name]},\n")
                        
rule filter_metadata:
    input:
        csv = os.path.join(config["outdir"],"variants.lineages.csv"),
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta")
    output:
        csv = os.path.join(config["outdir"],"metadata.final.csv")
    run:
        in_list = {}
        for record in SeqIO.parse(input.fasta,"fasta"):
            in_list[record.id] = 1

        with open(output.csv, "w") as fw:
            fw.write("sequence_name,lineage\n")
            with open(input.csv, "r") as f:
                reader = csv.DictReader(f)
                for row in reader:
                    if row["sequence_name"] in in_list:
                        name = row["sequence_name"]
                        lineage = row["lineage"]
                        fw.write(f"{name},{lineage}\n")

rule get_relevant_postions:
    input:
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
        csv = os.path.join(config["outdir"],"metadata.final.csv"),
        reference = config["reference"]
    output:
        relevant_pos_obj = os.path.join(config["outdir"],"relevantPositions.pickle"),
    run:
        get_relevant_positions(input.csv,input.fasta,input.reference,output.relevant_pos_obj)

rule run_rf_training:
    input:
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
        csv = os.path.join(config["outdir"],"metadata.final.csv"),
        reference = config["reference"],
        relevant_pos_obj = rules.get_relevant_postions.output.relevant_pos_obj
    params:
        path_to_script = pangoLEARN_path
    output:
        headers = os.path.join(config["outdir"],"randomForestHeaders_v1.joblib"),
        model = os.path.join(config["outdir"],"randomForest_v1.joblib"),
        txt = os.path.join(config["outdir"],"training_summary.rf.txt")
    shell:
        """
        python {params.path_to_script}/pangoLEARN/training/pangoLEARNRandomForest_v1.py \
        {input.csv:q} \
        {input.fasta} \
        {input.reference:q} \
        {config[outdir]} \
        {input.relevant_pos_obj} \
        > {output.txt:q}
        """

rule run_dt_training:
    input:
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
        csv = os.path.join(config["outdir"],"metadata.final.csv"),
        reference = config["reference"],
        relevant_pos_obj = rules.get_relevant_postions.output.relevant_pos_obj
    params:
        path_to_script = pangoLEARN_path
    output:
        headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
        model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
        txt = os.path.join(config["outdir"],"training_summary.dt.txt")
    shell:
        """
        python {params.path_to_script}/pangoLEARN/training/pangoLEARNDecisionTree_v1.py \
        {input.csv:q} \
        {input.fasta} \
        {input.reference:q} \
        {config[outdir]} \
        {input.relevant_pos_obj} \
        > {output.txt:q}
        """

rule get_decisions:
    input:
        headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
        model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
        txt = rules.run_dt_training.output.txt
    params:
        path_to_script = pangoLEARN_path
    output:
        txt = os.path.join(config["outdir"],"decision_tree_rules.txt"),
        zipped = os.path.join(config["outdir"],"decision_tree_rules.zip")
    shell:
        """
        python {params.path_to_script}/pangoLEARN/training/getDecisionTreeRules.py \
        {input.model:q} {input.headers:q} {input.txt:q} \
        > {output.txt:q} && zip {output.zipped:q} {output.txt:q}
        """

# rule get_recall:
#     input:
#         txt = rules.run_rf_training.output.txt
#     params:
#         path_to_script = pangoLEARN_path
#     output:
#         txt = os.path.join(config["outdir"],"lineage_recall_report.txt")
#     shell:
#         """
#         python {params.path_to_script}/pangoLEARN/training/processOutputFile.py {input.txt} > {output.txt}
#         """

rule create_hash:
    input:
        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
        lin_designation = os.path.join(config["outdir"],"lineages.designated.csv")
    output:
        csv = os.path.join(config["outdir"],"lineage.hash.csv"),
        fasta = os.path.join(config["outdir"],"lineage.hash.fasta"),
        hashed_designations = os.path.join(config["outdir"],"designations.hash.csv")
    run:
        designated = get_dict(input.lin_designation,"sequence_name","lineage")

        hash_map,seq_hash_dict = add_to_hash(input.fasta)

        with open(output.csv,"w") as fw:
            fw.write("seq_hash,lineage\n")
            for seq_hash in hash_map:
                seq_name = hash_map[seq_hash]
                set_name = designated[seq_name]
                fw.write(f"{seq_hash},{set_name}\n")
        
        num_seqs = 0
        with open(output.hashed_designations, "w") as fw2:
            fw2.write("taxon,lineage\n")
            with open(output.fasta, "w") as fw:
                for seq in seq_hash_dict:
                    num_seqs +=1
                    fw.write(f">{seq_hash_dict[seq]}\n{seq}\n")
                    fw2.write(f"{seq_hash_dict[seq]},{designated[seq_hash_dict[seq]]}\n")

        print("Number of seqs going into training: ",f"{num_seqs}")