File: classification.snakefile

package info (click to toggle)
qcumber 2.3.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 2,276 kB
  • sloc: python: 3,097; sh: 153; makefile: 18
file content (152 lines) | stat: -rwxr-xr-x 7,821 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#############################
#    READ CLASSIFICATION    #
#############################


cmap = matplotlib.cm.get_cmap('Set3')

def get_kraken_result(filename, outputfile):
    level = "p"
    kraken_counts = read_csv(str(filename), sep="\t", header=None)

    if kraken_counts[0][0]==0 and kraken_counts[1][0]=="unclassified":
        with open(outputfile,"w") as empty_img:
            pass
        return None

    kraken_counts.columns = ["count", "root", "d", "p", "c", "o", "f", "g", "s"]

    nclassified = int(sum(kraken_counts[1:]["count"]))
    pclassified =  round(100* kraken_counts[1:]["count"].sum() / kraken_counts["count"].sum(), 2)

    kraken_counts["perc"] = (100*kraken_counts["count"] / sum(kraken_counts["count"])).round(4)
    new_index = kraken_counts[level]
    for i in new_index[new_index != new_index].index:
        new_index.loc[i] = [x for x in kraken_counts.ix[i, ["root", "d", "p", "c", "o", "f", "g", "s"]] if x == x][-1]

    kraken_counts.index = list(new_index)
    kraken_counts.index.name = "Name"
    kraken_counts = kraken_counts.reset_index().groupby("Name").sum()
    kraken_counts = kraken_counts[kraken_counts["perc"]>0 ]
    kraken_counts["perc"].plot.bar(stacked = True,edgecolor='black', alpha=0.9)
    legend = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.savefig(outputfile, bbox_extra_artists=(legend,),bbox_inches="tight")
    return OrderedDict({ "kraken_img":outputfile,"#Classified": nclassified, "%Classified": pclassified , "kraken_results":kraken_counts[["perc","count"]].to_latex() })


#--------------------------------------------< RULES >-----------------------------------------------------------------#


rule kraken_classification:
    input:
        db = config["kraken_db"],
        fastq_files = get_all_reads
    output:
        report = temp(classification_path + "/{sample}.krakenreport"),
        log = log_path +  "/{sample}.kraken.log"
    log:
        log_path + "/{sample}.kraken.log"
    params:
        preload ="--preload",
        classified = classification_path + "/{sample}.classified_reads.fastq",
        unclassified = classification_path + "/{sample}.unclassified_reads.fastq"
        #type = "--fastq-input"
    threads:
        max_threads
    run:
        #fastq_files = [x for x in list(input.fastq_files) if getsize(x) != 0]
        fastq_files = []
        for fastq in list(input.fastq_files):
            with gzip.open(fastq, "r") as fastq_file:
                if len(fastq_file.readlines()) ==0:
                    shell("touch {output.report}")
                    shell("echo 'Found empty input.' > {log} ")
                else:
                    fastq_files.append(fastq)
        if len(fastq_files)!=0:
            if (config["kraken_classified_out"]):
                shell("kraken {params.preload} --threads {threads} --db {input.db} {input.fastq_files} --output {output.report} --classified-out {params.classified} --unclassified-out {params.unclassified}  2> {log}")
            else:
                shell("kraken {params.preload} --threads {threads} --db {input.db} {input.fastq_files} --output {output.report}  2> {log}")

rule kraken_translate:
    input:
        db = config["kraken_db"],
        raw_report = classification_path + "/{sample}.krakenreport"
    output:
        classification_path + "/{sample}.translated"
    shell:
        "if [ -s {input.raw_report} ]; then kraken-translate {input.raw_report} --db {input.db} --mpa-format  > {output}; else touch {output} ; fi"

rule kraken_csv:
    input:
        translated = classification_path + "/{sample}.translated",
        kraken_log = log_path + "/{sample}.kraken.log"
    output:
        classification_path + "/{sample}.csv"
    run:
        if getsize(str(input.translated)) == 0:
            with open(str(output), "w") as out:
                out.write('0\tunclassified\t\t\t\t\t\t\t')
        else:
            kraken_log = open(str((input.kraken_log)),"r")
            for line in kraken_log.readlines():
                pattern = re.match("\s+(?P<n_unclassified>\d+) sequences unclassified \((?P<p_unclassified>\d+.\d+)%\)", line)
                if pattern:
                        unclassified = pattern.group("n_unclassified")
            report = read_csv(str(input.translated), header=None, sep="\t")[1].value_counts()
            kraken_counts = DataFrame()
            kraken_counts = kraken_counts.append(
                Series({"count": unclassified, "root": "unclassified"}, name="unclassified"))
            for x in report.index:
                temp = dict([x.split("__") for x in x.split("|") if x != "root"])
                temp["count"] = report[x]
                if x == "root":
                    temp["root"] = "root"
                for missing_col in (set(["count", "root", "d", "p", "c", "o", "f", "g", "s"]) - set(temp.keys()) ):
                    temp[missing_col]=""
                kraken_counts = kraken_counts.append(Series(temp, name=x))
            kraken_counts[["count", "root", "d", "p", "c", "o", "f", "g", "s"]].to_csv(str(output), sep="\t", index=False, header=False)

rule kraken_batch_plot:
    input:
        expand(classification_path + "/{sample}.csv", sample= unique_samples.keys())
    params:
        level = "s"
    output:
        csv = classification_path + "/kraken_batch_result.csv",
        png = classification_path + "/kraken_batch.png"
    run:
        kraken_summary = DataFrame()
        for sample in sorted(list(input)):
            kraken_counts = read_csv(str(sample), sep="\t", header=None)
            kraken_counts.columns = ["count","root", "d", "p","c","o","f","g","s"]
            kraken_counts["perc"] = (kraken_counts["count"] / sum(kraken_counts["count"])).round(4) * 100
            # sort kraken_counts by perc and take the first 10 most abundant entries (possible to pass number pre config/parameter)
            kraken_counts.sort_values("perc", ascending=False, inplace=True)
            try:
                kraken_filtered = kraken_counts.head(10)
            except:
                kraken_filtered = kraken_counts
            new_index = kraken_filtered[params.level]
            for i in new_index[new_index !=new_index].index:
                new_index.loc[i] = [x for x in kraken_filtered.ix[i,["root", "d", "p","c","o","f","g","s"]] if x==x][-1]
            kraken_filtered = kraken_filtered["perc"]
            kraken_filtered.index = list(new_index)
            kraken_filtered.index.name = "Name"
            kraken_filtered = kraken_filtered.reset_index().groupby("Name").sum()
            kraken_filtered = kraken_filtered["perc"].append(Series({"other": (100 - sum(kraken_filtered["perc"]))}))
            kraken_filtered.name = re.search(classification_path + "/(?P<sample>.*).csv", str(sample)).group("sample")
            try:
                kraken_filtered.drop("unclassified", inplace = True)
                kraken_filtered.drop("root", inplace = True)
            except:
                pass #print("No column unclassified")
            kraken_summary = concat([kraken_summary, kraken_filtered], axis=1)
            # Sort again (with name=column with perc entries) such that written table for kraken batch is sorted by perc
            kraken_summary.sort_values(kraken_filtered.name, ascending=False, inplace=True)
        kraken_summary.to_csv(str(output.csv))

        kraken_summary.T.plot.bar(stacked = True,edgecolor='black', title = "Classified reads by Kraken [%]", alpha=0.9)
        legend = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.savefig(str(output.png), bbox_extra_artists=(legend,), bbox_inches='tight')