File: classification.snakefile

package info (click to toggle)
qcumber 2.3.0-3
links: PTS, VCS
area: main
in suites: sid
size: 2,468 kB
sloc: python: 3,097; sh: 161; makefile: 18
file content (152 lines) | stat: -rwxr-xr-x 7,821 bytes
parent folder | download | duplicates (2)
#############################
#    READ CLASSIFICATION    #
#############################


cmap = matplotlib.cm.get_cmap('Set3')

def get_kraken_result(filename, outputfile):
    level = "p"
    kraken_counts = read_csv(str(filename), sep="\t", header=None)

    if kraken_counts[0][0]==0 and kraken_counts[1][0]=="unclassified":
        with open(outputfile,"w") as empty_img:
            pass
        return None

    kraken_counts.columns = ["count", "root", "d", "p", "c", "o", "f", "g", "s"]

    nclassified = int(sum(kraken_counts[1:]["count"]))
    pclassified =  round(100* kraken_counts[1:]["count"].sum() / kraken_counts["count"].sum(), 2)

    kraken_counts["perc"] = (100*kraken_counts["count"] / sum(kraken_counts["count"])).round(4)
    new_index = kraken_counts[level]
    for i in new_index[new_index != new_index].index:
        new_index.loc[i] = [x for x in kraken_counts.ix[i, ["root", "d", "p", "c", "o", "f", "g", "s"]] if x == x][-1]

    kraken_counts.index = list(new_index)
    kraken_counts.index.name = "Name"
    kraken_counts = kraken_counts.reset_index().groupby("Name").sum()
    kraken_counts = kraken_counts[kraken_counts["perc"]>0 ]
    kraken_counts["perc"].plot.bar(stacked = True,edgecolor='black', alpha=0.9)
    legend = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.savefig(outputfile, bbox_extra_artists=(legend,),bbox_inches="tight")
    return OrderedDict({ "kraken_img":outputfile,"#Classified": nclassified, "%Classified": pclassified , "kraken_results":kraken_counts[["perc","count"]].to_latex() })


#--------------------------------------------< RULES >-----------------------------------------------------------------#


rule kraken_classification:
    input:
        db = config["kraken_db"],
        fastq_files = get_all_reads
    output:
        report = temp(classification_path + "/{sample}.krakenreport"),
        log = log_path +  "/{sample}.kraken.log"
    log:
        log_path + "/{sample}.kraken.log"
    params:
        preload ="--preload",
        classified = classification_path + "/{sample}.classified_reads.fastq",
        unclassified = classification_path + "/{sample}.unclassified_reads.fastq"
        #type = "--fastq-input"
    threads:
        max_threads
    run:
        #fastq_files = [x for x in list(input.fastq_files) if getsize(x) != 0]
        fastq_files = []
        for fastq in list(input.fastq_files):
            with gzip.open(fastq, "r") as fastq_file:
                if len(fastq_file.readlines()) ==0:
                    shell("touch {output.report}")
                    shell("echo 'Found empty input.' > {log} ")
                else:
                    fastq_files.append(fastq)
        if len(fastq_files)!=0:
            if (config["kraken_classified_out"]):
                shell("kraken {params.preload} --threads {threads} --db {input.db} {input.fastq_files} --output {output.report} --classified-out {params.classified} --unclassified-out {params.unclassified}  2> {log}")
            else:
                shell("kraken {params.preload} --threads {threads} --db {input.db} {input.fastq_files} --output {output.report}  2> {log}")

rule kraken_translate:
    input:
        db = config["kraken_db"],
        raw_report = classification_path + "/{sample}.krakenreport"
    output:
        classification_path + "/{sample}.translated"
    shell:
        "if [ -s {input.raw_report} ]; then kraken-translate {input.raw_report} --db {input.db} --mpa-format  > {output}; else touch {output} ; fi"

rule kraken_csv:
    input:
        translated = classification_path + "/{sample}.translated",
        kraken_log = log_path + "/{sample}.kraken.log"
    output:
        classification_path + "/{sample}.csv"
    run:
        if getsize(str(input.translated)) == 0:
            with open(str(output), "w") as out:
                out.write('0\tunclassified\t\t\t\t\t\t\t')
        else:
            kraken_log = open(str((input.kraken_log)),"r")
            for line in kraken_log.readlines():
                pattern = re.match("\s+(?P<n_unclassified>\d+) sequences unclassified \((?P<p_unclassified>\d+.\d+)%\)", line)
                if pattern:
                        unclassified = pattern.group("n_unclassified")
            report = read_csv(str(input.translated), header=None, sep="\t")[1].value_counts()
            kraken_counts = DataFrame()
            kraken_counts = kraken_counts.append(
                Series({"count": unclassified, "root": "unclassified"}, name="unclassified"))
            for x in report.index:
                temp = dict([x.split("__") for x in x.split("|") if x != "root"])
                temp["count"] = report[x]
                if x == "root":
                    temp["root"] = "root"
                for missing_col in (set(["count", "root", "d", "p", "c", "o", "f", "g", "s"]) - set(temp.keys()) ):
                    temp[missing_col]=""
                kraken_counts = kraken_counts.append(Series(temp, name=x))
            kraken_counts[["count", "root", "d", "p", "c", "o", "f", "g", "s"]].to_csv(str(output), sep="\t", index=False, header=False)

rule kraken_batch_plot:
    input:
        expand(classification_path + "/{sample}.csv", sample= unique_samples.keys())
    params:
        level = "s"
    output:
        csv = classification_path + "/kraken_batch_result.csv",
        png = classification_path + "/kraken_batch.png"
    run:
        kraken_summary = DataFrame()
        for sample in sorted(list(input)):
            kraken_counts = read_csv(str(sample), sep="\t", header=None)
            kraken_counts.columns = ["count","root", "d", "p","c","o","f","g","s"]
            kraken_counts["perc"] = (kraken_counts["count"] / sum(kraken_counts["count"])).round(4) * 100
            # sort kraken_counts by perc and take the first 10 most abundant entries (possible to pass number pre config/parameter)
            kraken_counts.sort_values("perc", ascending=False, inplace=True)
            try:
                kraken_filtered = kraken_counts.head(10)
            except:
                kraken_filtered = kraken_counts
            new_index = kraken_filtered[params.level]
            for i in new_index[new_index !=new_index].index:
                new_index.loc[i] = [x for x in kraken_filtered.ix[i,["root", "d", "p","c","o","f","g","s"]] if x==x][-1]
            kraken_filtered = kraken_filtered["perc"]
            kraken_filtered.index = list(new_index)
            kraken_filtered.index.name = "Name"
            kraken_filtered = kraken_filtered.reset_index().groupby("Name").sum()
            kraken_filtered = kraken_filtered["perc"].append(Series({"other": (100 - sum(kraken_filtered["perc"]))}))
            kraken_filtered.name = re.search(classification_path + "/(?P<sample>.*).csv", str(sample)).group("sample")
            try:
                kraken_filtered.drop("unclassified", inplace = True)
                kraken_filtered.drop("root", inplace = True)
            except:
                pass #print("No column unclassified")
            kraken_summary = concat([kraken_summary, kraken_filtered], axis=1)
            # Sort again (with name=column with perc entries) such that written table for kraken batch is sorted by perc
            kraken_summary.sort_values(kraken_filtered.name, ascending=False, inplace=True)
        kraken_summary.to_csv(str(output.csv))

        kraken_summary.T.plot.bar(stacked = True,edgecolor='black', title = "Classified reads by Kraken [%]", alpha=0.9)
        legend = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.savefig(str(output.png), bbox_extra_artists=(legend,), bbox_inches='tight')