1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
#############################
# READ CLASSIFICATION #
#############################
cmap = matplotlib.cm.get_cmap('Set3')
def get_kraken_result(filename, outputfile):
level = "p"
kraken_counts = read_csv(str(filename), sep="\t", header=None)
if kraken_counts[0][0]==0 and kraken_counts[1][0]=="unclassified":
with open(outputfile,"w") as empty_img:
pass
return None
kraken_counts.columns = ["count", "root", "d", "p", "c", "o", "f", "g", "s"]
nclassified = int(sum(kraken_counts[1:]["count"]))
pclassified = round(100* kraken_counts[1:]["count"].sum() / kraken_counts["count"].sum(), 2)
kraken_counts["perc"] = (100*kraken_counts["count"] / sum(kraken_counts["count"])).round(4)
new_index = kraken_counts[level]
for i in new_index[new_index != new_index].index:
new_index.loc[i] = [x for x in kraken_counts.ix[i, ["root", "d", "p", "c", "o", "f", "g", "s"]] if x == x][-1]
kraken_counts.index = list(new_index)
kraken_counts.index.name = "Name"
kraken_counts = kraken_counts.reset_index().groupby("Name").sum()
kraken_counts = kraken_counts[kraken_counts["perc"]>0 ]
kraken_counts["perc"].plot.bar(stacked = True,edgecolor='black', alpha=0.9)
legend = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig(outputfile, bbox_extra_artists=(legend,),bbox_inches="tight")
return OrderedDict({ "kraken_img":outputfile,"#Classified": nclassified, "%Classified": pclassified , "kraken_results":kraken_counts[["perc","count"]].to_latex() })
#--------------------------------------------< RULES >-----------------------------------------------------------------#
rule kraken_classification:
input:
db = config["kraken_db"],
fastq_files = get_all_reads
output:
report = temp(classification_path + "/{sample}.krakenreport"),
log = log_path + "/{sample}.kraken.log"
log:
log_path + "/{sample}.kraken.log"
params:
preload ="--preload",
classified = classification_path + "/{sample}.classified_reads.fastq",
unclassified = classification_path + "/{sample}.unclassified_reads.fastq"
#type = "--fastq-input"
threads:
max_threads
run:
#fastq_files = [x for x in list(input.fastq_files) if getsize(x) != 0]
fastq_files = []
for fastq in list(input.fastq_files):
with gzip.open(fastq, "r") as fastq_file:
if len(fastq_file.readlines()) ==0:
shell("touch {output.report}")
shell("echo 'Found empty input.' > {log} ")
else:
fastq_files.append(fastq)
if len(fastq_files)!=0:
if (config["kraken_classified_out"]):
shell("kraken {params.preload} --threads {threads} --db {input.db} {input.fastq_files} --output {output.report} --classified-out {params.classified} --unclassified-out {params.unclassified} 2> {log}")
else:
shell("kraken {params.preload} --threads {threads} --db {input.db} {input.fastq_files} --output {output.report} 2> {log}")
rule kraken_translate:
input:
db = config["kraken_db"],
raw_report = classification_path + "/{sample}.krakenreport"
output:
classification_path + "/{sample}.translated"
shell:
"if [ -s {input.raw_report} ]; then kraken-translate {input.raw_report} --db {input.db} --mpa-format > {output}; else touch {output} ; fi"
rule kraken_csv:
input:
translated = classification_path + "/{sample}.translated",
kraken_log = log_path + "/{sample}.kraken.log"
output:
classification_path + "/{sample}.csv"
run:
if getsize(str(input.translated)) == 0:
with open(str(output), "w") as out:
out.write('0\tunclassified\t\t\t\t\t\t\t')
else:
kraken_log = open(str((input.kraken_log)),"r")
for line in kraken_log.readlines():
pattern = re.match("\s+(?P<n_unclassified>\d+) sequences unclassified \((?P<p_unclassified>\d+.\d+)%\)", line)
if pattern:
unclassified = pattern.group("n_unclassified")
report = read_csv(str(input.translated), header=None, sep="\t")[1].value_counts()
kraken_counts = DataFrame()
kraken_counts = kraken_counts.append(
Series({"count": unclassified, "root": "unclassified"}, name="unclassified"))
for x in report.index:
temp = dict([x.split("__") for x in x.split("|") if x != "root"])
temp["count"] = report[x]
if x == "root":
temp["root"] = "root"
for missing_col in (set(["count", "root", "d", "p", "c", "o", "f", "g", "s"]) - set(temp.keys()) ):
temp[missing_col]=""
kraken_counts = kraken_counts.append(Series(temp, name=x))
kraken_counts[["count", "root", "d", "p", "c", "o", "f", "g", "s"]].to_csv(str(output), sep="\t", index=False, header=False)
rule kraken_batch_plot:
input:
expand(classification_path + "/{sample}.csv", sample= unique_samples.keys())
params:
level = "s"
output:
csv = classification_path + "/kraken_batch_result.csv",
png = classification_path + "/kraken_batch.png"
run:
kraken_summary = DataFrame()
for sample in sorted(list(input)):
kraken_counts = read_csv(str(sample), sep="\t", header=None)
kraken_counts.columns = ["count","root", "d", "p","c","o","f","g","s"]
kraken_counts["perc"] = (kraken_counts["count"] / sum(kraken_counts["count"])).round(4) * 100
# sort kraken_counts by perc and take the first 10 most abundant entries (possible to pass number pre config/parameter)
kraken_counts.sort_values("perc", ascending=False, inplace=True)
try:
kraken_filtered = kraken_counts.head(10)
except:
kraken_filtered = kraken_counts
new_index = kraken_filtered[params.level]
for i in new_index[new_index !=new_index].index:
new_index.loc[i] = [x for x in kraken_filtered.ix[i,["root", "d", "p","c","o","f","g","s"]] if x==x][-1]
kraken_filtered = kraken_filtered["perc"]
kraken_filtered.index = list(new_index)
kraken_filtered.index.name = "Name"
kraken_filtered = kraken_filtered.reset_index().groupby("Name").sum()
kraken_filtered = kraken_filtered["perc"].append(Series({"other": (100 - sum(kraken_filtered["perc"]))}))
kraken_filtered.name = re.search(classification_path + "/(?P<sample>.*).csv", str(sample)).group("sample")
try:
kraken_filtered.drop("unclassified", inplace = True)
kraken_filtered.drop("root", inplace = True)
except:
pass #print("No column unclassified")
kraken_summary = concat([kraken_summary, kraken_filtered], axis=1)
# Sort again (with name=column with perc entries) such that written table for kraken batch is sorted by perc
kraken_summary.sort_values(kraken_filtered.name, ascending=False, inplace=True)
kraken_summary.to_csv(str(output.csv))
kraken_summary.T.plot.bar(stacked = True,edgecolor='black', title = "Classified reads by Kraken [%]", alpha=0.9)
legend = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig(str(output.png), bbox_extra_artists=(legend,), bbox_inches='tight')
|