File: sav.snakefile

package info (click to toggle)
qcumber 2.3.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 2,276 kB
  • sloc: python: 3,097; sh: 153; makefile: 18
file content (101 lines) | stat: -rwxr-xr-x 6,775 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
def plot_data_by_cycle(metrics, x, y, title, output, alt_ylabel, caption ):
    df = metrics.df[[x, "%s_A" % y, "%s_C" % y, "%s_T" % y, "%s_G" % y]]
    df.columns = [x.replace(y + "_", "") for x in df.columns]
    df = df.groupby(x).mean()
    y = alt_ylabel
    df.columns.name = y
    df.index.name = x
    df.plot(title=title + " - %s" % df.columns.name, legend=True)
    plt.savefig(output)
    plt.close()
    return {"title" : title + " - %s" % df.columns.name,
            "caption" : caption,
            "base64" : to_base64(output) }

def plot_data_by_lane(metrics, codes, x, y, group, title, output, caption ):
    df = metrics.df[metrics.df.code.isin(codes.keys())]
    df = df[["code", "lane", "value"]]
    for key in codes.keys():
        df.code = df.code.replace(key, codes[key])
    df.index = MultiIndex.from_tuples(list(zip(df["code"], df["lane"])))
    df = df[df.value > 0].append(df[df.value == 0].drop_duplicates())
    df.columns = [group, x, y]
    ax = sns.boxplot(x=x, y=y, hue=group, data=df).set_title(title)
    plt.savefig(output)
    plt.close()
    return {"title" : title,
            "caption" : caption,
            "base64" : to_base64(output)
            }

def get_sav_input(savfolder):
    steps = {}
    infiles = ["CompletedJobInfo.xml", "RunInfo.xml"]
    interopfiles = ["ControlMetricsOut.bin", "CorrectedIntMetricsOut.bin", "ErrorMetricsOut.bin", "ExtractionMetricsOut.bin", "IndexMetricsOut.bin", "QMetricsOut.bin", "TileMetricsOut.bin"]
    steps["savfolder"] = savfolder
    steps["infiles"] = expand("{savfolder}/{infiles}", savfolder=savfolder, infiles=infiles)
    steps["interopfiles"] = expand("{savfolder}/InterOp/{interopfiles}", savfolder=savfolder, interopfiles=interopfiles)
    return steps


if config["sav"]:
    sav_results =  data_path + "/sav.json"

    rule plot_sav:
        input:
            **get_sav_input(config["sav"])
        output:
            #unpack(sav_results)
            pdf = main_path + "/SAV.pdf",
            plots = expand(data_path + "/{img}.png", img = ["data_by_cycle_base","data_by_cycle_fwhm","data_by_cycle_intensity",
                "data_by_lane_phasing","data_by_lane_prephasing", "data_by_lane_cluster", "qscore_distr" , "qscore_heatmap"]),
            json = sav_results
        log:
            log_path + "/sav_report.log"
        run:
            import xmltodict
            shell("Rscript --vanilla {path}/Rscripts/sav.R {input} {outfolder}", path = geninfo_config["QCumber_path"], outfolder = data_path, input=input.savfolder)
            sav_plots =[]
            sav_plots.append(
                {"title": "Data by Cycle - FWHM", "caption": "The average full width of clusters at half maximum (in pixels).",
                 "base64": to_base64(data_path + "/data_by_cycle_fwhm.png"), "filename": data_path + "/data_by_cycle_fwhm.png" })
            sav_plots.append({"title": "Data by Cycle - intensity",
                              "caption": "This plot shows the intensity by color of the 90% percentile of the data for each cycle.",
                              "base64": to_base64(data_path + "/data_by_cycle_intensity.png"), "filename": data_path + "/data_by_cycle_intensity.png"})
            sav_plots.append({"title": "Data by Cycle - %Base",
                              "caption": "The percentage of clusters for which the selected base has been called.",
                              "base64": to_base64(data_path + "/data_by_cycle_base.png"), "filename": data_path + "/data_by_cycle_base.png"})
            sav_plots.append({"title": "Data by Cycle - %>=Q30",
                              "caption": 'The percentage of bases with a quality score of 30 or higher, respectively. This chart is generated after the 25th cycle, and the values represent the current cycle.',
                              "base64": to_base64(data_path + "/qscore_q30.png"), "filename": data_path + "/qscore_q30.png"})
            sav_plots.append({"title": "Data by Lane - %Phasing",
                              "caption": 'The percentage of molecules in a cluster for which sequencing falls behind (phasing) the current cycle within a read The graph is split out per read.',
                              "base64": to_base64(data_path + "/data_by_lane_phasing.png"), "filename": data_path + "/data_by_lane_phasing.png"})
            sav_plots.append({"title": "Data by Lane - %Prephasing",
                      "caption": 'The percentage of molecules in a cluster for which sequencing falls behind (phasing) the current cycle within a read The graph is split out per read.',
                      "base64": to_base64(data_path + "/data_by_lane_prephasing.png"), "filename": data_path + "/data_by_lane_prephasing.png"})
            sav_plots.append({"title": "Data by Lane - Cluster density",
              "caption": 'The density of clusters for each tile (in thousand per mm2)',
              "base64": to_base64(data_path + "/data_by_lane_cluster.png"), "filename": data_path + "/data_by_lane_cluster.png"})
            sav_plots.append({"title": "QScore Heatmap",
                  "caption": 'The Q-score heat map shows the Q-score by cycle for all lanes.',
                  "base64": to_base64(data_path + "/qscore_heatmap.png"), "filename": data_path + "/qscore_heatmap.png"})
            sav_plots.append({"title": "QScore Distribution",
                  "caption": 'The Q-score distribution shows the number of reads by quality score. The quality score os cumulative for current cycle and previous cycles, and only reads that pass the quality filter are included. The Q-score is based on the Phred scale. ',
                  "base64": to_base64(data_path + "/qscore_distr.png"), "filename": data_path + "/qscore_distr.png"})

            xml = OrderedDict()
            xml["tables"] = OrderedDict()
            runinfo = xmltodict.parse( "".join(open(join(str(input.savfolder), "RunInfo.xml"),"r").readlines() ))
            runinfo["RunInfo"]["Run"]["Reads"] = runinfo["RunInfo"]["Run"]["Reads"]["Read"]

            xml["tables"]["RunInfo"] = runinfo["RunInfo"]["Run"]
            try:
                runparam = xmltodict.parse( "".join(open(join(str(input.savfolder),"RunParameters.xml"),"r").readlines() ))
            except:
                runparam = xmltodict.parse( "".join(open(join(str(input.savfolder),"runParameters.xml"),"r").readlines() ))
            runparam['RunParameters'].pop("Reads", None)

            xml["tables"]["RunParameter"] = dict( (key, value) for (key,value) in runparam["RunParameters"].items() if not key.startswith("@xml"))
            xml["img"] = sav_plots
            json.dump(xml, open(str(output.json), "w"))