File: trimming.snakefile

package info (click to toggle)
qcumber 2.3.0-2
links: PTS, VCS
area: main
in suites: bookworm, bullseye, sid, trixie
size: 2,276 kB
sloc: python: 3,097; sh: 153; makefile: 18
file content (311 lines) | stat: -rwxr-xr-x 13,844 bytes
#############
# Functions #
#############

def get_trimmomatic_result(files, params):
    n_reads = 0
    total_reads = 0

    for file in files:
        with open(file,"r") as logfile:
            for line in logfile.readlines():
                if re.match("Input Read", line):
                    if geninfo_config["Sample information"]["type"]=="PE":
                        pattern = re.match("Input Read Pairs:\s+(?P<total>\d+)\s+"
                                           "Both Surviving:\s+(?P<nSurvived>\d+) \((?P<pSurvived>\d+.\d+)%\)\s+"
                                           "Forward Only Surviving:\s+(?P<forward>\d+) \(\d+.\d+%\)\s+"
                                           "Reverse Only Surviving:\s+(?P<reverse>\d+) \(\d+.\d+%\).*", line)

                        total_reads += int(pattern.group("total")) * 2
                        n_reads += int(pattern.group("nSurvived")) * 2 + int(pattern.group("forward")) + int(pattern.group("reverse"))

                    else:
                        pattern = re.match(".*Surviving: (?P<nSurvived>\d+) \((?P<survived>\d+.\d+)%\)", line)
                        pattern2 = re.match("Input Reads: (?P<total>\d+)", line)

                        total_reads += int(pattern2.group("total"))
                        n_reads+= int(pattern.group("nSurvived"))


    all_params ={}
    for param in params:
        with open(param,"r") as paramfile:
            all_params[os.path.basename(param).replace(".trimmomatic.params", "")] = paramfile.read()
    if total_reads != 0:
        perc_remaining = round( 100*(n_reads/total_reads),2)
    else:
        perc_remaining = 0

    return OrderedDict([
        ("#Remaining Reads", n_reads),
        ("%Remaining Reads", perc_remaining),
        ("Trim parameter", all_params)])


def get_defaults():
    params = ""
    if config["technology"] == "Illumina":
        params +="ILLUMINACLIP:%s:%s " % (geninfo_config["adapter"], config["illuminaclip"])
    if not config["only_trim_adapters"]:
        params+="%s MINLEN:%s " % (config["trimOption"],
                                   str(config["minlen"]))
    return params

def perc_slope(a,b,perc):
    if abs(numpy.diff([a,b]))>(b*float(perc)):
        return True
    return False

def optimize_trimming(filename, outname, perc=0.1):
    if not exists(filename):
        with open(outname, "w") as paramfile:
            pass
    else:
        mytable=None
        with open(filename, "rb") as fastqcdata:
            table = []
            ifwrite = False
            #previous_line = ''
            while True:
                line = fastqcdata.readline()
                if not line:
                    break
                line = line.decode()
                line = line.replace("\n", "")
                if line.startswith(">>END_MODULE") and ifwrite:
                    try:
                        dtype = {'names': table[0], 'formats': ['|S15', float, float, float, float]}
                        mytable = numpy.asarray([tuple(x) for x in table[1:]], dtype=dtype)
                        break
                    except:
                        pass
                elif re.search("Sequence length\s+\d+", line):
                    seq_length = re.search("Sequence length\s+(?P<length>\d+)", line).group("length")
                #elif re.search("Sequence length\s+\d+(?P<length>\d+)", line):
                # The culprit                      ^^^ this killed trimBetter
                #    seq_length = re.search("Sequence length\s+\d+(?P<length>\d+)", line).group("length")
                elif line.startswith(">>Per base sequence content"):
                    #print(line, seq_length)
                    ifwrite = True
                    temp = line.split("\t")
                    if temp[1].lower() == "pass":
                        with open(outname, "w") as paramfile:
                            paramfile.write("%s;%s" % (0, seq_length))
                        return True
                elif ifwrite:
                    table.append(line.split("\t"))
                # previous_line = line
        headcrop = 0
        tailcrop = len(mytable["A"])
        column = numpy.ma.array(mytable)
        print(rep(column))
        for i in range(-4, int(round(len(mytable["A"]) / 3, 0)), 1):
            for nucl in ["A", "C", "G", "T"]:
                column[nucl].mask[max(i, 0):i + 5] = True
                if headcrop >0:
                    column[nucl].mask[:headcrop] = True
                if tailcrop < len(mytable["A"]):
                    column[nucl].mask[tailcrop:] = True

                # check heacrop
                if (perc_slope(numpy.mean(mytable[nucl][max(i, 0):i + 5]), numpy.mean(column[nucl]), perc=perc)) & (headcrop < (i + 5)):
                    headcrop = i + 5
                    trim_bool = True
                elif headcrop < i:
                    column[nucl].mask[max(i, 0):i + 5] = False

                # now crop from the end
                column[nucl].mask[-(i + 5):(min(len(mytable[nucl]), len(mytable[nucl]) - i))] = True
                if (perc_slope(numpy.mean(mytable[nucl][-(i + 6): (min(len(mytable[nucl]) - 1, len(mytable[nucl]) - 1 - i))]), numpy.mean(column[nucl]), perc=perc)) & (tailcrop > len(mytable[nucl]) - (i + 5)):
                    tailcrop = len(mytable[nucl]) - (i + 5)
                    trim_bool = True
                else:
                    column[nucl].mask[-(i + 5): (min(len(mytable["A"]) - 1, len(mytable[nucl]) - 1 - i))] = False
        with open(outname, "w") as paramfile:
            paramfile.write("%s;%s" % (headcrop, tailcrop-headcrop))
        return True

def get_best_params(output, r1, r2=None):

    with open (r1,"r") as r1_file:
        r1_params =r1_file.read().replace("\n","").split(";")
        # print('head-/tail-crop params:\nread1:', r1_params)
    if r2 is not None:
        with open(r2, "r") as r2_file:
            r2_params = r2_file.read().replace("\n","").split(";")
        # print('read2:', r2_params)
    else:
        r2_params = [-float("inf"), float("inf")]
    if not r1_params[0]=="":
        new_params = " ".join([
                get_defaults(),
                "HEADCROP:" + str(max(int(r1_params[0]),int(r2_params[0]))),
                "CROP:" +  str(min(int(r1_params[1]),int(r2_params[1]))),
                "MINLEN:" + str(config["minlen"])])
    else:
        new_params = get_defaults()
    with open(output, "w") as outfile:
        outfile.write(new_params)
    return  new_params


def get_trimmomatic_input(wildcards):
    input = {}
    #if geninfo_config["Sample information"]["samples"][wildcards.sample][0].endswith(".bam"):
    #    input["fastq_files"] = bam_to_fastq(geninfo_config["Sample information"]["samples"][wildcards.sample][0])
    #else:
    #    input["fastq_files"] =  geninfo_config["Sample information"]["samples"][wildcards.sample]
    input["fastq_files"] = get_all_reads(wildcards, True)
    if config["trimBetter"]:
        if geninfo_config["Sample information"]["type"] == "PE":
            input["params"] = list(expand("{path}/{sample}_{read}.params", read=["R1", "R2"], sample=wildcards.sample,
                                  path=trimbetter_path))
        else:
            input["params"] = list(expand("{path}/{sample}.params", sample = wildcards.sample, path = trimbetter_path))
    return input

def get_trimmomatic_output(path, is_temp = False):
    output = {}
    if is_temp:
        #output["pseudo_trimfile"] = (path +
        #                             "/{sample}.trimBetter.trimmomatic.pseudo")
        #output["params_file"] = temp( path + "/{sample}.trimmomatic.params")
        output["logfile"] = log_path + "/{sample}.trimBetter.trimmomatic.log"
    else:
        output["params_file"] = temp(path + "/{sample}.trimmomatic.params")
        #output["pseudo_trimfile"] = path + "/{sample}.trimmomatic.pseudo"
        output["logfile"] =  log_path + "/{sample}.trimmomatic.log"

    if geninfo_config["Sample information"]["type"]=="SE":
        if is_temp:
            output["trimmed_files"] = [temp(path + "/{sample}.fastq.gz")]
        else:
            output["trimmed_files"] = [path + "/{sample}.fastq.gz"]
    else:
        if is_temp:
            output["trimmed_files"] = [temp(path + "/{sample}.1P.fastq.gz"),
                                       temp(path + "/{sample}.1U.fastq.gz"),
                                       temp(path + "/{sample}.2P.fastq.gz"),
                                       temp(path + "/{sample}.2U.fastq.gz")]
        else:
            output["trimmed_files"] =[ path + "/{sample}.1P.fastq.gz",
                             path + "/{sample}.1U.fastq.gz",
                             path + "/{sample}.2P.fastq.gz",
                             path + "/{sample}.2U.fastq.gz"]
    return output
#--------------------------------------------< RULES >-----------------------------------------------------------------#


if not config["notrimming"]:
    if config["trimBetter"]:
        rule join_reads_trimBetter:
            input:
                r1 = [trimbetter_path + "/{sample}.1P.fastq.gz", trimbetter_path + "/{sample}.1U.fastq.gz"],
                r2 = [trimbetter_path + "/{sample}.2P.fastq.gz", trimbetter_path + "/{sample}.2U.fastq.gz"]
            output:
                r1_out = temp(trimbetter_path + "/{sample}_R1.fastq.gz"),
                r2_out = temp(trimbetter_path + "/{sample}_R2.fastq.gz")
            shell:
                "cat {input.r1} > {output.r1_out} | "
                "cat {input.r2} > {output.r2_out} "

        rule fastqc_trimBetter:
            input:
                fastq_files =  trimbetter_path + "/{sample}_{read}.fastq.gz"
            output:
                temp(trimbetter_path + "/FastQC/{sample}_{read}_fastqc.zip"),
                temp(trimbetter_path + "/FastQC/{sample}_{read}_fastqc.html"),
                fastqc = temp(trimbetter_path + "/FastQC/{sample}_{read}_fastqc"),
                log = temp(trimbetter_path + "/FastQC/{sample}_{read}.fastqc.log")
            threads:
                max_threads
            message:
                "Run FastQC to obtain better trimming paramters."
            run:
                #print('sizes:', ' '.join(['%s:%i|' % (x,os.path.getsize(x))
                #                           for x in input]), file=sys.stderr)
                shell(
                    "if [ `zcat '{input}' | head -n 1 | wc -c ` -eq 0 ]; "
                    "then touch {output}; "
                    "else fastqc {input} -o $(dirname {output.fastqc})"
                    " --extract --nogroup -t {threads} >  {output.log} 2>&1; "
                    "fi; ")

        rule optimize_trimming_parameter:
            input:
                trimbetter_path + "/FastQC/{sample}_{read}_fastqc"
            output:
                temp(trimbetter_path + "/{sample}_{read}.params")
            params:
                perc_slope = config["trimBetter_threshold"]
            run: # Apperently os.path.join is loaded somehow
                res = optimize_trimming(join(str(input),"fastqc_data.txt"),
                                        str(output), float(params.perc_slope))
                if not res:
                    shell('exit 1')

        rule trimmomatic_trimBetter:
            input:
                fastq_files = lambda x: geninfo_config["Sample information"]["samples"][x.sample]
            output:
                **get_trimmomatic_output(trimbetter_path, is_temp = True)
            threads:
                max_threads
            log:
                get_trimmomatic_output(trimbetter_path, is_temp = True)['logfile']
                #log_path + "/{sample}.trimmomatic.trimBetter.log"
            params:
                get_defaults()
            shell:
                ("trimmomatic %s  -threads {threads} {input.fastq_files}"
                 " {output.trimmed_files} {params}"
                 " 2> {log}") % geninfo_config["Sample information"]["type"]

    #-- end trimbetter

    rule trimmomatic:
        input:
            unpack(get_trimmomatic_input)
        output:
            **get_trimmomatic_output(trimming_path)
        log:
            log_path + "/{sample}.trimmomatic.log"
        params:
           minlen = config["minlen"],
           trimOption = config["trimOption"]
        threads:
            max_threads
        run:
            #print('sizes:', ' '.join(['%s:%i|' % (x,os.path.getsize(x))
            #                          for x in input]), file=sys.stderr)
            try:
                new_params = get_best_params(str(output.params_file), *list(input.params))
            except:
                new_params = get_defaults()
            if params.minlen:
                pass
            if params.trimOption:
                pass
            paramfile = open(str(output.params_file),"w")
            paramfile.write(new_params)
            paramfile.close()
            shell("trimmomatic %s -threads {threads} "  # -Xmx512m "
                  "{input.fastq_files} {output.trimmed_files} %s 2> {output.logfile}" % (
                      geninfo_config["Sample information"]["type"], new_params))
            #shell("touch {output.pseudo_trimfile}")



rule join_reads:
    input:
        r1 = [trimming_path + "/{sample}.1P.fastq.gz",trimming_path + "/{sample}.1U.fastq.gz"],
        r2 = [trimming_path + "/{sample}.2P.fastq.gz",trimming_path + "/{sample}.2U.fastq.gz"]
    output:
        r1_out = temp(trimming_path + "/{sample}_R1.fastq.gz"),
        r2_out = temp(trimming_path + "/{sample}_R2.fastq.gz")
    run:
        # print('sizes:', ' '.join(['%s:%i|' % (x,os.path.getsize(x))
        #                           for x in input]), file=sys.stderr)
        shell(
            "cat {input.r1} > {output.r1_out} | "
            "cat {input.r2} > {output.r2_out}")