File: batchTrim

package info (click to toggle)
trimmomatic 0.39%2Bdfsg-2
links: PTS, VCS
area: main
in suites: bookworm, bullseye, forky, sid, trixie
size: 2,436 kB
sloc: java: 3,748; python: 218; xml: 68; sh: 31; makefile: 13
file content (295 lines) | stat: -rwxr-xr-x 11,228 bytes
parent folder | download | duplicates (4)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#created by Stephan Fuchs (FG13, RKI), 2015
#
# Wrapper to simplify calls to trimmomatic

#config
version = '0.0.9'
logsep = '=====================================\n'
cmdTrimPE = "TrimmomaticPE"




#dependencies
try:
    import argparse
except ImportError:
    print('The "argparse" module is not available. Use Python 3.2 or better.')
    sys.exit(1)

import os
import re
import sys
import time
import shutil
import subprocess


#processing command line arguments
parser = argparse.ArgumentParser(prog="BATCH_TRIM", description='uses trimmomatic on multiple paired-end read file pairs')
parser.add_argument('file', metavar="FILE(S)", help="input BAM file", type=argparse.FileType('r'), nargs='+')
parser.add_argument('-t', metavar='INT', help="Number of threads/CPUs that should be used. Default is 4." , type=int, action="store", default=4)
arg0group = parser.add_mutually_exclusive_group(required=False)
arg0group.add_argument('--fastqc', help="If set combineFastQC is performed on all trimmed read files.", action="store_true", default=False)
arg0group.add_argument('--fastqcpaired', help="If set combineFastQC is performed only on trimmed paired read files." , action="store_true", default=False)
parser.add_argument('--illuminaclip', metavar='STR', help="Cut adapter and other illumina-specific sequences. Pattern: <fasta containing adapters and sequences>:<maximum of allowed mismatches>:<accuracy for palindromic matches>:<accuracy for simple matches> (Example: NexteraPE-PE.fa:2:30:10). By default ILLUMINACLIP is deactivated." , type=str, action="store", default=False)
arg1group = parser.add_mutually_exclusive_group(required=False)
arg1group.add_argument('--slidingwindow', metavar='STR', help="Performs a sliding window trimming approach. Pattern: <window size>:<required quality> (Example: 4:15). By default slidingwindow 4.15 is used." , type=str, action="store")
arg1group.add_argument('--maxinfo', metavar='STR', help="Performs an adaptive quality trimming approach. Pattern: <target length>:<strictness> (Example: 15:0.5). By default maxinfo is not used." , type=str, action="store")
parser.add_argument('--leading', metavar='INT', help="Cut bases of the start of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
parser.add_argument('--trailing', metavar='INT', help="Cut bases of the end of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
parser.add_argument('--crop', metavar='INT', help="Cut read to a specific length by removing bases from the end. By default this is deactivated." , type=int, action="store", default=False)
parser.add_argument('--headcrop', metavar='INT', help="Cut read to a specific length by removing bases from the start. By default this is deactivated." , type=int, action="store", default=False)
parser.add_argument('--minlen', metavar='INT', help="Drop the read if it is below a defined length. By default minimal length is set to 36. Set 0 to deactivate." , type=int, action="store", default=36)
parser.add_argument('--avgqual', metavar='INT', help="Drop the read if average quality is below a speciefied level. By default this is deactivated." , type=int, action="store", default=0)
arg2group = parser.add_mutually_exclusive_group(required=False)
arg2group.add_argument('--tophred33', help="Convert quality scores to Phred-33. By default this is deactivated." , action="store_true", default=False)
arg2group.add_argument('--tophred64', help="Convert quality scores to Phred-64. By default this is deactivated." , action="store_true", default=False)

args = parser.parse_args()


if len(args.file) % float(2) != 0:
    exit("ERROR: It is not possible to process odd number of files in PE mode")

if args.slidingwindow == None and args.maxinfo == None:
    args.slidingwindow = "4:15"

#FUNCTIONS
def die (msg=False, path = False, status=1):
    '''Displays message (optional) and exits using the defined status code. If a path is given, this path is deleted (recursively)'''
    if msg:
        print(msg)
    if path:
        shutil.rmtree(path)
    sys.exit(status)



def formatColumns(rows, empty='', space=5):
    '''formats nested list (1. level = rows; 2. level = columns) in equal-sized columns. fields containing integers or floats are aligned to the right'''

    #allowed signs in numbers
    allowed = ['+', '-', '.', ',', '%']

    #str conversion
    rows = [[str(x) for x in row] for row in rows]

    #fill up rows to same number of columns (important for transposing)
    maxlen = max([len(x) for x in rows])
    [x.extend([empty]*(maxlen-len(x))) for x in rows]
    align = []
    output = [''] * len(rows)
    for column in zip(*rows):
      #detect numbers to align right
      string = ''.join(column)
      for i in allowed:
        string = string.replace(i, '')
      if string.strip('+-.,').isdigit():
        align = 'r'
      else:
        align = 'l'

      maxlen = max([len(x) for x in column])
      for i in range(len(column)):
          if align == 'r':
              output[i] += ' ' * (maxlen - len(column[i])) + column[i] + ' ' * space
          else:
              output[i] += column[i] + ' ' * (maxlen - len(column[i])) + ' ' * space

    return '\n'.join(output) + '\n'

#START

#create dir
err = 1
while err > 0:
    timecode = time.strftime("%Y-%m-%d_%H-%M-%S")
    targetpath = "BATCH_TRIM_" + timecode
    if os.path.isdir(targetpath):
        err += 1
        time.sleep(1)
    else:
        os.mkdir(targetpath)
        err = 0
    if err == 30:
        print("ERROR: giving up to find available output file name")
        sys.exit(1)


#working
print("target path created:  " + targetpath)
print(str(len(args.file)) + " files to process ...")

#filing
files = []
for f in args.file:
    files.append(f.name)
    f.close()
files.sort()

#get version of trimmomatic (unfortunately there is no --version option available)
try:
    proc=subprocess.Popen(['apt-cache', 'policy', 'trimmomatic'], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except subprocess.CalledProcessError:
    print(subprocess.CalledProcessError)
except OSError:
    print(OSError)

if proc.returncode > 0:
  print(proc.communicate()[1])
  die(targetdir)
else:
  out=proc.communicate()[0]
  match = re.search('Installed: ([\d\.]*)', out)
  if match == None:
      print("trimmomatic version not recognized")
      tversion='unknown'
  else:
      tversion= match.group(1)
      print("trimmomatic version:  " + tversion)


#start logging
loghandle = open(targetpath + "/batch_trim.log", 'w')
loghandle.write("BATCH TRIM\n")
loghandle.write(logsep)
log=[["version:", version], ['time:', timecode], ['result folder:', targetpath]]
loghandle.write(formatColumns(log))

loghandle.write("\nTRIMMOMATIC\n")
loghandle.write(logsep)
log=[["version:", tversion]]


#create trimmomatic command line
cmd = []

if args.illuminaclip != False:
    log.append(["ILLUMINACLIP:", args.illuminaclip])
    cmd.append("ILLUMINACLIP:" + args.illuminaclip)
else:
    log.append(["ILLUMINACLIP:", 'off'])

if args.slidingwindow != None:
    log.append(["SLIDINGWINDOW:", args.slidingwindow])
    cmd.append("SLIDINGWINDOW:" + args.slidingwindow)
elif args.maxinfo != None:
    log.append(["MAXINFO:", args.maxinfo])
    cmd.append("MAXINFO:" + args.maxinfo)

if args.leading > 0:
    log.append(["LEADING:", args.leading])
    cmd.append("LEADING:" + str(args.leading))
else:
    log.append(["LEADING:", 'off'])

if args.trailing > 0:
    log.append(["TRAILING:", args.trailing])
    cmd.append("TRAILING:" + str(args.trailing))
else:
    log.append(["TRAILING:", 'off'])

if args.crop != False:
    log.append(["CROP:", args.crop])
    loghandle.write(str(args.crop) + "\n")
else:
    log.append(["CROP:", 'off'])

if args.headcrop != False:
    log.append(["HEADCROP:", args.crop])
    cmd.append("HEADCROP:" + str(args.headcrop))
else:
    log.append(["HEADCROP:", 'off'])

if args.minlen > 0:
    log.append(["MINLEN:", args.minlen])
    cmd.append("MINLEN:" + str(args.minlen))
else:
    log.append(["MINLEN:", 'off'])

if args.avgqual > 0:
    log.append(["AVGQUAL:", args.avgqual])
    cmd.append("AVGQUAL:" + str(args.avgqual))
else:
    log.append(["AVGQUAL:", 'off'])

if args.tophred33 != False:
    log.append(["to PHRED33 conversion:", 'on'])
    cmd.append("TOPHRED33")
else:
    log.append(["to PHRED33 conversion:", 'off'])

if args.tophred64 != False:
    log.append(["to PHRED64 conversion:", 'on'])
    cmd.append("TOPHRED64")
else:
    log.append(["to PHRED64 conversion:", 'off'])

loghandle.write(formatColumns(log))

#performing trimmomatic
loghandle.write("\nTRIM RESULTS\n")
loghandle.write(logsep)
log = [["forward file", "reverse file", "detected enoding", "input read pairs", "both surviving", "forward only surviving", "reverse only surviving", "dropped", "forward paired read file", "forward unpaired read file", "reverse paired read file", "reverse unpaired read file"]]
i = 0
while i*2 < len(files):
    print("PROCESSING FILE PAIR " + str(i+1) + " OF " + str(len(files)/2))
    print("  forward file: " + files[i*2])
    print("  reverse file: " + files[i*2+1])

    ext=str(os.path.splitext(files[i*2]))
    name=str(os.path.basename(files[i*2]))
    name=name[:-(len(ext[1])+2)]
    paired1=targetpath + '/' + name + '_paired.fq.gz'
    unpaired1=targetpath + '/' + name + '_unpaired.fq.gz'

    ext=str(os.path.splitext(files[i*2+1]))
    name=str(os.path.basename(files[i*2+1]))
    name=name[:-(len(ext[1])+2)]
    paired2=targetpath + '/' + name + '_paired.fq.gz'
    unpaired2=targetpath + '/' + name + '_unpaired.fq.gz'

    cli = [cmdTrimPE, "-threads", str(args.t), files[i*2], files[i*2+1], paired1, unpaired1, paired2, unpaired2]
    cli.extend(cmd)

    print("  trimming...")
    try:
        proc=subprocess.Popen(cli, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except subprocess.CalledProcessError:
        print(subprocess.CalledProcessError)
    except OSError:
        print(OSError)	

    out=proc.communicate()[1] #unfortunatley trimmomatic prints all in stderr and is not using stdout

    if proc.returncode > 0:
        exit(1)
    else:
        out = out.split("\n")
        phred = re.search('phred\d*', out[1])
        matches = re.findall('\d+ *[\d\(\)%\.,]*', out[-3])
        log.append([files[i*2], files[i*2+1], phred.group(0), matches[0], matches[1], matches[2], matches[3], matches[4], os.path.basename(paired1), os.path.basename(unpaired1), os.path.basename(paired2), os.path.basename(unpaired2)])
    i += 1

loghandle.write(formatColumns(log))

#starting combineFastQC
cli = False
if args.fastqc:
    cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*.gz'
elif args.fastqcpaired:	
    cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*_paired.fq.gz'

if cli:
    print(cli)
    print("starting batchFastQC...\n\n")
    try:
        proc=subprocess.Popen(cli, shell=True)
    except subprocess.CalledProcessError:
        print(subprocess.CalledProcessError)
    except OSError:
        print(OSError)