1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
|
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#created by Stephan Fuchs (FG13, RKI), 2015
#
# Wrapper to simplify calls to trimmomatic
#config
version = '0.0.9'
logsep = '=====================================\n'
cmdTrimPE = "TrimmomaticPE"
#dependencies
try:
import argparse
except ImportError:
print('The "argparse" module is not available. Use Python 3.2 or better.')
sys.exit(1)
import os
import re
import sys
import time
import shutil
import subprocess
#processing command line arguments
parser = argparse.ArgumentParser(prog="BATCH_TRIM", description='uses trimmomatic on multiple paired-end read file pairs')
parser.add_argument('file', metavar="FILE(S)", help="input BAM file", type=argparse.FileType('r'), nargs='+')
parser.add_argument('-t', metavar='INT', help="Number of threads/CPUs that should be used. Default is 4." , type=int, action="store", default=4)
arg0group = parser.add_mutually_exclusive_group(required=False)
arg0group.add_argument('--fastqc', help="If set combineFastQC is performed on all trimmed read files.", action="store_true", default=False)
arg0group.add_argument('--fastqcpaired', help="If set combineFastQC is performed only on trimmed paired read files." , action="store_true", default=False)
parser.add_argument('--illuminaclip', metavar='STR', help="Cut adapter and other illumina-specific sequences. Pattern: <fasta containing adapters and sequences>:<maximum of allowed mismatches>:<accuracy for palindromic matches>:<accuracy for simple matches> (Example: NexteraPE-PE.fa:2:30:10). By default ILLUMINACLIP is deactivated." , type=str, action="store", default=False)
arg1group = parser.add_mutually_exclusive_group(required=False)
arg1group.add_argument('--slidingwindow', metavar='STR', help="Performs a sliding window trimming approach. Pattern: <window size>:<required quality> (Example: 4:15). By default slidingwindow 4.15 is used." , type=str, action="store")
arg1group.add_argument('--maxinfo', metavar='STR', help="Performs an adaptive quality trimming approach. Pattern: <target length>:<strictness> (Example: 15:0.5). By default maxinfo is not used." , type=str, action="store")
parser.add_argument('--leading', metavar='INT', help="Cut bases of the start of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
parser.add_argument('--trailing', metavar='INT', help="Cut bases of the end of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
parser.add_argument('--crop', metavar='INT', help="Cut read to a specific length by removing bases from the end. By default this is deactivated." , type=int, action="store", default=False)
parser.add_argument('--headcrop', metavar='INT', help="Cut read to a specific length by removing bases from the start. By default this is deactivated." , type=int, action="store", default=False)
parser.add_argument('--minlen', metavar='INT', help="Drop the read if it is below a defined length. By default minimal length is set to 36. Set 0 to deactivate." , type=int, action="store", default=36)
parser.add_argument('--avgqual', metavar='INT', help="Drop the read if average quality is below a speciefied level. By default this is deactivated." , type=int, action="store", default=0)
arg2group = parser.add_mutually_exclusive_group(required=False)
arg2group.add_argument('--tophred33', help="Convert quality scores to Phred-33. By default this is deactivated." , action="store_true", default=False)
arg2group.add_argument('--tophred64', help="Convert quality scores to Phred-64. By default this is deactivated." , action="store_true", default=False)
args = parser.parse_args()
if len(args.file) % float(2) != 0:
exit("ERROR: It is not possible to process odd number of files in PE mode")
if args.slidingwindow == None and args.maxinfo == None:
args.slidingwindow = "4:15"
#FUNCTIONS
def die (msg=False, path = False, status=1):
'''Displays message (optional) and exits using the defined status code. If a path is given, this path is deleted (recursively)'''
if msg:
print(msg)
if path:
shutil.rmtree(path)
sys.exit(status)
def formatColumns(rows, empty='', space=5):
'''formats nested list (1. level = rows; 2. level = columns) in equal-sized columns. fields containing integers or floats are aligned to the right'''
#allowed signs in numbers
allowed = ['+', '-', '.', ',', '%']
#str conversion
rows = [[str(x) for x in row] for row in rows]
#fill up rows to same number of columns (important for transposing)
maxlen = max([len(x) for x in rows])
[x.extend([empty]*(maxlen-len(x))) for x in rows]
align = []
output = [''] * len(rows)
for column in zip(*rows):
#detect numbers to align right
string = ''.join(column)
for i in allowed:
string = string.replace(i, '')
if string.strip('+-.,').isdigit():
align = 'r'
else:
align = 'l'
maxlen = max([len(x) for x in column])
for i in range(len(column)):
if align == 'r':
output[i] += ' ' * (maxlen - len(column[i])) + column[i] + ' ' * space
else:
output[i] += column[i] + ' ' * (maxlen - len(column[i])) + ' ' * space
return '\n'.join(output) + '\n'
#START
#create dir
err = 1
while err > 0:
timecode = time.strftime("%Y-%m-%d_%H-%M-%S")
targetpath = "BATCH_TRIM_" + timecode
if os.path.isdir(targetpath):
err += 1
time.sleep(1)
else:
os.mkdir(targetpath)
err = 0
if err == 30:
print("ERROR: giving up to find available output file name")
sys.exit(1)
#working
print("target path created: " + targetpath)
print(str(len(args.file)) + " files to process ...")
#filing
files = []
for f in args.file:
files.append(f.name)
f.close()
files.sort()
#get version of trimmomatic (unfortunately there is no --version option available)
try:
proc=subprocess.Popen(['apt-cache', 'policy', 'trimmomatic'], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except subprocess.CalledProcessError:
print(subprocess.CalledProcessError)
except OSError:
print(OSError)
if proc.returncode > 0:
print(proc.communicate()[1])
die(targetdir)
else:
out=proc.communicate()[0]
match = re.search('Installed: ([\d\.]*)', out)
if match == None:
print("trimmomatic version not recognized")
tversion='unknown'
else:
tversion= match.group(1)
print("trimmomatic version: " + tversion)
#start logging
loghandle = open(targetpath + "/batch_trim.log", 'w')
loghandle.write("BATCH TRIM\n")
loghandle.write(logsep)
log=[["version:", version], ['time:', timecode], ['result folder:', targetpath]]
loghandle.write(formatColumns(log))
loghandle.write("\nTRIMMOMATIC\n")
loghandle.write(logsep)
log=[["version:", tversion]]
#create trimmomatic command line
cmd = []
if args.illuminaclip != False:
log.append(["ILLUMINACLIP:", args.illuminaclip])
cmd.append("ILLUMINACLIP:" + args.illuminaclip)
else:
log.append(["ILLUMINACLIP:", 'off'])
if args.slidingwindow != None:
log.append(["SLIDINGWINDOW:", args.slidingwindow])
cmd.append("SLIDINGWINDOW:" + args.slidingwindow)
elif args.maxinfo != None:
log.append(["MAXINFO:", args.maxinfo])
cmd.append("MAXINFO:" + args.maxinfo)
if args.leading > 0:
log.append(["LEADING:", args.leading])
cmd.append("LEADING:" + str(args.leading))
else:
log.append(["LEADING:", 'off'])
if args.trailing > 0:
log.append(["TRAILING:", args.trailing])
cmd.append("TRAILING:" + str(args.trailing))
else:
log.append(["TRAILING:", 'off'])
if args.crop != False:
log.append(["CROP:", args.crop])
loghandle.write(str(args.crop) + "\n")
else:
log.append(["CROP:", 'off'])
if args.headcrop != False:
log.append(["HEADCROP:", args.crop])
cmd.append("HEADCROP:" + str(args.headcrop))
else:
log.append(["HEADCROP:", 'off'])
if args.minlen > 0:
log.append(["MINLEN:", args.minlen])
cmd.append("MINLEN:" + str(args.minlen))
else:
log.append(["MINLEN:", 'off'])
if args.avgqual > 0:
log.append(["AVGQUAL:", args.avgqual])
cmd.append("AVGQUAL:" + str(args.avgqual))
else:
log.append(["AVGQUAL:", 'off'])
if args.tophred33 != False:
log.append(["to PHRED33 conversion:", 'on'])
cmd.append("TOPHRED33")
else:
log.append(["to PHRED33 conversion:", 'off'])
if args.tophred64 != False:
log.append(["to PHRED64 conversion:", 'on'])
cmd.append("TOPHRED64")
else:
log.append(["to PHRED64 conversion:", 'off'])
loghandle.write(formatColumns(log))
#performing trimmomatic
loghandle.write("\nTRIM RESULTS\n")
loghandle.write(logsep)
log = [["forward file", "reverse file", "detected enoding", "input read pairs", "both surviving", "forward only surviving", "reverse only surviving", "dropped", "forward paired read file", "forward unpaired read file", "reverse paired read file", "reverse unpaired read file"]]
i = 0
while i*2 < len(files):
print("PROCESSING FILE PAIR " + str(i+1) + " OF " + str(len(files)/2))
print(" forward file: " + files[i*2])
print(" reverse file: " + files[i*2+1])
ext=str(os.path.splitext(files[i*2]))
name=str(os.path.basename(files[i*2]))
name=name[:-(len(ext[1])+2)]
paired1=targetpath + '/' + name + '_paired.fq.gz'
unpaired1=targetpath + '/' + name + '_unpaired.fq.gz'
ext=str(os.path.splitext(files[i*2+1]))
name=str(os.path.basename(files[i*2+1]))
name=name[:-(len(ext[1])+2)]
paired2=targetpath + '/' + name + '_paired.fq.gz'
unpaired2=targetpath + '/' + name + '_unpaired.fq.gz'
cli = [cmdTrimPE, "-threads", str(args.t), files[i*2], files[i*2+1], paired1, unpaired1, paired2, unpaired2]
cli.extend(cmd)
print(" trimming...")
try:
proc=subprocess.Popen(cli, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except subprocess.CalledProcessError:
print(subprocess.CalledProcessError)
except OSError:
print(OSError)
out=proc.communicate()[1] #unfortunatley trimmomatic prints all in stderr and is not using stdout
if proc.returncode > 0:
exit(1)
else:
out = out.split("\n")
phred = re.search('phred\d*', out[1])
matches = re.findall('\d+ *[\d\(\)%\.,]*', out[-3])
log.append([files[i*2], files[i*2+1], phred.group(0), matches[0], matches[1], matches[2], matches[3], matches[4], os.path.basename(paired1), os.path.basename(unpaired1), os.path.basename(paired2), os.path.basename(unpaired2)])
i += 1
loghandle.write(formatColumns(log))
#starting combineFastQC
cli = False
if args.fastqc:
cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*.gz'
elif args.fastqcpaired:
cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*_paired.fq.gz'
if cli:
print(cli)
print("starting batchFastQC...\n\n")
try:
proc=subprocess.Popen(cli, shell=True)
except subprocess.CalledProcessError:
print(subprocess.CalledProcessError)
except OSError:
print(OSError)
|