File: batchTrim

package info (click to toggle)
trimmomatic 0.39%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 2,436 kB
  • sloc: java: 3,748; python: 218; xml: 68; sh: 31; makefile: 13
file content (295 lines) | stat: -rwxr-xr-x 11,228 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#created by Stephan Fuchs (FG13, RKI), 2015
#
# Wrapper to simplify calls to trimmomatic

#config
version = '0.0.9'
logsep = '=====================================\n'
cmdTrimPE = "TrimmomaticPE"




#dependencies
try:
    import argparse
except ImportError:
    print('The "argparse" module is not available. Use Python 3.2 or better.')
    sys.exit(1)

import os
import re
import sys
import time
import shutil
import subprocess


#processing command line arguments
parser = argparse.ArgumentParser(prog="BATCH_TRIM", description='uses trimmomatic on multiple paired-end read file pairs')
parser.add_argument('file', metavar="FILE(S)", help="input BAM file", type=argparse.FileType('r'), nargs='+')
parser.add_argument('-t', metavar='INT', help="Number of threads/CPUs that should be used. Default is 4." , type=int, action="store", default=4)
arg0group = parser.add_mutually_exclusive_group(required=False)
arg0group.add_argument('--fastqc', help="If set combineFastQC is performed on all trimmed read files.", action="store_true", default=False)
arg0group.add_argument('--fastqcpaired', help="If set combineFastQC is performed only on trimmed paired read files." , action="store_true", default=False)
parser.add_argument('--illuminaclip', metavar='STR', help="Cut adapter and other illumina-specific sequences. Pattern: <fasta containing adapters and sequences>:<maximum of allowed mismatches>:<accuracy for palindromic matches>:<accuracy for simple matches> (Example: NexteraPE-PE.fa:2:30:10). By default ILLUMINACLIP is deactivated." , type=str, action="store", default=False)
arg1group = parser.add_mutually_exclusive_group(required=False)
arg1group.add_argument('--slidingwindow', metavar='STR', help="Performs a sliding window trimming approach. Pattern: <window size>:<required quality> (Example: 4:15). By default slidingwindow 4.15 is used." , type=str, action="store")
arg1group.add_argument('--maxinfo', metavar='STR', help="Performs an adaptive quality trimming approach. Pattern: <target length>:<strictness> (Example: 15:0.5). By default maxinfo is not used." , type=str, action="store")
parser.add_argument('--leading', metavar='INT', help="Cut bases of the start of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
parser.add_argument('--trailing', metavar='INT', help="Cut bases of the end of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
parser.add_argument('--crop', metavar='INT', help="Cut read to a specific length by removing bases from the end. By default this is deactivated." , type=int, action="store", default=False)
parser.add_argument('--headcrop', metavar='INT', help="Cut read to a specific length by removing bases from the start. By default this is deactivated." , type=int, action="store", default=False)
parser.add_argument('--minlen', metavar='INT', help="Drop the read if it is below a defined length. By default minimal length is set to 36. Set 0 to deactivate." , type=int, action="store", default=36)
parser.add_argument('--avgqual', metavar='INT', help="Drop the read if average quality is below a speciefied level. By default this is deactivated." , type=int, action="store", default=0)
arg2group = parser.add_mutually_exclusive_group(required=False)
arg2group.add_argument('--tophred33', help="Convert quality scores to Phred-33. By default this is deactivated." , action="store_true", default=False)
arg2group.add_argument('--tophred64', help="Convert quality scores to Phred-64. By default this is deactivated." , action="store_true", default=False)

args = parser.parse_args()


if len(args.file) % float(2) != 0:
    exit("ERROR: It is not possible to process odd number of files in PE mode")

if args.slidingwindow == None and args.maxinfo == None:
    args.slidingwindow = "4:15"

#FUNCTIONS
def die (msg=False, path = False, status=1):
    '''Displays message (optional) and exits using the defined status code. If a path is given, this path is deleted (recursively)'''
    if msg:
        print(msg)
    if path:
        shutil.rmtree(path)
    sys.exit(status)



def formatColumns(rows, empty='', space=5):
    '''formats nested list (1. level = rows; 2. level = columns) in equal-sized columns. fields containing integers or floats are aligned to the right'''

    #allowed signs in numbers
    allowed = ['+', '-', '.', ',', '%']

    #str conversion
    rows = [[str(x) for x in row] for row in rows]

    #fill up rows to same number of columns (important for transposing)
    maxlen = max([len(x) for x in rows])
    [x.extend([empty]*(maxlen-len(x))) for x in rows]
    align = []
    output = [''] * len(rows)
    for column in zip(*rows):
      #detect numbers to align right
      string = ''.join(column)
      for i in allowed:
        string = string.replace(i, '')
      if string.strip('+-.,').isdigit():
        align = 'r'
      else:
        align = 'l'

      maxlen = max([len(x) for x in column])
      for i in range(len(column)):
          if align == 'r':
              output[i] += ' ' * (maxlen - len(column[i])) + column[i] + ' ' * space
          else:
              output[i] += column[i] + ' ' * (maxlen - len(column[i])) + ' ' * space

    return '\n'.join(output) + '\n'

#START

#create dir
err = 1
while err > 0:
    timecode = time.strftime("%Y-%m-%d_%H-%M-%S")
    targetpath = "BATCH_TRIM_" + timecode
    if os.path.isdir(targetpath):
        err += 1
        time.sleep(1)
    else:
        os.mkdir(targetpath)
        err = 0
    if err == 30:
        print("ERROR: giving up to find available output file name")
        sys.exit(1)


#working
print("target path created:  " + targetpath)
print(str(len(args.file)) + " files to process ...")

#filing
files = []
for f in args.file:
    files.append(f.name)
    f.close()
files.sort()

#get version of trimmomatic (unfortunately there is no --version option available)
try:
    proc=subprocess.Popen(['apt-cache', 'policy', 'trimmomatic'], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except subprocess.CalledProcessError:
    print(subprocess.CalledProcessError)
except OSError:
    print(OSError)

if proc.returncode > 0:
  print(proc.communicate()[1])
  die(targetdir)
else:
  out=proc.communicate()[0]
  match = re.search('Installed: ([\d\.]*)', out)
  if match == None:
      print("trimmomatic version not recognized")
      tversion='unknown'
  else:
      tversion= match.group(1)
      print("trimmomatic version:  " + tversion)


#start logging
loghandle = open(targetpath + "/batch_trim.log", 'w')
loghandle.write("BATCH TRIM\n")
loghandle.write(logsep)
log=[["version:", version], ['time:', timecode], ['result folder:', targetpath]]
loghandle.write(formatColumns(log))

loghandle.write("\nTRIMMOMATIC\n")
loghandle.write(logsep)
log=[["version:", tversion]]


#create trimmomatic command line
cmd = []

if args.illuminaclip != False:
    log.append(["ILLUMINACLIP:", args.illuminaclip])
    cmd.append("ILLUMINACLIP:" + args.illuminaclip)
else:
    log.append(["ILLUMINACLIP:", 'off'])

if args.slidingwindow != None:
    log.append(["SLIDINGWINDOW:", args.slidingwindow])
    cmd.append("SLIDINGWINDOW:" + args.slidingwindow)
elif args.maxinfo != None:
    log.append(["MAXINFO:", args.maxinfo])
    cmd.append("MAXINFO:" + args.maxinfo)

if args.leading > 0:
    log.append(["LEADING:", args.leading])
    cmd.append("LEADING:" + str(args.leading))
else:
    log.append(["LEADING:", 'off'])

if args.trailing > 0:
    log.append(["TRAILING:", args.trailing])
    cmd.append("TRAILING:" + str(args.trailing))
else:
    log.append(["TRAILING:", 'off'])

if args.crop != False:
    log.append(["CROP:", args.crop])
    loghandle.write(str(args.crop) + "\n")
else:
    log.append(["CROP:", 'off'])

if args.headcrop != False:
    log.append(["HEADCROP:", args.crop])
    cmd.append("HEADCROP:" + str(args.headcrop))
else:
    log.append(["HEADCROP:", 'off'])

if args.minlen > 0:
    log.append(["MINLEN:", args.minlen])
    cmd.append("MINLEN:" + str(args.minlen))
else:
    log.append(["MINLEN:", 'off'])

if args.avgqual > 0:
    log.append(["AVGQUAL:", args.avgqual])
    cmd.append("AVGQUAL:" + str(args.avgqual))
else:
    log.append(["AVGQUAL:", 'off'])

if args.tophred33 != False:
    log.append(["to PHRED33 conversion:", 'on'])
    cmd.append("TOPHRED33")
else:
    log.append(["to PHRED33 conversion:", 'off'])

if args.tophred64 != False:
    log.append(["to PHRED64 conversion:", 'on'])
    cmd.append("TOPHRED64")
else:
    log.append(["to PHRED64 conversion:", 'off'])

loghandle.write(formatColumns(log))

#performing trimmomatic
loghandle.write("\nTRIM RESULTS\n")
loghandle.write(logsep)
log = [["forward file", "reverse file", "detected enoding", "input read pairs", "both surviving", "forward only surviving", "reverse only surviving", "dropped", "forward paired read file", "forward unpaired read file", "reverse paired read file", "reverse unpaired read file"]]
i = 0
while i*2 < len(files):
    print("PROCESSING FILE PAIR " + str(i+1) + " OF " + str(len(files)/2))
    print("  forward file: " + files[i*2])
    print("  reverse file: " + files[i*2+1])

    ext=str(os.path.splitext(files[i*2]))
    name=str(os.path.basename(files[i*2]))
    name=name[:-(len(ext[1])+2)]
    paired1=targetpath + '/' + name + '_paired.fq.gz'
    unpaired1=targetpath + '/' + name + '_unpaired.fq.gz'

    ext=str(os.path.splitext(files[i*2+1]))
    name=str(os.path.basename(files[i*2+1]))
    name=name[:-(len(ext[1])+2)]
    paired2=targetpath + '/' + name + '_paired.fq.gz'
    unpaired2=targetpath + '/' + name + '_unpaired.fq.gz'

    cli = [cmdTrimPE, "-threads", str(args.t), files[i*2], files[i*2+1], paired1, unpaired1, paired2, unpaired2]
    cli.extend(cmd)

    print("  trimming...")
    try:
        proc=subprocess.Popen(cli, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except subprocess.CalledProcessError:
        print(subprocess.CalledProcessError)
    except OSError:
        print(OSError)	

    out=proc.communicate()[1] #unfortunatley trimmomatic prints all in stderr and is not using stdout

    if proc.returncode > 0:
        exit(1)
    else:
        out = out.split("\n")
        phred = re.search('phred\d*', out[1])
        matches = re.findall('\d+ *[\d\(\)%\.,]*', out[-3])
        log.append([files[i*2], files[i*2+1], phred.group(0), matches[0], matches[1], matches[2], matches[3], matches[4], os.path.basename(paired1), os.path.basename(unpaired1), os.path.basename(paired2), os.path.basename(unpaired2)])
    i += 1

loghandle.write(formatColumns(log))

#starting combineFastQC
cli = False
if args.fastqc:
    cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*.gz'
elif args.fastqcpaired:	
    cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*_paired.fq.gz'

if cli:
    print(cli)
    print("starting batchFastQC...\n\n")
    try:
        proc=subprocess.Popen(cli, shell=True)
    except subprocess.CalledProcessError:
        print(subprocess.CalledProcessError)
    except OSError:
        print(OSError)