1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
#!/usr/bin/python3
from collections import defaultdict
from operator import itemgetter
from string import rstrip
seqlens = defaultdict(int)
from sys import stdin
from sys import argv
sizes = dict()
outsizes = defaultdict(int)
i1 = open(argv[1],'r')
i2 = open(argv[2],'r')
o1 = open(argv[3],'r')
o2 = open(argv[4],'r')
for f in [i1,i2]:
count = 0
for line in f:
if line.startswith("@") and (count == 4 or count == 0):
count = 0
ins_len = int(line.split('/')[1])
sizes[rstrip(line)] = ins_len
count += 1
i1.close()
i2.close()
for f in [o1,o2]:
next_line_seq = False
count = 0
pid = ''
for line in f:
if line.startswith("@") and (count == 4 or count == 0):
count = 0
pid = rstrip(line)
next_line_seq = True
count += 1
if next_line_seq and count == 2:
next_line_seq == False
outsizes[pid] = len(line)
o1.close()
o2.close()
FP = 0 # trimmed genomic bases
FN = 0 # left adapter behind
TN = 0 # left genomic bases behind
TP = 0 # trimmed adapter
min_len = 30
max_len = 100
#sens = tp/(tp+fn)
#spec = tn/(tn+fp)
for (pid,truelen) in sizes.items():
outlen = outsizes[pid]
## if truelen >= min_len:
if truelen < outlen:
TP += max_len - outlen #trimmed difference between outlen and orilen
FP += 0 #didn't trim any genomic bases
TN += truelen #left length genomic bases
FN += outlen - truelen #difference is adapter
if outlen == truelen:
TP += max_len - outlen
FP += 0
TN += truelen
FN += 0 #got all adapter
if truelen > outlen:
TP += max_len - min(max_len, truelen) #trimmed all of adapter
FP += min(max_len,truelen) - outlen #trimmed the difference of genomic
TN += outlen #didnt trim outlen genomic bases
FN += 0 #since the true length is > outlength there is no adapter left behind
## else:
## if outlen > 0:
## FP += 0 # trimmed genomic bases
## FN += max(outlen - truelen,0) # left adapter behind
## TN += 0 # left genomic bases behind
## TP += max_len - outlen # trimmed adapter
## if outlen == 0:
## FP += 0 # trimmed genomic bases
## FN += 0 # left adapter behind
## TN += 0 # left genomic bases behind
## TP += max_len - truelen # trimmed adapter
print(("Adapter Trimmed:%d" % (TP) ))
print(("Adapter Missed:%d" % (FN) ))
print(("Adapter Total:%d" % (FN+TP) ))
print(("Genomic Bases Trimmed:%d" % (FP) ))
print(("Genomic Bases Total:%d" % (FP + TN) ))
print(("Adapter Trimming Sensitivity: %f" % (float(TP)/(float(TP)+float(FN))) ))
print(("Adapter Trimming Specificity: %f" % (float(TN)/(float(TN)+float(FP))) ))
|