File: ComputeSyntheticStats.py

package info (click to toggle)
rapmap 0.15.0%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 6,352 kB
  • sloc: cpp: 48,826; ansic: 4,471; sh: 215; python: 82; makefile: 17
file content (100 lines) | stat: -rw-r--r-- 3,492 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from __future__ import print_function
import argparse

def main(args):
    import pysam
    import sys

    printFP = args.printFalsePositives
    alnFile = pysam.AlignmentFile(args.input, 'r')
    totalNumReads = args.totalNumReads
    currQueryName = ''
    skipToNextAlignment = False

    readsWithTrueAln = 0
    readsSeen = 0

    truePos = 0
    falsePos = 0
    falseNeg = 0
    foundTrueAlignment = False
    ## True Neg are somewhat ill-defined in our context
    prevRec = None
    useNHTag = args.useNHTag
    NHTagSum = 0
    nhData = []
    for rec in alnFile:
        if rec.is_unmapped:
            continue
        qname = rec.qname[:-2]
        # If this is a new read, remember the name
        # and increment the counter
        if qname != currQueryName:
            readsSeen += 1
            if useNHTag:
                tagVal = rec.get_tag('NH')
                NHTagSum += tagVal
                nhData.append(tagVal)
            if readsSeen % 1000000 == 0:
                print("\r\rSaw {} reads --- thp = {:.2%}".format(readsSeen, \
                       float(truePos) / readsSeen), file=sys.stderr, end='')


            currQueryName = qname
            if not foundTrueAlignment:
                falsePos += 1
                if (printFP):
                    print(prevRec)
            skipToNextAlignment = False
            foundTrueAlignment = False

        prevRec = rec
        # If we already found the true hit
        # for this read, don't bother processing
        # this record
        if not skipToNextAlignment:
            # Process the record to find if its the true
            # hit
            currQueryName = qname
            trueTxpName = qname.split(':')[2]
            alignedTxpName = alnFile.getrname(rec.rname)
            if (trueTxpName == alignedTxpName):
                truePos += 1
                skipToNextAlignment = True
                foundTrueAlignment = True

    falseNeg = totalNumReads - readsSeen
    print('\n'.join(["Total Reads = {}" ,
          "Reads Aligned  = {}" ,
          "True Pos = {}" ,
          "False Pos = {}" ,
          "False Neg = {}" ,
          "Precision = {:.2%}",
          "Recall = {:.2%}" ,
          "TPH = {:.2%}",
          "FDR = {:.2%}",
          "F1 Score = {:.2%}"]).format(totalNumReads, readsSeen,
                                  truePos, falsePos,
                                  falseNeg, truePos / float(truePos + falsePos),
                                  truePos / float(truePos + falseNeg),
                                  truePos / float(readsSeen),
                                  falsePos / float(falsePos + truePos),
                                  (2*truePos) / float(2*truePos + falsePos + falseNeg)),
          file=sys.stderr)

    if useNHTag:
        print("Average hits-per-read = {:.2}".format(
              NHTagSum / float(totalNumReads)), file=sys.stderr)
        if args.nhFreqFile:
            import pickle
            pickle.dump(nhData, args.nhFreqFile)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Compute statistics from synthetic SAM file.')
    parser.add_argument('--input', type=str)
    parser.add_argument('--totalNumReads', type=int)
    parser.add_argument('--useNHTag', action='store_true')
    parser.add_argument('--nhFreqFile', nargs='?', type=argparse.FileType('w'))
    parser.add_argument('--printFalsePositives', action='store_true')
    args = parser.parse_args()
    main(args)