File: GenerateFeedback.py

package info (click to toggle)
shasta 0.14.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 29,636 kB
  • sloc: cpp: 82,262; python: 2,348; makefile: 222; sh: 143
file content (137 lines) | stat: -rwxr-xr-x 4,900 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python3

import configparser
import sys
import getopt
import json
import csv

helpMessage = """
Usage:
    GenerateFeedback.py --assemblyDirectory /path/to/assemblyDirectory

Note: The goal of this script is to provide feedback on a de novo assembly done using the
Shasta long read assembler.

Each genome is different and reads differ in quality. So it is likely that you will need
to repeat the `assembly -> feedback -> assembly` process a few times.

If you're not happy with the assembly after a few (3-4) tries, please file a Github issue
and we can help.
"""

def usage():
    print(helpMessage)
    return


def loadAssemblySummary(assemblyDirPath):
    assemblySummary = {}
    assemblySummaryFile = '{}/AssemblySummary.json'.format(assemblyDirPath)
    with open(assemblySummaryFile) as jsonFile:
        assemblySummary = json.load(jsonFile)

    return assemblySummary


def analyze(assemblyDirPath, genomeSize):
    assemblySummary = loadAssemblySummary(assemblyDirPath)
    readsUsedInAssembly = assemblySummary['Reads used in this assembly']
    numberOfReads = readsUsedInAssembly['Number of reads']

    readGraph = assemblySummary['Read graph']
    isolatedReadsFraction = float(readGraph['Isolated reads fraction']['Reads'])

    alignments = assemblySummary['Alignments']
    numberOfAlignmentCandidates = alignments['Number of alignment candidates found by the LowHash algorithm']
    numberOfGoodAlignments = alignments['Number of good alignments']

    assembledSegments = assemblySummary['Assembled segments']
    totalAssembledLength = assembledSegments['Total assembled segment length']
    longestSegmentLength = assembledSegments['Longest assembled segment length']
    segmentsN50 = assembledSegments['Assembled segments N50']

    print()
    print('Number of reads used = {}'.format(numberOfReads))
    print('Isolated reads fraction = {:.2f}'.format(isolatedReadsFraction))
    print('Number of alignment candidates = {}'.format(numberOfAlignmentCandidates))
    print('Number of good alignments = {}'.format(numberOfGoodAlignments))
    print()
    print('Genome fraction assembled = {:.2f} %'.format(totalAssembledLength * 100 / genomeSize))
    print('Longest assembled segment length = {}'.format(longestSegmentLength))
    print('Assembled segments N50 = {}'.format(segmentsN50))
    print()

    avgCandidatesPerRead = numberOfAlignmentCandidates / numberOfReads

    config = getConfig(assemblyDirPath)
    minHashConfig = config['MinHash']
    
    print('Feedback, if any:')

    if (avgCandidatesPerRead < 20):
        print('MinHash phase did not generate enough alignment candidates.')
        print('Try the following in order:')
        print('  (Suggestion) Increase `MinHash.minHashIterationCount` by 10, up to a maximum of 100.')
        if (int(minHashConfig['m']) == 4):
            print('  (Suggestion) Decrease `MinHash.m` to 3.')
    
    else:
        # Enough promising candidate pairs were generated
        avgGoodAlignmentsPerRead = numberOfGoodAlignments / numberOfReads
        if (avgGoodAlignmentsPerRead < 5 or isolatedReadsFraction > 0.5):
            # ... but not enough candidates met the bar of what is considered a good alignment.
            msg = (
                'Not enough good alignments were generated per read. '
                'Try relaxing the definition of what makes a good alignment.'
            )
            print(msg)
            print('Try the following in order:')
            print('  (Suggestion) Decrease `Align.minAlignedFraction` by 0.05, up to a minimum of 0.2.')
            print('  (Suggestion) Decrease `Align.minAlignedMarkerCount` by 20, up to a minimum of 200.')
            print('  (Suggestion) Increase `Align.maxSkip` & `Align.maxDrift` by 10, to allow for larger gaps in alignments.')
        

def getConfig(assemblyDirPath):
    config = configparser.ConfigParser()
    configFilePath = '{}/shasta.conf'.format(assemblyDirPath)
    if not config.read(configFilePath):
        raise Exception('Error reading config file {}.'.format(configFilePath))
    return config


def main(argv):
    assemblyDirPath = ""
    try:
      opts, args = getopt.getopt(argv,"", ["assemblyDirectory="])
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    for opt, arg in opts:
      if opt in ("--assemblyDirectory"):
         assemblyDirPath = arg

    if assemblyDirPath == "":
        usage()
        exit(2)

    print()

    genomeSizeMessage = """
What is the approximate genome size in megabases (Mbp)?
    Examples:
        3000 (for a 3 Gbp genome)
        0.4  (for a 400 Kbp genome)
    """
    print(genomeSizeMessage)
    genomeSize = float(input('Approximate genome size in megabasis (Mbp): '))

    genomeSize = int(genomeSize * 1000 * 1000)
    analyze(assemblyDirPath, genomeSize)
    
    print()
    return

if __name__ == '__main__':
    main(sys.argv[1:])