1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
#!/usr/bin/python3
import configparser
import sys
import getopt
import json
import csv
helpMessage = """
Usage:
GenerateFeedback.py --assemblyDirectory /path/to/assemblyDirectory
Note: The goal of this script is to provide feedback on a de novo assembly done using the
Shasta long read assembler.
Each genome is different and reads differ in quality. So it is likely that you will need
to repeat the `assembly -> feedback -> assembly` process a few times.
If you're not happy with the assembly after a few (3-4) tries, please file a Github issue
and we can help.
"""
def usage():
print(helpMessage)
return
def loadAssemblySummary(assemblyDirPath):
assemblySummary = {}
assemblySummaryFile = '{}/AssemblySummary.json'.format(assemblyDirPath)
with open(assemblySummaryFile) as jsonFile:
assemblySummary = json.load(jsonFile)
return assemblySummary
def analyze(assemblyDirPath, genomeSize):
assemblySummary = loadAssemblySummary(assemblyDirPath)
readsUsedInAssembly = assemblySummary['Reads used in this assembly']
numberOfReads = readsUsedInAssembly['Number of reads']
readGraph = assemblySummary['Read graph']
isolatedReadsFraction = float(readGraph['Isolated reads fraction']['Reads'])
alignments = assemblySummary['Alignments']
numberOfAlignmentCandidates = alignments['Number of alignment candidates found by the LowHash algorithm']
numberOfGoodAlignments = alignments['Number of good alignments']
assembledSegments = assemblySummary['Assembled segments']
totalAssembledLength = assembledSegments['Total assembled segment length']
longestSegmentLength = assembledSegments['Longest assembled segment length']
segmentsN50 = assembledSegments['Assembled segments N50']
print()
print('Number of reads used = {}'.format(numberOfReads))
print('Isolated reads fraction = {:.2f}'.format(isolatedReadsFraction))
print('Number of alignment candidates = {}'.format(numberOfAlignmentCandidates))
print('Number of good alignments = {}'.format(numberOfGoodAlignments))
print()
print('Genome fraction assembled = {:.2f} %'.format(totalAssembledLength * 100 / genomeSize))
print('Longest assembled segment length = {}'.format(longestSegmentLength))
print('Assembled segments N50 = {}'.format(segmentsN50))
print()
avgCandidatesPerRead = numberOfAlignmentCandidates / numberOfReads
config = getConfig(assemblyDirPath)
minHashConfig = config['MinHash']
print('Feedback, if any:')
if (avgCandidatesPerRead < 20):
print('MinHash phase did not generate enough alignment candidates.')
print('Try the following in order:')
print(' (Suggestion) Increase `MinHash.minHashIterationCount` by 10, up to a maximum of 100.')
if (int(minHashConfig['m']) == 4):
print(' (Suggestion) Decrease `MinHash.m` to 3.')
else:
# Enough promising candidate pairs were generated
avgGoodAlignmentsPerRead = numberOfGoodAlignments / numberOfReads
if (avgGoodAlignmentsPerRead < 5 or isolatedReadsFraction > 0.5):
# ... but not enough candidates met the bar of what is considered a good alignment.
msg = (
'Not enough good alignments were generated per read. '
'Try relaxing the definition of what makes a good alignment.'
)
print(msg)
print('Try the following in order:')
print(' (Suggestion) Decrease `Align.minAlignedFraction` by 0.05, up to a minimum of 0.2.')
print(' (Suggestion) Decrease `Align.minAlignedMarkerCount` by 20, up to a minimum of 200.')
print(' (Suggestion) Increase `Align.maxSkip` & `Align.maxDrift` by 10, to allow for larger gaps in alignments.')
def getConfig(assemblyDirPath):
config = configparser.ConfigParser()
configFilePath = '{}/shasta.conf'.format(assemblyDirPath)
if not config.read(configFilePath):
raise Exception('Error reading config file {}.'.format(configFilePath))
return config
def main(argv):
assemblyDirPath = ""
try:
opts, args = getopt.getopt(argv,"", ["assemblyDirectory="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("--assemblyDirectory"):
assemblyDirPath = arg
if assemblyDirPath == "":
usage()
exit(2)
print()
genomeSizeMessage = """
What is the approximate genome size in megabases (Mbp)?
Examples:
3000 (for a 3 Gbp genome)
0.4 (for a 400 Kbp genome)
"""
print(genomeSizeMessage)
genomeSize = float(input('Approximate genome size in megabasis (Mbp): '))
genomeSize = int(genomeSize * 1000 * 1000)
analyze(assemblyDirPath, genomeSize)
print()
return
if __name__ == '__main__':
main(sys.argv[1:])
|