File: GenerateConfig.py

package info (click to toggle)
shasta 0.14.0-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 29,636 kB
sloc: cpp: 82,262; python: 2,348; makefile: 222; sh: 143
file content (170 lines) | stat: -rwxr-xr-x 5,022 bytes
parent folder | download | duplicates (4)
#!/usr/bin/python3

import configparser
import sys
import getopt
import os
import time

helpMessage = """
Usage:
    GenerateConfig.py

Note: The goal of this script is to generate a good starter configuration for the
Shasta long read assembler. 

Each genome is different and reads differ in quality. So it is impossible to generate a perfect
configuration programmatically. If you'd like to get some suggestions on how to tweak the
configuration generated by this script, you should use the `GetFeedback.py` script after 
you run your assembly.

If you're still not happy with the assembly after using these two scripts, please file a Github
issue and we can help.
"""

def usage():
    print(helpMessage)
    return


def generateConfig(
    genomeSize,
    baseCallerId,
    enableDetangling,
    areUltraLongReads,
):
    minReadLength = 10000
    if areUltraLongReads:
        minReadLength = 40000
    
    # The following Align parameters are set to very permissive values to allow the majority of alignments
    # to be assessed during the initial stage of automatic alignment parameter selection
    maxSkip = 100
    maxDrift = 100
    maxTrim = 100
    minAlignedMarkerCount = 10
    minAlignedFraction = 0.1
    # This method uses the observed distribution of alignment stats to choose a cutoff for
    # maxSkip, maxDrift, maxTrim, minAlignedMarkerCount, and minAlignedFraction
    readGraphCreationMethod = 2

    config = configparser.ConfigParser()
    config.optionxform = lambda option: option
    config['Reads'] = {
        'minReadLength': str(minReadLength),
        'desiredCoverage': str(genomeSize * 60), # 60X coverage
        'noCache': 'True',
    }
    config['Kmers'] = {
        'k': '10'
    }
    config['MinHash'] = {
        'minHashIterationCount': '10',
        'minBucketSize': '5',
        'maxBucketSize': '30',
        'minFrequency': '5',
    }
    config['Align'] = {
        'alignMethod': '3',
        'downsamplingFactor': '0.05',
        'matchScore': '6',
        'maxSkip': str(maxSkip),
        'maxDrift': str(maxDrift),
        'maxTrim': str(maxTrim),
        'minAlignedFraction': str(minAlignedFraction),
        'minAlignedMarkerCount': str(minAlignedMarkerCount),
        'sameChannelReadAlignment.suppressDeltaThreshold': '30',
    }
    config['ReadGraph'] = {
        'creationMethod': str(readGraphCreationMethod),
    }
    config['MarkerGraph'] = {
        'simplifyMaxLength': '10,100,1000,10000,100000',
        'refineThreshold': '6',
        'crossEdgeCoverageThreshold': '3',
        'minCoverage': '0', # Auto select
    }
    config['Assembly'] = {
        'consensusCaller': 'Bayesian:guppy-3.6.0-a'    
    }

    if baseCallerId == 2 or baseCallerId == 3:
        # Guppy version less than 3.6.0
        config['Reads']['desiredCoverage'] = str(genomeSize * 80) # 80X coverage
        if baseCallerId == 2:
            config['Assembly']['consensusCaller'] = 'Bayesian:guppy-3.0.5-a'
        else:
            config['Assembly']['consensusCaller'] = 'Modal'
    
    if enableDetangling:
        config['Assembly']['detangleMethod'] = '2'
    
    return config


def main(argv):
    genomeSizeMessage = """
What is the approximate genome size in megabases (Mbp)?
    Examples:
        3000 (for a 3 Gbp genome)
        0.4  (for a 400 Kbp genome)
    """
    print(genomeSizeMessage)
    genomeSize = float(input('Approximate genome size in megabasis (Mbp): '))

    techMessage = """
What technology was used to generate the reads?
    Enter 1 for Oxford Nanopore (ONT) (default)
    Enter 2 for Pac Bio
    """
    print(techMessage)
    techStr = input('Technology : ')
    if techStr == '':
        techId = 1
    else:
        techId = int(techStr)
    
    baseCallerMessage = """
Which basecaller was used to generate reads?
    Enter 1 for Guppy version >= 3.6.0 (default)
    Enter 2 for Guppy version < 3.6.0
    Enter 3 if something other than Guppy was used 
    """

    if techId == 1: # Only applies to ONT reads.
        print(baseCallerMessage)
        baseCallerIdStr = input('Basecaller : ')
        if baseCallerIdStr == '':
            baseCallerId = 1
        else:
            baseCallerId = int(baseCallerIdStr)
    else:
        baseCallerId = 3

    print()

    areUltraLongReads = input('Using Ultra-long reads? [Y/n - default n] : ')
    areUltraLongReads = (areUltraLongReads == 'Y' or areUltraLongReads == 'y')

    print()

    enableDetangling = input('Enable detangling? [Y/n - default Y] : ')
    enableDetangling = (enableDetangling == 'Y' or enableDetangling == 'y' or enableDetangling == '')

    config = generateConfig(
        int(genomeSize * 1000 * 1000),
        baseCallerId,
        enableDetangling,
        areUltraLongReads,
    )

    configFileName = 'generatedShasta.conf'
    with open(configFileName, 'w') as configfile:
        config.write(configfile)

    print("Shasta configuration written out to {}".format(configFileName))
    return


if __name__ == '__main__':
    main(sys.argv[1:])