1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
|
# Copyright 2013-2017, The James Hutton Insitute
# Author: Leighton Pritchard
#
# This code is part of the pyani package, and is governed by its licence.
# Please see the LICENSE file that should have been included as part of
# this package.
"""Code to implement the ANIm average nucleotide identity method.
Calculates ANI by the ANIm method, as described in Richter et al (2009)
Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.
All input FASTA format files are compared against each other, pairwise,
using NUCmer (binary location must be provided). NUCmer output will be stored
in a specified output directory.
The NUCmer .delta file output is parsed to obtain an alignment length
and similarity error count for every unique region alignment. These are
processed to give matrices of aligned sequence lengths, similarity error
counts, average nucleotide identity (ANI) percentages, and minimum aligned
percentage (of whole genome) for each pairwise comparison.
"""
import os
from . import pyani_config
from . import pyani_files
from . import pyani_jobs
from .pyani_tools import ANIResults
# Generate list of Job objects, one per NUCmer run
def generate_nucmer_jobs(
filenames,
outdir=".",
nucmer_exe=pyani_config.NUCMER_DEFAULT,
filter_exe=pyani_config.FILTER_DEFAULT,
maxmatch=False,
jobprefix="ANINUCmer",
):
"""Return a list of Jobs describing NUCmer command-lines for ANIm
- filenames - a list of paths to input FASTA files
- outdir - path to output directory
- nucmer_exe - location of the nucmer binary
- maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option
Loop over all FASTA files, generating Jobs describing NUCmer command lines
for each pairwise comparison.
"""
ncmds, fcmds = generate_nucmer_commands(
filenames, outdir, nucmer_exe, filter_exe, maxmatch
)
joblist = []
for idx, ncmd in enumerate(ncmds):
njob = pyani_jobs.Job("%s_%06d-n" % (jobprefix, idx), ncmd)
fjob = pyani_jobs.Job("%s_%06d-f" % (jobprefix, idx), fcmds[idx])
fjob.add_dependency(njob)
# joblist.append(njob) # not required: dependency in fjob
joblist.append(fjob)
return joblist
# Generate list of NUCmer pairwise comparison command lines from
# passed sequence filenames
def generate_nucmer_commands(
filenames,
outdir=".",
nucmer_exe=pyani_config.NUCMER_DEFAULT,
filter_exe=pyani_config.FILTER_DEFAULT,
maxmatch=False,
):
"""Return a tuple of lists of NUCmer command-lines for ANIm
The first element is a list of NUCmer commands, the second a list
of delta_filter_wrapper.py commands. These are ordered such that
commands are paired. The NUCmer commands should be run before
the delta-filter commands.
- filenames - a list of paths to input FASTA files
- outdir - path to output directory
- nucmer_exe - location of the nucmer binary
- maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option
Loop over all FASTA files generating NUCmer command lines for each
pairwise comparison.
"""
nucmer_cmdlines, delta_filter_cmdlines = [], []
for idx, fname1 in enumerate(filenames[:-1]):
for fname2 in filenames[idx + 1 :]:
ncmd, dcmd = construct_nucmer_cmdline(
fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch
)
nucmer_cmdlines.append(ncmd)
delta_filter_cmdlines.append(dcmd)
return (nucmer_cmdlines, delta_filter_cmdlines)
# Generate single NUCmer pairwise comparison command line from pair of
# input filenames
def construct_nucmer_cmdline(
fname1,
fname2,
outdir=".",
nucmer_exe=pyani_config.NUCMER_DEFAULT,
filter_exe=pyani_config.FILTER_DEFAULT,
maxmatch=False,
):
"""Returns a tuple of NUCmer and delta-filter commands
The split into a tuple was made necessary by changes to SGE/OGE. The
delta-filter command must now be run as a dependency of the NUCmer
command, and be wrapped in a Python script to capture STDOUT.
NOTE: This command-line writes output data to a subdirectory of the passed
outdir, called "nucmer_output".
- fname1 - query FASTA filepath
- fname2 - subject FASTA filepath
- outdir - path to output directory
- maxmatch - Boolean flag indicating whether to use NUCmer's -maxmatch
option. If not, the -mum option is used instead
"""
outsubdir = os.path.join(outdir, pyani_config.ALIGNDIR["ANIm"])
outprefix = os.path.join(
outsubdir,
"%s_vs_%s"
% (
os.path.splitext(os.path.split(fname1)[-1])[0],
os.path.splitext(os.path.split(fname2)[-1])[0],
),
)
if maxmatch:
mode = "--maxmatch"
else:
mode = "--mum"
nucmercmd = "{0} {1} -p {2} {3} {4}".format(
nucmer_exe, mode, outprefix, fname1, fname2
)
filtercmd = "delta_filter_wrapper.py " + "{0} -1 {1} {2}".format(
filter_exe, outprefix + ".delta", outprefix + ".filter"
)
return (nucmercmd, filtercmd)
# return "{0}; {1}".format(nucmercmd, filtercmd)
# Parse NUCmer delta file to get total alignment length and total sim_errors
def parse_delta(filename):
"""Returns (alignment length, similarity errors) tuple from passed .delta.
- filename - path to the input .delta file
Extracts the aligned length and number of similarity errors for each
aligned uniquely-matched region, and returns the cumulative total for
each as a tuple.
"""
aln_length, sim_errors = 0, 0
for line in [_.strip().split() for _ in open(filename, "r").readlines()]:
if line[0] == "NUCMER" or line[0].startswith(">"): # Skip headers
continue
# We only process lines with seven columns:
if len(line) == 7:
aln_length += abs(int(line[1]) - int(line[0]))
sim_errors += int(line[4])
return aln_length, sim_errors
# Parse all the .delta files in the passed directory
def process_deltadir(delta_dir, org_lengths, logger=None):
"""Returns a tuple of ANIm results for .deltas in passed directory.
- delta_dir - path to the directory containing .delta files
- org_lengths - dictionary of total sequence lengths, keyed by sequence
Returns the following pandas dataframes in an ANIResults object;
query sequences are rows, subject sequences are columns:
- alignment_lengths - symmetrical: total length of alignment
- percentage_identity - symmetrical: percentage identity of alignment
- alignment_coverage - non-symmetrical: coverage of query and subject
- similarity_errors - symmetrical: count of similarity errors
May throw a ZeroDivisionError if one or more NUCmer runs failed, or a
very distant sequence was included in the analysis.
"""
# Process directory to identify input files - as of v0.2.4 we use the
# .filter files that result from delta-filter (1:1 alignments)
deltafiles = pyani_files.get_input_files(delta_dir, ".filter")
# Hold data in ANIResults object
results = ANIResults(list(org_lengths.keys()), "ANIm")
# Fill diagonal NA values for alignment_length with org_lengths
for org, length in list(org_lengths.items()):
results.alignment_lengths[org][org] = length
# Process .delta files assuming that the filename format holds:
# org1_vs_org2.delta
for deltafile in deltafiles:
qname, sname = os.path.splitext(os.path.split(deltafile)[-1])[0].split("_vs_")
# We may have .delta files from other analyses in the same directory
# If this occurs, we raise a warning, and skip the .delta file
if qname not in list(org_lengths.keys()):
if logger:
logger.warning(
"Query name %s not in input " % qname
+ "sequence list, skipping %s" % deltafile
)
continue
if sname not in list(org_lengths.keys()):
if logger:
logger.warning(
"Subject name %s not in input " % sname
+ "sequence list, skipping %s" % deltafile
)
continue
tot_length, tot_sim_error = parse_delta(deltafile)
if tot_length == 0 and logger is not None:
if logger:
logger.warning(
"Total alignment length reported in " + "%s is zero!" % deltafile
)
query_cover = float(tot_length) / org_lengths[qname]
sbjct_cover = float(tot_length) / org_lengths[sname]
# Calculate percentage ID of aligned length. This may fail if
# total length is zero.
# The ZeroDivisionError that would arise should be handled
# Common causes are that a NUCmer run failed, or that a very
# distant sequence was included in the analysis.
try:
perc_id = 1 - float(tot_sim_error) / tot_length
except ZeroDivisionError:
perc_id = 0 # set arbitrary value of zero identity
results.zero_error = True
# Populate dataframes: when assigning data from symmetrical MUMmer
# output, both upper and lower triangles will be populated
results.add_tot_length(qname, sname, tot_length)
results.add_sim_errors(qname, sname, tot_sim_error)
results.add_pid(qname, sname, perc_id)
results.add_coverage(qname, sname, query_cover, sbjct_cover)
return results
|