File: last-tutorial.html

package info (click to toggle)
last-align 1179-1
links: PTS, VCS
area: main
in suites: bullseye
size: 4,004 kB
sloc: cpp: 43,317; python: 3,352; ansic: 1,874; makefile: 495; sh: 305
file content (740 lines) | stat: -rw-r--r-- 31,100 bytes
<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="Docutils 0.6: http://docutils.sourceforge.net/" />
<title>LAST Cookbook</title>
<style type="text/css">

/*
:Author: David Goodger (goodger@python.org)
:Id: $Id: html4css1.css 5951 2009-05-18 18:03:10Z milde $
:Copyright: This stylesheet has been placed in the public domain.

Default cascading style sheet for the HTML output of Docutils.

See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
customize this style sheet.
*/

/* used to remove borders from tables and images */
.borderless, table.borderless td, table.borderless th {
  border: 0 }

table.borderless td, table.borderless th {
  /* Override padding for "table.docutils td" with "! important".
     The right padding separates the table cells. */
  padding: 0 0.5em 0 0 ! important }

.first {
  /* Override more specific margin styles with "! important". */
  margin-top: 0 ! important }

.last, .with-subtitle {
  margin-bottom: 0 ! important }

.hidden {
  display: none }

a.toc-backref {
  text-decoration: none ;
  color: black }

blockquote.epigraph {
  margin: 2em 5em ; }

dl.docutils dd {
  margin-bottom: 0.5em }

/* Uncomment (and remove this text!) to get bold-faced definition list terms
dl.docutils dt {
  font-weight: bold }
*/

div.abstract {
  margin: 2em 5em }

div.abstract p.topic-title {
  font-weight: bold ;
  text-align: center }

div.admonition, div.attention, div.caution, div.danger, div.error,
div.hint, div.important, div.note, div.tip, div.warning {
  margin: 2em ;
  border: medium outset ;
  padding: 1em }

div.admonition p.admonition-title, div.hint p.admonition-title,
div.important p.admonition-title, div.note p.admonition-title,
div.tip p.admonition-title {
  font-weight: bold ;
  font-family: sans-serif }

div.attention p.admonition-title, div.caution p.admonition-title,
div.danger p.admonition-title, div.error p.admonition-title,
div.warning p.admonition-title {
  color: red ;
  font-weight: bold ;
  font-family: sans-serif }

/* Uncomment (and remove this text!) to get reduced vertical space in
   compound paragraphs.
div.compound .compound-first, div.compound .compound-middle {
  margin-bottom: 0.5em }

div.compound .compound-last, div.compound .compound-middle {
  margin-top: 0.5em }
*/

div.dedication {
  margin: 2em 5em ;
  text-align: center ;
  font-style: italic }

div.dedication p.topic-title {
  font-weight: bold ;
  font-style: normal }

div.figure {
  margin-left: 2em ;
  margin-right: 2em }

div.footer, div.header {
  clear: both;
  font-size: smaller }

div.line-block {
  display: block ;
  margin-top: 1em ;
  margin-bottom: 1em }

div.line-block div.line-block {
  margin-top: 0 ;
  margin-bottom: 0 ;
  margin-left: 1.5em }

div.sidebar {
  margin: 0 0 0.5em 1em ;
  border: medium outset ;
  padding: 1em ;
  background-color: #ffffee ;
  width: 40% ;
  float: right ;
  clear: right }

div.sidebar p.rubric {
  font-family: sans-serif ;
  font-size: medium }

div.system-messages {
  margin: 5em }

div.system-messages h1 {
  color: red }

div.system-message {
  border: medium outset ;
  padding: 1em }

div.system-message p.system-message-title {
  color: red ;
  font-weight: bold }

div.topic {
  margin: 2em }

h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
  margin-top: 0.4em }

h1.title {
  text-align: center }

h2.subtitle {
  text-align: center }

hr.docutils {
  width: 75% }

img.align-left, .figure.align-left{
  clear: left ;
  float: left ;
  margin-right: 1em }

img.align-right, .figure.align-right {
  clear: right ;
  float: right ;
  margin-left: 1em }

.align-left {
  text-align: left }

.align-center {
  clear: both ;
  text-align: center }

.align-right {
  text-align: right }

/* reset inner alignment in figures */
div.align-right {
  text-align: left }

/* div.align-center * { */
/*   text-align: left } */

ol.simple, ul.simple {
  margin-bottom: 1em }

ol.arabic {
  list-style: decimal }

ol.loweralpha {
  list-style: lower-alpha }

ol.upperalpha {
  list-style: upper-alpha }

ol.lowerroman {
  list-style: lower-roman }

ol.upperroman {
  list-style: upper-roman }

p.attribution {
  text-align: right ;
  margin-left: 50% }

p.caption {
  font-style: italic }

p.credits {
  font-style: italic ;
  font-size: smaller }

p.label {
  white-space: nowrap }

p.rubric {
  font-weight: bold ;
  font-size: larger ;
  color: maroon ;
  text-align: center }

p.sidebar-title {
  font-family: sans-serif ;
  font-weight: bold ;
  font-size: larger }

p.sidebar-subtitle {
  font-family: sans-serif ;
  font-weight: bold }

p.topic-title {
  font-weight: bold }

pre.address {
  margin-bottom: 0 ;
  margin-top: 0 ;
  font: inherit }

pre.literal-block, pre.doctest-block {
  margin-left: 2em ;
  margin-right: 2em }

span.classifier {
  font-family: sans-serif ;
  font-style: oblique }

span.classifier-delimiter {
  font-family: sans-serif ;
  font-weight: bold }

span.interpreted {
  font-family: sans-serif }

span.option {
  white-space: nowrap }

span.pre {
  white-space: pre }

span.problematic {
  color: red }

span.section-subtitle {
  /* font-size relative to parent (h1..h6 element) */
  font-size: 80% }

table.citation {
  border-left: solid 1px gray;
  margin-left: 1px }

table.docinfo {
  margin: 2em 4em }

table.docutils {
  margin-top: 0.5em ;
  margin-bottom: 0.5em }

table.footnote {
  border-left: solid 1px black;
  margin-left: 1px }

table.docutils td, table.docutils th,
table.docinfo td, table.docinfo th {
  padding-left: 0.5em ;
  padding-right: 0.5em ;
  vertical-align: top }

table.docutils th.field-name, table.docinfo th.docinfo-name {
  font-weight: bold ;
  text-align: left ;
  white-space: nowrap ;
  padding-left: 0 }

h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
  font-size: 100% }

ul.auto-toc {
  list-style-type: none }

</style>
<style type="text/css">

/* Style sheet for LAST HTML documents */
h1 { color: navy }
h2 { color: teal }
div.document { margin-left: auto; margin-right: auto; max-width: 45em }
strong { color: red }
.option-list td { padding-bottom: 1em }
table.field-list { border: thin solid green }

</style>
</head>
<body>
<div class="document" id="last-cookbook">
<h1 class="title">LAST Cookbook</h1>

<p><a class="reference external" href="last.html">LAST</a> is used by running commands in a terminal / command line.  It
has many options: unfortunately, the LAST developers don't know the
best options for every possible alignment task.  Here are some
reasonable starting points.  Feel free to optimize (and share!) search
protocols.</p>
<div class="section" id="a-minimal-example-compare-human-fugu-mitochondrial-genomes">
<h2>A minimal example: compare human &amp; fugu mitochondrial genomes</h2>
<p>Let's find and align similar regions between the human and fugu
mitochondrial genomes.  These FASTA-format files are in LAST's
examples directory: humanMito.fa and fuguMito.fa.  The simplest
possible usage is:</p>
<pre class="literal-block">
lastdb humdb humanMito.fa
lastal humdb fuguMito.fa &gt; myalns.maf
</pre>
<p>The <a class="reference external" href="lastdb.html">lastdb</a> command creates several files whose names begin with
&quot;humdb&quot;.  The <a class="reference external" href="lastal.html">lastal</a> command then compares fuguMito.fa to the humdb
files, and writes the alignments to a file called &quot;myalns.maf&quot;.</p>
</div>
<div class="section" id="understanding-the-output">
<h2>Understanding the output</h2>
<p>The output has very long lines, so you need to view it without
line-wrapping.  For example, you can use:</p>
<pre class="literal-block">
less -S myalns.maf
</pre>
<p>Each alignment looks like this (<a class="reference external" href="http://genome.ucsc.edu/FAQ/FAQformat.html#format5">MAF</a> format):</p>
<pre class="literal-block">
a score=27 EG2=4.7e+04 E=2.6e-05
s humanMito 2170 145 + 16571 AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
s fuguMito  1648 142 + 16447 AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...
</pre>
<p>The score is a measure of how significant the similarity is.  EG2 and
E are explained at <a class="reference external" href="last-evalues.html">last-evalues</a>.  Lines starting with &quot;s&quot; contain:
the sequence name, the start coordinate of the alignment, the number
of bases spanned by the alignment, the strand, the sequence length,
and the aligned bases.</p>
<p>The start coordinates are zero-based.  This means that, if the
alignment begins right at the start of a sequence, the coordinate is
0.  If the strand is &quot;-&quot;, the start coordinate is the coordinate in
the reverse-complemented sequence (the same as if you were to
reverse-complement the sequence before giving it to LAST).</p>
<p>You can convert MAF to other formats with <a class="reference external" href="maf-convert.html">maf-convert</a>, or use <a class="reference external" href="lastal.html">lastal</a>
option <tt class="docutils literal"><span class="pre">-f</span></tt> to get a few other formats.</p>
</div>
<div class="section" id="more-accurate-learn-substitution-gap-rates">
<h2>More accurate: learn substitution &amp; gap rates</h2>
<p>We can get more accurate alignments between the human and fugu
mitochondrial genomes like this:</p>
<pre class="literal-block">
lastdb humdb humanMito.fa
last-train humdb fuguMito.fa &gt; hufu.train
lastal -p hufu.train humdb fuguMito.fa &gt; myalns.maf
</pre>
<p>The <a class="reference external" href="last-train.html">last-train</a> command finds the rates of deletion, insertion, and
each kind of substitution between these sequences, and writes them to
a file &quot;hufu.train&quot;.  lastal's <tt class="docutils literal"><span class="pre">-p</span></tt> option uses this file to get
more-accurate alignments.</p>
</div>
<div class="section" id="comparing-protein-sequences">
<h2>Comparing protein sequences</h2>
<p>We can compare some query proteins to some reference proteins like
this:</p>
<pre class="literal-block">
lastdb -p -c -R01 refdb ref-prots.fa
lastal refdb query-prots.fa &gt; prot-alns.maf
</pre>
<p><tt class="docutils literal"><span class="pre">-p</span></tt> tells it the sequences are proteins.  (If you forget <tt class="docutils literal"><span class="pre">-p</span></tt> and
the sequences look proteinaceous, you'll get a warning message.)</p>
<p>The other options suppress alignments caused by simple sequence such
as <tt class="docutils literal">APPSPAPPSPAPPSPAPPSPAP</tt>:</p>
<ul>
<li><p class="first"><tt class="docutils literal"><span class="pre">-R01</span></tt> converts the sequence letters to uppercase, then finds
simple regions and converts them to lowercase.  This will be done
for both ref-prots and query-prots.</p>
</li>
<li><p class="first"><tt class="docutils literal"><span class="pre">-c</span></tt> omits alignments that lack a significant amount of
uppercase-to-uppercase alignment.</p>
</li>
</ul>
<p>You can also use <a class="reference external" href="last-train.html">last-train</a>, but we've hardly tested it for
protein-protein alignment, so we're not sure if it helps.</p>
<div class="section" id="find-high-similarity-and-short-protein-alignments">
<h3>Find high-similarity, and short, protein alignments</h3>
<p>If we just want high-similarity alignments, we can use the PAM30 (or
PAM10) <a class="reference external" href="last-matrices.html">scoring scheme</a>:</p>
<pre class="literal-block">
lastdb -p -c -R01 refdb ref-prots.fa
lastal -p PAM30 refdb query-prots.fa &gt; prot-alns.maf
</pre>
<p>This has two advantages:</p>
<ul>
<li><p class="first">It omits low-similarity alignments, or alignment parts.</p>
</li>
<li><p class="first">It can find short similarities, which would be deemed insignificant
(likely to occur by chance between random sequences) unless we focus
the search on high-similarity.</p>
</li>
</ul>
</div>
</div>
<div class="section" id="comparing-dna-to-proteins">
<h2>Comparing DNA to proteins</h2>
<p>We can find related regions of DNA and proteins, allowing for nonsense
mutations and frameshifts.  For example, let's find DNA regions
related to transposon proteins:</p>
<pre class="literal-block">
lastdb -q -c -R01 trandb transposon-prots.fa
last-train --codon trandb dna.fa &gt; codon.train
lastal -p codon.train -m100 -D1e9 -K1 trandb dna.fa &gt; out.maf
</pre>
<p><tt class="docutils literal"><span class="pre">-q</span></tt> appends <tt class="docutils literal">*</tt> meaning STOP to each protein, and treats <tt class="docutils literal">*</tt> as
a 21st protein letter.</p>
<p><tt class="docutils literal"><span class="pre">--codon</span></tt> makes it do DNA-versus-protein.  Here, <a class="reference external" href="last-train.html">last-train</a> tries
to learn 21x64 substitution rates, so it needs a fairly large amount
of data (e.g. a chromosome).</p>
<p><tt class="docutils literal"><span class="pre">-m100</span></tt> makes it more slow-and-sensitive than the default (which is
<tt class="docutils literal"><span class="pre">-m10</span></tt>), see <a class="reference external" href="lastal.html">lastal</a>.</p>
<p><tt class="docutils literal"><span class="pre">-D1e9</span></tt> sets a strict <a class="reference external" href="last-evalues.html">significance</a> threshold.  It means: only
report strong similarities that would be expected to occur by chance,
between random sequences, at most once per 10^9 base-pairs.  The
default is 1e6.</p>
<p><tt class="docutils literal"><span class="pre">-K1</span></tt> streamlines the output by omitting any alignment whose DNA
range lies in that of a higher-scoring alignment.</p>
<p>Another possibility is to add <a class="reference external" href="last-train.html">last-train</a> option <tt class="docutils literal"><span class="pre">-X1</span></tt>, which treats
matches to <tt class="docutils literal">X</tt> (unknown) amino acids as neutral instead of
disfavored.</p>
<p>You can reuse <tt class="docutils literal"><span class="pre">last-train</span></tt> output for different alignment tasks, if
you expect the rates to be roughly the same.</p>
</div>
<div class="section" id="aligning-high-indel-error-long-dna-reads-to-a-genome">
<h2>Aligning high-indel-error long DNA reads to a genome</h2>
<p>Suppose we have DNA reads in either FASTA or <a class="reference external" href="https://doi.org/10.1093/nar/gkp1137">FASTQ</a> format.  This is
sensitive but slow:</p>
<pre class="literal-block">
lastdb -P8 -uNEAR -R01 mydb genome.fa
last-train -P8 -Q0 mydb reads.fastq &gt; reads.train
lastal -P8 -p reads.train mydb reads.fastq | last-split &gt; out.maf
</pre>
<p><tt class="docutils literal"><span class="pre">-P8</span></tt> uses 8 parallel threads, adjust as appropriate for your
computer.  This has no effect on the results.</p>
<p><tt class="docutils literal"><span class="pre">-uNEAR</span></tt> selects a <a class="reference external" href="last-seeds.html">seeding scheme</a> that's better at finding
alignments with few substitutions and/or many gaps.</p>
<p><tt class="docutils literal"><span class="pre">-Q0</span></tt> makes it discard the <a class="reference external" href="https://doi.org/10.1093/nar/gkp1137">fastq</a> quality information (or you can
keep-but-ignore it with <tt class="docutils literal"><span class="pre">-Qkeep</span></tt>).</p>
<p><a class="reference external" href="last-split.html">last-split</a> finds a unique best alignment for each part of each read.</p>
<p>Here we used <tt class="docutils literal"><span class="pre">-R01</span></tt> to lowercase simple sequence like
<tt class="docutils literal">cacacacacacacacacacacaca</tt>.  But we didn't suppress it with <tt class="docutils literal"><span class="pre">-c</span></tt>,
so as not to hide anything from <a class="reference external" href="last-split.html">last-split</a>.  If desired, you can
filter lowercase with <a class="reference external" href="last-postmask.html">last-postmask</a>.</p>
<p>You can go faster by sacrificing a bit of sensitivity.  It depends on
your aim, e.g. slow-and-sensitive seems necessary to find intricate
rearrangements of repeats.  Suggested ways to go faster:</p>
<ul>
<li><p class="first"><a class="reference external" href="https://github.com/mcfrith/last-rna/blob/master/last-long-reads.md">Mask repeats</a>.  This has often worked well.</p>
</li>
<li><p class="first">Add lastal option <tt class="docutils literal"><span class="pre">-k8</span></tt> (or <tt class="docutils literal"><span class="pre">-k16</span></tt> etc).  This makes it faster,
by only finding initial matches starting at every 8th (or 16th etc)
position in the reads.</p>
</li>
<li><p class="first">Replace <tt class="docutils literal"><span class="pre">-uNEAR</span></tt> with <tt class="docutils literal"><span class="pre">-uRY32</span></tt> (or <tt class="docutils literal"><span class="pre">-uRY16</span></tt>, <tt class="docutils literal"><span class="pre">-uRY8</span></tt>,
<tt class="docutils literal"><span class="pre">-uRY4</span></tt>).  This makes it check for initial matches starting at
only ~1/32 (or ~1/16 etc) of positions, in both reads and genome.
Compared to <tt class="docutils literal"><span class="pre">-k</span></tt>: this harms sensitivity slightly more, but
reduces memory use and makes lastdb faster.</p>
</li>
</ul>
<div class="section" id="which-genome-version-to-use">
<h3>Which genome version to use?</h3>
<p>Some genome versions (e.g. for human) have artificial
exactly-duplicated regions, which makes it hard to align reads
uniquely.  To avoid that, look for a genome version called something
like &quot;analysis set&quot;.</p>
</div>
<div class="section" id="aligning-low-error-long-dna-reads-to-a-genome">
<h3>Aligning low-error long DNA reads to a genome</h3>
<p>We can do this the same way as for high-error reads, but perhaps
accelerate more aggressively (e.g. <tt class="docutils literal"><span class="pre">-uRY32</span></tt>).</p>
<p>If repeats are not masked, <a class="reference external" href="lastal.html">lastal</a> option <tt class="docutils literal"><span class="pre">-C2</span></tt> may reduce run time
with little effect on accuracy.</p>
</div>
<div class="section" id="aligning-potentially-spliced-rna-or-cdna-long-reads-to-a-genome">
<h3>Aligning potentially-spliced RNA or cDNA long reads to a genome</h3>
<p>See <a class="reference external" href="https://github.com/mcfrith/last-rna/blob/master/last-long-reads.md">here</a>.  (For low-error reads, you can probably omit <tt class="docutils literal"><span class="pre">-d90</span></tt> and
<tt class="docutils literal"><span class="pre">-m20</span></tt>.)</p>
</div>
</div>
<div class="section" id="aligning-illumina-dna-reads-to-a-genome">
<h2>Aligning Illumina DNA reads to a genome</h2>
<pre class="literal-block">
lastdb -P8 -uNEAR -R01 -C2 mydb genome.fasta
last-train -P8 -Q1 mydb reads.fastq.gz &gt; reads.train
lastal -P8 -p reads.train mydb reads.fastq.gz | last-split | gzip &gt; out.maf.gz
</pre>
<p>Most LAST commands accept <tt class="docutils literal">.gz</tt> compressed files, and you can
compress output with <tt class="docutils literal">gzip</tt> as above.</p>
<p><a class="reference external" href="lastdb.html">lastdb</a> option <tt class="docutils literal"><span class="pre">-C2</span></tt> makes the alignment a bit faster, but uses more
memory.  This has no effect on the results.  (You could use it in the
other examples too, but it might not be faster.)</p>
<p><tt class="docutils literal"><span class="pre">-Q1</span></tt> makes it use the <a class="reference external" href="https://doi.org/10.1093/nar/gkp1137">fastq</a> quality information to improve the
training and alignment.  LAST <strong>assumes</strong> that the qualities reflect
substitution errors, not insertion/deletion errors.  (For long
non-Illumina reads, we suspect this assumption doesn't hold, so we
didn't use this option.)</p>
<p>This recipe may be excessively slow-and-sensitive.  Adding <a class="reference external" href="lastal.html">lastal</a>
option <tt class="docutils literal"><span class="pre">-C2</span></tt> may make it faster with negligible accuracy loss.  You
can accelerate with e.g. <tt class="docutils literal"><span class="pre">-uRY16</span></tt> or <tt class="docutils literal"><span class="pre">-k16</span></tt> as above.</p>
<div class="section" id="finding-very-short-dna-alignments">
<h3>Finding very short DNA alignments</h3>
<p>By default, LAST only reports <a class="reference external" href="last-evalues.html">significant</a> alignments that will rarely
occur by chance.  In the preceding example, the minimum alignment
length is about 26 bases for a human-size genome (less for smaller
genomes).  To find shorter alignments, add <a class="reference external" href="lastal.html">lastal</a> option <tt class="docutils literal"><span class="pre">-D100</span></tt>
(say), to get alignments that could occur by chance once per hundred
query letters (the default is once per million.)  This makes the
minimum alignment length about 20 bases for a human-size genome.</p>
</div>
<div class="section" id="aligning-paired-end-illumina-dna-reads-to-a-genome">
<h3>Aligning paired-end Illumina DNA reads to a genome</h3>
<p>You could use <a class="reference external" href="last-pair-probs.html">last-pair-probs</a>.  It has a disadvantage: it doesn't
allow different parts of one read (i.e. one &quot;end&quot;) to align to
different parts of the genome.  Alternatively, you could align the
reads individually, ignoring the pair relationships:</p>
<pre class="literal-block">
fastq-interleave reads1.fq reads2.fq | lastal -P8 -p reads.train mydb | last-split &gt; out.maf
</pre>
<p><tt class="docutils literal"><span class="pre">fastq-interleave</span></tt> ensures that each read has a unique name (by
appending &quot;/1&quot; and &quot;/2&quot; if necessary).</p>
</div>
<div class="section" id="aligning-potentially-spliced-illumina-reads-to-a-genome">
<h3>Aligning potentially-spliced Illumina reads to a genome</h3>
<p>See <a class="reference external" href="last-split.html">last-split</a> (and <a class="reference external" href="last-pair-probs.html">last-pair-probs</a>).</p>
</div>
</div>
<div class="section" id="aligning-human-chimp-genomes">
<h2>Aligning human &amp; chimp genomes</h2>
<p>This is very slow-and-sensitive:</p>
<pre class="literal-block">
lastdb -P8 -uNEAR -R01 humdb human_no_alt_analysis_set.fa
last-train -P8 --revsym -E0.05 -C2 humdb chimp.fa &gt; humchi.train
lastal -E0.05 -C2 -p humchi.train humdb chimp.fa | last-split &gt; humchi1.maf
</pre>
<p><tt class="docutils literal"><span class="pre">--revsym</span></tt> makes the substitution rates the same on both strands.
For example, it makes A→G equal T→C (because A→G on one strand means
T→C on the other strand).  This is usually appropriate for
genome-genome comparison (but maybe not for mitochondria which have
asymmetric &quot;heavy&quot; and &quot;light&quot; strands).</p>
<p><tt class="docutils literal"><span class="pre">-E0.05</span></tt> means only get <a class="reference external" href="last-evalues.html">significant</a> alignments that would be
expected to occur by chance at a rate ≤ 0.05 times per pair of random
sequences of length 1 billion each.</p>
<p>The result so far is asymmetric: each part of the chimp genome is
aligned to at most one part of the human genome, but not vice-versa.
We can get one-to-one alignments like this:</p>
<pre class="literal-block">
maf-swap humchi1.maf | last-split &gt; humchi2.maf
</pre>
<p>Then we can discard less-confident alignments, and <a class="reference external" href="maf-convert.html">convert</a> to a
compact tabular format:</p>
<pre class="literal-block">
last-postmask humchi2.maf | maf-convert -n tab | awk -F= '$2 &lt;= 1e-5' &gt; humchi.tab
</pre>
<p><a class="reference external" href="last-postmask.html">last-postmask</a> discards alignments caused by simple sequence.  The
<tt class="docutils literal">awk</tt> command gets alignments with <a class="reference external" href="last-split.html">mismap probability</a> ≤ 10^-5.
Finally, we can make a <a class="reference external" href="last-dotplot.html">dotplot</a>:</p>
<pre class="literal-block">
last-dotplot humchi.tab humchi.png
</pre>
<p>To go faster, with minor accuracy loss: replace <tt class="docutils literal"><span class="pre">-uNEAR</span></tt> with
<tt class="docutils literal"><span class="pre">-uRY32</span></tt> and/or <a class="reference external" href="https://github.com/mcfrith/last-rna/blob/master/last-long-reads.md">mask repeats</a>.</p>
<p>To squeeze out the last 0.000...1% of accuracy: add <tt class="docutils literal"><span class="pre">-m50</span></tt> to the
<a class="reference external" href="lastal.html">lastal</a> options.</p>
<div class="section" id="aligning-human-mouse-genomes">
<h3>Aligning human &amp; mouse genomes</h3>
<p>You can do this in the same way as human/chimp, except that <tt class="docutils literal"><span class="pre">-uNEAR</span></tt>
should be omitted.  To increase sensitivity, but also time and memory
use, add lastdb <a class="reference external" href="last-seeds.html">seeding</a> option <tt class="docutils literal"><span class="pre">-uMAM4</span></tt> or or <tt class="docutils literal"><span class="pre">-uMAM8</span></tt>.  To
increase them even more, add <a class="reference external" href="lastal.html">lastal</a> option <tt class="docutils literal"><span class="pre">-m100</span></tt> (or as high as
you can bear).</p>
</div>
</div>
<div class="section" id="large-reference-sequences">
<h2>Large reference sequences</h2>
<p>If the sequences that you give to lastdb exceed ~4 billion letters,
consider using 8-byte LAST (<a class="reference external" href="lastdb.html">lastdb8</a> and <a class="reference external" href="lastal.html">lastal8</a>).  Ordinary (4-byte)
LAST can't handle so much sequence at once, so <a class="reference external" href="lastdb.html">lastdb</a> splits it into
&quot;volumes&quot;, which may be inefficient.  8-byte LAST avoids voluming, but
uses more memory.  So <a class="reference external" href="lastdb.html">lastdb8</a> works well with a memory-reducing
option: <tt class="docutils literal"><span class="pre">-uRY</span></tt> or <tt class="docutils literal"><span class="pre">-w</span></tt> or <tt class="docutils literal"><span class="pre">-W</span></tt>.</p>
</div>
<div class="section" id="moar-faster">
<h2>Moar faster</h2>
<ul>
<li><p class="first"><a class="reference external" href="last-parallel.html">Using multiple CPUs / cores</a></p>
</li>
<li><p class="first"><a class="reference external" href="last-tuning.html">Various speed &amp; memory options</a></p>
</li>
</ul>
</div>
<div class="section" id="ambiguity-of-alignment-columns">
<h2>Ambiguity of alignment columns</h2>
<p>Consider this alignment:</p>
<pre class="literal-block">
TGAAGTTAAAGGTATATGAATTCCAATTCTTAACCCCCCTATTAAACGAATATCTTG
|||||||| ||||||  |  ||  | |  |    || ||||||   |||||||||||
TGAAGTTAGAGGTAT--GGTTTTGAGTAGT----CCTCCTATTTTTCGAATATCTTG
</pre>
<p>The middle section has such weak similarity that its precise alignment
cannot be confidently inferred.  We can see the confidence of each
alignment column with <a class="reference external" href="lastal.html">lastal</a> option <tt class="docutils literal"><span class="pre">-j4</span></tt>:</p>
<pre class="literal-block">
lastal -j4 -p hufu.train humdb fuguMito.fa &gt; myalns.maf
</pre>
<p>The output looks like this:</p>
<pre class="literal-block">
a score=17 EG2=9.3e+09 E=5e-06
s seqX 0 57 + 57 TGAAGTTAAAGGTATATGAATTCCAATTCTTAACCCCCCTATTAAACGAATATCTTG
s seqY 0 51 + 51 TGAAGTTAGAGGTAT--GGTTTTGAGTAGT----CCTCCTATTTTTCGAATATCTTG
p                %*.14442011.(%##&quot;%$$$$###&quot;&quot;!!!&quot;&quot;&quot;&quot;&amp;'(*,340.,,.~~~~~~~~~~~
</pre>
<p>The &quot;p&quot; line indicates the probability that each column is wrongly
aligned, using a compact code (the same as <a class="reference external" href="https://doi.org/10.1093/nar/gkp1137">fastq</a> format):</p>
<blockquote>
<table border="1" class="docutils">
<colgroup>
<col width="13%" />
<col width="37%" />
<col width="13%" />
<col width="37%" />
</colgroup>
<tbody valign="top">
<tr><td>Symbol</td>
<td>Error probability</td>
<td>Symbol</td>
<td>Error probability</td>
</tr>
<tr><td><tt class="docutils literal">!</tt></td>
<td>0.79 -- 1</td>
<td><tt class="docutils literal">0</tt></td>
<td>0.025 -- 0.032</td>
</tr>
<tr><td><tt class="docutils literal">&quot;</tt></td>
<td>0.63 -- 0.79</td>
<td><tt class="docutils literal">1</tt></td>
<td>0.02  -- 0.025</td>
</tr>
<tr><td><tt class="docutils literal">#</tt></td>
<td>0.5  -- 0.63</td>
<td><tt class="docutils literal">2</tt></td>
<td>0.016 -- 0.02</td>
</tr>
<tr><td><tt class="docutils literal">$</tt></td>
<td>0.4  -- 0.5</td>
<td><tt class="docutils literal">3</tt></td>
<td>0.013 -- 0.016</td>
</tr>
<tr><td><tt class="docutils literal">%</tt></td>
<td>0.32 -- 0.4</td>
<td><tt class="docutils literal">4</tt></td>
<td>0.01  -- 0.013</td>
</tr>
<tr><td><tt class="docutils literal">&amp;</tt></td>
<td>0.25 -- 0.32</td>
<td><tt class="docutils literal">5</tt></td>
<td>0.0079 -- 0.01</td>
</tr>
<tr><td><tt class="docutils literal">'</tt></td>
<td>0.2  -- 0.25</td>
<td><tt class="docutils literal">6</tt></td>
<td>0.0063 -- 0.0079</td>
</tr>
<tr><td><tt class="docutils literal">(</tt></td>
<td>0.16 -- 0.2</td>
<td><tt class="docutils literal">7</tt></td>
<td>0.005  -- 0.0063</td>
</tr>
<tr><td><tt class="docutils literal">)</tt></td>
<td>0.13 -- 0.16</td>
<td><tt class="docutils literal">8</tt></td>
<td>0.004  -- 0.005</td>
</tr>
<tr><td><tt class="docutils literal">*</tt></td>
<td>0.1  -- 0.13</td>
<td><tt class="docutils literal">9</tt></td>
<td>0.0032 -- 0.004</td>
</tr>
<tr><td><tt class="docutils literal">+</tt></td>
<td>0.079 -- 0.1</td>
<td><tt class="docutils literal">:</tt></td>
<td>0.0025 -- 0.0032</td>
</tr>
<tr><td><tt class="docutils literal">,</tt></td>
<td>0.063 -- 0.079</td>
<td><tt class="docutils literal">;</tt></td>
<td>0.002  -- 0.0025</td>
</tr>
<tr><td><tt class="docutils literal">-</tt></td>
<td>0.05  -- 0.063</td>
<td><tt class="docutils literal">&lt;</tt></td>
<td>0.0016 -- 0.002</td>
</tr>
<tr><td><tt class="docutils literal">.</tt></td>
<td>0.04  -- 0.05</td>
<td><tt class="docutils literal">=</tt></td>
<td>0.0013 -- 0.0016</td>
</tr>
<tr><td><tt class="docutils literal">/</tt></td>
<td>0.032 -- 0.04</td>
<td><tt class="docutils literal">&gt;</tt></td>
<td>0.001  -- 0.0013</td>
</tr>
</tbody>
</table>
</blockquote>
<p>Note that each alignment is grown from a &quot;core&quot; region, and the
ambiguity estimates assume that the core is correctly aligned.  The
core is indicated by &quot;~&quot; symbols, and it contains exact matches only.</p>
</div>
</div>
</body>
</html>