File: ltrharvest.tex

package info (click to toggle)
genometools 1.6.6%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 50,576 kB
  • sloc: ansic: 271,876; ruby: 29,930; python: 5,106; sh: 3,083; makefile: 1,213; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (873 lines) | stat: -rw-r--r-- 32,507 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
\documentclass[12pt,titlepage]{article}
\usepackage[a4paper,top=30mm,bottom=30mm,left=20mm,right=20mm]{geometry}
\usepackage{url}
\usepackage{alltt}
\usepackage{xspace}
\usepackage{times}
\usepackage{listings}

\usepackage{verbatim}
%\usepackage[dvips]{hyperref}
\usepackage{optionman}

\newcommand{\LTRharvest}{\textit{LTRharvest}\xspace}
\newcommand{\GenomeTools}{\textit{GenomeTools}\xspace}
\newcommand{\Suffixerator}{\textit{Suffixerator}\xspace}
\newcommand{\Gtltrharvest}{\texttt{gt ltrharvest}\xspace}
\newcommand{\Gt}{\texttt{gt}\xspace}
\newcommand{\Gtsuffixerator}{\texttt{gt suffixerator}\xspace}

\title{\LTRharvest\\
a manual}
\author{\begin{tabular}{c}
         \textit{David Ellinghaus}\\
         \textit{Sascha Steinbiss}\\
         \textit{Stefan Kurtz}\\
         \textit{Ute Willhoeft}\\[1cm]
         Research Group for Genome Informatics\\
         Center for Bioinformatics\\
         University of Hamburg\\
         Bundesstrasse 43\\
         20146 Hamburg\\
         Germany\\[1cm]
         \url{willhoeft@zbh.uni-hamburg.de}\\[1cm]
         \begin{tabular}{p{0.8\textwidth}}
        In any documentation or publication about research using \LTRharvest
        please cite the following paper:\\[5mm]
        D.~Ellinghaus, S.~Kurtz, and U.~Willhoeft.
        \LTRharvest, an efficient and flexible software for
        de novo detection of LTR retrotransposons,
        BMC Bioinformatics 2008, 9:18\\[1mm]
        \url{http://www.biomedcentral.com/1471-2105/9/18}
        \end{tabular}
        \end{tabular}}
\date{26/08/2013}
\pdftrailerid{}
\begin{document}
\maketitle

\section{Introduction} \label{Introduction}

This document describes \LTRharvest \cite{LTRharvesturl}, a software tool
for \textit{de novo} predictions of LTR
retrotransposons in genomic sequences~\cite{EKW07}.
\LTRharvest computes boundary positions of potential
LTR retrotransposons
on a persistent index structure of the genomic target sequence,
the \textit{enhanced suffix array} \cite{AKO04}. Usually
the genomic target sequence is a complete chromosomal DNA sequence
in FASTA format. Nevertheless, every DNA sequence in (multiple)
FASTA format can be passed to the software.

For the prediction, \LTRharvest implements several filters.
These are consecutively applied on the sequence
data to reject candidates, which are not conform with sequence,
length or distance features of LTR retrotransposons.
Since these features are mostly species-specific,
every filter can be switched on or switched off and is free for
parameterisation of a certain LTR retrotransposon model.

\LTRharvest is written in \texttt{C} and it is based
on the \GenomeTools library \cite{genometools}. \LTRharvest is called
as part of the single binary named \Gt.
%The source code is single threaded and can be compiled on 32-bit and 64-bit
%platforms without making any changes to the sources.
\LTRharvest runs on an enhanced suffix array index, which is stored on files.
This index needs to be constructed by the program \texttt{suffixerator}, which
is also part of the \GenomeTools binary \Gt.

\section{Usage} \label{Usage}

Some text is highlighted by different fonts according to the following rules.

\begin{itemize}
\item \texttt{Typewriter font} is used for the names of software tools.
\item \texttt{\small{Small typewriter font}} is used for file names.
\item \begin{footnotesize}\texttt{Footnote sized typewriter font}
      \end{footnotesize} with a leading
      \begin{footnotesize}\texttt{'-'}\end{footnotesize}
      is used for program options.
\item \Showoptionarg{small italic font} is used for the argument(s) of an
      option.
\end{itemize}

\subsection{The options of \LTRharvest} \label{Overview}

Since \LTRharvest is part of \Gt, \LTRharvest is called as follows:

\texttt{gt} $[$\Showoption{j} \Showoptionarg{n} $]$ \texttt{ltrharvest} \Showoption{index} \Showoptionarg{indexname} $[$\emph{options}$]$

where \Showoptionarg{indexname} denotes the enhanced suffix array index of the
target sequence(s) constructed by the program \Suffixerator. If the option
\Showoption{j} is given and \GenomeTools has been compiled with multithreading
support (\texttt{threads=yes}), some calculations will run in parallel, using
\Showoptionarg{n} threads.
An overview of all possible options with a short one-line description of each
option is given in Table \ref{overviewOpt}. All options can be specified only
once.

\begin{table}[htbp]
\caption{Overview of the \LTRharvest options sorted by categories.}
\begin{footnotesize}
\[
\renewcommand{\arraystretch}{0.89}
\begin{tabular}{ll}\hline
\Showoptiongroup{Input options}
\Showoption{index}& specify the name of the enhanced suffix array index
\\
\Showoption{range}& specify the range in the input sequences to be searched for
LTR retrotransposon candidates
\\
\Showoptiongroup{Output options}
\Showoption{out}& specify outputfilename for predictions in multiple FASTA
format
\\
\Showoption{gff3}& specify outputfilename for predictions in GFF3 format
\\
\Showoption{outinner}& specify outputfilename for multiple FASTA file
of the inner regions of predictions
\\
\Showoption{tabout}& output tabular format on stdout (instead of GFF3)
\\
\Showoption{seqids}& output sequence descriptions (instead of numbers)
\\
\Showoption{md5}& output GFF3 with MD5 headers
\\
\Showoptiongroup{Filter options}
\Showoption{seed}& specify minimum seed length for exact maximal repeats
\\
\Showoption{minlenltr}& specify minimum length for each LTR
\\
\Showoption{maxlenltr}& specify maximum length for each LTR
\\
\Showoption{mindistltr}& specify minimum distance of LTR
startpositions
\\
\Showoption{maxdistltr}& specify maximum distance of LTR
startpositions
\\
\Showoption{similar}& specify similaritythreshold in range [1..100\%]
\\
\Showoption{mintsd}& specify minimum length for each TSD
\\
\Showoption{maxtsd}& specify maximum length for each TSD (requires
\Showoption{mintsd})
\\
\Showoption{motif}& specify palindromic motif consisting of 2 nucleotides
startmotif + 2 nucleotides endmotif,
\\
\Showoption{motifmis}& specify maximum number of mismatches in motif
(requires \Showoption{motif})
\\
\Showoption{vic}& specify the number of nucleotides (to the left and to
the right)
\\
& that will be searched for TSDs and/or motifs around 5' and 3' boundary
\\
\Showoption{overlaps}& specify no$|$best$|$all
\\
\Showoptiongroup{Alignment options}
\Showoption{xdrop}& specify xdrop value for seed extension
\\
\Showoption{mat}& specify matchscore for seed extension
\\
\Showoption{mis}& specify mismatchscore for seed extension
\\
\Showoption{ins}& specify insertionscore for seed extension
\\
\Showoption{del}& specify deletionscore for seed extension
\\
\Showoptiongroup{Miscellaneous options}
\Showoption{v}& verbose mode
\\
\Showoption{longoutput}& additional motif/TSD output (requires
\Showoption{mintsd} or \Showoption{motif})
\\
\Showoption{help}& show all options
\\
\hline
\end{tabular}
%\input{ltrprogoptions}
\]
\end{footnotesize}
\label{overviewOpt}
\end{table}

%%%%
\subsection{Input options}

\begin{Justshowoptions}
\Option{index}{\Showoptionarg{indexname}}{
Specify the name of the enhanced suffix array index. The index must comprise
the tables $\tt{suf}$, $\tt{lcp}$, $\tt{des}$ and
$\tt{tis}$. An example of how the enhanced suffix array index is constructed
with the program \Suffixerator can be found in section~\ref{Examples}.
}
\end{Justshowoptions}

The option \Showoption{index} is mandatory.

\begin{Justshowoptions}
\Option{range}{\Showoptionarg{$x$ $y$}}{
Specify the range in the input sequence(s) to be searched for LTR
retrotransposon candidates. That is, if $x=1000$ and $y=10000$ then candidates
are only reported if they start after position 1000 and end before position
10000 in their respective sequence coordinates.

If $x=y=0$ (default), then the whole sequences are searched.
}
\end{Justshowoptions}

%%%%
\subsection{Output options}

Results are reported in tabular fashion on stdout and can easily
be written to a file using the notation \texttt{\symbol{62}}
\Showoptionarg{resultfile} as in the following:

\Gtltrharvest \Showoption{index} \Showoptionarg{indexname}
$[$\emph{options}$]$ \texttt{\symbol{62}} \Showoptionarg{resultfile}

In addition, the following options influence how and which information is
output:
\begin{Justshowoptions}

\Option{out}{\Showoptionarg{outputfile}}{
Specify the name of a file where the predictions will be written to.
Each prediction will be represented by an individual FASTA entry.
}

\Option{outinner}{\Showoptionarg{outputfile}}{
Specify the name of a file where the inner regions (the prediction sequences
without the flanking LTR sequences) will be written to.
Each prediction will be represented by an individual FASTA entry.
}

\Option{gff3}{\Showoptionarg{outputfile}}{
Specify the name of a file where the predictions will be written to.
Each prediction will be represented by an individual GFF3~\cite{gff3} entry.
}

\Option{tabout}
{$\Showoptionarg{yes}|\Showoptionarg{no}$}{
If set to $\Showoptionarg{yes}$, the usual tabular format will be printed to
stdout. If set to $\Showoptionarg{no}$, then GFF3 output will be printed to
stdout. Default is $\Showoptionarg{yes}$. This option does not influence the
GFF3 output specified by \Showoption{gff3}.
}

\Option{seqids}
{$\Showoptionarg{yes}|\Showoptionarg{no}$}{
If set to $\Showoptionarg{yes}$, then the original sequence descriptions (FASTA
headers) are output as GFF3 sequence identifiers.
If set to $\Showoptionarg{no}$, then the sequence identifiers will have the
format  \texttt{seq}$n$, where $n$ is the sequence number of the corresponding
sequence in the input index, e.g.\ \texttt{seq0} for the first sequence in the
index. Default is $\Showoptionarg{no}$.
}

\Option{md5}
{$\Showoptionarg{yes}|\Showoptionarg{no}$}{
If set to $\Showoptionarg{yes}$, the sequence regions output in the GFF3 output will
be augmented with additional MD5 fingerprints for each sequence.
That is, the sequence region identifiers are constructed as follows:
``\texttt{md5:}$md5hash$\texttt{:}$seqid$'' where $seqid$ is the original
sequence region identifies according to option \Showoption{seqids}.
Default is $\Showoptionarg{no}$.
}


\end{Justshowoptions}


%%%%
\subsection{Filter options}

The filter options provide the opportunity to exclude predictions with unwanted
sequence, length or distance features. If a particular option is not set
by the user, a default value for this options will be set,
except for the options \Showoption{mintsd}, \Showoption{maxtsd}
and \Showoption{motif}. Thus, if none of the filter options is set by the user,
a prediction of LTR retrotransposons will be
conducted without searching for target site duplications (TSD) or a particular
LTR start-end motif.

\begin{Justshowoptions}

\Option{seed}{\Showoptionarg{$L_{ex}$}}{
Specify the minimum length for the exact maximal repeats. Only those repeats
with the specified minimum length are analyzed in the process of finding
candidate pairs. Exact maximal repeats of length below that threshold are
not taken further into account.
\Showoptionarg{$L_{ex}$} has to be a positive integer. If this
option is not selected by the user, then \Showoptionarg{$L_{ex}$} is set
to $30$ by default.
}

\Option{minlenltr}{\Showoptionarg{$L_{min}$}}{
Specify the minimum length of each LTR. \Showoptionarg{$L_{min}$}
has to be specified as a positive integer.
If this option is not selected by the user, then \Showoptionarg{$L_{min}$}
is set to $100$ by default.
}

\Option{maxlenltr}{\Showoptionarg{$L_{max}$}}{
Specify the maximum length of each LTR. \Showoptionarg{$L_{max}$}
has to be specified as a positive integer.
If this option is not selected by the user, then \Showoptionarg{$L_{max}$}
is set to $1000$ by default.
}

\Option{mindistltr}{\Showoptionarg{$D_{min}$}}{
Specify the minimum distance of LTR starting positions.
\Showoptionarg{$D_{min}$} has to be specified as a positive integer.
If this option is not selected by the user, then \Showoptionarg{$D_{min}$}
is set to $1000$ by default.
}

\Option{maxdistltr}{\Showoptionarg{$D_{max}$}}{
Specify the maximum distance of LTR starting positions.
\Showoptionarg{$D_{max}$} has to be specified as a positive integer.
If this option is not selected by the user, then \Showoptionarg{$D_{max}$}
is set to $15000$ by default.
}

\Option{similar}{\Showoptionarg{similaritythreshold}}{
Specify the minimum similarity value between the two LTRs.
The argument \Showoptionarg{similaritythreshold} has to be chosen
from the range [0,100] and means a percentage.
If this option is not selected by the user, the default value is
set to 85\%.
}

\Option{mintsd}{\Showoptionarg{TSDminlen}}{
If this option is selected, a search for target site duplications
(TSDs) will be conducted with a minimum TSD length of
\Showoptionarg{TSDminlen}.
If this option is not selected by the user, a search for TSDs
with minimum TSD length 4 will be conducted.
If this option is set but no maximum TSD length is specified by the
option \Showoption{maxtsd}, then the maximum TSD length is set to $20$
by default.
}

\Option{maxtsd}{\Showoptionarg{TSDmaxlen}}{
This option requires the option \Showoption{mintsd}.
If this option is selected, a search for target site duplications
(TSDs) will be conducted with a maximum TSD length of
\Showoptionarg{TSDmaxlen}.
}

\Option{motif}{\Showoptionarg{expr}}{
Specify 2 nucleotides for the starting motif and 2 nucleotides for
the ending motif at the beginning and the ending of each LTR,
respectively.
Only palindromic motif sequences - where the motif sequence
is equal to its complementary sequence read backwards -
%- of
%nucleotides a(adenine), c(cytosine), g(guanine), t(thymine)
are allowed, e.g. \Showoptionarg{tgca}.
Type the nucleotides without any space separating them.
If this option is not selected by the user, then candidate pairs will not
be checked, if they contain a motif.
If this options is set but no allowed number of mismatches is
specified by the option \Showoption{motifmis}, then a search for
the exact motif will be conducted.
}

\Option{motifmis}{\Showoptionarg{n}}{
This option requires the option \Showoption{motif}.
Specify the allowed number of mismatches by the argument
\Showoptionarg{n}. If this option is not set, then a search for
the exact motif will be conducted. The non-negative integer
\Showoptionarg{n} has to be chosen from the range $[0,3]$.
}

\Option{vic}{\Showoptionarg{l}}{
Specify the number of nucleotide positions \Showoptionarg{l}
to the left and to the right (the vicinity), that
will be searched for TSDs and/or one motif around the
5' and 3' predicted boundary of a LTR retrotransposon. This option
has only an effect, if option \Showoption{mintsd} and/or option
\Showoption{motif} is switched on.
If this option is not selected by the user, the default value of $l$
is 60.
}

\Option{overlaps}
{\Showoptionarg{no}$|$\Showoptionarg{best}$|$\Showoptionarg{all}}{
Specify the output with regard to nested and/or overlapping
LTR retrotransposon predictions. If the argument
\Showoptionarg{no} is selected, then neither nested nor overlapping
predictions will be reported in the output.
If the argument \Showoptionarg{best} is
selected, then, in the case of two or more
nested or overlapping predictions,
solely the LTR retrotransposon prediction with the highest
similarity between its LTRs will be reported.
If the argument \Showoptionarg{all} is
selected, then all LTR retrotransposon predictions will be reported
whether there are nested and/or overlapping predictions or not.
If this option is not selected by the user,
the option with argument \Showoptionarg{best} is set by default.
}

\end{Justshowoptions}


%%%%
\subsection{Alignment options}

An $X$-drop extension process permits the search for degenerated LTRs.
The alignment options provide the opportunity to control this extension
of the maximal repeat seeds.
If a particular alignment option is not set
by the user, a default value for this options will be set.

\begin{Justshowoptions}

\Option{xdrop}{\Showoptionarg{X}}{
Specify the xdrop value \Showoptionarg{X} for extending a seed repeat in both
directions allowing for matches, mismatches, insertions, and deletions. The
argument \Showoptionarg{X} must be a positive integer or $0$.
The $X$-drop extension process stops as soon as the extension involving matches,
mismatches, insertions, and deletions has a score smaller than $T-X$ where T
denotes the largest score seen so far.
If this option
is not selected by the user, then \Showoptionarg{X} is set to $5$ by default.
}

\Option{mat}{\Showoptionarg{score}}{
Specify the positive match score for the $X$-drop extension process. If the
option is not selected by the user, the default value is $2$.
}

\Option{mis}{\Showoptionarg{score}}{
Specify the negative mismatch score for the $X$-drop extension process.
If this option is not selected by the user, the default value is $-2$.
}

\Option{ins}{\Showoptionarg{score}}{
Specify the negative insertion score for the $X$-drop extension process.
If this option is not selected by the user, the default value is $-3$.
}

\Option{del}{\Showoptionarg{score}}{
Specify the negative deletion score for the $X$-drop extension process.
If this option is not selected by the user, the default value is $-3$.
}

\end{Justshowoptions}

%%%%
\subsection{Miscellaneous options}

\begin{Justshowoptions}

\Option{v}{}{
This option enables the verbose mode. This means, that some more information
about the processing will be printed to \texttt{stdout} during the run.
This includes a long list of switched on or switched off options.
}

\Option{longoutput}{}{
This option additionally prints information about the detected
TSD and/or the motif to \texttt{stdout},
if a search for TSD and/or for the motif
has been selected by the user.
This option requires the option \Showoption{mintsd} and/or
\Showoption{motif}.
}

\Option{help}{}{
\LTRharvest will show a summary of all options on
\texttt{stdout} and terminate with exit code $0$.
}

\end{Justshowoptions}
%%%%%%%%%%%%

\section{Examples} \label{Examples}

In this section, example applications of \LTRharvest are presented.
In Subsection~\ref{different-options}, examples for using different options of
\LTRharvest are given. Subsection~\ref{Example-genome} then gives an example
for the prediction of LTR
retrotransposons on the entire \textit{S. cerevisiae} genome.
In Subsection~\ref{ExampleCluster}, an example for a clustering process of
the \LTRharvest output is shown. Please note that this step is not part of
\LTRharvest and is carried out by
%with a subsequent
%clustering process of the predicted sequences using
\textit{Vmatch}~\cite{vmatch}.
%(which is not part of the \GenomeTools binary \Gt).

\subsection{Using different options of \LTRharvest}
\label{different-options}

As the input sequence file, we use some FASTA file
\texttt{\small{chr02.19970727.fsa.gz}} containing the \textit{S. cerevisiae}
genome sequence, chromosome 2. Note that the file is a compressed file in gzip
format (because of the ending \texttt{.gz}). This format can be handled by the
program \Suffixerator.

First, we create the enhanced suffix array. We invoke \Gtsuffixerator with
options \Showoption{tis}, \Showoption{suf}, \Showoption{lcp}, \Showoption{des},
\Showoption{ssp} and \Showoption{sds} since \LTRharvest needs the corresponding
tables. Furthermore, we specify \Showoption{dna}, as we process DNA sequences.
Additional creation of the $\tt{ois}$ table using the \Showoption{lossless}
option makes it easier to re-use the created index in later tools (e.g.\
\emph{LTRdigest}) which require this table.

\begin{footnotesize}
\begin{verbatim}
$ gt suffixerator -db chr02.19970727.fsa.gz -indexname chr02.19970727.fsa -tis
-suf -lcp -des -ssp -sds -dna -lossless -v
# dna=yes
# indexname="chr02.19970727.fsa"
.
.
.
# countradixsort=0
# counttqsort=0
\end{verbatim}
\end{footnotesize}

Now we can use the index for \LTRharvest. The first example call will just use
the default parameters for the filter and alignment options
without searching for TSD or an LTR start-end motif.

\begin{footnotesize}
\begin{verbatim}
$ gt ltrharvest -index chr02.19970727.fsa
# args=-index chr02.19970727.fsa
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr
# where:
# s = starting position
# e = ending position
# l = length
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# sim = similarity
# seq-nr = sequence number
29632  35598  5967  29632  29969  338  35259  35598  340  98.53  0
220989  226919  5931  220989  221339  351  226574  226919  346  97.15  0
259532  265448  5917  259532  259863  332  265117  265448  332  99.40  0
427671  430013  2343  427671  428170  500  429522  430013  492  90.20  0
\end{verbatim}
\end{footnotesize}

Each comment line starts with the comment symbol \#.
Each non-comment line denotes a LTR retrotransposon prediction with
starting and ending positions of the whole LTR retrotransposon, the
left LTR instance and the right LTR instance, respectively. Furthermore,
for each of these elements, the corresponding element length is reported
as well as a percentage similarity of the two LTRs. The last integer of
each line denotes the number of the input sequence, the LTR retrotransposon
prediction occurs in. The input sequence numbers are counted from 0.

Invoking \LTRharvest with the optional argument \Showoption{v} gives
more information about enabled and disabled options as well as
additional information about the enhanced suffix array index
and time/space consumption. Moreover, specifying options \Showoption{out}
and \Showoption{outinner} the run results in two multiple FASTA files.

\begin{footnotesize}
\begin{verbatim}
$ gt ltrharvest -index chr02.19970727.fsa -v -out pred-chr02.fsa
-outinner pred-inner-chr02.fsa
# args=-index chr02.19970727.fsa -v -out pred-chr02.fsa -outinner pred-inner-chr02.fsa
# user defined options and values:
#   verbosemode: On
#   indexname: chr02.19970727.fsa
#   outputfile: pred-chr02.fsa
#   outputfile inner region: pred-inner-chr02.fsa
#   xdropbelowscore: 5
#   similaritythreshold: 85.00
#   minseedlength: 30
#   matchscore: 2
#   mismatchscore: -2
#   insertionscore: -3
#   deletionscore: -3
#   minLTRlength: 100
#   maxLTRlength: 1000
#   minLTRdistance: 1000
#   maxLTRdistance: 15000
#   overlaps: best
#   minTSDlength: 4
#   maxTSDlength: 20
#   palindromic motif:
#   motifmismatchesallowed: 4
#   vicinity: 60 nt
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr
# where:
# s = starting position
# e = ending position
# l = length
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# sim = similarity
# seq-nr = sequence number
29632  35598  5967  29632  29969  338  35259  35598  340  98.53  0
220989  226919  5931  220989  221339  351  226574  226919  346  97.15  0
259532  265448  5917  259532  259863  332  265117  265448  332  99.40  0
427671  430013  2343  427671  428170  500  429522  430013  492  90.20  0

\end{verbatim}
\end{footnotesize}

Searching additionally for TSD and a LTR start-end motif we use
the options \Showoption{mintsd}, \Showoption{maxtsd}, \Showoption{motif} and
\Showoption{motifmis}.

\begin{footnotesize}
\begin{verbatim}
$ gt ltrharvest -index chr02.19970727.fsa -mintsd 5 -maxtsd 20 -motif tgca
-motifmis 0
# args=-index chr02.19970727.fsa -mintsd 5 -maxtsd 20 -motif tgca -motifmis 0
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr
# where:
# s = starting position
# e = ending position
# l = length
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# sim = similarity
# seq-nr = sequence number
29632  35590  5959  29632  29963  332  35259  35590  332  99.70  0
220996  226911  5916  220996  221329  334  226575  226911  337  97.33  0
259532  265448  5917  259532  259863  332  265117  265448  332  99.40  0
\end{verbatim}
\end{footnotesize}

Finally, if we are interested in the sequence and the length of the TSD
as well as the sequence of the motif, we select the option
\Showoption{longoutput}.
This also is an example for specifying all filter and alignment
options by the user.

\begin{footnotesize}
\begin{verbatim}
$ gt ltrharvest -index chr02.19970727.fsa -seed 30 -xdrop 5 -mat 2 -mis -2 -ins -3
-del -3 -minlenltr 100 -maxlenltr 1000 -mindistltr 1000 -maxdistltr 15000
-similar 90.0 -overlaps all -mintsd 5 -maxtsd 20 -motif tgca -motifmis 0 -vic 60
-longoutput
# args=-index chr02.19970727.fsa -seed 30 -xdrop 5 -mat 2 -mis -2 -ins -3
-del -3 -minlenltr 100 -maxlenltr 1000 -mindistltr 1000 -maxdistltr 15000
-similar 90.0 -overlaps all -mintsd 5 -maxtsd 20 -motif tgca -motifmis 0
-vic 60 -longoutput
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) TSD l(TSD) m(lLTR) s(rLTR) e(rLTR) l(rLTR) TSD l(TSD) m(rLTR) sim(LTRs) seq-nr
# where:
# s = starting position
# e = ending position
# l = length
# m = motif
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# TSD = target site duplication
# sim = similarity
# seq-nr = sequence number
29632  35590  5959  29632  29963  332  ATAAT  5  TG..CA  35259  35590
  332  ATAAT  5  TG..CA  99.70  0
220996  226911  5916  220996  221329  334  GGAAT  5  TG..CA  226575  226911
  337  GGAAT  5  TG..CA  97.33  0
259532  265448  5917  259532  259863  332  GTAAT  5  TG..CA  265117  265448
  332  GTAAT  5  TG..CA  99.40  0
\end{verbatim}
\end{footnotesize}

\subsection{Predictions on the entire \textit{S.cerevisiae} genome}
\label{Example-genome}

As target sequences file we use some multiple FASTA file
\texttt{\small{chrAll.19971001.fsa.gz}}, which contains all 16
chromosomal sequences of the release before Oct. 1st, 1997, probably
used by Kim et al.
in their comprehensive survey of retrotransposons~\cite{kim:1998}.

First, we create the enhanced suffix array. We invoke \Gtsuffixerator with
options \Showoption{tis}, \Showoption{suf}, \Showoption{lcp}, \Showoption{des},
since \LTRharvest needs the corresponding tables. Furthermore, we specify option
\Showoption{dna}, as we are processing DNA sequences. We also use
\Showoption{lossless} to create the table necessary to enable lossless access to
the sequence.

\begin{footnotesize}
\begin{verbatim}
$ gt suffixerator -db chrAll.19971001.fsa.gz -indexname chrAll.19971001.fsa -tis
-suf -lcp -des -sds -dna -lossless -v
# dna=yes
# indexname="chrAll.19971001.fsa"
.
.
.
# countradixsort=0
# counttqsort=4
\end{verbatim}
\end{footnotesize}

Now we can use the index with \LTRharvest. In addition to the filter and
alignment options, we choose option \Showoption{out} for printing the predicted
LTR retrotransposon sequences to a file.

\begin{footnotesize}
\begin{verbatim}
$ gt ltrharvest -index chrAll.19971001.fsa -seed 100 -minlenltr 100 -maxlenltr 1000
-mindistltr 1000 -maxdistltr 15000 -xdrop 5 -mat 2 -mis -2 -ins -3 -del -3
-similar 90.0 -overlaps best -mintsd 5 -maxtsd 20 -motif tgca -motifmis 0
-vic 60 -longoutput -out pred-chrAll.fsa
# args=-index chrAll.19971001.fsa -seed 100 -minlenltr 100 -maxlenltr 1000
-mindistltr 1000 -maxdistltr 15000 -xdrop 5 -mat 2 -mis -2 -ins -3 -del -3
-similar 90.0 -overlaps best -mintsd 5 -maxtsd 20 -motif tgca -motifmis 0
-vic 60 -longoutput -out pred-chrAll.fsa
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) TSD l(TSD) m(lLTR) s(rLTR)
  e(rLTR) l(rLTR) TSD l(TSD) m(rLTR) sim(LTRs) seq-nr
# where:
# s = starting position
# e = ending position
# l = length
# m = motif
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# TSD = target site duplication
# sim = similarity
# seq-nr = sequence number
160239  166163  5925  160239  160575  337  GGTTC  5  TG..CA  165827  166163
  337  GGTTC  5  TG..CA  100.00  0
29632  35590  5959  29632  29963  332  ATAAT  5  TG..CA  35259  35590
  332  ATAAT  5  TG..CA  99.70  1
220996  226911  5916  220996  221329  334  GGAAT  5  TG..CA  226575  226911
  337  GGAAT  5  TG..CA  97.33  1
.
.
.
804640  810557  5918  804640  804973  334  AATTA  5  TG..CA  810224  810557
  334  AATTA  5  TG..CA  100.00  15
844407  850335  5929  844407  844744  338  GAAAT  5  TG..CA  849998  850335
  338  GAAAT  5  TG..CA  100.00  15
850624  856549  5926  850624  850961  338  CAAAA  5  TG..CA  856212  856549
  338  CAAAA  5  TG..CA  99.11  15
\end{verbatim}
\end{footnotesize}

\subsection{Sequence clustering of \LTRharvest output (optional)}
\label{ExampleCluster}

In addition to the prediction process done by \LTRharvest,
a cluster analysis on the predicted
sequences is recommended. Here, we choose the single linkage cluster
analysis program from the
\textit{Vmatch} software tool~\cite{vmatch}
(which is not part of the \GenomeTools binary \Gt)
in order to show how this task can be accomplished.
An index needs to be constructed from the predicted sequences by the program
\texttt{mkvtree} which is part of \textit{Vmatch}.

\begin{footnotesize}
\begin{verbatim}
$ mkvtree -db pred-chrAll.fsa -dna -pl -allout -v
reading file "pred-chrAll.fsa"
.
.
.
overall space peak: main=2.61 MB (10.01 bytes/symbol), secondary=0.53 MB
\end{verbatim}
\end{footnotesize}

Now we can use \texttt{vmatch} (which is part of \textit{Vmatch})
to perform a clustering of the predicted sequences.
A reasonable parameter set is given in
Tab.~\ref{parameters-cluster-default},
page~\pageref{parameters-cluster-default}.
See the \emph{Vmatch manual}~\cite{vmatch} for an explanation of the
options. The following command computes clusters of all (database) sequences in
\texttt{\small{pred-chrAll.fsa}}. Each cluster has a unique cluster number,
followed by the sequence number contained in the cluster.

\begin{footnotesize}
\begin{verbatim}
$ vmatch -dbcluster 95 7 Cluster-pred-chrAll -p -d -seedlength 50
-l 1101 -exdrop 9 pred-chrAll.fsa
# args=-dbcluster 95 7 Cluster-pred-chrAll -p -d -seedlength 50
-l 1101 -exdrop 9 pred-chrAll.fsa
# 3 clusters
# 45 elements out of 46 (97.83%) are in clusters
# 1 elements out of 46 (2.17%) are singlets
# 1 cluster of size 2
# 1 cluster of size 3
# 1 cluster of size 40
0:  32 41 42 36 39 13 40 43 37 30 44 38 18 21 17 34 35 31 12 20
33 23 26 27 6 28 3 29 25 1 24 9 19 2 7 22 4 10 5 0
1:  8 15 11
2:  14 45
\end{verbatim}
\end{footnotesize}

%\bibliographystyle{plain}
%\bibliography{references}
\begin{thebibliography}{1}

\bibitem{LTRharvesturl}
  \url{http://www.zbh.uni-hamburg.de/ltrharvest}

\bibitem{gff3}
\textsc{GFF3}, tab delimited file format for genome annotation.
  \url{http://www.sequenceontology.org/gff3.shtml}.

\bibitem{AKO04}
M.I. Abouelhoda, S.~Kurtz, and E.~Ohlebusch.
\newblock Replacing suffix trees with enhanced suffix arrays.
\newblock {\em Journal of Discrete Algorithms}, 2:53--86, 2004.

\bibitem{EKW07}
D.~Ellinghaus, S.~Kurtz, and U.~Willhoeft.
\newblock \LTRharvest, an efficient and flexible software for de novo
  detection of \normalsize{LTR} retrotransposons.
\newblock {\em BMC Bioinformatics}, 9:18, 2008.

\bibitem{genometools}
G.~Gremme.
\newblock The \textsc{GenomeTools} genome analysis system.
  \url{http://genometools.org}.

\bibitem{kim:1998}
J.M. Kim, S.~Vanguri, J.D. Boeke, A.~Gabriel, and D.F. Voytas.
\newblock Transposable elements and genome organisation: a comprehensive survey
  of retrotransposons revealed by the complete \textit{Saccharomyces
  cerevisiae} genome sequence.
\newblock {\em Genome Research}, 8(5):464--478, 1998.

\bibitem{vmatch}
S.~Kurtz.
\newblock The \textsc{Vmatch} large scale sequence analysis software.
  \url{http://www.vmatch.de}.

\end{thebibliography}

\begin{table}
\caption{A default parameter set for \textit{Vmatch}'s single linkage
  clustering program.}
\vspace{0.25cm}
\begin{tabular}[h!]{lcl}\hline
Parameter name & Value & Comment\\\hline
dbcluster    & 95 ($\lfloor(\frac{D_{min}}{D_{max}} \times 100)\rfloor + 1$) & Match covers at least 95\% of the smaller sequence and\\
             &                                                 & (\textit{formula value})\% of the larger sequence\\
d  & & Compute direct matches\\
p & & Compute palindromic matches\\
seedlength & 50 & Minimal length of the exact repeats\\
l & $D_{min} + L_{min}$ & Minimal length of matches\\
exdrop & 9 & Xdrop score when extending a seed in both directions\\\hline
\end{tabular}
\label{parameters-cluster-default}
\end{table}
\end{document}