File: notes.tex

package info (click to toggle)
tigr-glimmer 3.02b-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 13,928 kB
  • ctags: 2,492
  • sloc: cpp: 24,416; awk: 232; csh: 220; makefile: 156; sh: 61
file content (1177 lines) | stat: -rw-r--r-- 45,460 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
\documentclass[fleqn,titlepage,11pt]{article}

\usepackage{latexsym,delcher}

\PortraitPage
\def\baselinestretch{1.0}
\def\thefootnote{\fnsymbol{footnote}}
\def\thepage{{\footnotesize\arabic{page}}}
\def\today{9~May~2006}

\def\Desc#1{\,\mbox{\emph{#1}}\,}
\def\Glimmer{\textsc{Glimmer}}
\def\Gtwo{\textsc{Glimmer2}}
\def\Gthree{\textsc{Glimmer3}}
\def\PgBICM{\texttt{build-icm}}
\def\Pg#1{\texttt{#1}}


\begin{document}

\RaggedRight
\sloppy

\title{\Glimmer{} Release Notes \\ Version~3.02}
\author{Arthur~L. Delcher\titlepagenote{Copyright \copyright\ 2006 University of Maryland Center for Bioinformatics \& Computational Biology}}

\maketitle


\section{Introduction}

This document describes Version~3.02 of the \Glimmer{}
gene-finding software.  This version incorporates
a nearly complete rewrite of the code, resulting in
improvements in both sensitivity and specificity of
the predictions.

This is a complete version of the software with
all features implemented.  Users discovering
problems or errors are encouraged to report them to
\,\verb`adelcher@umiacs.umd.edu`\,.


\section{About \Glimmer{}}

\Glimmer{} is a collection of programs for identifying genes in
microbial DNA sequences.  The system works by creating a
variable-length Markov model from a training set of genes and
then using that model to attempt to identify all genes in a
given DNA sequence.  Version~1 of \Glimmer{} was described
in~\cite{glimmer1} and Version~2 was described
in~\cite{glimmer2}.  An article describing \Gthree{} is
in preparation.

\Gthree{} is released as OSI Certified Open Source Software
under the Artistic License.  The license is contained in the
file, \Pg{LICENSE}, in the distribution.


\section{What's Changed from Version~2 to Version~3}

Changes have been made in the algorithms to
score and select genes in \Gthree, and also in the
options and output formats:
\bn\RaggedRight
\item
  In both \Gtwo{} and \Gthree{}, orfs are scored, and those scoring
  above the threshold value form the candidate set.

  In \Gtwo{},
  pairwise overlaps between these candidates are examined, and using
  a series of rules, orfs are eliminated or start sites adjusted.
  This continues in an iterative fashion until no further
  changes occur.  In many cases, the rules cannot resolve an
  overlap between two orfs, and both are output in the final
  list of predictions, which have comment tags indicating this.

  In \Gthree{}, a single dynamic-programming, HMM-like algorithm
  is used to select the highest-scoring orfs and their
  start sites.  This algorithm guarantees that the predictions
  have no overlaps longer than the specified length (which
  can be set by the
  \,\verb`-o`\, option).  Thus, there are no longer any comments
  with the \Gthree{} predictions, and in general there are fewer
  predictions, reducing the false-positive rate.  Out tests
  indicate that there is no corresponding increase in false
  negatives for \Gthree{} compared to \Gtwo{}.
  
\item
  \Gthree{} scores orfs in the reverse direction, \ie, 3' to
  5'.  This improves the accuracy of scores near the start codon
  of genes because the trailing context of the ICM is in
  the coding region of the gene (on which it has been trained).

\item
  The \,\verb`long-orfs`\, program now uses an
  amino-acid distribution model to filter
  the set of candidate orfs before a subset of sufficiently
  long, non-overlapping orfs is selected.

\item
  The \,\verb`make`\, system and directory structure has been
  revised so that the source, object and executable files
  are now in separate directories.

\item
  There have been some changes in program parameters, including:
  \bn\RaggedRight
  \item
    Program options are now specified \underline{\emph{before}}
    required parameters, rather than after.  Most options now
    have a long form in addition to the single letter form.
  \item
    \PgBICM{} uses a parameter to specify the output file for
    the ICM, instead of sending it to standard output like
    \Gtwo{}.  This parameter can be ``-'' to direct output to
    standard out, if desired.
  \item
    \Pg{glimmer3} requires a third parameter, which is used as a
    prefix for its output files.
  \en

\item
  There have been some changes in the format and/or meaning of
  output values.  Specifically:
  \bn\RaggedRight
  \item
    \Gthree{} produces two output files:  a file with detailed
    information about all orfs (similar to the first part of
    \Gtwo{} output), and a file containing just the final
    predictions (like the second part of \Gtwo{} output).
  \item
    The prediction coordinates in \Gthree{} now include the
    stop codon.  Thus the end coordinates will differ from
    \Gtwo{} values by 3.
  \item
    Orfs are now printed with a score, which is 100 times the
    log odds per base of the in-frame coding score versus the score
    of the independent, non-coding model.  These scores provide
    a consistent scale to compare scores of different orfs.
  \item
    The \,\verb`-X`\, option will now report genes extending
    past the end of a sequence with a coordinate that is
    either less than or equal to zero, or greater than the
    sequence length.
  \en

\item
  \Gthree{} can now process multiple-sequence input files.
  The outputs for each sequence are preceded by the
  fasta-header line of the sequence in both the \,\verb`.detail`\,
  and the \,\verb`.predict`\, files.

\item
  Two \Gtwo{} options have been eliminated:
\bl{\settowidth{\labelwidth}{\Pg{-p}}\leftmargin=\labelwidth \addtolength{\leftmargin}{1em}\labelsep=1em}\RaggedRight
  \item[\Pg{-p}]
    Was used to specify acceptable overlaps of genes as a percentage
    of their lengths.  This is problematic since the choice of start site
    affects gene length.
  \item[\Pg{-w}]
    Specified the minimum length of an orf that might be considered a gene
    based on scores of intersecting orfs.  Setting a suitably low score
    threshold (with the \Pg{-t} option) effectively includes these orfs.
  \el

\en


\section{Installing and Running \Gthree}

\Glimmer{} software was written for the Linux software
environment.  The following instructions assume a Linux
system.  They also work under Mac OSX.

\subsection{Installation}
To install \Gthree{}, download the compressed tarfile
\,\verb`glimmer302.tar.gz`\,
from the website.  Then uncompress the file by typing
\BSV\begin{verbatim}
  tar xzf glimmer302.tar.gz
\end{verbatim}\ESV
A directory named \,\verb`glimmer3.02`\, should result.
In that directory, is a subdirectory named \,\verb`src`\,.
Within the \,\verb`src`\, subdirectory type
\BSV\begin{verbatim}
  make
\end{verbatim}\ESV
(or alternately \Pg{gmake}).
This will compile the \Gthree{} programs and put
the executable files in the directory \,\verb`glimmer3.02/bin`\,.
These files can be copied or moved to whatever directory
is convenient to the user.

\subsubsection{Troubleshooting}
If the make fails, one possibility is that long options are
not installed on your system.  To compile without long options,
edit file \Pg{delcher.hh} in directory \Pg{src/Common}
to change line
\BSV\begin{verbatim}
#define  ALLOW_LONG_OPTIONS  1
\end{verbatim}\ESV
near the top of the file to
\BSV\begin{verbatim}
#define  ALLOW_LONG_OPTIONS  0
\end{verbatim}\ESV
and then retry make.  It also may be necessary to comment-out
or delete the line
\BSV\begin{verbatim}
#include  <getopt.h>
\end{verbatim}\ESV
in this file.  If you turn off long options then only the single-letter
form of program options will work.

Another reason the make may fail is if your version of make
does not support all the features of GNU make.  If this is the
case, you can try an alternative, simplified version of the make
system by going to directory
\BSV\begin{verbatim}
  glimmer3.02/SimpleMake
\end{verbatim}\ESV
and type \,\verb`make`\, there.

\subsection{Running \Glimmer{}}
Running \Glimmer{} is a two-step process.  First, a
probability model of coding sequences, called an
interpolated context model or ICM, must be built.
This is done by the program \,\verb`build-icm`\, from a set of
training sequences.  These sequences can be obtained
in several ways:
\bn\RaggedRight
\item
  From known genes in the genome, \eg, genes identified
  by homology searches
\item
  From long, non-overlapping orfs in the genome as
  produced by the program \,\verb`long-orfs`\,.
\item
  From genes in a highly similar species/strain.
\en

Once the probability model is built, the \Pg{glimmer3} program
itself is run to analyze the sequences and make gene
predictions.  \Pg{glimmer3} has a number of different options
that affect its predictions.  One of these (\,\verb`-b`\,)
provides the program with a position weight matrix (PWM)
representing the ribosome binding site for genes and is used to
improve the accuracy of start site predictions.

To obtain the best results with \Glimmer{}, the largest possible
training set of genes should be used from the same genome on
which predictions are to be made.  If genes are known from
homology searches, they can be used.  If only a few such genes
are available, they can be combined with the training set
produced by the \Pg{long-orfs} program (but do not include
duplicate genes in the training set).  If you are running
\Glimmer{} on small genome fragments, the genome
of the nearest available evolutionary relative of the target
organism can be used to provide a training set of genes.

\subsubsection{Speed \& Memory Usage}
The speed and memory usage of \Gthree{} programs will depend on
the system speed and the size and nature of the data files.
The \Pg{build-icm} program takes time roughly proportional
to the size of its input file.
On a 3.0GHz Intel Xeon Linux system, using default parameters,
it takes roughly 10~seconds per megabyte of input.  Its memory
requirement is less than 50~Mb for bacterial-size genomes.
The run-time of the \Pg{glimmer3} program depends both on
the size of the input genome and the number of potential genes
in it.  High-GC genomes, which have more long open reading frames,
take longer to process than low-GC genomes.  Again, using a 3.0GHz
Intel Xeon Linux system as a benchmark, \Pg{glimmer3} on
\emph{Campylobacter jejuni RM1221} (1.77Mb, 30.3%%~GC)
takes about 15~seconds and uses less than 8Mb of memorey;
\emph{Pseudomonas fluorescens Pf-5} (7.07Mb, 63%%~GC)
takes less than 2~minutes and uses about 27Mb of memory.

\subsection{Useful Scripts}
  \label{script:sec}
In the \,\verb`scripts`\, subdirectory are several C-shell scripts that
are useful for running \Gthree{}.  At the top of each script
are specified the directory paths to the \Glimmer{} executables
and Awk scripts (the lines beginning with
\,\verb`set glimmerpath`\, and \,\verb`set awkpath`\,).
The user will need to change these entries to the directories
where these files were installed on his/her system.  The first
lines of these files may also need to be modified if the
user's \,\verb`csh`\, and \,\verb`awk`\, programs are in a directory
other than \,\verb`/bin`\,.

\bn\RaggedRight
\exdent
  \Pg{g3-from-scratch.csh} is a sample shell script that first uses
  program \Pg{long-orfs} to find a training set of (putative) genes
  and then runs \Pg{glimmer3} on the result.
  It may be desirable to change the \Pg{glimmer3} options
  on the \,\verb`set glimmeropts`\, line.

  To run the script, say, on the genome sequence in file
  \,\verb`genom.seq`\, and prefix the output files with the tag
  \,\verb`run1`\,, simply type:
\BSV\begin{verbatim}
  g3-from-scratch.csh genom.seq run1
\end{verbatim}\ESV
  The script would then run the commands:
\BSV\begin{verbatim}
  long-orfs -n -t 1.15 genom.seq run1.longorfs
  extract -t genom.seq run1.longorfs > run1.train
  build-icm -r run1.icm < run1.train
  glimmer3 -o50 -g110 -t30 genom.seq run1.icm run1
\end{verbatim}\ESV
  
\exdent
  \Pg{g3-from-training.csh} is a sample shell script that uses a
  given set of gene coordinates to extract a training set and
  then run \Pg{glimmer3}.  This script uses the program \Pg{elph}
  (available from TIGR at \,\verb`www.tigr.org/software/ELPH`\,)
  to create a PWM from the region upstream of the start sites
  in the specified coordinate sets.  It also uses the first codons
  in the training set to estimate the start-codon distribution for
  the genome.

  To run the script on the genome sequence in file
  \Pg{genom.seq}, with file \Pg{train.coords} containing the positions
  of the training sequences in \Pg{genom.seq}, and using tag \Pg{run2}
  to prefix the output files, type:
\BSV\begin{verbatim}
  g3-from-training.csh genom.seq train.coords run2
\end{verbatim}\ESV
  The script would then run the commands:
\BSV\begin{verbatim}
  extract -t genom.seq train.coords > run2.train
  build-icm -r run2.icm < run2.train
  upstream-coords.awk 25 0 train.coords | extract genom.seq - > run2.upstream
  elph run2.upstream LEN=6 | get-motif-counts.awk > run2.motif
  set startuse = `start-codon-distrib -3 genom.seq train.coords`
  glimmer3 -o50 -g110 -t30 -b run2.motif -P $startuse genom.seq run2.icm run2
\end{verbatim}\ESV

\exdent
  \Pg{g3-iterated.csh} is a shell script that combines
  the two preceding scripts.  It uses the predictions from
  the scratch run to create a training set for the second run.
  The reason for a second run is that the output from the
  first run will have a more accurate set of start sites than
  the output from the \Pg{long-orfs} program, which automatically
  uses the most upstream start site.  These start sites allow the
  creation of a PWM for the ribosome binding site and the estimation
  of start-codon usage in the genome.

  To run the script on the genome sequence in file
  \,\verb`genom.seq`\, and prefix the output files with the tag
  \,\verb`run3`\,, type:
\BSV\begin{verbatim}
  g3-iterated.csh genom.seq run3
\end{verbatim}\ESV
  The script would then run the commands:
\BSV\begin{verbatim}
  long-orfs -n -t 1.15 genom.seq run3.longorfs
  extract -t genom.seq run3.longorfs > run3.train
  build-icm -r run3.icm < run3.train
  glimmer3 -o50 -g110 -t30 genom.seq run3.icm run3.run1
  tail +2 run3.run1.predict > run3.coords
  upstream-coords.awk 25 0 run3.coords | extract genom.seq - > run3.upstream
  elph run3.upstream LEN=6 | get-motif-counts.awk > run3.motif
  set startuse = `start-codon-distrib -3 genom.seq run3.coords`
  glimmer3 -o50 -g110 -t30 -b run3.motif -P $startuse genom.seq run3.icm run3
\end{verbatim}\ESV

\en

Several Awk scripts, including those called by the above scripts, are
in the same directory, \,\verb`scripts`\,, as these
C-shell scripts.  Each script has a comment at the beginning describing
what it does.


\section{Sample Run Directory}

A directory containing a sample run of \Gthree{} is provided.
This directory, named \,\Pg{sample-run}\, contains the genome sequence
for \emph{Treponema pallidum} (file \,\Pg{tpall.fna}\,)
and a list of annotated genes for it (file \,\Pg{tpall.nh}\,),
both downloaded from GenBank.
The files whose names begin \,\Pg{from-scratch}\, are the result of
running the script
\BSV\begin{verbatim}
  g3-from-scratch.csh tpall.fna from-scratch
\end{verbatim}\ESV
The files whose names begin \,\Pg{from-training}\, are the result of
running the script
\BSV\begin{verbatim}
  g3-from-training.csh tpall.fna tpall.nh from-training
\end{verbatim}\ESV
The files whose names begin \,\Pg{iterated}\, are the result of
running the script
\BSV\begin{verbatim}
  g3-iterated.csh tpall.fna iterated
\end{verbatim}\ESV
Users will need to modify the path directories at the top of these
scripts to be able to run them (see Section~\ref{script:sec} above).


\section{Notes on the Programs}

\subsection{\Pg{build-icm} Program}

This program constructs an interpolated context model (ICM)
from an input set of sequences.

\subsubsection{\Pg{build-icm} Parameters \& Options}
The format for invoking \,\Pg{build-icm}\, is:
\bq
  \Pg{build-icm}\, [\Desc{options}] \Desc{output-file} \,\Pg{<}\,\Desc{input-file}
\eq
Sequences are reads from standard input, the ICM is
built and written to \Desc{output-file}.  If \Desc{output-file}
is ``-'', then the output will be sent to standard output.
Since input comes from standard input, one also can ``pipe'' the input
into this program, \eg,
\BSV\begin{verbatim}
  cat abc.in | build-icm xyz.icm
\end{verbatim}\ESV
or even type in the input directly.

Possible \Desc{options} are:
\bl{}\RaggedRight
\exdent
  \verb`-d` \Desc{num} \enskip or \enskip \verb`--depth` \Desc{num}

  Set the depth of the ICM to \Desc{num}.  The depth is the
  maximum number of positions in the context window that
  will be used to determine the probability of the predicted
  position.  The default value is 7.

\exdent
  \verb`-F` \enskip or \enskip \verb`--no_stops`

  Do not use any input strings with in-frame stop codons.
  Stop codons are determined by either the \Pg{-z} or \Pg{-Z}
  option.

\exdent
  \verb`-h` \enskip or \enskip \verb`--help`

  Print the usage message.

\exdent
  \verb`-p` \Desc{num} \enskip or \enskip \verb`--period` \Desc{num}

  Set the period of the ICM to \Desc{num}.  The period is the
  number of different submodels for different positions in the
  text in a cyclic pattern.  \Eg, if the period is 3, the first
  submodel will determine positions $1, 4, 7, \dots$; the second
  submodel will determine positions $2, 5, 8, \dots$; and the third
  submodel will determine positions $3, 6, 9, \dots$.  For a
  non-periodic model, use a value of 1.  The default value
  is 3.

\exdent
  \verb`-r` \enskip or \enskip \verb`--reverse`

  Use the reverse of the input strings to build the ICM.  Note that
  this is merely the reverse and \emph{\underline{NOT}} the
  reverse-complement.  In other words, the model is built in
  the backwards direction.

\exdent
  \verb`-t` \enskip or \enskip \verb`--text`

  Output the model in a text format.  This is for
  informational/debugging purposes only---the \Pg{glimmer3}
  program cannot read models in this form.

  The format of the output is a header line containing the
  parameters of the model, followed by individual
  probability lines.  The entries on each probability line
  are:
  \bq
    \begin{tabular}{cl}
      Column & \quad Description \\
      1 & ID number \\
      2 & Context pattern \\
      3 & Mutual information \\
      4 & Probability of A \\
      5 & Probability of C \\
      6 & Probability of G \\
      7 & Probability of T
    \end{tabular}
  \eq
  The context pattern is divided into codons by the vertical lines (this
  option assumes the default 3-periodic model).
  The ``?'' represents the position being predicted.  Letters represent
  specific values in their respective positions in the context window.
  The asterisk indicates the position that has maximum mutual information
  with the predicted position.

\exdent
  \verb`-v` \Desc{num} \enskip or \enskip \verb`--verbose` \Desc{num}

  Set the verbose level to \Desc{num}.  This controls extra debugging
  output---the higher the value the more output.

\exdent
  \verb`-w` \Desc{num} \enskip or \enskip \verb`--width` \Desc{num}

  Set the width of the ICM to \Desc{num}.  The width includes
  the predicted position.  The default value is 12.

\exdent
  \verb`-z` \Desc{n} \enskip or \enskip \verb`--trans_table` \Desc{n}

  Use Genbank translation table number \Desc{n} to specify stop codons.

\exdent
  \verb`-Z` \Desc{codon-list} \enskip or \enskip \verb`--stop_codons` \Desc{codon-list}

  Specify stop codons as a comma-separated list.
  Sample format:  \,\verb`-Z tag,tga,taa`\,.
  The default stop codons are \Pg{tag}, \Pg{tga} and \Pg{taa}.
\el

\subsection{\Pg{glimmer3} Program}

This is the main program that makes gene preditions.

\subsubsection{\Pg{glimmer3} Parameters \& Options}
The invocation for \,\Pg{glimmer3}\, is:
\bq
  \Pg{glimmer3}\, [\Desc{options}] \Desc{sequence} \Desc{icm} \Desc{tag}
\eq
where \Desc{sequence} is the name of the file containing the DNA
sequence(s) to be analyzed and \Desc{icm} is the name of the file
containing the ICM model produced by \,\verb`build-icm`\,.  \Desc{tag}
is a prefix used to name the two output files:  \Desc{tag}\verb`.detail`
and \Desc{tag}\verb`.predict`.

\Desc{options} can be the following:
\bl{}\RaggedRight
\exdent
  \verb`-A` \Desc{codon-list} \enskip or \enskip \verb`--start_codons` \Desc{codon-list}

  Specify start codons as a comma-separated list.
  Sample format:  \,\verb`-A atg,gtg`\,.
  The default start codons are \Pg{atg}, \Pg{gtg} and \Pg{ttg}.
  Use the \Pg{-P} option to specify the relative proportions of use.
  If \Pg{-P} is not used, then the proportions will be equal.

\exdent
  \verb`-b` \Desc{filename} \enskip or \enskip \verb`--rbs_pwm` \Desc{filename}

  Read a position weight matrix (PWM) from \Desc{filename} to identify
  the ribosome binding site to help choose start sites.  The format of
  this file is indicated by the following example:
\BSV\begin{verbatim}
6
a     212     309      43      36     452     138
c      55      58       0      19      48      26
g     247     141     501     523       5     365
t      64      70      34       0      73      49
\end{verbatim}\ESV
  The first line is the number of positions in the pattern, \ie,
  the number of columns in the matrix (not counting
  the first column of labels).  The column values are the relative
  frequencies of nucleotides at each position.

\exdent
  \verb`-C` \Desc{p} \enskip or \enskip \verb`--gc_percent` \Desc{p}

  Use \Desc{p} as the GC percentage of the independent model, \ie,
  the model of intergenic sequence.
  Note:  \Desc{p} should be a percentage, \eg, \verb`-C 45.2`

  If this option is not specified, the GC percentage will be
  counted from the input file.

\exdent
  \verb`-E` \Desc{filename} \enskip or \enskip \verb`--entropy` \Desc{filename}

  Read entropy profiles from \Desc{filename}.  The format is one header
  line, then 20 lines of 3 columns each, which is the format produced
  by the program \Pg{entropy-profile} with the \Pg{-b} option.
  The columns are amino acid,
  positive entropy, and negative entropy, respectively.  Rows must be in
  alphabetical order by amino acid code letter.  This currently does
  not affect \Gthree{} predictions, but is used in
  the \Pg{long-orfs} program.  If the option is specified, the
  entropy-distance ratio for each potential gene is printed as the last column
  of the \Pg{.detail} file.  If \Desc{filename} is ``\Pg{\#}'', then
  a set of default entropy profiles, constructed from a wide range of
  species, is used.

\exdent
 \verb` -f` \enskip or \enskip \verb`--first_codon`

  Use the first possible codon in an orf as the start codon
  for initial scoring purposes.  Otherwise, the highest-scoring
  codon will be used.  This only affects the start positions in
  the \,\verb`.detail`\, file.  The final start predictions in
  the \,\verb`.predict`\, file are always based on the scoring
  functions.

\exdent
  \verb`-g` \Desc{n} \enskip or \enskip \verb`--gene_len` \Desc{n}

  Set the minimum gene length to \Desc{n} nucleotides.  This does not include
  the bases in the stop codon.

\exdent
  \verb`-h` \enskip or \enskip \verb`--help`

  Print the usage message.

\exdent
  \verb`-i` \Desc{filename} \enskip or \enskip \verb`--ignore` \Desc{filename}

  \Desc{filename} specifies regions of bases that are off 
  limits, so that no bases within that area will be examined.
  The format for entries in this file is one line per region,
  with the start and end positions of the region specified
  as the first two fields on the line.  The rest of the line
  is regarded as comments.  Additionally, any line beginning
  with a \,\verb`#`\, is regarded as a comment.  \Eg, the
  following file:
\BSV\begin{verbatim}
   1001     1600   Comment here
# The region can be specified high-low as well as low-high
   5600     5001
\end{verbatim}\ESV
  would ignore bases $1001 \ldots 1600$ and $5001 \ldots 5600$
  in the input sequence.  This option should not be used with
  multi-sequence input files.

\exdent
  \verb`-l` \enskip or \enskip \verb`--linear`

  Assume a linear rather than circular genome, \ie, there will
  be no genes that ``wraparound'' between the beginning and end
  of the sequence.

\exdent
  \verb`-L` \Desc{filename} \enskip or \enskip \verb`--orf_coords` \Desc{filename}

  \Desc{filename} specifies a list of orfs that should
  be scored separately, with no attempt to resolve overlaps or
  determine start codons.  The format of the
  list is one orf per line, with entries separated by white space.
  The first entry is an identifier for the orf.  It can be an
  arbitrary string without spaces.  The next two entries are
  the start and end positions of the orf, respectively, (coordinates counting
  from 1), including the stop codon.  The fourth entry is the
  reading frame.  This is used only to determine the direction of
  the orf in cases of circular genomes where the orf might ``wrap
  around'' the end of the input sequence.  If positive the
  orf is presumed to be on the positive DNA strand; otherwise,
  on the negative strand.  Any further entries on the line are ignored.

  The output with this option goes both to the \Pg{.predict} file
  and to the \Pg{.detail} file.

\exdent
  \verb`-M` \enskip or \enskip \verb`--separate_genes`

  \Desc{sequence-file} is a multifasta file of separate genes to
  be scored separately, with no overlap rules.  Each sequence
  is assumed to be in $5'$ to $3'$ order and to include the stop
  codon.

\exdent
  \verb`-o` \Desc{n} \enskip or \enskip \verb`--max_olap` \Desc{n}

  Set the maximum overlap length to \Desc{n}.  Overlaps of this
  many or fewer bases are allowed between genes.  The new
  dynamic programming algorithm should \underline{\emph{never}}
  output genes that overlap by more than this many bases.

\exdent
  \verb`-P` \Desc{number-list} \enskip or \enskip \verb`--start_probs` \Desc{number-list}

  Specify the probability of different start codons (same number and order
  as in \Pg{-A} option).  If no \Pg{-A} option is given, then there should be 3
  values:  for \Pg{atg}, \Pg{gtg} and \Pg{ttg},
  in that order.  Sample format:  \verb`-P 0.6,0.35,0.05`.
  If \Pg{-A} is specified without \Pg{-P}, then each start codon is equally likely
  (which is very unusual).

\exdent
  \verb`-q` \Desc{n} \enskip or \enskip \verb`--ignore_score_len` \Desc{n}

  Consider any gene \Desc{n} or more bases long as a potential
  gene, regardless of its in-frame score.
  Without this option, this value is calculated automatically to
  be the length such that the expected number of orfs this long
  or longer in a random sequence of a million bases is one.

\exdent
  \verb`-r` \enskip or \enskip \verb`--no_indep`

  Don't use the independent probability score column at all.  Using
  this option will produce more short gene predictions.

\exdent
  \verb`-t` \Desc{n} \enskip or \enskip \verb`--threshold` \Desc{n}

  Set the threshold score for consideration as a gene to \Desc{n}.
  If the in-frame
  score $\ge \Desc{n}$, then the region is given a number and considered
  a potential gene.  Note this is the integer score in the column labelled
  ``InFrm'' in the \,\verb`.detail`\, file, not the decimal score in
  the column labelled ``Raw''.

\exdent
  \verb`-X` \enskip or \enskip \verb`--extend`

  Also score orfs that extend off the end of the sequence(s).  This
  option presumes that the sequence(s) is linear and not circular.
  Reported positions off the end of the sequence are the nearest
  positions in the correct reading frame.  Note that this ignores
  any partial codons at the ends of a sequence.  Suppose, for example,
  that a sequence is 998bp long and an orf in reading frame +1
  starts at position 601 and extends off the end of the sequence.
  Then the end of that gene/orf will be reported at position 999,
  as if the stop codon were in positions 997\ldots999.  This is true
  even if the last two characters of the sequence are, say, \,\verb`cc`\,
  and cannot possibly be part of a stop codon.

  Any scores associated with orfs that extend past the end of a
  sequence are computed using only complete codons contained in
  the sequence.
  
\exdent
  \verb`-z` \Desc{n} \enskip or \enskip \verb`--trans_table` \Desc{n}

  Use Genbank translation table number \Desc{n} to specify stop codons.

\exdent
  \verb`-Z` \Desc{codon-list} \enskip or \enskip \verb`--stop_codons` \Desc{codon-list}

  Specify stop codons as a comma-separated list.
  Sample format:  \verb`-Z tag,tga,taa`.
  The default stop codons are \Pg{tag}, \Pg{tga} and \Pg{taa}.
\el


\subsubsection{\Pg{glimmer3} Output Formats}

\smallskip
\noindent\textbf{\Pg{.detail} File}
\smallskip

The \Pg{.detail} file begins with the command that invoked the program and
a list of the parameters
used by the program.  Here is a sample:
\BSV\begin{verbatim}
Command:  /fs/szgenefinding/Glimmer3/bin/glimmer3 -o 50 -g 110 -t 30 -b iterated.motif -P 
0.603,0.338,0.059 tpall.fna iterated.icm iterated

Sequence file = tpall.fna
Number of sequences = 1
ICM model file = iterated.icm
Excluded regions file = none
List of orfs file = none
Input is NOT separate orfs
Independent (non-coding) scores are used
Circular genome = true
Truncated orfs = false
Minimum gene length = 110 bp
Maximum overlap bases = 50
Threshold score = 30
Use first start codon = false
Start codons = atg,gtg,ttg
Start probs = 0.603,0.338,0.059
Stop codons = taa,tag,tga
GC percentage = 52.8%
Ignore score on orfs longer than 799
\end{verbatim}\ESV

Following that, for each sequence in the input file the
fasta-header line is echoed and followed by a list of orfs
that were long enough for \Pg{glimmer3} to score.  Here is
a sample of the beginning of such a section:
\BSV\begin{verbatim}
>gi|15638995|ref|NC_000919.1| Treponema pallidum subsp. pallidum str. Nichols, complete ge
nome
Sequence length = 1138011

           ----- Start -----           --- Length ----  ------------- Scores -------------
 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
        +2       17       20      139      120     117    -4.94     0 99  0  -  0  -  -  0
        +2      140      242      361      219     117     0.99     0 87  0  - 12  -  -  0
        -1      435      417      148      285     267     5.48     2 97  -  -  2  -  -  0
        +2      668      668      790      120     120     2.89     0 99  0  -  -  -  -  0
        -3      899      839      717      180     120    -0.86     1 95  -  -  -  -  1  3
        -1      936      933      808      126     123     0.38    13 78  -  - 13  -  -  8
        -3     1124     1109      918      204     189    -1.32     0 99  -  -  -  -  0  0
0001    +1        4        4     1398     1392    1392     6.61    99 99  -  -  -  -  -  0
        -2     1750     1720     1457      291     261    -0.92     8  -  -  -  -  8  - 91
        -2     1957     1945     1751      204     192    -1.47     1  -  - 70  -  1  - 27
        -3     2078     2063     1908      168     153    -1.88     4  -  - 20  -  -  4 75
        -2     2308     2293     2174      132     117    -0.38     5  -  - 85  -  5  -  9
0002    +3     1542     1641     2756     1212    1113     3.20    99  -  - 99  -  -  -  0
        -3     2807     2774     2616      189     156    -2.08     3  0  -  -  -  -  3 96\end{verbatim}\ESV
Below is a description of the columns.  All positions are counted from the beginning of
the sequence with the first base being position~$1$.
\bl{\settowidth{\labelwidth}{Last Column}\leftmargin=\labelwidth \addtolength{\leftmargin}{1em}\labelsep=1em}\RaggedRight
\item[\Pg{ID}]
  An identification number for a potential gene.  Only orfs whose in-frame (\Pg{InFrm})
  score is above the threshold score (set by the \Pg{-t} option) or are longer
  than the ignore-score length have an entry
  in this column.

\item[\Pg{Frame}]
  The reading frame of the orf---positive for forward strand, negative for reverse strand.
  It is determined by the position of the leftmost base of the stop codon:
  \bn
  \exdent
    frame $+1$ if the stop begins in position $1,4,7,\ldots$;
  \exdent
    frame $+2$ if the stop begins in position $2,5,8,\ldots$; 
  \exdent
    frame $+3$ if the stop begins in position $3,5,9,\ldots$; 
  \exdent
    frame $-1$ if the stop begins in position $3,5,9,\ldots$ (so the leftmost base
    is position $1,4,7,\ldots$);
  \exdent
    frame $-2$ if the stop begins in position $4,7,10,\ldots$ (left base position
    $2,5,8,\ldots$);
  \exdent
    frame $-3$ if the stop begins in position $5,8,11,\ldots$ (left base position
    $3,6,9\ldots$).
  \en
  Note that if the genome length is not a multiple of $3$, for genes that wrap
  around the end of the sequence the same rules applied
  to the start codon position will not yield the same reading frame.

\item[\Pg{Start}]
  The positions of the first base of the orf and the first base of the start codon of the
  gene.  Note that the gene start may be different for the same orf in the \Pg{.predict} file.

\item[\Pg{Stop}]
  Position of the last base of the stop codon.

\item[\Pg{Length}]
  Number of bases in the orf and in the gene.  It does \underline{\emph{NOT}}
  include the bases of the stop codon.

\item[\Pg{Raw} Score]
  This is 100 times the per-base log-odds ratio of the in-frame coding ICM score to the
  independent (\ie, non-coding) model score.  It gives a rough quantification to how
  well an orf scores that can be compared between any two orfs.
  
\item[\Pg{InFrm} Score]
  The normalized (to the range $ 0\ldots 99$) score of the gene in its
  reading frame.  This is just the appropriate-frame value among the next
  six scores.

\item[Frame Scores]
  The normalized (to the range $ 0\ldots 99$) score of the gene in each reading frame.
  A ``\Pg{-}'' indicates the presence of a stop codon in that reading frame.
  The normalization compares only scores without stop codons and the independent
  (non-coding) \Pg{NC} score.  If the orf is sufficiently long, \ie, longer than
  the value stated in ``\Pg{Ignore score on orfs longer than}\ldots'',
  the score is not used.

\item[\Pg{NC} Score]
  The normalized independent (\ie, non-coding or intergenic) model score.  This model
  is adjusted for the fact that the orf, by definition, has no in-frame stop codons.

\item[\Pg{EDR} Score]
  An additional column of scores is produced if the \Pg{-E}~option
  is specified.
  This is the entropy-distance ratio, \ie, the ratio of the distance
  of the amino-acid distribution from a positive model to the distance
  from a negative model.  Scores below $1.0$ are more likely to be genes;
  scores above $1.0$ less likely to be genes.
  It is not currently used in the scoring process.
\el

\smallskip
\noindent\textbf{\Pg{.predict} File}
\smallskip

This file has the final gene predictions.  It's format is the fasta-header
line of the sequence followed by one line per gene.  Here is a sample of the
beginning of such a file:
\BSV\begin{verbatim}
>gms:3447|cmr:632 chromosome 1 {Mycobacterium smegmatis MC2}
orf00001      499     1692  +1    13.14
orf00004     1721     2614  +2    14.20
orf00006     2624     3778  +2    10.35
orf00009     3775     4359  +1     9.34
\end{verbatim}\ESV
The columns are:
\bl{\settowidth{\labelwidth}{Column 1}\leftmargin=\labelwidth \addtolength{\leftmargin}{1em}\labelsep=1em}\RaggedRight
\item[Column 1]
  The identifier of the predicted gene.  The numeric portion matches the
  number in the \Pg{ID} column of the \Pg{.detail} file.

\item[Column 2]
  The start position of the gene.

\item[Column 3]
  The end position of the gene.  This is the last base of the stop codon, \ie,
  it includes the stop codon.

\item[Column 4]
  The reading frame.

\item[Column 5]
  The per-base ``raw'' score of the gene.  This is slightly different from the
  value in the \Pg{.detail} file, because it includes adjustments for the
  PWM and start-codon frequency.
\el

\subsection{\Pg{long-orfs} Program}

This program identifies long, non-overlapping open reading frames (orfs)
in a DNA sequence file.  These orfs are very likely to contain genes,
and can be used as a set of training sequences for the \Pg{build-icm}
program.  More specifically, among all orfs longer than a minimum length
$\ell$, those that do not overlap any others are output.  The start
codon used for each orf is the first possible one.  The program, by
default, automatically determines the value $\ell$ that maximizes the
number of orfs that are output.  With the \Pg{-t} option, the initial
set of candidate orfs also can be filtered using entropy distance, which
generally produces a larger, more accurate training set, particularly
for high-GC-content genomes.  Entropy distance is described in~\cite{med1}.

\subsubsection{\Pg{long-orfs} Parameters \& Options}
The format for invoking \,\Pg{long-orfs}\, is:
\bq
  \Pg{long-orfs}\, [\Desc{options}] \Desc{sequence} \Desc{output}
\eq
where \Desc{sequence} is the name of the file containing the DNA sequence
to be analyzed and \Desc{output} is the name of the output file of
coordinates.  \Desc{sequence} may contain only one sequence.
If \Desc{output} is ``\Pg{-}'', then the output is directed to
standard output.

Possible \Desc{options} are:
\bl{}\RaggedRight
\exdent
  \verb`-A` \Desc{codon-list} \enskip or \enskip \verb`--start_codons` \Desc{codon-list}

  Specify allowable start codons as a comma-separated list.
  Sample format:  \,\verb`-A atg,gtg`\,.
  The default start codons are \Pg{atg}, \Pg{gtg} and \Pg{ttg}.

\exdent
  \verb`-E` \Desc{filename} \enskip or \enskip \verb`--entropy` \Desc{filename}

  Read entropy profiles from \Desc{filename}.  The format is one header
  line, then 20 lines of 3 columns each, which is the format produced
  by the program \Pg{entropy-profile} with the \Pg{-b} option.
  The columns are amino acid,
  positive entropy, and negative entropy, respectively.  Rows must be in
  alphabetical order by amino acid code letter.

  The entropy profiles are used only if the \Pg{-t} option is specified.

\exdent
  \verb`-f` \enskip or \enskip \verb`--fixed`

  Do \underline{\emph{NOT}} automatically calculate the minimum gene
  length that maximizes the number or length of output regions, but
  instead use either the value specified by the \Pg{-g} option or
  else the default, which is 90.

\exdent
  \verb`-g` \Desc{n} \enskip or \enskip \verb`--min_len` \Desc{n}

  Set the minimum gene length to \Desc{n} nucleotides.  This does not include
  the bases in the stop codon.

\exdent
  \verb`-h` \enskip or \enskip \verb`--help`

  Print the usage message.

\exdent
  \verb`-i` \Desc{filename} \enskip or \enskip \verb`--ignore` \Desc{filename}

  \Desc{filename} specifies regions of bases that are off 
  limits, so that no bases within that area will be examined.
  The format for entries in this file is described above for
  the same option in the \Pg{glimmer3} program.

\exdent
  \verb`-l` \enskip or \enskip \verb`--linear`

  Assume a linear rather than circular genome, \ie, there will
  be no ``wraparound'' genes with part at the beginning of the sequence
  and the rest at the end of the sequence.

\exdent
  \verb`-L` \enskip or \enskip \verb`--length_opt`

  Find and use as the minimum gene length the value that maximizes the
  total \underline{\emph{length}} of non-overlapping genes, instead of
  the default behaviour, which is to maximize the total \underline{\emph{number}}
  of non-overlapping genes.

\exdent
  \verb`-n` \enskip or \enskip \verb`--no_header`

  Do not include the program-settings header information in the
  output file.  With this option, the output file will contain
  only the coordinates of the selected orfs.

\exdent
  \verb`-o` \Desc{n} \enskip or \enskip \verb`--max_olap` \Desc{n}

  Set the maximum overlap length to \Desc{n}.  Overlaps of this
  many or fewer bases between genes are not regarded as overlaps.

\exdent
  \verb`-t` \Desc{x} \enskip or \enskip \verb`--cutoff` \Desc{x}

  Only genes with an entropy distance score less than \Desc{x} will
  be considered.  This cutoff is made before any subsequent steps
  in the algorithm.

\exdent
  \verb`-w` \enskip or \enskip \verb`--without_stops`

  Do \underline{\emph{NOT}} include the stop codon in the region
  described by the output coordinates.  By default it is included.

\exdent
  \verb`-z` \Desc{n} \enskip or \enskip \verb`--trans_table` \Desc{n}

  Use Genbank translation table number \Desc{n} to specify stop codons.

\exdent
  \verb`-Z` \Desc{codon-list} \enskip or \enskip \verb`--stop_codons` \Desc{codon-list}

  Specify allowable stop codons as a comma-separated list.
  Sample format:  \verb`-Z tag,tga`.
  The default stop codons are \Pg{tag}, \Pg{tga} and \Pg{taa}.
\el

\subsection{Other Programs}

A number of other utility programs are included in the \Gthree{}
package.  For all of these programs, running the program with
the ``\Pg{-h}'' option, will give a brief description of the
program usage and options.

\subsubsection{\Pg{anomaly} Program}
This program reads a genome sequence and list of gene coordinates
for it and reports genes with bad start codons, bad stop codons,
in-frame stop codons, or frame shifts.
\bq
  \Pg{anomaly}\, [\Desc{options}] \Desc{sequence} \Desc{coords}
\eq

\subsubsection{\Pg{build-fixed} Program}
This program builds a fixed-length interpolated context model
from a set of sequences.  The sequences must all be the same
length.  The model is actually an array of separate ICM's, one
for each position in the fixed-length sequences.
\bq
  \Pg{build-fixed}\, [\Desc{options}] \,\Pg{<}\,\Desc{sequence} \,\Pg{>}\,\Desc{output-model}
\eq

\subsubsection{\Pg{entropy-profile} Program}
This program builds a multi-fasta list of gene sequences and
determines the natural and entropy distributions of all
amino acid residues contained in them.
\bq
  \Pg{entropy-profile}\, [\Desc{options}] \,\Pg{<}\,\Desc{sequences}
\eq

\subsubsection{\Pg{entropy-score} Program}
This program reads a genome sequence and a list of gene coordinates
for it and computes the entropy distance ratio for each gene.
Output goes to standard output and is the same as the coordinate
input with the entropy ratio appended to each line.
\bq
  \Pg{entropy-score}\, [\Desc{options}] \Desc{sequence} \Desc{coords}
\eq

\subsubsection{\Pg{extract} Program}
This program reads a genome sequence and a list of coordinates
for it and outputs a multi-fasta file of the regions specified
by the coordinates.  Output goes to standard output.
\bq
  \Pg{extract}\, [\Desc{options}] \Desc{sequence} \Desc{coords}
\eq

\subsubsection{\Pg{multi-extract} Program}
This program is a multi-fasta version of the preceding program.
The only difference is that the input sequence file can be a
multi-fasta file, and accordingly, the coordinate file must have
an extra field (at the beginning) that specifies to which sequence
the coordinates refer.
\bq
  \Pg{multi-extract}\, [\Desc{options}] \Desc{sequences} \Desc{coords}
\eq

\subsubsection{\Pg{score-fixed} Program}
This program scores a set of fixed-length input sequences using
two fixed-length interpolated context models.  Output goes to
standard output.
\bq
  \Pg{score-fixed}\, [\Desc{options}] \Desc{pos-model} \Desc{neg-model} \,\Pg{<}\,\Desc{sequences}
\eq

\subsubsection{\Pg{start-codon-distrib} Program}
This program reads a genome sequence and list of coordinates
for it and frequencies of the start codons of the genes.
Output goes to standard output.
\bq
  \Pg{start-codon-distrib}\, [\Desc{options}] \Desc{sequence} \Desc{coords}
\eq

\subsubsection{\Pg{uncovered} Program}
This program reads a genome sequence and list of coordinates
for it and outputs a multi-fasta file contained the regions of the
sequences that are \underline{\emph{NOT}} contained in any of
the regions specified in the coordinates file.
Output goes to standard output.
\bq
  \Pg{uncovered}\, [\Desc{options}] \Desc{sequence} \Desc{coords}
\eq

\subsubsection{\Pg{window-acgt} Program}
This program finds the distribution of nucleotides in each of a
series of windows across a DNA sequence.  The command-line parameters
specify the width of the window and the distance between successive
windows.  The input sequence comes from standard input and the output
goes to standard output.
\bq
  \Pg{window-acgt}\, [\Desc{options}] \Desc{window-len} \Desc{window-skip} \,\Pg{<}\,\Desc{input-file}
\eq


\section{Versions}

\subsection{Version~3.01}
  \bi\RaggedRight
  \item
    Eliminated unused functions.
  \item
    Eliminated \Pg{-p} and \Pg{-w} options.
  \item
    Implemented the \Pg{-X} option allowing orfs extending off the
    end (of a non-circular) sequence to be scored.
  \item
    Changed the width of the PWM in the scripts from 5 to 6.
  \item
    Added the \Pg{g3-iterated} script to combine running \Gthree{} from
    scratch and using the output as a training set for a second run.
  \item
    Lowered default threshold score (\Pg{-t} option) in scripts.
  \ei

\subsection{Version~3.02}
  \bi\RaggedRight
  \item
    Correct error in handling ORFs that wrap around the start/end
    of circular sequences.
  \item
    Change the make system to work on Mac OSX.
  \item
    Implement the \Pg{-L} and \Pg{-M} options.
  \item
    Change the orf scoring not to score the start codon with the
    ICM or with the independent score model.
  \ei

\raggedright
\bibliographystyle{alpha}
\bibliography{notes}

\end{document}