File: murasaki.1

package info (click to toggle)
murasaki 1.68.6-6
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 1,928 kB
  • ctags: 3,100
  • sloc: cpp: 16,010; perl: 8,365; makefile: 186
file content (736 lines) | stat: -rw-r--r-- 31,011 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.07)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.ie \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.el \{\
.    de IX
..
.\}
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "MURASAKI 1"
.TH MURASAKI 1 "2010-05-31" "perl v5.10.1" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
murasaki \- compute anchors between multiple sequences
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\& murasaki [OPTIONS] \-p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern]
\& mpirun murasaki [OPTIONS] \-p[pattern] seq1.fa seq2.gbk [seq3.raw ...] #compute anchors between seq1.fa and seq2.gbk using [pattern] in parallel via MPI
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
Murasaki generates anchors based on all supplied sequences based on
the user supplied \fBpattern\fR and hash tables. Essentially each base of
each sequence is masked by the pattern, forming a \fBseed\fR that is
used to generate a hash. The location of the seed is stored in the
\&\fBhash table\fR. Once all seeds have been hashed and stored, Murasaki
scans the hash table, generating anchors for all matching seeds. An
anchor refers to a set intervals across a subset of the input
sequences. These are stored in \fBname\fR.anchors files, and described in
\&\*(L"\s-1FILE\s0 \s-1FORMATS\s0\*(R". By default anchors are maximally extended until their
minimum pairwise ungapped alignment score drops below a threshold in
the same fashion the X\-drop parameter in \s-1BLAST\s0 and BLAST-like
searches.
.SS "\s-1PATTERNS\s0"
.IX Subsection "PATTERNS"
Murasaki uses \fBspaced seed patterns\fR to in considering seeds. A
\&\fBspaced seed pattern\fR is typically expressed as a string of 1s and 0s
necessarily starting and ending with a 1. 1s indicate that this base
is considered part of the seed, while bases at 0 positions are
not. For example with a pattern \*(L"1011\*(R" the sequence \*(L"\s-1ACGT\s0\*(R" would match
sequences \*(L"\s-1AGGT\s0\*(R" and \*(L"\s-1ATGT\s0\*(R" but not \*(L"\s-1ACTT\s0\*(R". The number of 1s in the
pattern is known as the \*(L"weight\*(R" of the pattern, and the number of 1s
and 0s combined is the \*(L"length\*(R" of the pattern. Murasaki allows the
use of any arbitrary pattern expressed as a string of 1s and 0s, and
also interprets patterns of the form \*(L"x:y\*(R" to mean a "random pattern
of weight \fIx\fR and length \fIy\fR."
.PP
The choice of pattern obviously has an impact on sensitivity and
specificity, but whether one pattern is \*(L"better\*(R" than another depends
on the application and the input sequences under
consideration. Calcuating \*(L"maximally sensitive spaced seed patterns\*(R"
is a computationally difficult problem and there are a number of
research papers describing various methods for approximation
(\*(L"\s-1RELATED\s0 \s-1READING\s0\*(R"). In general, however, \*(L"heavier\*(R" spaced seed
patterns are less sensitive, but more specific, than lighter
seeds. Anecdotally we find that seeds with weights approximately 60%
to 75% (with lengths around 24 for bacteria, and 36 to 48 for mammals)
are good for most applications. Extremely similar species (for example
human and chimp) benefit from longer, heavier, seeds.
.SS "\s-1HASH\s0 \s-1FUNCTIONS\s0"
.IX Subsection "HASH FUNCTIONS"
Hash functions (as well as hash parameters) are generated
automatically based the system environment and input sequences. There
are essentially two types of hash functions available in Murasaki:
adaptive and cryptoraphic hashes. The adaptive hashes are \s-1XOR\s0
combinations of various bitwise shifts of the seed designed by
analyzing the \fBspaced seed pattern\fR to maximize the entropy of the
resulting hash. Cryptographic hashes are available via the CryptoPP
library and use the \fIentire\fR spaced seed pattern to generate a hash
using one of the common cryptographic hashes like \s-1MD5\s0 or \s-1SHA\-1\s0. The
adaptive hash functions are almost always faster and more efficient
than \s-1MD5\s0 and \s-1SHA\-1\s0, but the cryptographic functions are available for
reference and may be useful as an alternative in the unlikely event
you're dealing with an environment where the adaptive hasher is
unsuitable (for example a sequence consisting of only A and T (leaving
1 out of every 2 bits unitilized)).
.SS "\s-1MEMORY\s0 \s-1SCALING\s0"
.IX Subsection "MEMORY SCALING"
Murasaki can take a lot of memory. Storing the location of each seed
in the hash table is the most costly part of the operation, requiring
approximately \f(CW\*(C`ceil(log_2(N))\*(C'\fR bits per seed where \f(CW\*(C`N\*(C'\fR is the total
sequence length. Locations are, by default, stored in a bitpacked
format to approach theoretical minimum. The second most costly element
is the hash table structure, where each bucket carries a small
overhead and unused are simply wasted space. More hash table buckets
(i.e. a longer hash table) decreases the expected number of
collisions, leading to faster executation time. Therefore Murasaki
tries to use as many buckets as possible by inspecting the available
system memory and using as much as it can while still storing all the
seed locations. If this automatic scaling is ineffective, setting the
hash table size directly via the \-\-hashbits|\-b options can force a
specific hash table size. If the memory of one computer is
insufficient to store the desired hash table, \s-1PARALLELIZATION\s0 can
be used to distribute the hash table across multiple computers.
.SS "\s-1PARALLELIZATION\s0"
.IX Subsection "PARALLELIZATION"
Murasaki is designed to run in parallel using \s-1MPI\s0. Consult the
documentation for the specific variations of your \s-1MPI\s0 implementation,
however in general the executation method looks like:
.PP
.Vb 1
\& mpirun [MPI options] murasaki [murasaki options] \-p[pattern] [seq1 ...]
.Ve
.PP
Murasaki in parallel divides the number of processors available (\s-1NP\s0)
into two groups: hasher nodes and storage nodes. The storage nodes
divide the hash table between each themselves, each being responsible
for a different part of the table. Hasher nodes divide the input
sequence in between themselves, each hashing a separate portion of the
input sequence, and passing the seed location to the appropriate
storage node for storage.  When all the hasher nodes are finished
hashing, the storage nodes scan their portion of hash table and pass
matching sets of seeds to a hasher node where they are assembled into
anchors and extended. Finally all the hasher nodes combine their
independent anchor sets into one final set in \f(CW\*(C`ceil(log_2(H))\*(C'\fR
iterations (where \f(CW\*(C`H\*(C'\fR is the number of hasher nodes), with each
hasher node number 2h passing its anchors to hasher number 2h\-1 at
each iteration.
.PP
Because almost none of the parallelization steps require communication
between \fIall\fR nodes, and each seed and each anchor can be processed
in parallel, Murasaki scales very well in parallel, running
approximately twice as fast when twice as many nodes are
available. Furthermore, the hash table is automatically grown to take
advantage of the combined memory from multiple machines.
.SH "OPTIONS"
.IX Header "OPTIONS"
Most options can be specified in their long form (e.g. \*(L"\-\-directory
out\*(R" or \*(L"\-\-directory=out\*(R") or short form (e.g. \*(L"\-dout\*(R"). Options
marked by <S> expect a string, <D> an integer, <F> a float, and <B> a
boolean value (\*(L"yes/on/true/1\*(R" for true, \*(L"no/off/false/0\*(R" for
false). Most booleans can omit the value, toggling the value from
whatever it was to the opposite.
.PP
Murasaki has a lot of options. Here we've separated them into
categories to help distinguish the scope of the various options,
however in certain situations certain option choices may have
onforseen consequences, and of course ultimately if the specified
output is \fIhuge\fR, the required runtime will necessarily be \fIlong\fR.
It is a mistake to think that everything outside of the
\&\*(L"tuning options\*(R" in Performance section has no bearing on performance.
.SS "Anchor parameter related options"
.IX Subsection "Anchor parameter related options"
These options shape what is considered an \*(L"anchor\*(R".
.IP "\-\-pattern|\-p <S>" 4
.IX Item "--pattern|-p <S>"
.Vb 3
\& specifies the seed pattern (eg. 11101001010011011). using the format
\& C<[<w>:<l>]> automatically generates a random pattern of weight <w>
\& and length <l>
.Ve
.IP "\-\-repeatmask|\-r <B>" 4
.IX Item "--repeatmask|-r <B>"
Skip repeat masked data (ie: lowercase atgc). Be aware that some sequence files
are distributed purely in lower case.
.IP "\-\-seedfilter|\-f <D>" 4
.IX Item "--seedfilter|-f <D>"
Skip seeds that occur more than N times. Exceptionally slow. See
\&\-\-hashfilter for a faster approximation.
.IP "\-\-hashfilter|\-m <D>" 4
.IX Item "--hashfilter|-m <D>"
Like \-\-seedfilter but works on hash keys instead of seeds. May cause
some collateral damage to otherwise unique seeds, but it's
faster.
.IP "\-\-skipfwd|\-F <B>" 4
.IX Item "--skipfwd|-F <B>"
Don't hash/match the forward strands.
.IP "\-\-skiprev|\-R <B>" 4
.IX Item "--skiprev|-R <B>"
Don't hash/match the reverse complement strands.
.IP "\-\-skip1to1|\-1 <B>" 4
.IX Item "--skip1to1|-1 <B>"
Skip matches along the 1:1 line (good for comparing to self).
.IP "\-\-hashonly|\-Q <B>" 4
.IX Item "--hashonly|-Q <B>"
Hash Only. No anchor output, just statistics.
.IP "\-\-hashskip|\-S <D>" 4
.IX Item "--hashskip|-S <D>"
Hashes every n bases. Default is 1 (i.e. hashing all positions). Not
supplying any argument increments the skip amount by 1.
.IP "\-\-join|\-j <D>" 4
.IX Item "--join|-j <D>"
Join anchors within n bases of eachother (default: 0). Specifying a negative D implies \-D*patternLength.
.IP "\-\-bitscore|\-B <B>" 4
.IX Item "--bitscore|-B <B>"
toggles compututation of a bitscore for all anchors (default is on).
.IP "\-\-seedterms|\-T <B>" 4
.IX Item "--seedterms|-T <B>"
toggles retention of seed terms (defaults to off). These are necessary
for computing TF-IDF scores).
.IP "\-\-sectime|\-e <B>" 4
.IX Item "--sectime|-e <B>"
Always display times in seconds as opposed to human readable \*(L"1d 3h 45m 5s\*(R" style times.
.IP "\-\-mergefilter|\-Y <D>" 4
.IX Item "--mergefilter|-Y <D>"
Filter out matches which would would cause more than \fID\fR many anchors
to be generated from 1 seed (default \-Y100).  Use \-Y0 to disable.
.IP "\-\-scorefilter <D>" 4
.IX Item "--scorefilter <D>"
Set a minimum ungapped score for seeds.
.IP "\-\-rifts|\-/ <D>" 4
.IX Item "--rifts|-/ <D>"
Allow anchors to skip D sequences (default 0).
.IP "\-\-islands|\-% <D>" 4
.IX Item "--islands|-% <D>"
Same as \-\-rifts=S\-D (where S is number of input seqs).
.IP "\-\-fuzzyextend|\-z <B>" 4
.IX Item "--fuzzyextend|-z <B>"
Enable (default) or disable fuzzy extension (i.e. ungapped alignment)
of anchors.
.IP "\-\-fuzzyextendlosslimit|\-Z <D>" 4
.IX Item "--fuzzyextendlosslimit|-Z <D>"
Set the cutoff at which to stop extending fuzzy hits (ie. the \s-1BLAST\s0 X
parameter).
.IP "\-\-gappedanchors <B>" 4
.IX Item "--gappedanchors <B>"
Use gapped (true) or ungapped (false (default)) anchors.
.IP "\-\-scorebyminimumpair <B>" 4
.IX Item "--scorebyminimumpair <B>"
Do anchor scoring by minimum pair when appropriate
(default). Alternative is arithmatic mean (seldom useful, but
theoretically faster).
=item \-\-rifts|\-/ <D>
.Sp
Allow anchors to skip D sequences (default 0).
.IP "\-\-islands|\-% <D>" 4
.IX Item "--islands|-% <D>"
Same as \-\-rifts=S\-D (where S is number of input seqs).
.IP "\-\-fuzzyextend|\-z <B>" 4
.IX Item "--fuzzyextend|-z <B>"
Enable (default) or disable fuzzy extension (i.e. ungapped alignment)
of anchors.
.IP "\-\-fuzzyextendlosslimit|\-Z <D>" 4
.IX Item "--fuzzyextendlosslimit|-Z <D>"
Set the cutoff at which to stop extending fuzzy hits (ie. the \s-1BLAST\s0 X
parameter).
.IP "\-\-gappedanchors <B>" 4
.IX Item "--gappedanchors <B>"
Use gapped (true) or ungapped (false (default)) anchors.
.IP "\-\-scorebyminimumpair <B>" 4
.IX Item "--scorebyminimumpair <B>"
Do anchor scoring by minimum pair when appropriate
(default). Alternative is arithmatic mean (seldom useful, but
theoretically faster).
.SS "Output options"
.IX Subsection "Output options"
These options primarily affect what data is output where.
.IP "\-\-directory|\-d <S>" 4
.IX Item "--directory|-d <S>"
.Vb 1
\& output directory (default: output)
.Ve
.IP "\-\-name|\-n <S>" 4
.IX Item "--name|-n <S>"
.Vb 1
\& alignment name (default: test)
.Ve
.IP "\-\-repeatmap|\-i <B>" 4
.IX Item "--repeatmap|-i <B>"
Toggles keeping of a repeat map when \-\-mergefilter is used (defaults to yes).
.IP "\-\-histogram|\-H <D>" 4
.IX Item "--histogram|-H <D>"
Histogram computation level: (\-H alone implies \-H1)
.RS 4
.IP "0 \- no histogram (default)" 4
.IX Item "0 - no histogram (default)"
.PD 0
.IP "1 \- basic bucketsize/bucketcount histogram data" 4
.IX Item "1 - basic bucketsize/bucketcount histogram data"
.IP "2 \- bucket-based scores to anchors.detils" 4
.IX Item "2 - bucket-based scores to anchors.detils"
.IP "3 \- perbucket count data" 4
.IX Item "3 - perbucket count data"
.IP "4 \- perbucket + perpattern count data" 4
.IX Item "4 - perbucket + perpattern count data"
.RE
.RS 4
.PD
.Sp
Any values above 2 are purely explorartory and can result in massive
output files.
.RE
.IP "\-\-tfidf|\-k <B>" 4
.IX Item "--tfidf|-k <B>"
Perform accurate tfidf scoring from within murasaki (requires extra
memory at anchor generation time). Default is no.
.SS "Performance/tuning options"
.IX Subsection "Performance/tuning options"
These options primarily affect performance, and don't (in general)
impact output.
.IP "\-\-quickhash|\-q <D>" 4
.IX Item "--quickhash|-q <D>"
.Vb 1
\& specify a hashing function:
.Ve
.RS 4
.IP "0 \- adaptive with S\-boxes (default when there's plenty of hash table to spare)" 4
.IX Item "0 - adaptive with S-boxes (default when there's plenty of hash table to spare)"
.PD 0
.IP "1 \- don't pack bits to make hash (use first word only)" 4
.IX Item "1 - don't pack bits to make hash (use first word only)"
.IP "2 \- naively use the first hashbits worth of pattern" 4
.IX Item "2 - naively use the first hashbits worth of pattern"
.IP "3 \- adaptivevely find a good hash (default)" 4
.IX Item "3 - adaptivevely find a good hash (default)"
.IP "**experimental CryptoPP hashes**" 4
.IX Item "**experimental CryptoPP hashes**"
.IP "4 \- \s-1MD5\s0" 4
.IX Item "4 - MD5"
.IP "5 \- \s-1SHA1\s0" 4
.IX Item "5 - SHA1"
.IP "6 \- Whirlpool" 4
.IX Item "6 - Whirlpool"
.IP "7 \- \s-1CRC\-32\s0" 4
.IX Item "7 - CRC-32"
.IP "8 \- Adler\-32" 4
.IX Item "8 - Adler-32"
.RE
.RS 4
.PD
.Sp
Note: 3 and 0 are the only \*(L"recommended\*(R" hash functions, and the only
ones automatically selected. The others are provided merely for
reference. 1, 7, and 8 aren't even expected to utilize the entire hash
space.
.RE
.IP "\-\-hashbits|\-b <D>" 4
.IX Item "--hashbits|-b <D>"
use D bit hashes (for n's of 1 to \s-1WORDSIZE\s0. default 26)
.IP "\-\-hashtype|\-t <S>" 4
.IX Item "--hashtype|-t <S>"
select hash table data structure to use:
.RS 4
.IP "OpenHash  \- open sub-word packing of hashbits (default when there's plenty of hash table to spare)" 4
.IX Item "OpenHash  - open sub-word packing of hashbits (default when there's plenty of hash table to spare)"
.PD 0
.IP "EcoHash   \- chained sub-word packing of hashbits (default)" 4
.IX Item "EcoHash   - chained sub-word packing of hashbits (default)"
.IP "ArrayHash \- malloc/realloc (fast but fragmentation-prone)" 4
.IX Item "ArrayHash - malloc/realloc (fast but fragmentation-prone)"
.IP "MSetHash  \- memory exorbanant, almost pointless." 4
.IX Item "MSetHash  - memory exorbanant, almost pointless."
.RE
.RS 4
.RE
.IP "\-\-probing <D>" 4
.IX Item "--probing <D>"
.PD
0 \- linear, 1 \- quadratic (default). Only applicable for \-\-hashtype=OpenHash.
.IP "\-\-hitfilter|\-h <D>" 4
.IX Item "--hitfilter|-h <D>"
Minimum number of hits to be outputted as an anchor (default 1). In
PatternHunter this is 2.
.IP "\-\-rseed|\-s <D>" 4
.IX Item "--rseed|-s <D>"
Random number seed for non-deterministic algorithms (ie: adative hash
function generation). If you're doing any performance comparisons,
it's probably imperative that you use the same seed for each run of
the same settings.  Default is obtained from \fItime()\fR (ie: seconds since
1970).
.IP "\-\-memory|\-M [<F>|<S>]" 4
.IX Item "--memory|-M [<F>|<S>]"
Set the target amount of total memory (either in gb or as % total memory).
.IP "\-\-reverseotf|\-o <B>" 4
.IX Item "--reverseotf|-o <B>"
Generate reverse complement on the fly (defaults to on). Turning this
off precomputes the all reverse complement strands and stores them in
memory, which rarely provides a measurable performance improvement.
.IP "\-\-binaryseq <B>" 4
.IX Item "--binaryseq <B>"
Enable (default) or disable binary sequence read/write
.PP
\fIAdaptive hash function related:\fR
.IX Subsection "Adaptive hash function related:"
.PP
Performance options related to adaptive hash function generation.
.IP "\-\-hasherFairEntropy <B>" 4
.IX Item "--hasherFairEntropy <B>"
Use more balanced entropy estimation (default: yes).
.IP "\-\-hasherCorrelationAdjust <B>" 4
.IX Item "--hasherCorrelationAdjust <B>"
Adjust entropy estimates for nearby sources assuming some correlation
(default: yes).
.IP "\-\-hasherTargetGACycles <D>" 4
.IX Item "--hasherTargetGACycles <D>"
Adaptive hash function generation genetic algorithm cycle cutoff.
.IP "\-\-hasherEntropyAgro <F>" 4
.IX Item "--hasherEntropyAgro <F>"
How aggressive to be about pursuing maximum entropy hash functions
(takes a real. default is 1).
.SS "\s-1MPI\s0 Specific:"
.IX Subsection "MPI Specific:"
.IP "\-\-hashers|\-A [<F>|<D>]" 4
.IX Item "--hashers|-A [<F>|<D>]"
Specify the number of processes to be used as hashers (only applies to \s-1MPI\s0. If a number between 0 and 1 it refers to a ratio of np).
.IP "\-\-localhash|\-K <B>" 4
.IX Item "--localhash|-K <B>"
Perform hashing locally on each storage node rather than sending it
over the network (helpful for slow networks).
.IP "\-\-mpidistro|\-L <B>" 4
.IX Item "--mpidistro|-L <B>"
Toggles use of \s-1MPI\s0 to distribute sequence data over (if the sequence
is available on local disk on each node then turning this off may
potentially accerlate the initial sequence loading).
.IP "\-\-waittoanchor|\-w <B>" 4
.IX Item "--waittoanchor|-w <B>"
Postpone actual anchor computation until all location sets have been
received (as opposed to trying to work between receiving seed
packets).
.IP "\-\-buffers|\-u <D>" 4
.IX Item "--buffers|-u <D>"
Maximum number of unfinished buffers to allow while message passing (0
means unlimited). Default is set based on the number of nodes
participating. \s-1MPI\s0 can crash or perform \fIvery\fR poorly if this value
is too high.
.IP "\-\-nobuffers|\-U <B>" 4
.IX Item "--nobuffers|-U <B>"
Same as \-\-buffers=1.
.IP "\-\-bigfirst|\-I <B>" 4
.IX Item "--bigfirst|-I <B>"
Assign hashers to large memory nodes first.
.IP "\-\-hostbalance|\-l <B>" 4
.IX Item "--hostbalance|-l <B>"
.RS 4
.PD 0
.IP "If yes (default): spread out hashers evenly among all nodes." 4
.IX Item "If yes (default): spread out hashers evenly among all nodes."
.IP "If no: ignore host name when assigning jobs." 4
.IX Item "If no: ignore host name when assigning jobs."
.RE
.RS 4
.RE
.IP "\-\-memorybalance|\-a <B>" 4
.IX Item "--memorybalance|-a <B>"
.RS 4
.IP "If yes (default): balance hash storage between nodes based on the amount of available ram." 4
.IX Item "If yes (default): balance hash storage between nodes based on the amount of available ram."
.IP "If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently." 4
.IX Item "If no: distribute storage evently. This more likely to achieve optimal run times, but might not utilize memory as efficiently."
.RE
.RS 4
.RE
.IP "\-\-distmerge|\-< <B>" 4
.IX Item "--distmerge|-< <B>"
.RS 4
.IP "if yes (default): during the merge step, storage nodes send seeds to any available hasher." 4
.IX Item "if yes (default): during the merge step, storage nodes send seeds to any available hasher."
.IP "if no: send all seeds to one node only." 4
.IX Item "if no: send all seeds to one node only."
.RE
.RS 4
.RE
.IP "\-\-distcollect|\-> <B>" 4
.IX Item "--distcollect|-> <B>"
.RS 4
.IP "if yes (default): collect anchor data from all hashers." 4
.IX Item "if yes (default): collect anchor data from all hashers."
.IP "if no: send all seeds to the final assembly node only." 4
.IX Item "if no: send all seeds to the final assembly node only."
.RE
.RS 4
.RE
.IP "\-\-mpiredirectoutput <B>" 4
.IX Item "--mpiredirectoutput <B>"
.RS 4
.IP "if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki\-mpiout\-\fIN\fR)." 4
.IX Item "if yes (default): each rank redirects its stdout/stderr to a separate file (murasaki-mpiout-N)."
.IP "if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see \-\-output\-filename and \-\-tag\-output in \fImpirun\fR\|(1)))." 4
.IX Item "if no: do what comes naturally (ie: managed by mpirun (for OpenMPI see --output-filename and --tag-output in mpirun))."
.RE
.RS 4
.RE
.IP "\-\-keepstdoe <B>" 4
.IX Item "--keepstdoe <B>"
.PD
Don't erase the murasaki-mpiout files on success.
.IP "\-\-sysvipc|\-V <B>" 4
.IX Item "--sysvipc|-V <B>"
Use System V \s-1IPC\s0 to negotiate shared memory regions (saves memory when
one host runs multiple nodes). Default is true.
.SS "Universal options:"
.IX Subsection "Universal options:"
.IP "\-\-verbose|\-v" 4
.IX Item "--verbose|-v"
Increases verbosity.
.IP "\-\-version|\-V" 4
.IX Item "--version|-V"
Prints version information and quits.
.IP "\-\-help|\-?" 4
.IX Item "--help|-?"
Prints a help message and quits.
.SH "FILE FORMATS"
.IX Header "FILE FORMATS"
Murasaki has a wide array of output files, the formats of most of
which are intended to be intuitive. All output files are prefixed by
the value of the \-\-name parameter. The primary output file formats are
described here. Files are line based and tab delimited unless
otherwise specified.
.SS ".seqs"
.IX Subsection ".seqs"
The .seqs shows what sequences were used as input, 1 per line. This
file gets used by various programs in conjunction with the .anchors
file, so it's generally important that the contents reflect the
correct sequence files. Moving anchor results between computers might
result in a change of paths, requiring the user to update the .seqs
file. As an alternative, always using relative paths can alleviate
this problem.
.SS ".anchors files"
.IX Subsection ".anchors files"
These files are 1 anchor per line, with a 3\-tuple per sequence. Each
touple represents the start and stop coordinates and strand of the
anchored interval on each sequence. The sequence order matches that of
the order in the .seqs file. The coordinates are structured such that
1 refers to the first base in the sequence, 2 to the second,
etc. Negative values refer to the reverse complement sequence where \-1
is the \fIlast\fR base of the reverse complement sequence (ie: the
complement first base in the forward sequence). The \*(L"strand\*(R" element
is a '+' or '\-' that merely matches the sign of the coordinates (this
is redundant information, but kept to make parsing or filtering
simpler).
.PP
For examle:
.PP
.Vb 1
\& 1       18     +       \-1      \-18       \-
.Ve
.PP
This line describes an anchor where the first 18 bases of the first
sequence match the first 18 bases of the reverse complement of the
second sequence.
.SS ".anchors.details"
.IX Subsection ".anchors.details"
This is an antiquated file format, but used by \s-1GMV\s0 to calculate
statistics like TF-IDF scores, and has been kept around for that
reason. The .anchors.details file has the same format and information
as the .anchors file, however after the anchor touples are two more
terms: a score, and a comma (,) delimited list of term and count pairs
(written \*(L"term:count\*(R"). The score and count data might be varied
depending on the \f(CW\*(C`\-\-histogram\*(C'\fR option choices.
.SS ".anchors.bitscore"
.IX Subsection ".anchors.bitscore"
The term \*(L"bitscore\*(R" here is a misnomer, but maintained for historical
reasons. In reality, this file contains the mean number of matching
bases and length of each anchor (corresponding line by line to the
\&.anchors file).
.SS ".stats.tfidf"
.IX Subsection ".stats.tfidf"
Contains anchor TF-IDF scores (corresponding line by line to the
\&.anchors file).
.SS ".histogram"
.IX Subsection ".histogram"
Contains a simple histogram of the hash table usage. The first field
is the bucket size, and the second is the frequency. For example a .histogram file like this:
.PP
.Vb 2
\& 1  24
\& 2  1
.Ve
.PP
Would indicate that there were 24 hash buckets that stored only 1
location (i.e. 24 unique seeds), and 1 hash bucket stored 2 locations
(i.e. 1 seed that matched 2 locations (or 2 non-matching seeds that
resulted in a hash collision)).
.SS ".options"
.IX Subsection ".options"
Maintains a record of the options used when running Murasaki.
.SS ".repeats"
.IX Subsection ".repeats"
The .repeats file stores a record of \*(L"repeats\*(R" as defined by the
\&\-\-mergefilter option (i.e. seeds that would have have induced more
anchors than permitted). In this file, each repeat record is separated
by a blank line. A repeat record looks like this:
.PP
.Vb 3
\& R: G.GCCTTT.T.ACT.CACAA..AT
\& 0: 2145540494 \-425039256 \-113794380 1998323403
\& 1: 2480929222 \-1874514626 2543723555 \-2550045172
.Ve
.PP
The first line (always prefixed \*(L"R:\*(R") shows the repeating seed itself
(where the . are the bases masked by the pattern). The subsequent
lines show where these seeds occured in the input sequences (in the
first (0) and second (1) sequences). Note that if there are no hits in
a particular sequence, it doesn't include a blank line for that sequence. For example:
.PP
.Vb 3
\& R: G.GCCTTT.T.ACT.CACAA..AT
\& 0: 2145540494 \-425039256 \-113794380 1998323403
\& 2: 2480929222 \-1874514626 2543723555 \-2550045172
.Ve
.PP
is also a valid .repeats file.
.SH "LICENSE"
.IX Header "LICENSE"
\&\s-1GNU\s0 General Public License, version 3 (GPLv3)
.SH "AVAILABILITY"
.IX Header "AVAILABILITY"
<http://murasaki.sourceforge.net>
.SH "AUTHOR"
.IX Header "AUTHOR"
Kris Popendorf <krisp@dna.bio.keio.ac.jp>
.SH "SEE ALSO"
.IX Header "SEE ALSO"
\&\fImbfa\fR\|(1), \fIgeneparse\fR\|(1)
.SS "\s-1RELATED\s0 \s-1READING\s0"
.IX Subsection "RELATED READING"
.ie n .IP "M. Csuros and B. Ma, ""Rapid Homology Search with Two-Stage Extension and Daughter Seeds"" (2005)." 4
.el .IP "M. Csuros and B. Ma, ``Rapid Homology Search with Two-Stage Extension and Daughter Seeds'' (2005)." 4
.IX Item "M. Csuros and B. Ma, Rapid Homology Search with Two-Stage Extension and Daughter Seeds (2005)."
.PD 0
.ie n .IP "F. P. Preparata and L. Zhang and K. W. Choi, ""Quick, practical selection of effective seeds for homology search"" (2005)." 4
.el .IP "F. P. Preparata and L. Zhang and K. W. Choi, ``Quick, practical selection of effective seeds for homology search'' (2005)." 4
.IX Item "F. P. Preparata and L. Zhang and K. W. Choi, Quick, practical selection of effective seeds for homology search (2005)."
.ie n .IP "\s-1KP\s0 Choi, et. al., ""Good spaced seeds for homology search"" (2004)." 4
.el .IP "\s-1KP\s0 Choi, et. al., ``Good spaced seeds for homology search'' (2004)." 4
.IX Item "KP Choi, et. al., Good spaced seeds for homology search (2004)."