File: gen-inclusions.py

package info (click to toggle)
hmmer 3.3.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 35,432 kB
  • sloc: ansic: 129,659; perl: 10,152; sh: 3,331; makefile: 2,027; python: 1,007
file content (672 lines) | stat: -rwxr-xr-x 34,114 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
#! /usr/bin/env python

# gen-inclusions.py : generate .tex inclusions for user guide
#
#   usage: gen-inclusions.py
#      or: gen-inclusions.py <top_builddir> <top_srcdir>
#
# Runs the tutorial examples in the user guide and extracts output
# snippets for inclusion in .tex files. This saves having to manually
# update the user guide to be consistent with small changes to outputs
# (including version numbers and such).
#
# Each example is run in the current working directory, with input
# and output files in the cwd, some of them symlinked from
# `<top_srcdir>/tutorial`.
#
# Intended to be run infrequently by developers, not users, to update
# the user guide source files for a new release. The resulting .tex
# snippets are checked into git. It can assume that it's working on a
# development machine in a git repo, not in distribution code in the
# field.
#
# Script looks for HMMER3 executables in `<top_builddir>/src`, and
# tutorial files in `<top_srcdir>/tutorial`. The top_builddir and
# top_srcdir path prefixes default to `../../..`, because it is
# normally run in `documentation/userguide/inclusions`.
# 
# If provided, <top_builddir> and <top_srcdir> can be either relative
# or absolute paths.
#
# Options:
#    -t, --tutupdate : update tutorial files from Pfam, UniProt
#        --noclean   : don't clean up the files it generates


import sys
import os
import shutil
import subprocess
import re
import argparse

# Parse command line options and arguments
#
parser = argparse.ArgumentParser(description='generate .tex inclusions for user guide')
parser.add_argument('-t', '--tutupdate', action='store_true')
parser.add_argument(      '--noclean',   action='store_true')
parser.add_argument('top_builddir', nargs='?', default='../../..')
parser.add_argument('top_srcdir',   nargs='?', default='../../..')
args         = parser.parse_args()
top_builddir = os.path.abspath(args.top_builddir)   # absolute to top of build tree e.g.  '/home/seddy/hmmer3/build-osx'
top_srcdir   = args.top_srcdir                      # relative to top of source, e.g.  '../../..'

# Check that <top_builddir>/src seems to contain compiled programs.
#
if (not os.path.exists('{0}/src/hmmscan'.format(top_builddir))) or (not os.path.exists('{0}/src/hmmsearch'.format(top_builddir))):
    sys.exit('no programs found? ./configure; make first')


print('gen-inclusions.py : a fiddly script to generate .tex inclusions for user guide')
print('cross your fingers...')


# optionally, update tutorial files that come from Pfam, UniProt.
# don't update MADE1.sto from Dfam. It's a derivative, a 100-seq subset of the 2000-seq Dfam alignment.
if args.tutupdate:
    subprocess.run('wget -O fn3.sto      https://pfam.xfam.org/family/fn3/alignment/seed',        shell=True, cwd='{0}/tutorial'.format(top_srcdir))
    subprocess.run('wget -O Pkinase.sto  https://pfam.xfam.org/family/Pkinase/alignment/seed',    shell=True, cwd='{0}/tutorial'.format(top_srcdir))
    subprocess.run('wget -O 7LESS_DROME  http://www.uniprot.org/uniprot/P13368.txt',              shell=True, cwd='{0}/tutorial'.format(top_srcdir))



# make symlinks to tutorial files in cwd (i.e. in inclusions/ in the source directory)
# exception: globins4.sto is included verbatim in userguide, so copy it.
#
tutorial_files = [ '7LESS_DROME', 'HBB_HUMAN', 'MADE1.sto', 'Pkinase.sto', 'dna_target.fa', 'fn3.sto', 'globins45.fa' ]
for file in tutorial_files:
    if not os.path.exists(file): os.symlink('{0}/tutorial/{1}'.format(top_srcdir, file), file)
for file in ['globins4.sto']:
    if not os.path.exists(file): shutil.copyfile('{0}/tutorial/{1}'.format(top_srcdir, file), file)



# get a symlink or copy of uniprot_swissprot and its relnotes.txt in cwd too.
#  Default: I keep a copy on my development machine in my ~/data/.
#  Else:    Attempt to download a copy from UniProt to cwd.
swissprot_dir = os.path.expanduser('~/data/Uniprot')
if os.path.exists(swissprot_dir):
    if not os.path.exists('uniprot_sprot.fasta'): os.symlink('{0}/uniprot_sprot.fasta'.format(swissprot_dir), 'uniprot_sprot.fasta')
    if not os.path.exists('relnotes.txt'):        os.symlink('{0}/relnotes.txt'.format(swissprot_dir),        'relnotes.txt')
else:
    subprocess.run('wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz', shell=True)
    subprocess.run('wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/relnotes.txt',                                  shell=True)
    subprocess.run('gunzip -f uniprot_sprot.fasta.gz',  shell=True)



# uniprot_release  = '2018_02' or some such, from relnotes.txt
# uniprot_nseq     = '556,825' or some such (as string, with commas)
#
def uniprot_relnotes(deffp):
    print('parsing uniprot release info...')

    with open('relnotes.txt', 'r') as f:
        content = f.read()
        m = re.search(r'UniProt Release (\S+)', content)
        if not m: sys.exit('failed to identify UniProt release code')
        uniprot_release = m.group(1)

        m = re.search(r'UniProtKB/Swiss-Prot:\s*(\S+) entries', content)
        if not m: sys.exit('failed to identify UniProt/Swiss-Prot nseq')
        uniprot_nseq     = m.group(1)

    print(r'\newcommand{{\UNIrelease}}{{{0}}}'.format(re.sub('_', r'\_', uniprot_release)), file=deffp)
    print(r'\newcommand{{\UNInseq}}{{{0}}}'.format(uniprot_nseq),                           file=deffp)
    print('', file=deffp)



# intro uses hmmscan -h header
# this also gets version and date, as \HMMERversion, \HMMERdate
#
def hmmscan_noargs(deffp):
    print('running hmmscan (with no args)...')
    r = subprocess.run('hmmscan -h',
                       shell=True, stdout=subprocess.PIPE, encoding='utf-8',  # don't check=True; hmmscan -h returns 1
                       env={"PATH": "{0}/src".format(top_builddir)})
    
    m = re.match(r'((?:#.*\n)+)', r.stdout)
    with open('hmmscan-noargs.out', 'w') as f:
        print(m.group(1), file=f, end='')

    m = re.search('# HMMER\s+(\S+)\s+\((.+?)\);', r.stdout)
    print(r'\newcommand{{\HMMERversion}}{{{0}}}'.format(m.group(1)), file=deffp)
    print(r'\newcommand{{\HMMERdate}}{{{0}}}'.format(m.group(2)),     file=deffp)
    print('', file=deffp)

          


# 0. hmmbuild  (with no arguments)
#    Don't set check=True, because hmmbuild w/ no args returns 1 exit code.
#      hmmbuild-noargs.out : stdout, complete
#
def hmmbuild_noargs():
    print('running hmmbuild (with no args)...')
    r = subprocess.run('hmmbuild > hmmbuild-noargs.out',
                       shell=True, stdout=subprocess.PIPE, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    


# 1. hmmbuild globins4.hmm globins4.sto
#
def hmmbuild_globins(deffp):
    print('running hmmbuild globins4.hmm globins4.sto...')
    r = subprocess.run('hmmbuild globins4.hmm globins4.sto',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    with open("hmmbuild-globins.out", "w") as f:
        print(r.stdout, file=f, end='')

    m = re.search(r'^\d+\s+\S+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)', r.stdout, flags=re.MULTILINE)
    if not m: sys.exit("bad pattern match to hmmbuild output")
    print(r"\newcommand{{\BGLnseq}}{{{0}}}".format(m.group(1)),                      file=deffp)  # LaTeX macro names only allow letters
    print(r"\newcommand{{\BGLalen}}{{{0}}}".format(m.group(2)),                      file=deffp)  # {{, }} are escapes for {,}                           
    print(r"\newcommand{{\BGLmlen}}{{{0}}}".format(m.group(3)),                      file=deffp)
    print(r"\newcommand{{\BGLgaps}}{{{0}}}".format(int(m.group(2))-int(m.group(3))), file=deffp)
    print(r"\newcommand{{\BGLeffn}}{{{0}}}".format(m.group(4)),                      file=deffp)
    print(r"\newcommand{{\BGLre}}{{{0}}}".format(m.group(5)),                        file=deffp)
    print('', file=deffp)
    
    with open('globins4.hmm', 'r') as f:
        content = f.read()
        # elide both horizontally and vertically. Among the regexp trickery here:
        #   \n matches newline if pattern is not a raw string; but then other regexp elems have to be \\S, etc
        #   (?:) is a non-capturing group, useful when you need the group for a {} repetition operator
        content = re.sub('(\nHMM\\s+A\\s+C.+\n(?:.+\n){7})(?s:.+)\n((?:.+\n){3}//)',   '\\1...\n\\2', content)                       # Cuts every line between 1st and last state.
        content = re.sub(r'(HMM(?:\s+\S\s{3}){7})(?:\s+\S\s{3}){11}(.+)',              r'\1 ... \2',  content)                       # Cuts columns in the HMM line
        content = re.sub(r'^(\s+COMPO(?:\s+\S+){7})(?:\s+\S+){11}(.+)$',               r'\1 ... \2',  content, flags=re.MULTILINE)   # Cuts columns in the COMPO line
        content = re.sub(r'^(\s+\d+(?:\s+\S+){7})(?:\s+\S+){11}(.+)$',                 r'\1 ... \2',  content, flags=re.MULTILINE)   # Cuts columns in the match lines
        content = re.sub(r'^((?:\s+[\d\.*]+){7})(?:\s+\S+){11}((?:\s+[\d\.*]+){2})$',  r'\1 ... \2',  content, flags=re.MULTILINE)   # Cuts columns in the insert, transition lines
    with open('hmmbuild-globins.out2', 'w') as f:
        print(content, file=f, end='')

    # the formats chapter uses this profile too
    with open('globins4.hmm', 'r') as f:
        content = f.read()
        m = re.match(r'HMMER3/(\S)\s+(\[.+\])', content)
        print(r'\newcommand{{\HMMERfmtversion}}{{{0}}}'.format(m.group(1)), file=deffp)
        print(r'\newcommand{{\HMMERsavestamp}}{{{0}}}'.format(m.group(2)),  file=deffp)
        print('', file=deffp)


# 2. hmmsearch globins4.hmm uniprot_sprot.fasta
#
def hmmsearch_globins(deffp):
    print('running hmmsearch globins4.hmm uniprot_sprot.fasta...')
    r = subprocess.run('hmmsearch globins4.hmm uniprot_sprot.fasta',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    m = re.match('(.+Scores for complete sequences.+?)\n', r.stdout, flags=re.DOTALL)
    if not m: sys.exit('bad first pattern match to hmmsearch output')
    with open("hmmsearch-globins.out", "w") as f:
        print(m.group(1), file=f)

    m = re.search('\n(\\s+--- full sequence ---(.+\n){9})', r.stdout)    # matches 9 lines of output starting with --- full sequence --- line
    if not m: sys.exit('bad top hits pattern match to hmmsearch output')
    with open("hmmsearch-globins.out2", "w") as f:
        print(m.group(1), file=f, end='')                                # end='' because pattern match included a final \n

    m = re.search(r'\n(Internal pipeline statistics.+)', r.stdout, flags=re.DOTALL)
    if not m: sys.exit('bad xpipeline stats pattern match to hmmsearch output')
    with open("hmmsearch-globins.out3", "w") as f:
        print(m.group(1), file=f, end='')

    #                     e-value        score        bias        e-val  sc      bias     exp     N       seqname
    m = re.search(r'^\s*([0-9\.e-]+)\s+([0-9\.]+)\s+([0-9\.]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+sp\|\S+\|(\S+)', r.stdout, flags=re.MULTILINE)
    if not m: sys.exit('bad third pattern match to hmmsearch output')
    print(r'\newcommand{{\SGUevalue}}{{{0}}}'.format(m.group(1)),                                   file=deffp)
    print(r'\newcommand{{\SGUbitscore}}{{{0}}}'.format(m.group(2)),                                 file=deffp)
    print(r'\newcommand{{\SGUbias}}{{{0}}}'.format(m.group(3)),                                     file=deffp)
    print(r'\newcommand{{\SGUorigscore}}{{{0:.1f}}}'.format(float(m.group(2)) + float(m.group(3))), file=deffp)
    print(r'\newcommand{{\SGUdombitscore}}{{{0}}}'.format(m.group(5)),                              file=deffp)
    print(r'\newcommand{{\SGUseqname}}{{{0}}}'.format(re.sub('_', r'\_', m.group(9))),              file=deffp)  # protect the \ in the swissprot name
    # part of the text assumes that the "full sequence bit score" and "best 1 domain bit score"
    # for the top globin hit aren't identical. They're already .1f strings and can be compared directly.
    assert m.group(2) != m.group(5), "guide assumes that full vs. best 1 domain bit scores for top globin differ slightly"

    m = re.search(r'Passed MSV filter:\s+\d+\s+\((\S+?)\)', r.stdout)
    print(r'\newcommand{{\SGUmsvpass}}{{{0:.1f}}}'.format(100 * round(float(m.group(1)), 3)), file=deffp)    

    m = re.search(r'Passed bias filter:\s+(\d+)', r.stdout)
    print(r'\newcommand{{\SGUbiaspass}}{{{0}}}'.format(m.group(1)), file=deffp)

    m = re.search(r'Passed Vit filter:\s+(\d+)', r.stdout)
    print(r'\newcommand{{\SGUvitpass}}{{{0}}}'.format(m.group(1)), file=deffp)

    m = re.search(r'Passed Fwd filter:\s+(\d+)', r.stdout)
    print(r'\newcommand{{\SGUfwdpass}}{{{0}}}'.format(m.group(1)), file=deffp)

    m = re.search(r'Elapsed: \d+:\d+:(\S+)', r.stdout)
    print(r'\newcommand{{\SGUelapsed}}{{{0:.1f}}}'.format(float(m.group(1))), file=deffp)
    print('', file=deffp)

    


# 3. hmmsearch fn3.hmm 7LESS_DROME
#      hmmsearch-fn3-sevenless.out  : first lines of top hits table
#      hmmsearch-fn3-sevenless.out2 : domain annotation section's starting line
#      hmmsearch-fn3-sevenless.out3 : domain annotation section, table
#
def hmmsearch_fn3_sevenless(deffp):
    print('running hmmbuild, then hmmsearch fn3.hmm 7LESS_DROME...')

    r = subprocess.run('hmmbuild fn3.hmm fn3.sto',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    r = subprocess.run('hmmsearch fn3.hmm 7LESS_DROME',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    m = re.search('\n(\\s+--- full sequence ---(?:.+\n){4})', r.stdout)           # .out: per-seq hit table (one hit) from 7LESS_DROME
    if not m: sys.exit('bad first pattern match to fn3 hmmsearch output')
    with open('hmmsearch-fn3-sevenless.out', 'w') as f:
        print(m.group(1), file=f, end='')

    # append parsed variables from per-seq hit table line to .def
    #                     e-value        score        bias        e-val  sc      bias     exp     N     seqname
    m = re.search(r'^\s*([0-9\.e-]+)\s+([0-9\.]+)\s+([0-9\.]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)', r.stdout, flags=re.MULTILINE)
    if not m: sys.exit('bad second pattern match to fn3 hmmsearch output')
    print(r'\newcommand{{\SFSevalue}}{{{0}}}'.format(m.group(1)),      file=deffp)
    print(r'\newcommand{{\SFSbitscore}}{{{0}}}'.format(m.group(2)),    file=deffp)
    print(r'\newcommand{{\SFSdomevalue}}{{{0}}}'.format(m.group(4)),   file=deffp)
    print(r'\newcommand{{\SFSdombitscore}}{{{0}}}'.format(m.group(5)), file=deffp)
    print(r'\newcommand{{\SFSexpdom}}{{{0}}}'.format(m.group(7)),      file=deffp)
    print(r'\newcommand{{\SFSndom}}{{{0}}}'.format(m.group(8)),        file=deffp)
    print('', file=deffp)

    m = re.search(r'^(Domain annotation.+:\s*)$', r.stdout, flags=re.MULTILINE)   # .out2: "Domain annotation" header line
    if not m: sys.exit('bad domain section line pattern match to fn3 hmmsearch output')
    with open('hmmsearch-fn3-sevenless.out2', 'w') as f:
        print(m.group(1), file=f, end='')

    m = re.search('\n(>> 7LESS_DROME.+\n.+\n.+\n((?:\\s*\\d+.+\n)+))', r.stdout)  # .out3: per-domain hit table (9 domains) for 7LESS_DROME
    if not m: sys.exit('bad domain table pattern match to fn3 hmmsearch output')
    with open('hmmsearch-fn3-sevenless.out3', 'w') as f:
        print(m.group(1), file=f, end='')
    tbl = m.group(2)   # <tbl> = multiline string containing the 7LESS_DROME domain hits table

    m = re.search(r'\n(  ==\s+domain\s+2.+? PP\s*)\n', r.stdout, flags=re.DOTALL)   # .out4: alignment of domain 2
    if not m: sys.exit('bad alignment pattern match to fn3 hmmsearch output')
    with open('hmmsearch-fn3-sevenless.out4', 'w') as f:
        print(m.group(1), file=f, end='')

    # parse and store info on (9) fn3 domains identified in 7LESS_DROME
    domtbl1 = []                                             
    for line in tbl.splitlines():
        fields = line.split()
        dom    = { 'idx'      : int(fields[0]),
                   'sigchar'  : fields[1],
                   'bitscore' : float(fields[2]),
                   'c-evalue' : float(fields[4]),
                   'i-evalue' : float(fields[5]),
                   'hmmfrom'  : int(fields[6]),
                   'hmmto'    : int(fields[7]),
                   'envfrom'  : int(fields[12]),
                   'envto'    : int(fields[13]) }
        domtbl1.append(dom)

    return domtbl1



# 4. hmmsearch fn3.hmm uniprot_sprot
#
def hmmsearch_fn3_uniprot(deffp, domtbl1):
    print('running hmmsearch fn3.hmm uniprot_sprot.fasta...')

    r = subprocess.run('hmmsearch fn3.hmm uniprot_sprot.fasta',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    # uniprot name is sp|P13368|7LESS_DROME
    m = re.search('\n(>> \S+\|7LESS_DROME.+\n.+\n.+\n((?:\\s*\\d+.+\n)+))', r.stdout)   # .out: per-domain hit table (9 domains) for 7LESS_DROME, Uniprot search version
    if not m: sys.exit('bad pattern match to fn3 hmmsearch output')
    with open('hmmsearch-fn3-uniprot.out', 'w') as f:
        print(m.group(1), file=f, end='')
    tbl = m.group(2)  # <tbl> = multiline string containing the domain hits table

    # parse and store info on (7) fn3 domains identified in 7LESS_DROME in full UniProt search
    # domtbl1 is from just searching 7LESS_DROME alone, which gives more domains, because
    # some aren't statistically significant in the larger search.
    domtbl2 = []                                             
    for line in tbl.splitlines():
        fields = line.split()
        dom    = { 'idx'      : int(fields[0]),
                   'sigchar'  : fields[1],
                   'bitscore' : float(fields[2]),
                   'c-evalue' : float(fields[4]),
                   'i-evalue' : float(fields[5]),
                   'hmmfrom'  : int(fields[6]),
                   'hmmto'    : int(fields[7]),
                   'envfrom'  : int(fields[12]),
                   'envto'    : int(fields[13]) }
        domtbl2.append(dom)

    # parse out <domZ> from the trailer
    # we need it to construct numbers used to explain the conditional E-value calculation
    m = re.search(r'^Domain search space\s+\(domZ\):\s+(\d+)\s+\[number of targets reported over threshold\]', r.stdout, flags=re.MULTILINE)
    if not m: sys.exit('bad second pattern match to fn3 hmmsearch output')
    fn3_domZ = int(m.group(1))

    # capture information about best-scoring domain, from domtbl1 and domtbl2
    bestdom1 = sorted(domtbl1, key=lambda k: k['bitscore'], reverse=True)[0]
    bestdom2 = sorted(domtbl2, key=lambda k: k['bitscore'], reverse=True)[0]

    # construct list of domains in 7LESS_DROME single seq search but not in full Uniprot search
    lostdoms  = []
    insigdoms = []
    i         = 0
    for d in range(len(domtbl2)):
        while (domtbl1[i]['envfrom'] != domtbl2[d]['envfrom']):
            lostdoms.append(domtbl1[i])
            i += 1
        if (domtbl1[i]['sigchar'] == '!' and domtbl2[d]['sigchar'] == '?'):
            insigdoms.append(domtbl1[i])
        i += 1
    assert len(lostdoms)  == 2, "userguide assumes two domains are lost from single 7LESS_DROME seq vs. UniProt search"
    assert len(insigdoms) == 2, "userguide assumes two domains changed from ! to ? in single seq vs. UniProt search"


    # There's a section that says that domains 1,4,5 and maybe 6 are strong;
    # 2 is weak; 3,7 are insignificant, in the Uniprot version of the search.
    # Just check that this is true.
    assert len(domtbl2) == 7
    assert domtbl2[0]['i-evalue'] < 0.01
    assert domtbl2[3]['i-evalue'] < 0.01
    assert domtbl2[4]['i-evalue'] < 0.01
    assert domtbl2[5]['i-evalue'] < 0.2
    assert domtbl2[1]['i-evalue'] > 0.1 and domtbl2[1]['c-evalue'] < 0.01
    assert domtbl2[6]['i-evalue'] > 0.1 and domtbl2[6]['c-evalue'] < 0.1
    assert domtbl2[2]['c-evalue'] > 0.1
    
    print(r'\newcommand{{\SFSmaxdom}}{{{0}}}'.format(bestdom1['idx']),                              file=deffp)
    print(r'\newcommand{{\SFSmaxdomu}}{{{0}}}'.format(bestdom2['idx']),                             file=deffp)
    print(r'\newcommand{{\SFSmaxsc}}{{{0:.1f}}}'.format(bestdom1['bitscore']),                      file=deffp)
    print(r'\newcommand{{\SFSievalue}}{{{0}}}'.format(bestdom1['i-evalue']),                        file=deffp)
    print(r'\newcommand{{\SFSuievalue}}{{{0:.1e}}}'.format(bestdom2['i-evalue']),                   file=deffp)  
    print(r'\newcommand{{\SFSdomZ}}{{{0}}}'.format(fn3_domZ),                                       file=deffp)  
    print(r'\newcommand{{\SFSucevalue}}{{{0:.1e}}}'.format(bestdom2['c-evalue']),                   file=deffp)
    print(r'\newcommand{{\SFSaidx}}{{{0}}}'.format(lostdoms[0]['idx']),                             file=deffp)
    print(r'\newcommand{{\SFSascore}}{{{0:.1f}}}'.format(lostdoms[0]['bitscore']),                  file=deffp)
    print(r'\newcommand{{\SFSaevalue}}{{{0:.2g}}}'.format(lostdoms[0]['i-evalue']),                 file=deffp)
    print(r'\newcommand{{\SFSauevalue}}{{{0:.0f}}}'.format(lostdoms[0]['i-evalue'] * fn3_domZ),     file=deffp)
    print(r'\newcommand{{\SFSacoords}}{{{0}-{1}}}'.format(lostdoms[0]['envfrom'], lostdoms[0]['envto']), file=deffp)
    print(r'\newcommand{{\SFSbidx}}{{{0}}}'.format(lostdoms[1]['idx']),                             file=deffp)
    print(r'\newcommand{{\SFSbscore}}{{{0:.1f}}}'.format(lostdoms[1]['bitscore']),                  file=deffp)
    print(r'\newcommand{{\SFSbevalue}}{{{0:.2g}}}'.format(lostdoms[1]['i-evalue']),                 file=deffp)
    print(r'\newcommand{{\SFSbuevalue}}{{{0:.1f}}}'.format(lostdoms[1]['i-evalue'] * fn3_domZ),     file=deffp)
    print(r'\newcommand{{\SFSbcoords}}{{{0}-{1}}}'.format(lostdoms[1]['envfrom'], lostdoms[1]['envto']), file=deffp)
    print(r'\newcommand{{\SFSainsig}}{{{0:.1f}}}'.format(insigdoms[0]['bitscore']),                 file=deffp)
    print(r'\newcommand{{\SFSbinsig}}{{{0:.1f}}}'.format(insigdoms[1]['bitscore']),                 file=deffp)
    print('', file=deffp)



def sevenless_table():
    print('parsing domain annotation in 7LESS_DROME...')
    with open('7LESS_DROME', 'r') as f:
        content = f.read()
        domtbl  = []
        for m in re.finditer(r'^(FT\s+DOMAIN\s+\d+\s+\d+\s+Fibronectin.+)\nFT\s+(.+)\n', content, flags=re.MULTILINE):
            domtbl.append('{0} {1}'.format(m.group(1), m.group(2)))

    assert len(domtbl) == 7, 'guide assumes that uniprot annotates seven domains in 7LESS_DROME'

    with open('sevenless_domains.out', 'w') as f:
        for domline in domtbl:
            print(domline, file=f)




def jackhmmer_hbb_uniprot(deffp):
    print('running jackhmmer HBB_HUMAN uniprot_sprot.fasta...')

    r = subprocess.run('jackhmmer HBB_HUMAN uniprot_sprot.fasta',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    m = re.match(r'((?s:.+?)Scores for complete sequences(?:.*\n){10})', r.stdout)
    with open('jackhmmer-hbb-uniprot.out', 'w') as f:
         print(m.group(1), file=f, end='')
         print('...', file=f)

    m = re.search(r'\n((?:.*\n){2}\s*------ inclusion threshold ------(?:.*\n){3})', r.stdout)
    with open('jackhmmer-hbb-uniprot.out2', 'w') as f:
        print(m.group(1), file=f, end='')

    m = re.search(r'\n((?:@@.*\n)+(?:.*\n)(?:@@.*\n)+)', r.stdout)
    with open('jackhmmer-hbb-uniprot.out3', 'w') as f:
        print(m.group(1), file=f, end='')

    m = re.search(r'@@ Included in MSA:\s+(\d+)\s+subsequences\s+\(query\s+\+\s+(\d+) subseqs from \d+ targets', r.stdout)
    print(r'\newcommand{{\JHUninc}}{{{0}}}'.format(m.group(1)), file=deffp)
    print(r'\newcommand{{\JHUnsig}}{{{0}}}'.format(m.group(1)), file=deffp)

    m = re.search(r'@@\s+Round:\s+2\s*\n(?s:.+?)(Scores for complete sequences(?:.*\n){8})', r.stdout)
    with open('jackhmmer-hbb-uniprot.out4', 'w') as f:
        print(m.group(1), file=f, end='')
        print('...', file=f)

    m = re.search(r"""
                      @@\s+Round:\s+2.*       # anchor on round 2 part of the output
                      (?s: .+?)               # accept anything until...
                      \n
                      (                       # start what we're grabbing...
                      (?: .+\n) {2}           #   match exactly two preceding lines
                      \+.+\n                  #   first + line
                      (?: [^\+].+\n ) {1,4}?  #   up to 1-4 intervening lines
                      \+.+\n                  #   a second + line
                      (?: [^\+].+\n ) {1,7}?  #   up to 1-7 intervening lines
                      \+.+\n                  #   a third + line
                      .+\n                    #   and one trailing line
                      )                       # end of our grab
                   """, r.stdout, flags=re.VERBOSE)
    with open('jackhmmer-hbb-uniprot.out5', 'w') as f:
        print('...', file=f)
        print(m.group(1), file=f, end='')
        print('...', file=f)

    m = re.search(r"""
                      @@\s+Round:\s+2         # anchor on round 2 part of the output
                      (?s:.+?)                # accept anything until...
                      \n
                      (                       # start grab:
                      (?:@@.*\n)+             #   a block of @@ info
                      (?:.*\n)                #   blank line(s)
                      (?:@@.*\n)+             #   a second block of @@ info
                      )                       # end grab.
                   """, r.stdout, flags=re.VERBOSE)
    with open('jackhmmer-hbb-uniprot.out6', 'w') as f:
        print(m.group(1), file=f, end='')

    m = re.search(r"""
                    @@\s+Round:\s+4
                    (?s:.+?)
                    (
                      @@\s+New\s+targets\s+included:
                      (?s:.+?)
                      //\n
                      \[ok\]
                    )
                    $
                    """, r.stdout, flags=re.VERBOSE)
    with open('jackhmmer-hbb-uniprot.out7', 'w') as f:
        print(m.group(1), file=f, end='')

    print('', file=deffp)



def hmmscan_sevenless():
    print('building minifam and running hmmscan on 7LESS_DROME...')

    subprocess.run('hmmbuild globins4.hmm globins4.sto',             shell=True, stdout=subprocess.DEVNULL, check=True, env={"PATH": "{0}/src".format(top_builddir)})
    subprocess.run('hmmbuild fn3.hmm fn3.sto',                       shell=True, stdout=subprocess.DEVNULL, check=True, env={"PATH": "{0}/src".format(top_builddir)})
    subprocess.run('hmmbuild Pkinase.hmm Pkinase.sto',               shell=True, stdout=subprocess.DEVNULL, check=True, env={"PATH": "{0}/src".format(top_builddir)})
    subprocess.run('cat globins4.hmm fn3.hmm Pkinase.hmm > minifam', shell=True, stdout=subprocess.DEVNULL, check=True)

    r = subprocess.run('hmmpress -f minifam',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})
    with open('hmmpress-minifam.out', 'w') as f:
        print(r.stdout, file=f, end='')
    
    r = subprocess.run('hmmscan minifam 7LESS_DROME',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    m = re.match(r'((?s:.+?)Scores for complete sequence.*\n(?:.*\n){5})', r.stdout)
    with open('hmmscan-minifam-sevenless.out', 'w') as f:
        print(m.group(1), file=f, end='')

    m = re.search(r'(Domain annotation.+\n>> fn3.+\n.+\n.+\n(?:\s*\d+.+\n)+)', r.stdout)
    with open('hmmscan-minifam-sevenless.out2', 'w') as f:
        print(m.group(1), file=f, end='')
    
    m = re.search(r'\n(  ==\s+domain\s+2(?s:.+?) PP\s*)\n', r.stdout)
    with open('hmmscan-minifam-sevenless.out3', 'w') as f:
        print(m.group(1), file=f, end='')




def hmmstat_minifam():
    print('running hmmstat minifam...')

    r = subprocess.run('hmmstat minifam',  
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})
    with open('hmmstat-minifam.out', 'w') as f:
        print(r.stdout, file=f, end='')
   




def hmmalign_globins():
    print('running hmmalign globins4.hmm globins45.fa...')

    r = subprocess.run('hmmalign globins4.hmm globins45.fa',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    m = re.match(r'(# STOCKHOLM(?:.*\n){13})', r.stdout)
    content = re.sub('^(.{80}).*$', r'\1 ...', m.group(1), flags=re.MULTILINE)
    with open('hmmalign-globins.out', 'w') as f:
        print(content, file=f, end='')
        print('...',   file=f)

    m = re.search(r'\n((?:.+\n){4}#=GC PP_cons.+\n#=GC RF.+\n)', r.stdout)
    content = re.sub('^(.{80}).*$', r'\1 ...', m.group(1), flags=re.MULTILINE)
    with open('hmmalign-globins.out2', 'w') as f:
        print('...',   file=f)
        print(content, file=f, end='')


    # Text uses MYG_HORSE's unaligned leading G, PP=8 as an example.
    assert re.search(r'^MYG_HORSE\s+g',             r.stdout, flags=re.MULTILINE)
    assert re.search(r'^#=GR\s+MYG_HORSE\s+PP\s+8', r.stdout, flags=re.MULTILINE)



def hmmbuild_made1():
    print('running hmmbuild MADE1.hmm MADE1.sto')

    r = subprocess.run('hmmbuild MADE1.hmm MADE1.sto',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    content = re.sub(r'(MADE1 \(MAriner Derived Element 1\), a TcMar-Mariner).+\n', r'\1 ...\n', r.stdout)
    with open('hmmbuild-made1.out', 'w') as f:
        print(content, file=f, end='')



def nhmmer_made1(deffp):
    print('running nhmmer MADE1.hmm dna_target.fa')

    r = subprocess.run('nhmmer MADE1.hmm dna_target.fa',
                       shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
                       env={"PATH": "{0}/src".format(top_builddir)})

    m = re.search(r'\n(\s*E-value\s*score.+\n(?:.+\n)+\s*-+\s*inclusion threshold.+\n.+\n)', r.stdout)
    with open('nhmmer-made1.out', 'w') as f:
        print(m.group(1), file=f, end='')
    
    m = re.search(r'humanchr1.+\s+(3\d+)\s+(3\d+)(?s:.+?)humanchr1.+\s+(3\d+)\s+(3\d+)', r.stdout)
    print(r'\newcommand{{\NMHafrom}}{{{0}}}'.format(m.group(1)), file=deffp)
    print(r'\newcommand{{\NMHato}}{{{0}}}'.format(m.group(2)), file=deffp)
    print(r'\newcommand{{\NMHbfrom}}{{{0}}}'.format(m.group(3)), file=deffp)
    print(r'\newcommand{{\NMHbto}}{{{0}}}'.format(m.group(4)), file=deffp)

    assert int(m.group(1)) < int(m.group(3))   # "one on the forward strand"

    m = re.search(r'^(Annotation.+)\n>>', r.stdout, flags=re.MULTILINE)
    with open('nhmmer-made1.out2', 'w') as f:
        print(m.group(1), file=f)

    m = re.search(r'\n(>> humanchr1(?:.+\n){4})', r.stdout)
    with open('nhmmer-made1.out3', 'w') as f:
        print(m.group(1), file=f)
    
    m = re.search(r'\n\n(\s+Alignment.+\n\s*score.+\n(?:.*\n)+?)\n>>', r.stdout)
    with open('nhmmer-made1.out4', 'w') as f:
        print(m.group(1), file=f, end='')
        
    m = re.search(r'\n(Internal pipeline statistics.+)', r.stdout, flags=re.DOTALL)
    with open("nhmmer-made1.out5", "w") as f:
        print(m.group(1), file=f, end='')

    m = re.search(r'^Target sequences:\s+\d+\s+\((\d+)\s+residues', r.stdout, flags=re.MULTILINE)
    print(r'\newcommand{{\NMHnres}}{{{0}}}'.format(m.group(1)),         file=deffp)
    print(r'\newcommand{{\NMHntop}}{{{0}}}'.format(int(m.group(1))//2), file=deffp)

    m = re.search(r'^Residues passing SSV filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
    print(r'\newcommand{{\NMHnssv}}{{{0}}}'.format(m.group(1)), file=deffp)
    print(r'\newcommand{{\NMHfracssv}}{{{0:.1f}}}'.format(float(m.group(2))*100), file=deffp)

    m = re.search(r'^Residues passing bias filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
    print(r'\newcommand{{\NMHnbias}}{{{0}}}'.format(m.group(1)), file=deffp)
    print(r'\newcommand{{\NMHfracbias}}{{{0:.1f}}}'.format(float(m.group(2))*100), file=deffp)

    m = re.search(r'^Residues passing Vit filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
    print(r'\newcommand{{\NMHnvit}}{{{0}}}'.format(m.group(1)), file=deffp)
    print(r'\newcommand{{\NMHfracvit}}{{{0:.1f}}}'.format(float(m.group(2))*100), file=deffp)

    m = re.search(r'^Residues passing Fwd filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
    print(r'\newcommand{{\NMHnfwd}}{{{0}}}'.format(m.group(1)), file=deffp)



with open("inclusions.def", "w") as deffp:
    uniprot_relnotes(deffp)
    hmmscan_noargs(deffp)
    hmmbuild_noargs()
    hmmbuild_globins(deffp)
    hmmsearch_globins(deffp)    
    domtbl1 = hmmsearch_fn3_sevenless(deffp)
    hmmsearch_fn3_uniprot(deffp, domtbl1)
    sevenless_table()
    jackhmmer_hbb_uniprot(deffp)
    hmmscan_sevenless()
    hmmstat_minifam()
    hmmalign_globins()
    hmmbuild_made1()
    nhmmer_made1(deffp)

if not args.noclean:
    for fname in [ 'relnotes.txt', 'uniprot_sprot.fasta',
                   'fn3.hmm', 'globins4.hmm', 'Pkinase.hmm', 'MADE1.hmm',
                   'minifam', 'minifam.h3f',  'minifam.h3i', 'minifam.h3m', 'minifam.h3p' ]:
        if os.path.exists(fname): os.remove(fname)
    for fname in tutorial_files:
        if os.path.exists(fname): os.remove(fname)