1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
|
#! /usr/bin/env python
# gen-inclusions.py : generate .tex inclusions for user guide
#
# usage: gen-inclusions.py
# or: gen-inclusions.py <top_builddir> <top_srcdir>
#
# Runs the tutorial examples in the user guide and extracts output
# snippets for inclusion in .tex files. This saves having to manually
# update the user guide to be consistent with small changes to outputs
# (including version numbers and such).
#
# Each example is run in the current working directory, with input
# and output files in the cwd, some of them symlinked from
# `<top_srcdir>/tutorial`.
#
# Intended to be run infrequently by developers, not users, to update
# the user guide source files for a new release. The resulting .tex
# snippets are checked into git. It can assume that it's working on a
# development machine in a git repo, not in distribution code in the
# field.
#
# Script looks for HMMER3 executables in `<top_builddir>/src`, and
# tutorial files in `<top_srcdir>/tutorial`. The top_builddir and
# top_srcdir path prefixes default to `../../..`, because it is
# normally run in `documentation/userguide/inclusions`.
#
# If provided, <top_builddir> and <top_srcdir> can be either relative
# or absolute paths.
#
# Options:
# -t, --tutupdate : update tutorial files from Pfam, UniProt
# --noclean : don't clean up the files it generates
import sys
import os
import shutil
import subprocess
import re
import argparse
# Parse command line options and arguments
#
parser = argparse.ArgumentParser(description='generate .tex inclusions for user guide')
parser.add_argument('-t', '--tutupdate', action='store_true')
parser.add_argument( '--noclean', action='store_true')
parser.add_argument('top_builddir', nargs='?', default='../../..')
parser.add_argument('top_srcdir', nargs='?', default='../../..')
args = parser.parse_args()
top_builddir = os.path.abspath(args.top_builddir) # absolute to top of build tree e.g. '/home/seddy/hmmer3/build-osx'
top_srcdir = args.top_srcdir # relative to top of source, e.g. '../../..'
# Check that <top_builddir>/src seems to contain compiled programs.
#
if (not os.path.exists('{0}/src/hmmscan'.format(top_builddir))) or (not os.path.exists('{0}/src/hmmsearch'.format(top_builddir))):
sys.exit('no programs found? ./configure; make first')
print('gen-inclusions.py : a fiddly script to generate .tex inclusions for user guide')
print('cross your fingers...')
# optionally, update tutorial files that come from Pfam, UniProt.
# don't update MADE1.sto from Dfam. It's a derivative, a 100-seq subset of the 2000-seq Dfam alignment.
if args.tutupdate:
subprocess.run('wget -O fn3.sto https://pfam.xfam.org/family/fn3/alignment/seed', shell=True, cwd='{0}/tutorial'.format(top_srcdir))
subprocess.run('wget -O Pkinase.sto https://pfam.xfam.org/family/Pkinase/alignment/seed', shell=True, cwd='{0}/tutorial'.format(top_srcdir))
subprocess.run('wget -O 7LESS_DROME http://www.uniprot.org/uniprot/P13368.txt', shell=True, cwd='{0}/tutorial'.format(top_srcdir))
# make symlinks to tutorial files in cwd (i.e. in inclusions/ in the source directory)
# exception: globins4.sto is included verbatim in userguide, so copy it.
#
tutorial_files = [ '7LESS_DROME', 'HBB_HUMAN', 'MADE1.sto', 'Pkinase.sto', 'dna_target.fa', 'fn3.sto', 'globins45.fa' ]
for file in tutorial_files:
if not os.path.exists(file): os.symlink('{0}/tutorial/{1}'.format(top_srcdir, file), file)
for file in ['globins4.sto']:
if not os.path.exists(file): shutil.copyfile('{0}/tutorial/{1}'.format(top_srcdir, file), file)
# get a symlink or copy of uniprot_swissprot and its relnotes.txt in cwd too.
# Default: I keep a copy on my development machine in my ~/data/.
# Else: Attempt to download a copy from UniProt to cwd.
swissprot_dir = os.path.expanduser('~/data/Uniprot')
if os.path.exists(swissprot_dir):
if not os.path.exists('uniprot_sprot.fasta'): os.symlink('{0}/uniprot_sprot.fasta'.format(swissprot_dir), 'uniprot_sprot.fasta')
if not os.path.exists('relnotes.txt'): os.symlink('{0}/relnotes.txt'.format(swissprot_dir), 'relnotes.txt')
else:
subprocess.run('wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz', shell=True)
subprocess.run('wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/relnotes.txt', shell=True)
subprocess.run('gunzip -f uniprot_sprot.fasta.gz', shell=True)
# uniprot_release = '2018_02' or some such, from relnotes.txt
# uniprot_nseq = '556,825' or some such (as string, with commas)
#
def uniprot_relnotes(deffp):
print('parsing uniprot release info...')
with open('relnotes.txt', 'r') as f:
content = f.read()
m = re.search(r'UniProt Release (\S+)', content)
if not m: sys.exit('failed to identify UniProt release code')
uniprot_release = m.group(1)
m = re.search(r'UniProtKB/Swiss-Prot:\s*(\S+) entries', content)
if not m: sys.exit('failed to identify UniProt/Swiss-Prot nseq')
uniprot_nseq = m.group(1)
print(r'\newcommand{{\UNIrelease}}{{{0}}}'.format(re.sub('_', r'\_', uniprot_release)), file=deffp)
print(r'\newcommand{{\UNInseq}}{{{0}}}'.format(uniprot_nseq), file=deffp)
print('', file=deffp)
# intro uses hmmscan -h header
# this also gets version and date, as \HMMERversion, \HMMERdate
#
def hmmscan_noargs(deffp):
print('running hmmscan (with no args)...')
r = subprocess.run('hmmscan -h',
shell=True, stdout=subprocess.PIPE, encoding='utf-8', # don't check=True; hmmscan -h returns 1
env={"PATH": "{0}/src".format(top_builddir)})
m = re.match(r'((?:#.*\n)+)', r.stdout)
with open('hmmscan-noargs.out', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search('# HMMER\s+(\S+)\s+\((.+?)\);', r.stdout)
print(r'\newcommand{{\HMMERversion}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\HMMERdate}}{{{0}}}'.format(m.group(2)), file=deffp)
print('', file=deffp)
# 0. hmmbuild (with no arguments)
# Don't set check=True, because hmmbuild w/ no args returns 1 exit code.
# hmmbuild-noargs.out : stdout, complete
#
def hmmbuild_noargs():
print('running hmmbuild (with no args)...')
r = subprocess.run('hmmbuild > hmmbuild-noargs.out',
shell=True, stdout=subprocess.PIPE, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
# 1. hmmbuild globins4.hmm globins4.sto
#
def hmmbuild_globins(deffp):
print('running hmmbuild globins4.hmm globins4.sto...')
r = subprocess.run('hmmbuild globins4.hmm globins4.sto',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
with open("hmmbuild-globins.out", "w") as f:
print(r.stdout, file=f, end='')
m = re.search(r'^\d+\s+\S+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)', r.stdout, flags=re.MULTILINE)
if not m: sys.exit("bad pattern match to hmmbuild output")
print(r"\newcommand{{\BGLnseq}}{{{0}}}".format(m.group(1)), file=deffp) # LaTeX macro names only allow letters
print(r"\newcommand{{\BGLalen}}{{{0}}}".format(m.group(2)), file=deffp) # {{, }} are escapes for {,}
print(r"\newcommand{{\BGLmlen}}{{{0}}}".format(m.group(3)), file=deffp)
print(r"\newcommand{{\BGLgaps}}{{{0}}}".format(int(m.group(2))-int(m.group(3))), file=deffp)
print(r"\newcommand{{\BGLeffn}}{{{0}}}".format(m.group(4)), file=deffp)
print(r"\newcommand{{\BGLre}}{{{0}}}".format(m.group(5)), file=deffp)
print('', file=deffp)
with open('globins4.hmm', 'r') as f:
content = f.read()
# elide both horizontally and vertically. Among the regexp trickery here:
# \n matches newline if pattern is not a raw string; but then other regexp elems have to be \\S, etc
# (?:) is a non-capturing group, useful when you need the group for a {} repetition operator
content = re.sub('(\nHMM\\s+A\\s+C.+\n(?:.+\n){7})(?s:.+)\n((?:.+\n){3}//)', '\\1...\n\\2', content) # Cuts every line between 1st and last state.
content = re.sub(r'(HMM(?:\s+\S\s{3}){7})(?:\s+\S\s{3}){11}(.+)', r'\1 ... \2', content) # Cuts columns in the HMM line
content = re.sub(r'^(\s+COMPO(?:\s+\S+){7})(?:\s+\S+){11}(.+)$', r'\1 ... \2', content, flags=re.MULTILINE) # Cuts columns in the COMPO line
content = re.sub(r'^(\s+\d+(?:\s+\S+){7})(?:\s+\S+){11}(.+)$', r'\1 ... \2', content, flags=re.MULTILINE) # Cuts columns in the match lines
content = re.sub(r'^((?:\s+[\d\.*]+){7})(?:\s+\S+){11}((?:\s+[\d\.*]+){2})$', r'\1 ... \2', content, flags=re.MULTILINE) # Cuts columns in the insert, transition lines
with open('hmmbuild-globins.out2', 'w') as f:
print(content, file=f, end='')
# the formats chapter uses this profile too
with open('globins4.hmm', 'r') as f:
content = f.read()
m = re.match(r'HMMER3/(\S)\s+(\[.+\])', content)
print(r'\newcommand{{\HMMERfmtversion}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\HMMERsavestamp}}{{{0}}}'.format(m.group(2)), file=deffp)
print('', file=deffp)
# 2. hmmsearch globins4.hmm uniprot_sprot.fasta
#
def hmmsearch_globins(deffp):
print('running hmmsearch globins4.hmm uniprot_sprot.fasta...')
r = subprocess.run('hmmsearch globins4.hmm uniprot_sprot.fasta',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
m = re.match('(.+Scores for complete sequences.+?)\n', r.stdout, flags=re.DOTALL)
if not m: sys.exit('bad first pattern match to hmmsearch output')
with open("hmmsearch-globins.out", "w") as f:
print(m.group(1), file=f)
m = re.search('\n(\\s+--- full sequence ---(.+\n){9})', r.stdout) # matches 9 lines of output starting with --- full sequence --- line
if not m: sys.exit('bad top hits pattern match to hmmsearch output')
with open("hmmsearch-globins.out2", "w") as f:
print(m.group(1), file=f, end='') # end='' because pattern match included a final \n
m = re.search(r'\n(Internal pipeline statistics.+)', r.stdout, flags=re.DOTALL)
if not m: sys.exit('bad xpipeline stats pattern match to hmmsearch output')
with open("hmmsearch-globins.out3", "w") as f:
print(m.group(1), file=f, end='')
# e-value score bias e-val sc bias exp N seqname
m = re.search(r'^\s*([0-9\.e-]+)\s+([0-9\.]+)\s+([0-9\.]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+sp\|\S+\|(\S+)', r.stdout, flags=re.MULTILINE)
if not m: sys.exit('bad third pattern match to hmmsearch output')
print(r'\newcommand{{\SGUevalue}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\SGUbitscore}}{{{0}}}'.format(m.group(2)), file=deffp)
print(r'\newcommand{{\SGUbias}}{{{0}}}'.format(m.group(3)), file=deffp)
print(r'\newcommand{{\SGUorigscore}}{{{0:.1f}}}'.format(float(m.group(2)) + float(m.group(3))), file=deffp)
print(r'\newcommand{{\SGUdombitscore}}{{{0}}}'.format(m.group(5)), file=deffp)
print(r'\newcommand{{\SGUseqname}}{{{0}}}'.format(re.sub('_', r'\_', m.group(9))), file=deffp) # protect the \ in the swissprot name
# part of the text assumes that the "full sequence bit score" and "best 1 domain bit score"
# for the top globin hit aren't identical. They're already .1f strings and can be compared directly.
assert m.group(2) != m.group(5), "guide assumes that full vs. best 1 domain bit scores for top globin differ slightly"
m = re.search(r'Passed MSV filter:\s+\d+\s+\((\S+?)\)', r.stdout)
print(r'\newcommand{{\SGUmsvpass}}{{{0:.1f}}}'.format(100 * round(float(m.group(1)), 3)), file=deffp)
m = re.search(r'Passed bias filter:\s+(\d+)', r.stdout)
print(r'\newcommand{{\SGUbiaspass}}{{{0}}}'.format(m.group(1)), file=deffp)
m = re.search(r'Passed Vit filter:\s+(\d+)', r.stdout)
print(r'\newcommand{{\SGUvitpass}}{{{0}}}'.format(m.group(1)), file=deffp)
m = re.search(r'Passed Fwd filter:\s+(\d+)', r.stdout)
print(r'\newcommand{{\SGUfwdpass}}{{{0}}}'.format(m.group(1)), file=deffp)
m = re.search(r'Elapsed: \d+:\d+:(\S+)', r.stdout)
print(r'\newcommand{{\SGUelapsed}}{{{0:.1f}}}'.format(float(m.group(1))), file=deffp)
print('', file=deffp)
# 3. hmmsearch fn3.hmm 7LESS_DROME
# hmmsearch-fn3-sevenless.out : first lines of top hits table
# hmmsearch-fn3-sevenless.out2 : domain annotation section's starting line
# hmmsearch-fn3-sevenless.out3 : domain annotation section, table
#
def hmmsearch_fn3_sevenless(deffp):
print('running hmmbuild, then hmmsearch fn3.hmm 7LESS_DROME...')
r = subprocess.run('hmmbuild fn3.hmm fn3.sto',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
r = subprocess.run('hmmsearch fn3.hmm 7LESS_DROME',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
m = re.search('\n(\\s+--- full sequence ---(?:.+\n){4})', r.stdout) # .out: per-seq hit table (one hit) from 7LESS_DROME
if not m: sys.exit('bad first pattern match to fn3 hmmsearch output')
with open('hmmsearch-fn3-sevenless.out', 'w') as f:
print(m.group(1), file=f, end='')
# append parsed variables from per-seq hit table line to .def
# e-value score bias e-val sc bias exp N seqname
m = re.search(r'^\s*([0-9\.e-]+)\s+([0-9\.]+)\s+([0-9\.]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)', r.stdout, flags=re.MULTILINE)
if not m: sys.exit('bad second pattern match to fn3 hmmsearch output')
print(r'\newcommand{{\SFSevalue}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\SFSbitscore}}{{{0}}}'.format(m.group(2)), file=deffp)
print(r'\newcommand{{\SFSdomevalue}}{{{0}}}'.format(m.group(4)), file=deffp)
print(r'\newcommand{{\SFSdombitscore}}{{{0}}}'.format(m.group(5)), file=deffp)
print(r'\newcommand{{\SFSexpdom}}{{{0}}}'.format(m.group(7)), file=deffp)
print(r'\newcommand{{\SFSndom}}{{{0}}}'.format(m.group(8)), file=deffp)
print('', file=deffp)
m = re.search(r'^(Domain annotation.+:\s*)$', r.stdout, flags=re.MULTILINE) # .out2: "Domain annotation" header line
if not m: sys.exit('bad domain section line pattern match to fn3 hmmsearch output')
with open('hmmsearch-fn3-sevenless.out2', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search('\n(>> 7LESS_DROME.+\n.+\n.+\n((?:\\s*\\d+.+\n)+))', r.stdout) # .out3: per-domain hit table (9 domains) for 7LESS_DROME
if not m: sys.exit('bad domain table pattern match to fn3 hmmsearch output')
with open('hmmsearch-fn3-sevenless.out3', 'w') as f:
print(m.group(1), file=f, end='')
tbl = m.group(2) # <tbl> = multiline string containing the 7LESS_DROME domain hits table
m = re.search(r'\n( ==\s+domain\s+2.+? PP\s*)\n', r.stdout, flags=re.DOTALL) # .out4: alignment of domain 2
if not m: sys.exit('bad alignment pattern match to fn3 hmmsearch output')
with open('hmmsearch-fn3-sevenless.out4', 'w') as f:
print(m.group(1), file=f, end='')
# parse and store info on (9) fn3 domains identified in 7LESS_DROME
domtbl1 = []
for line in tbl.splitlines():
fields = line.split()
dom = { 'idx' : int(fields[0]),
'sigchar' : fields[1],
'bitscore' : float(fields[2]),
'c-evalue' : float(fields[4]),
'i-evalue' : float(fields[5]),
'hmmfrom' : int(fields[6]),
'hmmto' : int(fields[7]),
'envfrom' : int(fields[12]),
'envto' : int(fields[13]) }
domtbl1.append(dom)
return domtbl1
# 4. hmmsearch fn3.hmm uniprot_sprot
#
def hmmsearch_fn3_uniprot(deffp, domtbl1):
print('running hmmsearch fn3.hmm uniprot_sprot.fasta...')
r = subprocess.run('hmmsearch fn3.hmm uniprot_sprot.fasta',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
# uniprot name is sp|P13368|7LESS_DROME
m = re.search('\n(>> \S+\|7LESS_DROME.+\n.+\n.+\n((?:\\s*\\d+.+\n)+))', r.stdout) # .out: per-domain hit table (9 domains) for 7LESS_DROME, Uniprot search version
if not m: sys.exit('bad pattern match to fn3 hmmsearch output')
with open('hmmsearch-fn3-uniprot.out', 'w') as f:
print(m.group(1), file=f, end='')
tbl = m.group(2) # <tbl> = multiline string containing the domain hits table
# parse and store info on (7) fn3 domains identified in 7LESS_DROME in full UniProt search
# domtbl1 is from just searching 7LESS_DROME alone, which gives more domains, because
# some aren't statistically significant in the larger search.
domtbl2 = []
for line in tbl.splitlines():
fields = line.split()
dom = { 'idx' : int(fields[0]),
'sigchar' : fields[1],
'bitscore' : float(fields[2]),
'c-evalue' : float(fields[4]),
'i-evalue' : float(fields[5]),
'hmmfrom' : int(fields[6]),
'hmmto' : int(fields[7]),
'envfrom' : int(fields[12]),
'envto' : int(fields[13]) }
domtbl2.append(dom)
# parse out <domZ> from the trailer
# we need it to construct numbers used to explain the conditional E-value calculation
m = re.search(r'^Domain search space\s+\(domZ\):\s+(\d+)\s+\[number of targets reported over threshold\]', r.stdout, flags=re.MULTILINE)
if not m: sys.exit('bad second pattern match to fn3 hmmsearch output')
fn3_domZ = int(m.group(1))
# capture information about best-scoring domain, from domtbl1 and domtbl2
bestdom1 = sorted(domtbl1, key=lambda k: k['bitscore'], reverse=True)[0]
bestdom2 = sorted(domtbl2, key=lambda k: k['bitscore'], reverse=True)[0]
# construct list of domains in 7LESS_DROME single seq search but not in full Uniprot search
lostdoms = []
insigdoms = []
i = 0
for d in range(len(domtbl2)):
while (domtbl1[i]['envfrom'] != domtbl2[d]['envfrom']):
lostdoms.append(domtbl1[i])
i += 1
if (domtbl1[i]['sigchar'] == '!' and domtbl2[d]['sigchar'] == '?'):
insigdoms.append(domtbl1[i])
i += 1
assert len(lostdoms) == 2, "userguide assumes two domains are lost from single 7LESS_DROME seq vs. UniProt search"
assert len(insigdoms) == 2, "userguide assumes two domains changed from ! to ? in single seq vs. UniProt search"
# There's a section that says that domains 1,4,5 and maybe 6 are strong;
# 2 is weak; 3,7 are insignificant, in the Uniprot version of the search.
# Just check that this is true.
assert len(domtbl2) == 7
assert domtbl2[0]['i-evalue'] < 0.01
assert domtbl2[3]['i-evalue'] < 0.01
assert domtbl2[4]['i-evalue'] < 0.01
assert domtbl2[5]['i-evalue'] < 0.2
assert domtbl2[1]['i-evalue'] > 0.1 and domtbl2[1]['c-evalue'] < 0.01
assert domtbl2[6]['i-evalue'] > 0.1 and domtbl2[6]['c-evalue'] < 0.1
assert domtbl2[2]['c-evalue'] > 0.1
print(r'\newcommand{{\SFSmaxdom}}{{{0}}}'.format(bestdom1['idx']), file=deffp)
print(r'\newcommand{{\SFSmaxdomu}}{{{0}}}'.format(bestdom2['idx']), file=deffp)
print(r'\newcommand{{\SFSmaxsc}}{{{0:.1f}}}'.format(bestdom1['bitscore']), file=deffp)
print(r'\newcommand{{\SFSievalue}}{{{0}}}'.format(bestdom1['i-evalue']), file=deffp)
print(r'\newcommand{{\SFSuievalue}}{{{0:.1e}}}'.format(bestdom2['i-evalue']), file=deffp)
print(r'\newcommand{{\SFSdomZ}}{{{0}}}'.format(fn3_domZ), file=deffp)
print(r'\newcommand{{\SFSucevalue}}{{{0:.1e}}}'.format(bestdom2['c-evalue']), file=deffp)
print(r'\newcommand{{\SFSaidx}}{{{0}}}'.format(lostdoms[0]['idx']), file=deffp)
print(r'\newcommand{{\SFSascore}}{{{0:.1f}}}'.format(lostdoms[0]['bitscore']), file=deffp)
print(r'\newcommand{{\SFSaevalue}}{{{0:.2g}}}'.format(lostdoms[0]['i-evalue']), file=deffp)
print(r'\newcommand{{\SFSauevalue}}{{{0:.0f}}}'.format(lostdoms[0]['i-evalue'] * fn3_domZ), file=deffp)
print(r'\newcommand{{\SFSacoords}}{{{0}-{1}}}'.format(lostdoms[0]['envfrom'], lostdoms[0]['envto']), file=deffp)
print(r'\newcommand{{\SFSbidx}}{{{0}}}'.format(lostdoms[1]['idx']), file=deffp)
print(r'\newcommand{{\SFSbscore}}{{{0:.1f}}}'.format(lostdoms[1]['bitscore']), file=deffp)
print(r'\newcommand{{\SFSbevalue}}{{{0:.2g}}}'.format(lostdoms[1]['i-evalue']), file=deffp)
print(r'\newcommand{{\SFSbuevalue}}{{{0:.1f}}}'.format(lostdoms[1]['i-evalue'] * fn3_domZ), file=deffp)
print(r'\newcommand{{\SFSbcoords}}{{{0}-{1}}}'.format(lostdoms[1]['envfrom'], lostdoms[1]['envto']), file=deffp)
print(r'\newcommand{{\SFSainsig}}{{{0:.1f}}}'.format(insigdoms[0]['bitscore']), file=deffp)
print(r'\newcommand{{\SFSbinsig}}{{{0:.1f}}}'.format(insigdoms[1]['bitscore']), file=deffp)
print('', file=deffp)
def sevenless_table():
print('parsing domain annotation in 7LESS_DROME...')
with open('7LESS_DROME', 'r') as f:
content = f.read()
domtbl = []
for m in re.finditer(r'^(FT\s+DOMAIN\s+\d+\s+\d+\s+Fibronectin.+)\nFT\s+(.+)\n', content, flags=re.MULTILINE):
domtbl.append('{0} {1}'.format(m.group(1), m.group(2)))
assert len(domtbl) == 7, 'guide assumes that uniprot annotates seven domains in 7LESS_DROME'
with open('sevenless_domains.out', 'w') as f:
for domline in domtbl:
print(domline, file=f)
def jackhmmer_hbb_uniprot(deffp):
print('running jackhmmer HBB_HUMAN uniprot_sprot.fasta...')
r = subprocess.run('jackhmmer HBB_HUMAN uniprot_sprot.fasta',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
m = re.match(r'((?s:.+?)Scores for complete sequences(?:.*\n){10})', r.stdout)
with open('jackhmmer-hbb-uniprot.out', 'w') as f:
print(m.group(1), file=f, end='')
print('...', file=f)
m = re.search(r'\n((?:.*\n){2}\s*------ inclusion threshold ------(?:.*\n){3})', r.stdout)
with open('jackhmmer-hbb-uniprot.out2', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r'\n((?:@@.*\n)+(?:.*\n)(?:@@.*\n)+)', r.stdout)
with open('jackhmmer-hbb-uniprot.out3', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r'@@ Included in MSA:\s+(\d+)\s+subsequences\s+\(query\s+\+\s+(\d+) subseqs from \d+ targets', r.stdout)
print(r'\newcommand{{\JHUninc}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\JHUnsig}}{{{0}}}'.format(m.group(1)), file=deffp)
m = re.search(r'@@\s+Round:\s+2\s*\n(?s:.+?)(Scores for complete sequences(?:.*\n){8})', r.stdout)
with open('jackhmmer-hbb-uniprot.out4', 'w') as f:
print(m.group(1), file=f, end='')
print('...', file=f)
m = re.search(r"""
@@\s+Round:\s+2.* # anchor on round 2 part of the output
(?s: .+?) # accept anything until...
\n
( # start what we're grabbing...
(?: .+\n) {2} # match exactly two preceding lines
\+.+\n # first + line
(?: [^\+].+\n ) {1,4}? # up to 1-4 intervening lines
\+.+\n # a second + line
(?: [^\+].+\n ) {1,7}? # up to 1-7 intervening lines
\+.+\n # a third + line
.+\n # and one trailing line
) # end of our grab
""", r.stdout, flags=re.VERBOSE)
with open('jackhmmer-hbb-uniprot.out5', 'w') as f:
print('...', file=f)
print(m.group(1), file=f, end='')
print('...', file=f)
m = re.search(r"""
@@\s+Round:\s+2 # anchor on round 2 part of the output
(?s:.+?) # accept anything until...
\n
( # start grab:
(?:@@.*\n)+ # a block of @@ info
(?:.*\n) # blank line(s)
(?:@@.*\n)+ # a second block of @@ info
) # end grab.
""", r.stdout, flags=re.VERBOSE)
with open('jackhmmer-hbb-uniprot.out6', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r"""
@@\s+Round:\s+4
(?s:.+?)
(
@@\s+New\s+targets\s+included:
(?s:.+?)
//\n
\[ok\]
)
$
""", r.stdout, flags=re.VERBOSE)
with open('jackhmmer-hbb-uniprot.out7', 'w') as f:
print(m.group(1), file=f, end='')
print('', file=deffp)
def hmmscan_sevenless():
print('building minifam and running hmmscan on 7LESS_DROME...')
subprocess.run('hmmbuild globins4.hmm globins4.sto', shell=True, stdout=subprocess.DEVNULL, check=True, env={"PATH": "{0}/src".format(top_builddir)})
subprocess.run('hmmbuild fn3.hmm fn3.sto', shell=True, stdout=subprocess.DEVNULL, check=True, env={"PATH": "{0}/src".format(top_builddir)})
subprocess.run('hmmbuild Pkinase.hmm Pkinase.sto', shell=True, stdout=subprocess.DEVNULL, check=True, env={"PATH": "{0}/src".format(top_builddir)})
subprocess.run('cat globins4.hmm fn3.hmm Pkinase.hmm > minifam', shell=True, stdout=subprocess.DEVNULL, check=True)
r = subprocess.run('hmmpress -f minifam',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
with open('hmmpress-minifam.out', 'w') as f:
print(r.stdout, file=f, end='')
r = subprocess.run('hmmscan minifam 7LESS_DROME',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
m = re.match(r'((?s:.+?)Scores for complete sequence.*\n(?:.*\n){5})', r.stdout)
with open('hmmscan-minifam-sevenless.out', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r'(Domain annotation.+\n>> fn3.+\n.+\n.+\n(?:\s*\d+.+\n)+)', r.stdout)
with open('hmmscan-minifam-sevenless.out2', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r'\n( ==\s+domain\s+2(?s:.+?) PP\s*)\n', r.stdout)
with open('hmmscan-minifam-sevenless.out3', 'w') as f:
print(m.group(1), file=f, end='')
def hmmstat_minifam():
print('running hmmstat minifam...')
r = subprocess.run('hmmstat minifam',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
with open('hmmstat-minifam.out', 'w') as f:
print(r.stdout, file=f, end='')
def hmmalign_globins():
print('running hmmalign globins4.hmm globins45.fa...')
r = subprocess.run('hmmalign globins4.hmm globins45.fa',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
m = re.match(r'(# STOCKHOLM(?:.*\n){13})', r.stdout)
content = re.sub('^(.{80}).*$', r'\1 ...', m.group(1), flags=re.MULTILINE)
with open('hmmalign-globins.out', 'w') as f:
print(content, file=f, end='')
print('...', file=f)
m = re.search(r'\n((?:.+\n){4}#=GC PP_cons.+\n#=GC RF.+\n)', r.stdout)
content = re.sub('^(.{80}).*$', r'\1 ...', m.group(1), flags=re.MULTILINE)
with open('hmmalign-globins.out2', 'w') as f:
print('...', file=f)
print(content, file=f, end='')
# Text uses MYG_HORSE's unaligned leading G, PP=8 as an example.
assert re.search(r'^MYG_HORSE\s+g', r.stdout, flags=re.MULTILINE)
assert re.search(r'^#=GR\s+MYG_HORSE\s+PP\s+8', r.stdout, flags=re.MULTILINE)
def hmmbuild_made1():
print('running hmmbuild MADE1.hmm MADE1.sto')
r = subprocess.run('hmmbuild MADE1.hmm MADE1.sto',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
content = re.sub(r'(MADE1 \(MAriner Derived Element 1\), a TcMar-Mariner).+\n', r'\1 ...\n', r.stdout)
with open('hmmbuild-made1.out', 'w') as f:
print(content, file=f, end='')
def nhmmer_made1(deffp):
print('running nhmmer MADE1.hmm dna_target.fa')
r = subprocess.run('nhmmer MADE1.hmm dna_target.fa',
shell=True, stdout=subprocess.PIPE, check=True, encoding='utf-8',
env={"PATH": "{0}/src".format(top_builddir)})
m = re.search(r'\n(\s*E-value\s*score.+\n(?:.+\n)+\s*-+\s*inclusion threshold.+\n.+\n)', r.stdout)
with open('nhmmer-made1.out', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r'humanchr1.+\s+(3\d+)\s+(3\d+)(?s:.+?)humanchr1.+\s+(3\d+)\s+(3\d+)', r.stdout)
print(r'\newcommand{{\NMHafrom}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\NMHato}}{{{0}}}'.format(m.group(2)), file=deffp)
print(r'\newcommand{{\NMHbfrom}}{{{0}}}'.format(m.group(3)), file=deffp)
print(r'\newcommand{{\NMHbto}}{{{0}}}'.format(m.group(4)), file=deffp)
assert int(m.group(1)) < int(m.group(3)) # "one on the forward strand"
m = re.search(r'^(Annotation.+)\n>>', r.stdout, flags=re.MULTILINE)
with open('nhmmer-made1.out2', 'w') as f:
print(m.group(1), file=f)
m = re.search(r'\n(>> humanchr1(?:.+\n){4})', r.stdout)
with open('nhmmer-made1.out3', 'w') as f:
print(m.group(1), file=f)
m = re.search(r'\n\n(\s+Alignment.+\n\s*score.+\n(?:.*\n)+?)\n>>', r.stdout)
with open('nhmmer-made1.out4', 'w') as f:
print(m.group(1), file=f, end='')
m = re.search(r'\n(Internal pipeline statistics.+)', r.stdout, flags=re.DOTALL)
with open("nhmmer-made1.out5", "w") as f:
print(m.group(1), file=f, end='')
m = re.search(r'^Target sequences:\s+\d+\s+\((\d+)\s+residues', r.stdout, flags=re.MULTILINE)
print(r'\newcommand{{\NMHnres}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\NMHntop}}{{{0}}}'.format(int(m.group(1))//2), file=deffp)
m = re.search(r'^Residues passing SSV filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
print(r'\newcommand{{\NMHnssv}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\NMHfracssv}}{{{0:.1f}}}'.format(float(m.group(2))*100), file=deffp)
m = re.search(r'^Residues passing bias filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
print(r'\newcommand{{\NMHnbias}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\NMHfracbias}}{{{0:.1f}}}'.format(float(m.group(2))*100), file=deffp)
m = re.search(r'^Residues passing Vit filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
print(r'\newcommand{{\NMHnvit}}{{{0}}}'.format(m.group(1)), file=deffp)
print(r'\newcommand{{\NMHfracvit}}{{{0:.1f}}}'.format(float(m.group(2))*100), file=deffp)
m = re.search(r'^Residues passing Fwd filter:\s+(\d+)\s+\((\S+?)\)', r.stdout, flags=re.MULTILINE)
print(r'\newcommand{{\NMHnfwd}}{{{0}}}'.format(m.group(1)), file=deffp)
with open("inclusions.def", "w") as deffp:
uniprot_relnotes(deffp)
hmmscan_noargs(deffp)
hmmbuild_noargs()
hmmbuild_globins(deffp)
hmmsearch_globins(deffp)
domtbl1 = hmmsearch_fn3_sevenless(deffp)
hmmsearch_fn3_uniprot(deffp, domtbl1)
sevenless_table()
jackhmmer_hbb_uniprot(deffp)
hmmscan_sevenless()
hmmstat_minifam()
hmmalign_globins()
hmmbuild_made1()
nhmmer_made1(deffp)
if not args.noclean:
for fname in [ 'relnotes.txt', 'uniprot_sprot.fasta',
'fn3.hmm', 'globins4.hmm', 'Pkinase.hmm', 'MADE1.hmm',
'minifam', 'minifam.h3f', 'minifam.h3i', 'minifam.h3m', 'minifam.h3p' ]:
if os.path.exists(fname): os.remove(fname)
for fname in tutorial_files:
if os.path.exists(fname): os.remove(fname)
|