1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
#! /usr/bin/env python3
import sys
import os
import subprocess
usage = 'pmark-master.py <top_builddir> <top_srcdir> <resultdir> <ncpu> <benchmark_pfx> <pmark_script>'
if len(sys.argv) != 7: sys.exit('Incorrect number of cmdline args.\nUsage: {}'.format(usage))
(top_builddir, top_srcdir, resultdir, ncpu, benchmark_pfx, pmark_script) = sys.argv[1:]
tblfile = benchmark_pfx + '.tbl'
msafile = benchmark_pfx + '.train.msa'
fafile = benchmark_pfx + '.test.fa'
if os.path.exists(resultdir): sys.exit('results directory {} already exists'.format(resultdir))
if not os.path.isdir(top_builddir): sys.exit("didn't find top_builddir at {}".format(top_builddir))
if not os.path.isdir(top_srcdir): sys.exit("didn't find top_srcdir at {}".format(top_srcdir))
if not os.path.isfile(tblfile): sys.exit('pmark tbl file {} not found'.format(tblfile))
if not os.path.isfile(msafile): sys.exit('pmark training MSA file {} not found'.format(msafile))
if not os.path.isfile(msafile + '.ssi'): sys.exit("msafile {} needs to have an SSI index.\nRun esl-afetch --index on it to create one.".format(msafile))
if not os.path.isfile(fafile): sys.exit('pmark test sequence FASTA file {} not found'.format(fafile))
if not os.access(pmark_script, os.X_OK): sys.exit('driver script {} not found or not executable'.format(pmark_script))
os.mkdir(resultdir)
ncpu = int(ncpu)
# Read the master table, for MSAs with successful splits
#
msaname = []
alen = {}
n = 0
with open(tblfile) as tblfp:
for line in tblfp:
if line[0] == '#': continue
fields = line.split()
if fields[7] == 'ok':
msaname.append(fields[0])
alen[fields[0]] = int(fields[2])
n += 1
# Sort the list of MSA names by length of alignment (in columns),
# to help with load balancing; spread largest MSAs across the subtbls
#
msaname.sort(key=lambda s:alen[s], reverse=True)
# Split the list into <ncpu> subtables
#
subtbl =[ [] for c in range(ncpu) ]
for i in range(n):
subtbl[i % ncpu].append(msaname[i])
# Write the <ncpu> subtables
#
for c in range(ncpu):
with open('{0}/tbl.{1}'.format(resultdir, c), 'w') as f:
for s in subtbl[c]:
f.write(s + '\n')
# Write a SLURM job array script
#
with open('{0}/{0}.sh'.format(resultdir), 'w') as f:
cmd = '{0} {1} {2} {3} {3}/tbl.${{SLURM_ARRAY_TASK_ID}} {4} {5} {3}/tbl.${{SLURM_ARRAY_TASK_ID}}.out'.format(pmark_script, top_builddir, top_srcdir, resultdir, msafile, fafile)
f.write('#!/bin/bash\n')
f.write('#SBATCH -t 6-00:00\n') # 6 days
f.write('#SBATCH --mem 4000\n')
f.write('#SBATCH -p eddy\n')
f.write('#SBATCH -c 1\n') # 1 core
f.write('#SBATCH -N 1\n') # 1 node
f.write('#SBATCH -o {}/tbl.%a.slurm\n'.format(resultdir))
f.write(cmd + '\n')
# Submit the job array
#
cmd = 'sbatch --array=0-{0} {1}/{1}.sh'.format(ncpu-1, resultdir)
subprocess.run(cmd.split())
|