1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
####################################################################
# divide_data.py
# used to do first break-apart of lamarc data into pieces for
# "poor man's parallelization"
####################################################################
# system imports
import os.path
import sys
from xml.dom.minidom import parse, Document
# imports
import parallelCommon
# input options
[lamarcfile,lamdir,pydir] = parallelCommon.getOptionsAndVerify(True)
parallelCommon.describeThisScript("divide_data.py","divide regions and replicates",lamarcfile,lamdir)
inlamarc = open(lamarcfile,'r')
lamDom = parse(inlamarc)
inlamarc.close()
# find replicate count.
lamarcTag = parallelCommon.getFirstTag(lamDom,"lamarc")
chainsTag = parallelCommon.getFirstTag(lamarcTag,"chains")
replicatesTag = parallelCommon.getFirstTag(chainsTag,"replicates")
originalReplicateCount = 1
if replicatesTag:
originalReplicateCount = parallelCommon.getLongVal(replicatesTag)
# find region count
dataElem = parallelCommon.getSingleTag(lamarcTag,"data")
regionElems = dataElem.getElementsByTagName("region")
regionCount = len(regionElems)
# bail if regions and replicates are both singletons
if ( regionCount == 1) and (originalReplicateCount == 1):
print "ERROR: lamarc input file %s has only one region" % lamarcfile
print " and only one replicate. It cannot be broken down any further"
sys.exit(2)
# add comment to identify outfile as generated by this script
commentNode = lamDom.createComment("Created by divide_data.py")
lamarcTag.insertBefore(commentNode,lamarcTag.firstChild)
# write out full copy using writexml. we do this so we can compare
# the original input file content in the same formatting we're using
# to generate the sub-files
lamcopy = os.path.join(lamdir,"infile_copy.xml")
outf = open(lamcopy,'w')
lamDom.writexml(outf)
outf.close()
# don't do profiles until very last run
parallelCommon.turnProfilesOff(lamDom,lamarcTag)
# change replicate number to 1 if it exists
if replicatesTag:
parallelCommon.setVal(replicatesTag,"1")
# find format tag
formatTag = parallelCommon.getFirstTag(lamarcTag,"format")
# disconnect regions from dom -- we'll be putting them back
# in later one at a time
for region in regionElems:
dataElem.removeChild(region)
# will store names of single region/single replicate lamarc files
infileList = []
# output each region file
regCount = 0
for region in regionElems:
dataElem.appendChild(region)
for repCount in range(originalReplicateCount):
idString = 'reg%d_rep%d' % (regCount,repCount)
if (originalReplicateCount == 1):
idString = 'reg%d' % regCount
parallelCommon.fixFormatTag(lamDom,formatTag,idString,False)
runDir = os.path.join(lamdir,'%s' % idString)
if not os.path.exists(runDir):
os.makedirs(runDir)
regFile = os.path.join(runDir,'infile_%s.xml' % idString)
outf = open(regFile,'w')
lamDom.writexml(outf)
outf.close()
infileList.append(regFile)
# remove the region tag so next file output doesn't include it
dataElem.removeChild(region)
regCount = regCount + 1
# EWFIX -- unlink region to save memory ??
# output instructions: files to run, next program to run
parallelCommon.nextStep(lamdir,infileList,False)
if(originalReplicateCount == 1):
parallelCommon.finalStep(pydir,"combine_regions.py",lamarcfile,lamdir)
else:
parallelCommon.finalStep(pydir,"combine_replicates.py",lamarcfile,lamdir)
|