File: divide_data.py

package info (click to toggle)
lamarc 2.1.10.1%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 77,052 kB
  • sloc: cpp: 112,339; xml: 16,769; sh: 3,528; makefile: 1,219; python: 420; perl: 260; ansic: 40
file content (108 lines) | stat: -rw-r--r-- 3,522 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
####################################################################
# divide_data.py
#   used to do first break-apart of lamarc data into pieces for
#   "poor man's parallelization"
####################################################################
# system imports
import os.path
import sys
from xml.dom.minidom import parse, Document

# imports 
import parallelCommon


# input options
[lamarcfile,lamdir,pydir] = parallelCommon.getOptionsAndVerify(True)

parallelCommon.describeThisScript("divide_data.py","divide regions and replicates",lamarcfile,lamdir)

inlamarc = open(lamarcfile,'r')
lamDom = parse(inlamarc)
inlamarc.close()

# find replicate count.
lamarcTag = parallelCommon.getFirstTag(lamDom,"lamarc")
chainsTag = parallelCommon.getFirstTag(lamarcTag,"chains")
replicatesTag = parallelCommon.getFirstTag(chainsTag,"replicates")
originalReplicateCount = 1
if replicatesTag:
    originalReplicateCount = parallelCommon.getLongVal(replicatesTag)

# find region count
dataElem = parallelCommon.getSingleTag(lamarcTag,"data")
regionElems = dataElem.getElementsByTagName("region")
regionCount = len(regionElems)

# bail if regions and replicates are both singletons
if ( regionCount == 1) and (originalReplicateCount == 1):
    print "ERROR: lamarc input file %s has only one region" % lamarcfile
    print "       and only one replicate. It cannot be broken down any further"
    sys.exit(2)


# add comment to identify outfile as generated by this script
commentNode = lamDom.createComment("Created by divide_data.py")
lamarcTag.insertBefore(commentNode,lamarcTag.firstChild)

# write out full copy using writexml. we do this so we can compare
# the original input file content in the same formatting we're using
# to generate the sub-files
lamcopy = os.path.join(lamdir,"infile_copy.xml")
outf = open(lamcopy,'w')
lamDom.writexml(outf)
outf.close()

# don't do profiles until very last run
parallelCommon.turnProfilesOff(lamDom,lamarcTag)


# change replicate number to 1 if it exists
if replicatesTag:
    parallelCommon.setVal(replicatesTag,"1")

# find format tag
formatTag = parallelCommon.getFirstTag(lamarcTag,"format")

# disconnect regions from dom -- we'll be putting them back
# in later one at a time
for region in regionElems:
    dataElem.removeChild(region)

# will store names of single region/single replicate lamarc files
infileList = []

# output each region file
regCount = 0
for region in regionElems:
    dataElem.appendChild(region)
    for repCount in range(originalReplicateCount):
        idString = 'reg%d_rep%d' % (regCount,repCount)
        if (originalReplicateCount == 1):
            idString = 'reg%d' % regCount

        parallelCommon.fixFormatTag(lamDom,formatTag,idString,False)

        runDir = os.path.join(lamdir,'%s' % idString)
        if not os.path.exists(runDir):
            os.makedirs(runDir)
        regFile = os.path.join(runDir,'infile_%s.xml' % idString)
        outf = open(regFile,'w')
        lamDom.writexml(outf)
        outf.close()
        infileList.append(regFile)

    # remove the region tag so next file output doesn't include it
    dataElem.removeChild(region)

    regCount = regCount + 1

    # EWFIX -- unlink region to save memory ??
    

# output instructions: files to run, next program to run
parallelCommon.nextStep(lamdir,infileList,False)
if(originalReplicateCount == 1):
    parallelCommon.finalStep(pydir,"combine_regions.py",lamarcfile,lamdir)
else:
    parallelCommon.finalStep(pydir,"combine_replicates.py",lamarcfile,lamdir)