File: Setup.py

package info (click to toggle)
pbsuite 15.8.24%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 14,512 kB
  • ctags: 1,951
  • sloc: python: 10,962; sh: 147; xml: 21; makefile: 14
file content (168 lines) | stat: -rwxr-xr-x 6,637 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/python
import sys, re, os, logging, subprocess
from optparse import OptionParser

from pbsuite.utils.setupLogging import *
from pbsuite.utils.FileHandlers import FastaFile, QualFile, Gap, wrap, qwrap
from pbsuite.utils.CommandRunner import exe

USAGE = """%prog <inputScaffolding.fasta> [outputContigs.fasta]

Take the input scaffolding and split it into contigs around all [Nn]+ 
If an <inputScaffolding.qual> exists beside the <inputScaffolding.fasta>,
we'll create and incorporate it into PBJelly. Otherwise, it is not necessary
"""

refParser = re.compile("(.*)\|(ref\d{7})/?(\d+)?$")

class Setup():
    def __init__(self):
        self.parseArgs()
        setupLogging(self.opts.debug)
    
    def parseArgs(self):
        parser = OptionParser(USAGE)
        
        parser.add_option("-g", "--gapOutput", dest="gapOutput", \
            help="Create the table for gapInformation", default=None)
        parser.add_option("-i", "--index", dest="index", action="store_true", default=False, \
            help="Create the .sa index for faster blasr alignments")
        #parser.add_option("--noRename", dest="rename", action="store_true", default=False, \
            #help="Flag to indicate there will be no renaming of scaffolding.")
        parser.add_option("--debug",action="store_true", help="Increases verbosity of logging" )
        parser.add_option("--minGap", dest="minGap", type="int", default=25, \
            help="Minimum number of consecutive Ns to be considered a gap. default=25")
        parser.add_option("--maxGap", dest="maxGap", type="string", default="", \
            help="Maximum number of consecutive Ns to be considered a gap default=Inf")
        
        self.opts, args = parser.parse_args()
        if len(args) < 1:
            parser.error("Error! Incorrect number of arguments")
        if len(args) == 1:
            self.scaffInput = args[0]
            if not self.scaffInput.endswith(".fasta"):
                parser.error("Reference must end in extension .fasta! Please rename it.")
            if not os.path.isfile(self.scaffInput):
                parser.error("Error! Scaffold File is not a file / does not exist")
        else:
            parser.error("Error! Incorrect number of arguments")

        qn = self.scaffInput[:self.scaffInput.rindex('.fasta')]
        qualInputName = qn+".qual"
        if not os.path.isfile(qualInputName):
            self.qualInput = None
        else:
            self.qualInput = qualInputName
   
    def run(self):
        #Fasta Ref Output
        scaffTempName = self.scaffInput+".tempFasta"
        scaffOutput = open(scaffTempName, 'w')
        
        #Qual Ref Output
        if self.qualInput is not None:
            qualTempName= self.qualInput+".tempQual"
            qualOutput = open(qualTempName, 'w')
        
        #Gaps Output
        if self.opts.gapOutput is not None:
            gapTableOut = open(self.opts.gapOutput,'w')
        else:
            gapTableOut = False
        
        logging.info("Creating reference sequence index names and identifying gaps")
        
        refTemplate = "ref%07d"
        refId = 1
        
        #Read References
        reference = FastaFile(self.scaffInput)
        if self.qualInput is not None:
            qualReference = QualFile(self.qualInput)    
        
        for key in reference:
            
            scaffIndex = refTemplate % refId
            scaffName = key.replace(' ','_')
            
            refId += 1
            
            scaffName = scaffName + "|" + scaffIndex
            scaffOutput.write(">"+scaffName+"\n"+wrap(reference[key])+"\n")
            
            if self.qualInput is not None:
                qualOutput.write(">"+scaffName+"\n"+qwrap(qualReference[key])+"\n")
            
            gapCoords = []
            for gap in re.finditer("[^Nn]([Nn]{%d,%s})[^Nn]" % \
                    (self.opts.minGap, self.opts.maxGap), reference[key]):
                gapCoords.append([gap.start() + 1, gap.end() - 1])
            
            if len(gapCoords) == 0:#no Gaps
                gapTableOut.write("\t".join([scaffName, 'na', 'na', scaffIndex+"_0_0", '3'])+'\n')
                logging.debug("Scaffold %s is empty" % scaffName)
                continue
            
            #Consolidate gaps that are too close -- indicating LQ regions.
            i = 0
            while i < len(gapCoords)-1:
                if gapCoords[i+1][0] - gapCoords[i][1] < 25:
                    gapCoords[i+1][0] = gapCoords[i][0]
                    del(gapCoords[i])
                else:
                    i += 1
            
            prevEnd = 0#Contig Start Tracking
            idx = 0
            #Make the first gap
            prevEnd = gapCoords[0][1]
            gapCoords[0][1]-gapCoords[0][0]
            
            flag = Gap.BEGIN
            if len(gapCoords) == 1:
                flag += Gap.END
            if gapTableOut:
                gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[0][0], gapCoords[0][1], scaffIndex, idx, idx+1, flag))

            #Now Go Through the rest of the gaps
            for i in range(1, len(gapCoords)):
                idx += 1
                prevEnd = gapCoords[i][1]
                gapCoords[i][1]-gapCoords[i][0]
            
                if gapTableOut:
                    if i == len(gapCoords)-1:
                        flag = Gap.END
                    else:
                        flag = 0
                    gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[i][0], gapCoords[i][1], scaffIndex, idx, idx+1, flag))
            
        #Close shop
        scaffOutput.close()
        os.rename(self.scaffInput, self.scaffInput+".original")
        os.rename(scaffTempName, self.scaffInput)
        
        if self.qualInput is not None:
            qualOutput.close()
            os.rename(self.qualInput, self.qualInput+".original")
            os.rename(qualTempName, self.qualInput)
        
        if gapTableOut:
            gapTableOut.close()
        
        if self.opts.index:
            logging.info("Creating .sa indexes for references")
            r, o, e = exe("sawriter %s.sa %s" % (self.scaffInput, self.scaffInput))
            if r != 0:
                logging.error("sawriter returned %d" % r)
                logging.error("Ensure it's in your path")
                exit(1)
            logging.debug(str(o) + ' ' + str(e))
        
        logging.info("Finished!")

if __name__ == '__main__':
    me = Setup()
    me.run()