File: readSeq.py

package info (click to toggle)
bbmap 39.01%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 21,760 kB
  • sloc: java: 267,418; sh: 15,163; python: 5,247; ansic: 2,074; perl: 96; xml: 38; makefile: 38
file content (157 lines) | stat: -rwxr-xr-x 5,019 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python

import sys
import math
import random
import fileinput
import re
#import os
import copy

class readSeq:

    def __init__(self, fileName, paired = False, fileType = 'fastq', compression = 'auto', sampleRate = None):
        self.fileName = fileName
        self.fileType = fileType
        self.fileType = self.__detectFileFormat()
        self.compression = compression
        self.__readObj = self.readFile()
        self.paired = paired
        self.sampleRate = sampleRate
        
    def __iter__(self):
        return self

    def next(self):
        record = self.__readObj.next()
        self.header = record['header']
        self.seq = record['seq']
        self.seqNum = record['seqNum']
        if self.fileType == 'fastq':
            self.qual = record['qual']

        if self.paired is True:
            self2 = copy.copy(self)
            record = self.__readObj.next()
            self2.header = record['header']
            self2.seq = record['seq']
            self2.seqNum = record['seqNum']
            if self.fileType == 'fastq':
                self2.qual = record['qual']
            return self, self2
        else:
            return self

    def __detectFileFormat(self):
        if self.fileType != 'auto':
            return self.fileType
         
        if ( re.search('fasta$', self.fileName) or 
             re.search('fa$', self.fileName)    or
             re.search('fna$', self.fileName) ):
            self.fileType = 'fasta'

        elif ( re.search('fastq$', self.fileName) or
               re.search('fq$', self.fileName) ) :
            self.fileType = 'fastq'

        return self.fileType

    def readFile(self):

        record = dict()       
 
        # allow user to specify if compression should be auto detected using file names
        # specified by fileinput documentation
        if self.compression == 'auto':
            inFH = fileinput.FileInput(self.fileName, openhook=fileinput.hook_compressed)    
        else:
            inFH = fileinput.FileInput(self.fileName)
  
        #TODO RAISE exception if file doesn't exist
  
        # read fastq
        if self.fileType == 'fastq':
            self.seqNum = 0
            for record['header'] in inFH:

                record['seqNum'] = int(math.ceil(inFH.lineno()/8.0))
                record['seq'] = inFH.readline().strip()
                record['r1Plus'] = inFH.readline()
                record['qual'] = inFH.readline().strip()
                     
                if not re.search('^@', record['header'], flags=0):
                    pass
                    #TODO add exception for not being a fastq
                          
                record['header'] = re.sub('^@', '', record['header'].strip())
                yield record

        elif self.fileType == 'fasta':
            record['header'] = None
            record['seq'] = None
            record['seqNum'] = 0
            
            for line in inFH:
 
                line = line.strip()
                if re.search('^>', line):
                    line = re.sub('^>', '', line)
                    if record['header'] is None:
                        record['header'] = line
                        if not re.search('>', record['header']):
                            pass
                            # TODO add exception for not being fasta   
 
                    if record['seq'] is not None:
                        record['seqNum'] = record['seqNum'] + 1
                        
                        if self.sampleRate:
                        #subsampling of reads is desired
                            if random.random() < self.sampleRate:
                                yield record
                        else:
                            yield record
                        record['seq'] = None

                    record['header'] = line                
                else:
                    if record['seq'] is not None:
                        record['seq'] += line
                    else:
                        record['seq'] = line
                        
            record['seqNum'] = record['seqNum'] + 1
            
            if self.sampleRate:
                #subsampling of reads is desired
                if random.random() < self.sampleRate:
                    yield record
            else:
                yield record

        try:
            inFH = fileinput.close()
        except:
            pass

    #except:
        sys.exc_info()[0]

    def printRecord(self):
        if self.fileType == 'fastq':
            print("@%s" % self.header)
            print(self.seq)
            print("+") 
            print(self.qual)
        elif self.fileType == 'fasta':
            print(">%s" % self.header)
            print(self.seq)
    
if __name__ == '__main__':

    for inputfile in sys.argv[1:]:
        #get a pair of reads
        for r1, r2 in readSeq(inputfile, paired=True):
            print r1.header
            print r2.header