File: fastq.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (45 lines) | stat: -rw-r--r-- 1,400 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
__author__ = "Gavin Huttley, Anuj Pahwa"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Gavin Huttley", "Anuj Pahwa"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Gavin Huttley"
__email__ = "Gavin.Huttley@anu.edu.au"
__status__ = "Development"

def MinimalFastqParser(data, strict=True):
    """yields name, seq, qual from fastq file

    Arguments:
        - strict: checks the quality and sequence labels are the same
    """
    if type(data) == str:
        data = open(data)

    # fastq format is very simple, defined by blocks of 4 lines
    line_num = -1
    record = []
    for line in data:
        line_num += 1
        if line_num == 4:
            if strict: # make sure the seq and qual labels match
                assert record[0][1:] == record[2][1:], \
                  'Invalid format: %s -- %s' % (record[0][1:], record[2][1:])
            yield record[0][1:], record[1], record[3]
            
            line_num = 0
            record = []
        
        record.append(line.strip())
    
    if record:
        if strict and record[0]: # make sure the seq and qual labels match
            assert record[0][1:] == record[2][1:], 'Invalid format'
        
        if record[0]: # could be just an empty line at eof
            yield record[0][1:], record[1], record[3]
        
    
    if type(data) == file:
        data.close()