File: tab.pyx

package info (click to toggle)
obitools 3.0.1~b26%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,788 kB
  • sloc: ansic: 24,299; python: 657; sh: 27; makefile: 20
file content (108 lines) | stat: -rwxr-xr-x 2,898 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#cython: language_level=3

'''
Created on feb 20th 2018

@author: cmercier
'''

import types
from obitools3.utils cimport __etag__
from obitools3.utils cimport str2bytes

def tabIterator(lineiterator, 
                bint header = False,
                bytes sep = None,
                bytes dec = b".",          # TODO don't know how to use this to parse
                bint stripwhite=True,
                bint blanklineskip=True,
                bytes commentchar=b"#",
                bytes nastring=b"NA",
                int skip=0,
                only=None,
                firstline=None,
                int buffersize=100000000
                ):
    
    cdef LineBuffer lb
    cdef int        lines_to_skip, ionly, read
    cdef list       data
    cdef dict       view_line
    cdef list       keys
    cdef list       key_types
        
    keys = []
    key_types = []
    skipped = 0
    read = 0
    
    if only is None:
        ionly = -1
    else:
        ionly = int(only)
        
    if isinstance(lineiterator, (str, bytes)):
        lineiterator=uopen(lineiterator)        
    if isinstance(lineiterator, LineBuffer):
        iterator = iter(lineiterator)
    else:
        if hasattr(lineiterator, "readlines"):
            iterator = iter(LineBuffer(lineiterator, buffersize))
        elif hasattr(lineiterator, '__next__'):
            iterator = lineiterator
        else:
            raise Exception("Invalid line iterator")
    
    if firstline is None:
        line = next(iterator)
    else:
        line = firstline       
    
    while True:
        
        if (not line.strip() and blanklineskip) or line[:1] == commentchar:
            line = next(iterator)
        
        if ionly >= 0 and read >= ionly:
            break

        if not keys:
            if header:
                # TODO read types eventually
                keys = line.split(sep)
                keys = [x.strip() for x in keys]
                line = next(iterator)
                continue
            else:
                # TODO ??? default column names? like R?
                keys = [str2bytes(str(i)) for i in range(len(line.split(sep)))]
                
        while skipped < skip :
            line = next(iterator)
            skipped += 1

        view_line = {}
        
        # Parse
        data = line.split(sep)

        if stripwhite or key_types:
            data = [x.strip() for x in data]
        
        for i in range(len(data)):
            if key_types:  # TODO handle None when key types are actually read
                view_line[keys[i]] = key_types[i](data[i])
            else:
                view_line[keys[i]] = __etag__(data[i], nastring=nastring)
        
        yield view_line
        
        read+=1
        
        try:
            line = next(iterator)
        except StopIteration:
            return