1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
|
#cython: language_level=3
'''
Created on march 8th 2018
@author: cmercier
'''
from .tab import tabIterator
import types
def ngsfilterIterator(lineiterator,
bytes sep = None,
bytes dec = b".",
bint stripwhite=True,
bint blanklineskip=True,
bytes commentchar=b"#",
bytes nastring=b"NA",
int skip=0,
only=None,
firstline=None,
int buffersize=100000000
):
cdef list all_lines
cdef bytes header
cdef bytes out_sep
out_sep = b"\t"
if isinstance(lineiterator, (str, bytes)):
lineiterator=uopen(lineiterator)
if isinstance(lineiterator, LineBuffer):
iterator = iter(lineiterator)
else:
if hasattr(lineiterator, "readlines"):
iterator = iter(LineBuffer(lineiterator, buffersize))
elif hasattr(lineiterator, '__next__'):
iterator = lineiterator
else:
raise Exception("Invalid line iterator")
all_lines = [line for line in iterator]
new_lines = []
if firstline is not None:
all_lines.insert(0, firstline)
# Insert header for column names
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] #,b"additional_info"]
header = out_sep.join(column_names)
new_lines.append(header)
for line in all_lines:
split_line = line.split(maxsplit=5)
if split_line:
tags = split_line.pop(2)
tags = tags.split(b":")
for t_idx in range(len(tags)):
if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
tags[t_idx] = nastring
if len(tags) == 1: # Forward and reverse tags are the same
tags.append(tags[0])
split_line.insert(2, tags[0])
split_line.insert(3, tags[1])
new_lines.append(out_sep.join(split_line[0:6]))
return tabIterator(iter(new_lines),
header = True,
sep = out_sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
nastring = nastring,
skip = skip,
only = only,
firstline = None)
|