File: parse.py

package info (click to toggle)
hinge 0.5.0-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,972 kB
  • sloc: cpp: 9,480; ansic: 8,826; python: 5,023; sh: 340; makefile: 10
file content (44 lines) | stat: -rwxr-xr-x 1,284 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python3

import sys
min_len_aln = 1000


with sys.stdin as f:
    for l in f:
        l = l.strip().split()
        if len(l) != 2:
            continue

        read_id = l[0]
        seq = l[1]
        
        print(read_id,seq)
        
        #if len(seq) > max_len:
        #    seq = seq[:max_len-1]
        
        if read_id not in ("+", "-", "*"):
            if len(seq) >= min_len_aln:
                if len(seqs) == 0:
                    seqs.append(seq) #the "seed"
                    seed_id = l[0]
                if read_id not in read_ids: #avoidng using the same read twice. seed is used again here by design
                    seqs.append(seq)
                    read_ids.add(read_id)
        elif l[0] == "+":
            if len(seqs) >= min_cov_aln:
                seqs = seqs[:1] + sorted(seqs[1:], key=lambda x: -len(x))
                yield (seqs[:max_n_read], seed_id, config) 
            #seqs_data.append( (seqs, seed_id) ) 
            seqs = []
            read_ids = set()
            seed_id = None
        elif l[0] == "*":
            seqs = []
            read_ids = set()
            seed_id = None
        elif l[0] == "-":
            #yield (seqs, seed_id)
            #seqs_data.append( (seqs, seed_id) )
            break