File: moleculo_filter_contigs.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (44 lines) | stat: -rw-r--r-- 1,546 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python3

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# All Rights Reserved
# See file LICENSE for details.
############################################################################


import sam_parser

import SeqIO

import sys

class PatternContigFilter:
    def __init__(self, contigs, sam, pattern, rc_pattern):
        self.sam = sam
        reads = []
        self.filter = [False] * len(contigs)
        for rec in sam:
            reads.append(rec)
            if len(reads) == 2:
                left_sequence = str(reads[0].seq.upper())
                right_sequence = str(reads[1].seq.upper())
                if left_sequence.find(pattern) != -1 or right_sequence.find(rc_pattern) != -1 or right_sequence.find(pattern) != -1 or left_sequence.find(rc_pattern) != -1:
                    if not reads[0].is_unmapped:
                        self.filter[reads[0].tid] = True
                    if not reads[1].is_unmapped:
                        self.filter[reads[1].tid] = True
                reads = []

    def Filter(self, contig):
        return self.filter[self.sam.gettid(contig.id)]

class ContigLengthFilter:
    def __init__(self, min_length):
        self.min_length = min_length

    def Filter(self, contig):
        return len(contig.seq) >= self.min_length

# def Filter(contigs, left_reads, right_reads, length_threshold, pattern):
#     dataset_data = pyyaml.load(yaml_file, 'r')