File: lastz_parser.py

package info (click to toggle)
ragout 2.3-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 72,976 kB
  • sloc: python: 4,624; cpp: 2,314; makefile: 83; sh: 43
file content (99 lines) | stat: -rw-r--r-- 3,273 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

#(c) 2013-2014 by Authors
#This file is a part of Ragout program.
#Released under the BSD license (see LICENSE file)

"""
Functions for lastz input handling
"""

from __future__ import absolute_import
from __future__ import print_function
import subprocess
import os
from itertools import combinations

from .common import AlignmentColumn, AlignmentRow
from six.moves import filter

def parse_lastz_maf(filename):
    alignments = []
    with open(filename, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break

            if not line.startswith("a"):
                state = 1
                continue

            #read two next lines
            ref_line = f.readline().strip()
            qry_line = f.readline().strip()
            assert ref_line.startswith("s") and qry_line.startswith("s")

            def parse_column(string):
                seq_id, start, aln_len, strand, seq_len = string.split()[1:6]
                aln_len, seq_len = int(aln_len), int(seq_len)
                strand = 1 if strand == "+" else -1
                if strand > 0:
                    start, end = int(start), int(start) + aln_len
                else:
                    end = seq_len - 1 - int(start)
                    start = end - aln_len

                assert end >= start
                return AlignmentRow(start, end, strand, seq_len, seq_id)

            alignments.append(AlignmentColumn(parse_column(ref_line),
                                              parse_column(qry_line)))

    return alignments


def filter_intersecting(alignments):
    to_filter = set()

    def filter_by_rows(rows):
        for row_1, row_2 in combinations(rows, 2):
            if row_1.seq_id != row_2.seq_id:
                continue

            if row_1.start <= row_2.start <= row_1.end:
                to_filter.add(row_2)
                if not (row_1.start <= row_2.end <= row_1.end):
                    to_filter.add(row_1)
            continue

            #if row_2.start <= row_1.start <= row_2.end:
            #    to_filter.add(row_1)
            #    if not (row_2.start <= row_1.end <= row_2.end):
            #        to_filter.add(row_2)

    filter_by_rows([ap.ref for ap in alignments])
    filter_by_rows([ap.qry for ap in alignments])

    return [a for a in alignments if a.ref not in to_filter and
                                     a.qry not in to_filter]


def filter_by_length(alignments, min_len):
    func = (lambda a: abs(a.ref.start - a.ref.end) > min_len and
                      abs(a.qry.start - a.qry.end) > min_len)
    return list(filter(func, alignments))


LASTZ_BIN = "lastz"
def run_lastz(reference, target, out_file):
    print("Running lastz")
    reference = os.path.abspath(reference)
    target = os.path.abspath(target)
    out_file = os.path.abspath(out_file)
    cmdline = [LASTZ_BIN, reference + "[multiple,nameparse=darkspace]",
               target + "[nameparse=darkspace]","--notransition",
               "--step=20", "--chain", "--gapped", "--gfextend",
               "--ambiguous=iupac", "--format=maf", "--output=" + out_file,
               "--rdotplot=dotplot.txt"]
    devnull = os.devnull
    subprocess.check_call(cmdline, stderr=open(devnull, "w"))