File: compare-assemblies.py

package info (click to toggle)
ragout 2.3-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 72,976 kB
  • sloc: python: 4,624; cpp: 2,314; makefile: 83; sh: 43
file content (110 lines) | stat: -rwxr-xr-x 3,714 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2.7

#(c) 2013-2014 by Authors
#This file is a part of Ragout program.
#Released under the BSD license (see LICENSE file)

"""
This script counts number of rearrangements between two assemblies
thus evaluating 'agreement level' between them
"""

from __future__ import print_function
from __future__ import absolute_import
from collections import defaultdict
import sys
import os
import argparse

import networkx as nx

from utils.lastz_parser import (parse_lastz_maf, run_lastz,
                                filter_intersecting, filter_by_length)
from six.moves import map
from six.moves import zip


def get_alignment(reference, target, overwrite):
    out_file = (os.path.basename(reference) + "_" +
                os.path.basename(target) + ".maf")

    if os.path.isfile(out_file) and not overwrite:
        print("Alignment file already exists, lastz run skipped")
    else:
        run_lastz(reference, target, out_file)
    alignment = parse_lastz_maf(out_file)

    return alignment


def get_blocks(reference, target, overwrite, min_alignmtnt):
    alignment = get_alignment(reference, target, overwrite)
    alignment = filter_by_length(alignment, min_alignmtnt)
    alignment = filter_intersecting(alignment)
    #alignment = join_collinear(alignment)

    def enum_blocks(aln_rows):
        blocks = defaultdict(list)
        for r_id, row in enumerate(aln_rows):
            blocks[row.seq_id].append((r_id, row))
        for seq_id in blocks:
            blocks[seq_id].sort(key=lambda pair: pair[1].start)
            to_block = lambda r_id_row: (r_id_row[0] + 1) * r_id_row[1].strand
            blocks[seq_id] = list(map(to_block, blocks[seq_id]))

        return blocks

    #IMPORTANT: ref/qry rows should have corresponding order
    ref_blocks = enum_blocks([ap.ref for ap in alignment])
    qry_blocks = enum_blocks([ap.qry for ap in alignment])

    return ref_blocks, qry_blocks


def output_blocks(blocks):
    for seq, bl in blocks.items():
        print(">{0}".format(seq))
        print(" ".join(["{0:+d}".format(b) for b in bl]))


def count_discord_adj(ref_blocks, qry_blocks):
    #building breakpoint graph
    graph = nx.MultiGraph()
    for seq, blocks in ref_blocks.items():
        for block_1, block_2 in zip(blocks[:-1], blocks[1:]):
            graph.add_edge(-block_1, block_2, name=seq, color="blue")
    for seq, blocks in qry_blocks.items():
        for block_1, block_2 in zip(blocks[:-1], blocks[1:]):
            graph.add_edge(-block_1, block_2, name=seq, color="green")

    counter = 0
    for node in graph.nodes:
        if len(graph.neighbors(node)) > 1:
            counter += 1

    return counter


def main():
    parser = argparse.ArgumentParser(description="Compare two assemblies")
    parser.add_argument("assembly_1", metavar="assembly_1",
                        help="path to first assembly")
    parser.add_argument("assembly_2", metavar="assembly_2",
                        help="path to second assembly")
    parser.add_argument("--overwrite", action="store_const", metavar="overwrite",
                        dest="overwrite", default=False, const=True,
                        help="overwrite existing lastz alignment")
    parser.add_argument("-b", "--block", dest="block_size",
                        help="minimum synteny block size",
                        default="5000")
    args = parser.parse_args()

    ref_blocks, qry_blocks = get_blocks(args.assembly_1, args.assembly_2,
                                        args.overwrite, int(args.block_size))
    #output_blocks(ref_blocks)
    #output_blocks(qry_blocks)
    print(count_discord_adj(ref_blocks, qry_blocks))


if __name__ == "__main__":
    main()