File: _insert.pyx

package info (click to toggle)
python-iow 1.0.8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 660 kB
  • sloc: python: 2,322; makefile: 24; sh: 12
file content (131 lines) | stat: -rw-r--r-- 4,571 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# encoding: utf-8
# cython: profile=False, boundscheck=False, wraparound=False

from ._bp cimport BP
from . import to_skbio_treenode
import pandas as pd
import json
import skbio
cimport cython


# see the comment in _insert_setup. Avoid the use of invalidate_caches as it
# is very expensive for tree mutation operations
class TreeNodeNoValidate(skbio.TreeNode):
    def invalidate_caches(self):
        pass


# our noop used when monkey patching invalidate_caches
def noop(*arg, **kwargs):
    pass


# pandas apply functions for preallocation of objects in bulk
def _preallocate_fragment(r):
    return TreeNodeNoValidate(name=r['fragment'], length=r['pendant_length'])


def _preallocate_empty(r):
    return TreeNodeNoValidate()


def _insert_setup(placements, bptree, insert_type):
    # insertion setup addresses:
    # * treenode caching
    # * placement ordering
    # * preallocation of objects where "easy"

    sktree = to_skbio_treenode(bptree)
    node_lookup = {n.edge_num: n for n in sktree.traverse(include_self=True)}

    # mutation operations against TreeNode is expensive as every append or
    # remove triggers a call to invalidate caches, which requires a traversal
    # to find the root (and do other stuff). so let's monkey patch the method
    # to force a noop
    for node in sktree.traverse(include_self=True):
        node.invalidate_caches = noop

    # we are only setup to handle a single placement per fragment, so pull 
    # deduplicated following guidance from Prof. Siavash Mirarab. We sort so
    # "better" has a smaller index value
    # fragment -> group the rows by the fragment, fragment order doesn't matter
    # like_weight_ratio -> our first selection criteria, higher is better
    # pendant_length -> our second selection criteria, lower is better
    placements = placements.sort_values(['fragment', 'like_weight_ratio', 
                                         'pendant_length'],
                                        ascending=[True, False, True])

    # take the first non-duplicated row per fragment. because of the sort, this
    # is assured to be the highest weight ratio, and the smallest pendant 
    # length. Ties are handled arbitrarily. 
    placements = placements[~placements['fragment'].duplicated()]

    if insert_type == 'multifurcating':
        placements = placements.sort_values(['edge_num', 'pendant_length'])
    elif insert_type == 'fully_resolved':
        placements = placements.sort_values(['edge_num', 'distal_length'],
                                            ascending=[True, False])
    else:
        raise ValueError()


    placements['node'] = placements.apply(_preallocate_fragment, axis=1)

    if insert_type == 'fully_resolved':
        placements['parent'] = placements.apply(_preallocate_empty, axis=1)

    return placements, sktree, node_lookup


# pd.DataFrame is not a resolved type so we cannot use it here for cython
def insert_fully_resolved(object placements, BP bptree):
    """Update the backbone, fully resolving edges with multiple queries

    Parameters
    ----------
    placements : pd.DataFrame
        jplace data represented as a DataFrame
    bptree : bp.BP
        An instance of a BP tree, this is expected to contain edge numbers
        and correspond to the backbone for the jplace data

    Returns
    -------
    skbio.TreeNode
        A tree with the fragments placed
    """
    # TODO: profile, type and re-profile
    placements, sktree, node_lookup = \
        _insert_setup(placements, bptree, 'fully_resolved')

    for edge, edge_grp in placements.groupby('edge_num'):
        existing_node = node_lookup[edge]
        current_parent = existing_node.parent
        
        # break the edge
        current_parent.remove(existing_node)
        existing_node.parent = None
        existing_length = existing_node.length

        for _, fragment in edge_grp.iterrows():
            distal_length = fragment['distal_length']
            fragment_node = fragment['node']
            fragment_parent = fragment['parent']

            # update branch lengths
            fragment_parent.length = existing_length - distal_length
            existing_length = distal_length

            # attach the nodes
            fragment_parent.append(fragment_node)
            current_parent.append(fragment_parent)

            # update
            current_parent = fragment_parent
        
        existing_node.length = existing_length
        current_parent.append(existing_node)
        existing_node.length = distal_length

    return sktree