File: link_manager.py

package info (click to toggle)
glueviz 0.9.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 17,180 kB
  • ctags: 6,728
  • sloc: python: 37,111; makefile: 134; sh: 60
file content (229 lines) | stat: -rw-r--r-- 7,032 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
The LinkManager class is responsible for maintaining the conistency
of the "web of links" in a DataCollection. It discovers how to
combine ComponentLinks together to discover all of the ComponentIDs
that a Data object can derive,

As a trivial example, imagine a chain of 2 ComponentLinks linking
ComponentIDs across 3 datasets:

Data:        D1        D2       D3
ComponentID: x         y        z
Link:        <---x2y---><--y2z-->

The LinkManager autocreates a link from D1.id['x'] to D3.id['z']
by chaining x2y and y2z.
"""

from __future__ import absolute_import, division, print_function

import logging

from glue.external import six
from glue.core.contracts import contract
from glue.core.link_helpers import LinkCollection
from glue.core.component_link import ComponentLink
from glue.core.data import Data, DerivedComponent


def accessible_links(cids, links):
    """ Calculate all ComponentLink objects in a list
    that can be calculated from a collection of componentIds

    :param cids: Collection of ComponentID objects
    :param links: Iterable of ComponentLink objects

    :rtype: list
    A list of all links that can be evaluated
    given the input ComponentIDs
    """
    cids = set(cids)
    return [l for l in links if
            set(l.get_from_ids()) <= cids]


def discover_links(data, links):
    """ Discover all links to components that can be derived
    based on the current components known to a dataset, and a set
    of ComponentLinks.

    :param Data: Data object to discover new components for
    :param links: Set of ComponentLinks to use

    :rtype: dict
    A dict of componentID -> componentLink
    The ComponentLink that data can use to generate the componentID.
    """

    # TODO: try to add shortest paths first -- should
    # prevent lots of repeated checking

    cids = set(data.primary_components)
    cid_links = {}
    depth = {}
    for cid in cids:
        depth[cid] = 0

    while True:
        for link in accessible_links(cids, links):
            from_ = set(link.get_from_ids())
            to_ = link.get_to_id()
            cost = max([depth[f] for f in from_]) + 1
            if to_ in cids and cost >= depth[to_]:
                continue
            depth[to_] = cost
            cids.add(to_)
            cid_links[to_] = link
            break
        else:
            # no more links to add
            break
    return cid_links


def find_dependents(data, link):
    """ Determine which `DerivedComponents` in a data set
    depend (either directly or implicitly) on a given
    `ComponentLink`.

    :param data: The data object to consider
    :param link: The `ComponentLink` object to consider

    :rtype: set
    A `set` of `glue.core.component.DerivedComponent` IDs that cannot be
    calculated without the input `Link`
    """
    dependents = set()
    visited = set()
    while True:
        for derived in data.derived_components:
            derived = data.get_component(derived)
            if derived in visited:
                continue
            to_, from_ = derived.link.get_to_id(), derived.link.get_from_ids()
            if derived.link is link:
                dependents.add(to_)
                visited.add(derived)
                break
            if any(f in dependents for f in from_):
                dependents.add(to_)
                visited.add(derived)
                break
        else:
            break  # nothing more to remove
    return dependents


class LinkManager(object):

    """A helper class to generate and store ComponentLinks,
    and compute which components are accesible from which data sets
    """

    def __init__(self):
        self._links = set()
        self._duplicated_ids = []

    def add_link(self, link):
        """
        Ingest one or more ComponentLinks to the manager

        Parameters
        ----------
        link : ComponentLink, LinkCollection, or list thereof
           The link(s) to ingest
        """
        if isinstance(link, (LinkCollection, list)):
            for l in link:
                self.add_link(l)
        else:
            self._links.add(link)
            if link.identity:
                self._add_duplicated_id(link)
        self._reassign_mergers()

    def _add_duplicated_id(self, link):
        frm = link.get_from_ids()
        assert len(frm) == 1
        frm = frm[0]
        to = link.get_to_id()
        if (frm, to) in self._duplicated_ids:
            return
        if (to, frm) in self._duplicated_ids:
            return
        self._duplicated_ids.append((frm, to))

    def _reassign_mergers(self):
        """Update all links such that any reference to a duplicate
        componentID is replaced with the original"""
        for l in self._links:
            for o, d in self._duplicated_ids:
                l.replace_ids(d, o)

    def _merge_duplicate_ids(self, data):
        for o, d in self._duplicated_ids:
            if d in data.components:
                data.update_id(d, o)

    @contract(link=ComponentLink)
    def remove_link(self, link):
        logging.getLogger(__name__).debug('removing link %s', link)
        self._links.remove(link)

    @contract(data=Data)
    def update_data_components(self, data):
        """Update all the DerivedComponents in a data object, based on
        all the Components deriveable based on the links in self.

        This overrides any ComponentLinks stored in the
        DerivedComponents of the data itself -- any components which
        depend on a link not tracked by the LinkManager will be
        deleted.

        Parameters
        -----------
        data : Data object

        Behavior
        --------
        DerivedComponents will be replaced / added into
        the data object
        """
        self._merge_duplicate_ids(data)
        self._remove_underiveable_components(data)
        self._add_deriveable_components(data)

    def _remove_underiveable_components(self, data):
        """ Find and remove any DerivedComponent in the data
        which requires a ComponentLink not tracked by this LinkManager
        """
        data_links = set(data.get_component(dc).link
                         for dc in data.derived_components)
        missing_links = data_links - self._links
        to_remove = []
        for m in missing_links:
            to_remove.extend(find_dependents(data, m))

        for r in to_remove:
            data.remove_component(r)

    def _add_deriveable_components(self, data):
        """Find and add any DerivedComponents that a data object can
        calculate given the ComponentLinks tracked by this
        LinkManager

        """
        links = discover_links(data, self._links)
        for cid, link in six.iteritems(links):
            d = DerivedComponent(data, link)
            data.add_component(d, cid)

    @property
    def links(self):
        return list(self._links)

    def clear(self):
        self._links.clear()

    def __contains__(self, item):
        return item in self._links