File: cluster.pyx

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (122 lines) | stat: -rw-r--r-- 3,731 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Kanwei Li, 2009
Inspired by previous ClusterTree

Provides a ClusterTree data structure that supports efficient finding of 
clusters of intervals that are within a certain distance apart.

This clustering algorithm uses a binary tree structure. Nodes correspond to 
non-overlapping intervals, where overlapping means that the distance between
two intervals is less or equal to the max separation.

The tree self-balances using rotations based on the binomial sequence. Merges
among nodes are performed whenever a node is changed/added that will cause other
nodes to form a new cluster.

C source code is in src/cluster.c
"""

cdef extern from "cluster.h":
    
    cdef struct struct_interval:
        int start
        int end
        int id
        struct_interval * next

    ctypedef struct_interval interval
    
    cdef struct struct_clusternode:
        int start
        int end
        struct_interval *interval_head
        struct_interval *interval_tail
    
    ctypedef struct_clusternode clusternode
    
    cdef struct struct_clustertree:
        int max_dist
        int min_intervals

        struct_clusternode *root
    
    ctypedef struct_clustertree clustertree
    
    cdef struct struct_treeitr:
        struct_treeitr *next
        struct_clusternode *node
    
    ctypedef struct_treeitr treeitr
    
    clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id)
    clustertree* create_clustertree(int max_dist, int min_intervals)
    treeitr* clusteritr(clustertree *tree)
    void freeclusteritr(treeitr *itr)
    void free_tree(clustertree *tree)

cdef class ClusterTree:
    cdef clustertree *tree
    cdef int mincols
    cdef int minregions
    
    def __cinit__(self, mincols, minregions):
        self.tree = create_clustertree(mincols, minregions)
        self.mincols = mincols
        self.minregions = minregions
        
    def __dealloc__(self):
        free_tree(self.tree)
    
    def insert(self, s, e, id):
        ''' Insert an interval with start, end, id as parameters'''
        if s > e: raise ValueError("Interval start must be before end")
        self.tree.root = clusternode_insert(self.tree, self.tree.root, s, e, id)
                
    def getregions(self):
        ''' Returns a list clusters in ascending order of starting position.
            Each cluster is a tuple of (start, end, [sorted ids of intervals in cluster])
        
        tree = ClusterTree(0, 0)
        Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)
        tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])]
        '''
        cdef treeitr *itr
        cdef interval *ival
        
        regions = []
        itr = clusteritr(self.tree)
        
        while (itr):
            ids = []
            ival = itr.node.interval_head
            while (ival):
                ids.append(ival.id)
                ival = ival.next

            regions.append( (itr.node.start, itr.node.end, sorted(ids)) )
            itr = itr.next
        freeclusteritr(itr)
        return regions
        
    def getlines(self):
        ''' Similar to getregions except it just returns a list of ids of intervals
            The above example would return [3, 0, 1, 4, 2]
         '''
        cdef treeitr *itr
        cdef interval *ival
        
        lines = []
        itr = clusteritr(self.tree)
        
        while (itr):
            ids = []
            ival = itr.node.interval_head
            while (ival):
                ids.append(ival.id)
                ival = ival.next
            
            lines.extend(sorted(ids))
            itr = itr.next
        freeclusteritr(itr)
        return lines