| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 
 | """
Kanwei Li, 2009
Inspired by previous ClusterTree
Provides a ClusterTree data structure that supports efficient finding of 
clusters of intervals that are within a certain distance apart.
This clustering algorithm uses a binary tree structure. Nodes correspond to 
non-overlapping intervals, where overlapping means that the distance between
two intervals is less or equal to the max separation.
The tree self-balances using rotations based on the binomial sequence. Merges
among nodes are performed whenever a node is changed/added that will cause other
nodes to form a new cluster.
C source code is in src/cluster.c
"""
cdef extern from "cluster.h":
    
    cdef struct struct_interval:
        int start
        int end
        int id
        struct_interval * next
    ctypedef struct_interval interval
    
    cdef struct struct_clusternode:
        int start
        int end
        struct_interval *interval_head
        struct_interval *interval_tail
    
    ctypedef struct_clusternode clusternode
    
    cdef struct struct_clustertree:
        int max_dist
        int min_intervals
        struct_clusternode *root
    
    ctypedef struct_clustertree clustertree
    
    cdef struct struct_treeitr:
        struct_treeitr *next
        struct_clusternode *node
    
    ctypedef struct_treeitr treeitr
    
    clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id)
    clustertree* create_clustertree(int max_dist, int min_intervals)
    treeitr* clusteritr(clustertree *tree)
    void freeclusteritr(treeitr *itr)
    void free_tree(clustertree *tree)
cdef class ClusterTree:
    cdef clustertree *tree
    cdef int mincols
    cdef int minregions
    
    def __cinit__(self, mincols, minregions):
        self.tree = create_clustertree(mincols, minregions)
        self.mincols = mincols
        self.minregions = minregions
        
    def __dealloc__(self):
        free_tree(self.tree)
    
    def insert(self, s, e, id):
        ''' Insert an interval with start, end, id as parameters'''
        if s > e: raise ValueError("Interval start must be before end")
        self.tree.root = clusternode_insert(self.tree, self.tree.root, s, e, id)
                
    def getregions(self):
        ''' Returns a list clusters in ascending order of starting position.
            Each cluster is a tuple of (start, end, [sorted ids of intervals in cluster])
        
        tree = ClusterTree(0, 0)
        Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)
        tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])]
        '''
        cdef treeitr *itr
        cdef interval *ival
        
        regions = []
        itr = clusteritr(self.tree)
        
        while (itr):
            ids = []
            ival = itr.node.interval_head
            while (ival):
                ids.append(ival.id)
                ival = ival.next
            regions.append( (itr.node.start, itr.node.end, sorted(ids)) )
            itr = itr.next
        freeclusteritr(itr)
        return regions
        
    def getlines(self):
        ''' Similar to getregions except it just returns a list of ids of intervals
            The above example would return [3, 0, 1, 4, 2]
         '''
        cdef treeitr *itr
        cdef interval *ival
        
        lines = []
        itr = clusteritr(self.tree)
        
        while (itr):
            ids = []
            ival = itr.node.interval_head
            while (ival):
                ids.append(ival.id)
                ival = ival.next
            
            lines.extend(sorted(ids))
            itr = itr.next
        freeclusteritr(itr)
        return lines
        
 |