File: Region.pyx

package info (click to toggle)
macs 3.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 378,728 kB
  • sloc: ansic: 5,879; python: 4,342; sh: 451; makefile: 83
file content (429 lines) | stat: -rw-r--r-- 13,615 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2024-02-12 15:26:24 Tao Liu>

"""Module for Region classe.

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file LICENSE included with
the distribution).
"""

# ------------------------------------
# python modules
# ------------------------------------
#import random
#import re
#import sys

# ------------------------------------
# MACS3 modules
# ------------------------------------

from MACS3.Utilities.Constants import *

# ------------------------------------
# Other modules
# ------------------------------------

from cpython cimport bool

# ------------------------------------
# constants
# ------------------------------------
__version__ = "Region $Revision$"
__author__ = "Tao Liu <vladimir.liu@gmail.com>"
__doc__ = "Region class"

# ------------------------------------
# Misc functions
# ------------------------------------

# ------------------------------------
# Classes
# ------------------------------------
cdef class Regions:
    """For plain region of chrom, start and end
    """
    cdef:
        public dict regions
        public int total
        bool __sorted
        bool __merged

    def __init__ (self):
        self.regions= {}
        self.__sorted = False
        self.__merged =  False
        self.total = 0

    def __getitem__ (self, chrom):
        return self.regions[ chrom ]
        
    cpdef pop ( self, int n ):
        # when called, pop the first n regions in Regions class. Self will be modified.
        cdef:
            list clist # for chromosomes
            int tmp_l
            bytes chrom
            int n_taken # remember the number of regions prepared
            object ret # returned Regions

        if self.total == 0:
            raise Exception("None left")
            
        clist = sorted(list(self.regions.keys()))
        n_taken = n
        ret = Regions()
        for chrom in sorted(clist):
            ret.regions[chrom] = self.regions[chrom][:n_taken]
            self.regions[chrom] = self.regions[chrom][n_taken:]
            if not self.regions[chrom]:
                # remove this chromosome if there is none left
                self.regions.pop( chrom )
            tmp_l = len( ret.regions[chrom] )
            ret.total +=  tmp_l
            # calculate remained
            self.total -= tmp_l
            #print( ret.total, self.total )
            n_taken -= tmp_l
            if not n_taken:
                # when there is no need, quit loop
                break
        return ret
    
    cpdef init_from_PeakIO ( self, object peaks ):
        """Initialize the object with a PeakIO object.

        Note: I intentionally forgot to check if peakio is actually a
        PeakIO...
        """
        cdef:
            bytes chrom
            list ps
            object p
            int i

        peaks.sort()
        self.total = 0
        for chrom in sorted(peaks.get_chr_names()):
            ps = peaks.get_data_from_chrom( chrom )
            self.regions[chrom] = []
            for i in range( len( ps ) ):
                p = ps[ i ]
                self.regions[chrom].append( ( p['start'], p['end'] ) )
                self.total += 1
        self.sort()

    cpdef add_loc ( self, bytes chrom, int start, int end ):
        if self.regions.has_key(chrom):
            self.regions[chrom].append( (start,end) )
        else:
            self.regions[chrom] = [ (start,end), ]
        self.total += 1
        self.__sorted = False
        self.__merged = False
        return

    cpdef sort (self):
        cdef:
            bytes chrom

        if self.__sorted:
            return
        for chrom in sorted(self.regions.keys()):
            self.regions[chrom].sort()
        self.__sorted = True

    cpdef long total_length ( self ):
        """ Return the total length of the Regions object.
        """
        cdef:
            bytes chrom
            list ps
            int i
            int tl, s, e
        self.merge_overlap()
        tl = 0
        for chrom in sorted(self.regions.keys()):
            ps = self.regions[chrom]
            for i in range( len(ps) ):
                s, e = ps[i]
                tl +=  e - s
        return tl

    cpdef set get_chr_names (self):
        return set( sorted(self.regions.keys()) )

    cpdef expand ( self, int flanking ):
        """ Expand regions to both directions with 'flanking' bps.
        """
        cdef:
            bytes chrom
            list ps
            int i

        self.sort()
        for chrom in sorted(self.regions.keys()):
            ps = self.regions[ chrom ]
            for i in range( len( ps ) ):
                ps[i] = ( max(0, ps[i][0] - flanking), ps[i][1] + flanking )
            ps.sort()
            self.regions[chrom] = ps
        self.__merged = False
    
    cpdef merge_overlap ( self ):
        """
        Merge overlapping regions of itself.
        """
        cdef:
            bytes chrom
            int s_new_region, e_new_region, i, j
            dict regions, new_regions
            list chrs
            list regions_chr
            tuple prev_region

        if self.__merged:
            return
        self.sort()
        regions = self.regions
        new_regions = {}
        
        chrs = list(regions.keys())
        chrs.sort()
        self.total = 0
        for i in range(len(chrs)):
            chrom = chrs[i]
            new_regions[chrom]=[]
            n_append = new_regions[chrom].append
            prev_region = ()
            regions_chr = regions[chrom]
            for i in range( len(regions_chr) ):
                if not prev_region:
                    prev_region = regions_chr[i]
                    continue
                else:
                    if regions_chr[i][0] <= prev_region[1]:
                        s_new_region = prev_region[0]
                        e_new_region = regions_chr[i][1]
                        prev_region = (s_new_region,e_new_region)
                    else:
                        n_append(prev_region)
                        prev_region = regions_chr[i]
            if prev_region:
                n_append(prev_region)
            self.total += len( new_regions[chrom] )
        self.regions = new_regions
        self.sort()
        self.__merged = True
        return True

    cpdef write_to_bed (self, fhd ):
        cdef:
            int i
            bytes chrom
            tuple region

        chrs = list(self.regions.keys())
        chrs.sort()
        for i in range( len(chrs) ):
            chrom = chrs[i]
            for region in self.regions[chrom]:
                fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) )

    def __str__ ( self ):
        cdef:
            int i
            bytes chrom
            tuple region
            str ret
            
        ret = ""
        chrs = list(self.regions.keys())
        chrs.sort()
        for i in range( len(chrs) ):
            chrom = chrs[i]
            for region in self.regions[chrom]:
                ret += "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] )
        return ret
    
    # cpdef object randomly_pick ( self, int n, int seed = 12345 ):
    #     """Shuffle the regions and get n regions out of it. Return a
    #     new Regions object.  
    #     """

    #     cdef:
    #         list all_pc
    #         list chrs
    #         bytes chrom
    #         object ret_peakio, p
    #     assert n > 0
    #     chrs = sorted(list(self.peaks.keys()))
    #     all_pc = []
    #     for chrom in chrs:
    #         all_pc.extend(self.peaks[chrom])
    #     random.seed( seed )
    #     all_pc = random.shuffle( all_pc )[:n]
    #     ret_peakio = PeakIO()
    #     for p in all_pc:
    #         ret_peakio.add_PeakContent ( p["chrom"], p )
    #     return ret_peakio


    cpdef object intersect (self, object regions_object2):
        """ Get the only intersecting regions comparing with
        regions_object2, another Regions object. Then return a new
        Regions object.
        
        """
        cdef:
            object ret_regions_object
            dict regions1, regions2
            list chrs1, chrs2, four_coords
            bytes k
            dict ret_regions
            bool overlap_found
            tuple r1, r2
            long n_rl1, n_rl2

        self.sort()
        regions1 = self.regions
        self.total = 0
        assert isinstance(regions_object2, Regions)
        
        regions_object2.sort()
        regions2 = regions_object2.regions

        ret_regions_object = Regions()
        ret_regions = dict()
        chrs1 = list(regions1.keys())
        chrs2 = list(regions2.keys())
        for k in chrs1:
            #print(f"chromosome {k}")
            if not chrs2.count(k):
                # no such chromosome in peaks1, then don't touch the peaks in this chromosome
                ret_regions[ k ] = regions1[ k ]
                self.total += len( ret_regions[ k ] )
                continue
            ret_regions[ k ] = []
            n_rl1 = len( regions1[k] )    # number of remaining elements in regions1[k]
            n_rl2 = len( regions2[k] )    # number of remaining elements in regions2[k]
            rl1_k = iter( regions1[k] ).__next__
            rl2_k = iter( regions2[k] ).__next__
            r1 = rl1_k()
            n_rl1 -= 1
            r2 = rl2_k()
            n_rl2 -= 1
            while ( True ):
                # we do this until there is no r1 or r2 left.
                if r2[0] < r1[1] and r1[0] < r2[1]:
                    # We found an overlap, now get the intersecting
                    # region.
                    four_coords = sorted([r1[0], r1[1], r2[0], r2[1]])
                    ret_regions[ k ].append( (four_coords[1], four_coords[2]) )
                if r1[1] < r2[1]:
                    # in this case, we need to move to the next r1,
                    if n_rl1:
                        r1 = rl1_k()
                        n_rl1 -= 1
                    else:
                        # no more r1 left
                        break
                else:
                    # in this case, we need to move the next r2
                    if n_rl2:
                        r2 = rl2_k()
                        n_rl2 -= 1
                    else:
                        # no more r2 left
                        break
            self.total += len( ret_regions[ k ] )

        ret_regions_object.regions = ret_regions
        ret_regions_object.sort()
        return ret_regions_object
    

    cpdef void exclude (self, object regions_object2):
        """ Remove overlapping regions in regions_object2, another Regions
        object.

        """
        cdef:
            dict regions1, regions2
            list chrs1, chrs2
            bytes k
            dict ret_regions
            bool overlap_found
            tuple r1, r2
            long n_rl1, n_rl2

        self.sort()
        regions1 = self.regions
        self.total = 0
        assert isinstance(regions_object2,Regions)
        regions_object2.sort()
        regions2 = regions_object2.regions

        ret_regions = dict()
        chrs1 = list(regions1.keys())
        chrs2 = list(regions2.keys())
        for k in chrs1:
            #print(f"chromosome {k}")
            if not chrs2.count(k):
                # no such chromosome in peaks1, then don't touch the peaks in this chromosome
                ret_regions[ k ] = regions1[ k ]
                self.total += len( ret_regions[ k ] )
                continue
            ret_regions[ k ] = []
            n_rl1 = len( regions1[k] )
            n_rl2 = len( regions2[k] )
            rl1_k = iter( regions1[k] ).__next__
            rl2_k = iter( regions2[k] ).__next__
            overlap_found = False
            r1 = rl1_k()
            n_rl1 -= 1
            r2 = rl2_k()
            n_rl2 -= 1
            while ( True ):
                # we do this until there is no r1 or r2 left.
                if r2[0] < r1[1] and r1[0] < r2[1]:
                    # since we found an overlap, r1 will be skipped/excluded
                    # and move to the next r1
                    overlap_found = True
                    n_rl1 -= 1
                    if n_rl1 >= 0:
                        r1 = rl1_k()
                        overlap_found = False
                        continue
                    else:
                        break
                if r1[1] < r2[1]:
                    # in this case, we need to move to the next r1,
                    # we will check if overlap_found is true, if not, we put r1 in a new dict
                    if not overlap_found:
                        ret_regions[ k ].append( r1 )
                    n_rl1 -= 1
                    if n_rl1 >= 0:
                        r1 = rl1_k()
                        overlap_found = False
                    else:
                        # no more r1 left
                        break
                else:
                    # in this case, we need to move the next r2
                    if n_rl2:
                        r2 = rl2_k()
                        n_rl2 -= 1
                    else:
                        # no more r2 left
                        break
            if n_rl1 >= 0:
                ret_regions[ k ].extend( regions1[ k ][-n_rl1-1:] )
            self.total += len( ret_regions[ k ] )

        self.regions = ret_regions
        self.__sorted = False
        self.sort()
        return