1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
|
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2024-02-12 15:26:24 Tao Liu>
"""Module for Region classe.
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file LICENSE included with
the distribution).
"""
# ------------------------------------
# python modules
# ------------------------------------
#import random
#import re
#import sys
# ------------------------------------
# MACS3 modules
# ------------------------------------
from MACS3.Utilities.Constants import *
# ------------------------------------
# Other modules
# ------------------------------------
from cpython cimport bool
# ------------------------------------
# constants
# ------------------------------------
__version__ = "Region $Revision$"
__author__ = "Tao Liu <vladimir.liu@gmail.com>"
__doc__ = "Region class"
# ------------------------------------
# Misc functions
# ------------------------------------
# ------------------------------------
# Classes
# ------------------------------------
cdef class Regions:
"""For plain region of chrom, start and end
"""
cdef:
public dict regions
public int total
bool __sorted
bool __merged
def __init__ (self):
self.regions= {}
self.__sorted = False
self.__merged = False
self.total = 0
def __getitem__ (self, chrom):
return self.regions[ chrom ]
cpdef pop ( self, int n ):
# when called, pop the first n regions in Regions class. Self will be modified.
cdef:
list clist # for chromosomes
int tmp_l
bytes chrom
int n_taken # remember the number of regions prepared
object ret # returned Regions
if self.total == 0:
raise Exception("None left")
clist = sorted(list(self.regions.keys()))
n_taken = n
ret = Regions()
for chrom in sorted(clist):
ret.regions[chrom] = self.regions[chrom][:n_taken]
self.regions[chrom] = self.regions[chrom][n_taken:]
if not self.regions[chrom]:
# remove this chromosome if there is none left
self.regions.pop( chrom )
tmp_l = len( ret.regions[chrom] )
ret.total += tmp_l
# calculate remained
self.total -= tmp_l
#print( ret.total, self.total )
n_taken -= tmp_l
if not n_taken:
# when there is no need, quit loop
break
return ret
cpdef init_from_PeakIO ( self, object peaks ):
"""Initialize the object with a PeakIO object.
Note: I intentionally forgot to check if peakio is actually a
PeakIO...
"""
cdef:
bytes chrom
list ps
object p
int i
peaks.sort()
self.total = 0
for chrom in sorted(peaks.get_chr_names()):
ps = peaks.get_data_from_chrom( chrom )
self.regions[chrom] = []
for i in range( len( ps ) ):
p = ps[ i ]
self.regions[chrom].append( ( p['start'], p['end'] ) )
self.total += 1
self.sort()
cpdef add_loc ( self, bytes chrom, int start, int end ):
if self.regions.has_key(chrom):
self.regions[chrom].append( (start,end) )
else:
self.regions[chrom] = [ (start,end), ]
self.total += 1
self.__sorted = False
self.__merged = False
return
cpdef sort (self):
cdef:
bytes chrom
if self.__sorted:
return
for chrom in sorted(self.regions.keys()):
self.regions[chrom].sort()
self.__sorted = True
cpdef long total_length ( self ):
""" Return the total length of the Regions object.
"""
cdef:
bytes chrom
list ps
int i
int tl, s, e
self.merge_overlap()
tl = 0
for chrom in sorted(self.regions.keys()):
ps = self.regions[chrom]
for i in range( len(ps) ):
s, e = ps[i]
tl += e - s
return tl
cpdef set get_chr_names (self):
return set( sorted(self.regions.keys()) )
cpdef expand ( self, int flanking ):
""" Expand regions to both directions with 'flanking' bps.
"""
cdef:
bytes chrom
list ps
int i
self.sort()
for chrom in sorted(self.regions.keys()):
ps = self.regions[ chrom ]
for i in range( len( ps ) ):
ps[i] = ( max(0, ps[i][0] - flanking), ps[i][1] + flanking )
ps.sort()
self.regions[chrom] = ps
self.__merged = False
cpdef merge_overlap ( self ):
"""
Merge overlapping regions of itself.
"""
cdef:
bytes chrom
int s_new_region, e_new_region, i, j
dict regions, new_regions
list chrs
list regions_chr
tuple prev_region
if self.__merged:
return
self.sort()
regions = self.regions
new_regions = {}
chrs = list(regions.keys())
chrs.sort()
self.total = 0
for i in range(len(chrs)):
chrom = chrs[i]
new_regions[chrom]=[]
n_append = new_regions[chrom].append
prev_region = ()
regions_chr = regions[chrom]
for i in range( len(regions_chr) ):
if not prev_region:
prev_region = regions_chr[i]
continue
else:
if regions_chr[i][0] <= prev_region[1]:
s_new_region = prev_region[0]
e_new_region = regions_chr[i][1]
prev_region = (s_new_region,e_new_region)
else:
n_append(prev_region)
prev_region = regions_chr[i]
if prev_region:
n_append(prev_region)
self.total += len( new_regions[chrom] )
self.regions = new_regions
self.sort()
self.__merged = True
return True
cpdef write_to_bed (self, fhd ):
cdef:
int i
bytes chrom
tuple region
chrs = list(self.regions.keys())
chrs.sort()
for i in range( len(chrs) ):
chrom = chrs[i]
for region in self.regions[chrom]:
fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) )
def __str__ ( self ):
cdef:
int i
bytes chrom
tuple region
str ret
ret = ""
chrs = list(self.regions.keys())
chrs.sort()
for i in range( len(chrs) ):
chrom = chrs[i]
for region in self.regions[chrom]:
ret += "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] )
return ret
# cpdef object randomly_pick ( self, int n, int seed = 12345 ):
# """Shuffle the regions and get n regions out of it. Return a
# new Regions object.
# """
# cdef:
# list all_pc
# list chrs
# bytes chrom
# object ret_peakio, p
# assert n > 0
# chrs = sorted(list(self.peaks.keys()))
# all_pc = []
# for chrom in chrs:
# all_pc.extend(self.peaks[chrom])
# random.seed( seed )
# all_pc = random.shuffle( all_pc )[:n]
# ret_peakio = PeakIO()
# for p in all_pc:
# ret_peakio.add_PeakContent ( p["chrom"], p )
# return ret_peakio
cpdef object intersect (self, object regions_object2):
""" Get the only intersecting regions comparing with
regions_object2, another Regions object. Then return a new
Regions object.
"""
cdef:
object ret_regions_object
dict regions1, regions2
list chrs1, chrs2, four_coords
bytes k
dict ret_regions
bool overlap_found
tuple r1, r2
long n_rl1, n_rl2
self.sort()
regions1 = self.regions
self.total = 0
assert isinstance(regions_object2, Regions)
regions_object2.sort()
regions2 = regions_object2.regions
ret_regions_object = Regions()
ret_regions = dict()
chrs1 = list(regions1.keys())
chrs2 = list(regions2.keys())
for k in chrs1:
#print(f"chromosome {k}")
if not chrs2.count(k):
# no such chromosome in peaks1, then don't touch the peaks in this chromosome
ret_regions[ k ] = regions1[ k ]
self.total += len( ret_regions[ k ] )
continue
ret_regions[ k ] = []
n_rl1 = len( regions1[k] ) # number of remaining elements in regions1[k]
n_rl2 = len( regions2[k] ) # number of remaining elements in regions2[k]
rl1_k = iter( regions1[k] ).__next__
rl2_k = iter( regions2[k] ).__next__
r1 = rl1_k()
n_rl1 -= 1
r2 = rl2_k()
n_rl2 -= 1
while ( True ):
# we do this until there is no r1 or r2 left.
if r2[0] < r1[1] and r1[0] < r2[1]:
# We found an overlap, now get the intersecting
# region.
four_coords = sorted([r1[0], r1[1], r2[0], r2[1]])
ret_regions[ k ].append( (four_coords[1], four_coords[2]) )
if r1[1] < r2[1]:
# in this case, we need to move to the next r1,
if n_rl1:
r1 = rl1_k()
n_rl1 -= 1
else:
# no more r1 left
break
else:
# in this case, we need to move the next r2
if n_rl2:
r2 = rl2_k()
n_rl2 -= 1
else:
# no more r2 left
break
self.total += len( ret_regions[ k ] )
ret_regions_object.regions = ret_regions
ret_regions_object.sort()
return ret_regions_object
cpdef void exclude (self, object regions_object2):
""" Remove overlapping regions in regions_object2, another Regions
object.
"""
cdef:
dict regions1, regions2
list chrs1, chrs2
bytes k
dict ret_regions
bool overlap_found
tuple r1, r2
long n_rl1, n_rl2
self.sort()
regions1 = self.regions
self.total = 0
assert isinstance(regions_object2,Regions)
regions_object2.sort()
regions2 = regions_object2.regions
ret_regions = dict()
chrs1 = list(regions1.keys())
chrs2 = list(regions2.keys())
for k in chrs1:
#print(f"chromosome {k}")
if not chrs2.count(k):
# no such chromosome in peaks1, then don't touch the peaks in this chromosome
ret_regions[ k ] = regions1[ k ]
self.total += len( ret_regions[ k ] )
continue
ret_regions[ k ] = []
n_rl1 = len( regions1[k] )
n_rl2 = len( regions2[k] )
rl1_k = iter( regions1[k] ).__next__
rl2_k = iter( regions2[k] ).__next__
overlap_found = False
r1 = rl1_k()
n_rl1 -= 1
r2 = rl2_k()
n_rl2 -= 1
while ( True ):
# we do this until there is no r1 or r2 left.
if r2[0] < r1[1] and r1[0] < r2[1]:
# since we found an overlap, r1 will be skipped/excluded
# and move to the next r1
overlap_found = True
n_rl1 -= 1
if n_rl1 >= 0:
r1 = rl1_k()
overlap_found = False
continue
else:
break
if r1[1] < r2[1]:
# in this case, we need to move to the next r1,
# we will check if overlap_found is true, if not, we put r1 in a new dict
if not overlap_found:
ret_regions[ k ].append( r1 )
n_rl1 -= 1
if n_rl1 >= 0:
r1 = rl1_k()
overlap_found = False
else:
# no more r1 left
break
else:
# in this case, we need to move the next r2
if n_rl2:
r2 = rl2_k()
n_rl2 -= 1
else:
# no more r2 left
break
if n_rl1 >= 0:
ret_regions[ k ].extend( regions1[ k ][-n_rl1-1:] )
self.total += len( ret_regions[ k ] )
self.regions = ret_regions
self.__sorted = False
self.sort()
return
|