File: CallPeakUnit.pyx

package info (click to toggle)
macs 3.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 378,728 kB
  • sloc: ansic: 5,879; python: 4,342; sh: 451; makefile: 83
file content (1781 lines) | stat: -rw-r--r-- 76,301 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
# cython: language_level=3
# cython: profile=True
# cython: linetrace=True
# Time-stamp: <2022-09-15 17:06:17 Tao Liu>

"""Module for Calculate Scores.

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file LICENSE included with
the distribution).
"""

# ------------------------------------
# python modules
# ------------------------------------

from collections import Counter
from copy import copy
from time import time as ttime
import _pickle as cPickle
from tempfile import mkstemp
import os

import logging
import MACS3.Utilities.Logger

logger = logging.getLogger(__name__)
debug   = logger.debug
info    = logger.info
# ------------------------------------
# Other modules
# ------------------------------------
import numpy as np
cimport numpy as np
from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
from cpython cimport bool
from cykhash import PyObjectMap, Float32to32Map

# ------------------------------------
# C lib
# ------------------------------------
from libc.stdio cimport *
from libc.math cimport exp,log,log10, M_LN10, log1p, erf, sqrt, floor, ceil

# ------------------------------------
# MACS3 modules
# ------------------------------------
from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness
from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname
from MACS3.Signal.FixWidthTrack import FWTrack
from MACS3.Signal.PairedEndTrack import PETrackI
from MACS3.Signal.Prob import poisson_cdf
# --------------------------------------------
# cached pscore function and LR_asym functions
# --------------------------------------------
pscore_dict = PyObjectMap()
logLR_dict = PyObjectMap()

cdef float32_t get_pscore ( tuple t ):
    """t: tuple of ( lambda, observation )
    """
    cdef:
        float32_t val
    if t in pscore_dict:
        return pscore_dict[ t ]
    else:
        # calculate and cache
        val = -1.0 * poisson_cdf ( t[0], t[1], False, True )
        pscore_dict[ t ] = val
        return val

cdef float32_t get_logLR_asym ( tuple t ):
    """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
    chromatin bias ). Set minus sign for depletion.
    """
    cdef:
        float32_t val
        float32_t x
        float32_t y
    if t in logLR_dict:
        return logLR_dict[ t ]
    else:
        x = t[0]
        y = t[1]
        # calculate and cache
        if x > y:
            val = (x*(log10(x)-log10(y))+y-x)
        elif x < y:
            val = (x*(-log10(x)+log10(y))-y+x)
        else:
            val = 0
        logLR_dict[ t ] = val
        return val

# ------------------------------------
# constants
# ------------------------------------
__version__ = "CallPeakUnit $Revision$"
__author__ = "Tao Liu <vladimir.liu@gmail.com>"
__doc__ = "CallPeakUnit"

LOG10_E = 0.43429448190325176

# ------------------------------------
# Misc functions
# ------------------------------------

cdef void clean_up_ndarray ( np.ndarray x ):
    # clean numpy ndarray in two steps
    cdef:
        int64_t i
    i = x.shape[0] // 2
    x.resize( 100000 if i > 100000 else i, refcheck=False)
    x.resize( 0, refcheck=False)
    return

cdef inline float32_t chi2_k1_cdf ( float32_t x ):
    return erf( sqrt(x/2) )

cdef inline float32_t log10_chi2_k1_cdf ( float32_t x ):
  return log10( erf( sqrt(x/2) ) )

cdef inline float32_t chi2_k2_cdf ( float32_t x ):
  return 1 - exp( -x/2 )

cdef inline float32_t log10_chi2_k2_cdf ( float32_t x ):
  return log1p( - exp( -x/2 ) ) * LOG10_E

cdef inline float32_t chi2_k4_cdf ( float32_t x ):
  return 1 - exp( -x/2 ) * ( 1 + x/2 )

cdef inline float32_t log10_chi2_k4_CDF ( float32_t x ):
  return log1p( - exp( -x/2 ) * ( 1 + x/2 ) ) * LOG10_E

cdef inline np.ndarray apply_multiple_cutoffs ( list multiple_score_arrays, list multiple_cutoffs ):
    cdef:
        int32_t i
        np.ndarray ret

    ret = multiple_score_arrays[0] > multiple_cutoffs[0]

    for i in range(1,len(multiple_score_arrays)):
        ret += multiple_score_arrays[i] > multiple_cutoffs[i]

    return ret

cdef inline list get_from_multiple_scores ( list multiple_score_arrays, int32_t index ):
    cdef:
        list ret = []
        int32_t i

    for i in range(len(multiple_score_arrays)):
        ret.append(multiple_score_arrays[i][index])
    return ret


cdef inline float32_t get_logFE ( float32_t x, float32_t y ):
    """ return 100* log10 fold enrichment with +1 pseudocount.
    """
    return log10( x/y )

cdef inline float32_t get_subtraction ( float32_t x, float32_t y):
    """ return subtraction.
    """
    return x - y

cdef inline list getitem_then_subtract ( list peakset, int32_t start ):
    cdef:
        list a

    a = [x["start"] for x in peakset]
    for i in range(len(a)):
        a[i] = a[i] - start
    return a

cdef inline int32_t left_sum ( data, int32_t pos, int32_t width ):
    """
    """
    return sum([data[x] for x in data if x <= pos and x >= pos - width])

cdef inline int32_t right_sum ( data, int32_t pos, int32_t width ):
    """
    """
    return sum([data[x] for x in data if x >= pos and x <= pos + width])

cdef inline int32_t left_forward ( data, int32_t pos, int32_t window_size ):
    return data.get(pos,0) - data.get(pos-window_size, 0)

cdef inline int32_t right_forward ( data, int32_t pos, int32_t window_size ):
    return data.get(pos + window_size, 0) - data.get(pos, 0)

cdef float32_t median_from_value_length ( np.ndarray[np.float32_t, ndim=1] value, list length ):
    """
    """
    cdef:
        list tmp
        int32_t c, tmp_l
        float32_t tmp_v, mid_l

    c = 0
    tmp = sorted(list(zip( value, length )))
    mid_l = sum( length )/2
    for (tmp_v, tmp_l) in tmp:
        c += tmp_l
        if c > mid_l:
            return tmp_v

cdef float32_t mean_from_value_length ( np.ndarray[np.float32_t, ndim=1] value, list length ):
    """take list of values and list of corresponding lengths, calculate the mean.
    An important function for bedGraph type of data.
    """
    cdef:
        int32_t i
        int32_t tmp_l, l
        float64_t tmp_v, sum_v, tmp_sum   #try to solve precision issue
        float32_t ret

    sum_v = 0
    l = 0

    for i in range( len(length) ):
        tmp_l = length[ i ]
        tmp_v = <float64_t>value[ i ]
        tmp_sum = tmp_v * tmp_l
        sum_v = tmp_sum + sum_v
        l += tmp_l

    ret = <float32_t>(sum_v/l)

    return ret


cdef tuple find_optimal_cutoff( list x, list y ):
    """Return the best cutoff x and y.

    We assume that total peak length increase exponentially while
    decreasing cutoff value. But while cutoff decreases to a point
    that background noises are captured, total length increases much
    faster. So we fit a linear model by taking the first 10 points,
    then look for the largest cutoff that

    """
    cdef:
        np.ndarray npx, npy, npA
        float32_t optimal_x, optimal_y
        int64_t l, i
        float32_t m, c # slop and intercept
        float32_t sst # sum of squared total
        float32_t sse # sum of squared error
        float32_t rsq # R-squared

    l = len(x)
    assert l == len(y)
    npx = np.array( x )
    npy = np.log10( np.array( y ) )
    npA = np.vstack( [npx, np.ones(len(npx))] ).T

    for i in range( 10, l ):
        # at least the largest 10 points
        m, c = np.linalg.lstsq( npA[:i], npy[:i], rcond=None )[ 0 ]
        sst = sum( ( npy[:i] - np.mean( npy[:i] ) ) ** 2 )
        sse = sum( ( npy[:i] - m*npx[:i] - c ) ** 2 )
        rsq = 1 - sse/sst
        #print i, x[i], y[i], m, c, rsq
    return ( 1.0, 1.0 )



# ------------------------------------
# Classes
# ------------------------------------
cdef class CallerFromAlignments:
    """A unit to calculate scores and call peaks from alignments --
    FWTrack or PETrack objects.

    It will compute for each chromosome separately in order to save
    memory usage.
    """
    cdef:
        object treat            # FWTrack or PETrackI object for ChIP
        object ctrl             # FWTrack or PETrackI object for Control

        int32_t  d                           # extension size for ChIP
        list ctrl_d_s                    # extension sizes for Control. Can be multiple values
        float32_t treat_scaling_factor       # scaling factor for ChIP
        list ctrl_scaling_factor_s       # scaling factor for Control, corresponding to each extension size.
        float32_t lambda_bg                  # minimum local bias to fill missing values
        list chromosomes                 # name of common chromosomes in ChIP and Control data
        float64_t pseudocount                # the pseudocount used to calcuate logLR, FE or logFE
        bytes bedGraph_filename_prefix     # prefix will be added to _pileup.bdg for treatment and _lambda.bdg for control

        int32_t  end_shift                   # shift of cutting ends before extension
        bool trackline                   # whether trackline should be saved in bedGraph
        bool save_bedGraph               # whether to save pileup and local bias in bedGraph files
        bool save_SPMR                   # whether to save pileup normalized by sequencing depth in million reads
        bool no_lambda_flag              # whether ignore local bias, and to use global bias instead
        bool PE_mode                     # whether it's in PE mode, will be detected during initiation

        # temporary data buffer
        list chr_pos_treat_ctrl          # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
        bytes bedGraph_treat_filename
        bytes bedGraph_control_filename
        FILE * bedGraph_treat_f
        FILE * bedGraph_ctrl_f

        # data needed to be pre-computed before peak calling
        object pqtable          # remember pvalue->qvalue convertion; saved in cykhash Float32to32Map
        bool pvalue_all_done             # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value.

        dict pvalue_npeaks               # record for each pvalue cutoff, how many peaks can be called
        dict pvalue_length               # record for each pvalue cutoff, the total length of called peaks
        float32_t optimal_p_cutoff           # automatically decide the p-value cutoff ( can be translated into qvalue cutoff ) based
                                         # on p-value to total peak length analysis.
        bytes cutoff_analysis_filename     # file to save the pvalue-npeaks-totallength table

        dict pileup_data_files           # Record the names of temporary files for storing pileup values of each chromosome


    def __init__ (self, treat, ctrl,
                  int32_t d = 200, list ctrl_d_s = [200, 1000, 10000],
                  float32_t treat_scaling_factor = 1.0, list ctrl_scaling_factor_s = [1.0, 0.2, 0.02],
                  bool stderr_on = False,
                  float32_t pseudocount = 1,
                  int32_t end_shift = 0,
                  float32_t lambda_bg = 0,
                  bool save_bedGraph = False,
                  str  bedGraph_filename_prefix = "PREFIX",
                  str bedGraph_treat_filename = "TREAT.bdg",
                  str bedGraph_control_filename = "CTRL.bdg",
                  str cutoff_analysis_filename = "TMP.txt",
                  bool save_SPMR = False ):
        """Initialize.

        A calculator is unique to each comparison of treat and
        control. Treat_depth and ctrl_depth should not be changed
        during calculation.

        treat and ctrl are either FWTrack or PETrackI objects.

        treat_depth and ctrl_depth are effective depth in million:
                                    sequencing depth in million after
                                    duplicates being filtered. If
                                    treatment is scaled down to
                                    control sample size, then this
                                    should be control sample size in
                                    million. And vice versa.

        d, sregion, lregion: d is the fragment size, sregion is the
                             small region size, lregion is the large
                             region size

        pseudocount: a pseudocount used to calculate logLR, FE or
                     logFE. Please note this value will not be changed
                     with normalization method. So if you really want
                     to set pseudocount 1 per million reads, set it
                     after you normalize treat and control by million
                     reads by `change_normalizetion_method(ord('M'))`.

        """
        cdef:
            set chr1, chr2
            int32_t i
            char * tmp
            bytes tmp_bytes
            float32_t p
        # decide PE mode
        if isinstance(treat, FWTrack):
            self.PE_mode = False
        elif isinstance(treat, PETrackI):
            self.PE_mode = True
        else:
            raise Exception("Should be FWTrack or PETrackI object!")
        # decide if there is control
        self.treat = treat
        if ctrl:
            self.ctrl = ctrl
        else:                   # while there is no control
            self.ctrl = treat
        self.trackline = False
        self.d = d              # note, self.d doesn't make sense in PE mode
        self.ctrl_d_s = ctrl_d_s# note, self.d doesn't make sense in PE mode
        self.treat_scaling_factor = treat_scaling_factor
        self.ctrl_scaling_factor_s= ctrl_scaling_factor_s
        self.end_shift = end_shift
        self.lambda_bg = lambda_bg
        self.pqtable = Float32to32Map( for_int = False ) # Float32 -> Float32 map
        self.save_bedGraph = save_bedGraph
        self.save_SPMR = save_SPMR
        self.bedGraph_filename_prefix = bedGraph_filename_prefix.encode()
        self.bedGraph_treat_filename = bedGraph_treat_filename.encode()
        self.bedGraph_control_filename = bedGraph_control_filename.encode()
        if not self.ctrl_d_s or not self.ctrl_scaling_factor_s:
            self.no_lambda_flag = True
        else:
            self.no_lambda_flag = False
        self.pseudocount = pseudocount
        # get the common chromosome names from both treatment and control
        chr1 = set(self.treat.get_chr_names())
        chr2 = set(self.ctrl.get_chr_names())
        self.chromosomes = sorted(list(chr1.intersection(chr2)))

        self.pileup_data_files = {}
        self.pvalue_length = {}
        self.pvalue_npeaks = {}
        for p in np.arange( 0.3, 10, 0.3 ): # step for optimal cutoff is 0.3 in -log10pvalue, we try from pvalue 1E-10 (-10logp=10) to 0.5 (-10logp=0.3)
            self.pvalue_length[ p ] = 0
            self.pvalue_npeaks[ p ] = 0
        self.optimal_p_cutoff = 0
        self.cutoff_analysis_filename = cutoff_analysis_filename.encode()

    cpdef destroy ( self ):
        """Remove temporary files for pileup values of each chromosome.

        Note: This function MUST be called if the class object won't
        be used anymore.

        """
        cdef:
            bytes f

        for f in self.pileup_data_files.values():
            if os.path.isfile( f ):
                os.unlink( f )
        return

    cpdef set_pseudocount( self, float32_t pseudocount ):
        self.pseudocount = pseudocount

    cpdef enable_trackline( self ):
        """Turn on trackline with bedgraph output
        """
        self.trackline = True

    cdef __pileup_treat_ctrl_a_chromosome ( self, bytes chrom ):
        """After this function is called, self.chr_pos_treat_ctrl will
        be reset and assigned to the pileup values of the given
        chromosome.

        """
        cdef:
            list treat_pv, ctrl_pv
            int64_t i
            float32_t t
            object f
            str temp_filename

        assert chrom in self.chromosomes, "chromosome %s is not valid." % chrom

        # check backup file of pileup values. If not exists, create
        # it. Otherwise, load them instead of calculating new pileup
        # values.
        if chrom in self.pileup_data_files:
            try:
                f = open( self.pileup_data_files[ chrom ],"rb" )
                self.chr_pos_treat_ctrl = cPickle.load( f )
                f.close()
                return
            except:
                temp_fd, temp_filename = mkstemp()
                os.close(temp_fd)
                self.pileup_data_files[ chrom ] = temp_filename
        else:
            temp_fd, temp_filename = mkstemp()
            os.close(temp_fd)
            self.pileup_data_files[ chrom ] = temp_filename.encode()

        # reset or clean existing self.chr_pos_treat_ctrl
        if self.chr_pos_treat_ctrl:     # not a beautiful way to clean
            clean_up_ndarray( self.chr_pos_treat_ctrl[0] )
            clean_up_ndarray( self.chr_pos_treat_ctrl[1] )
            clean_up_ndarray( self.chr_pos_treat_ctrl[2] )

        if self.PE_mode:
            treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 )
        else:
            treat_pv = self.treat.pileup_a_chromosome( chrom, [self.d,], [self.treat_scaling_factor,], baseline_value = 0.0,
                                                       directional = True,
                                                       end_shift = self.end_shift )

        if not self.no_lambda_flag:
            if self.PE_mode:
                # note, we pileup up PE control as SE control because
                # we assume the bias only can be captured at the
                # surrounding regions of cutting sites from control experiments.
                ctrl_pv = self.ctrl.pileup_a_chromosome_c( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg )
            else:
                ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s,
                                                         baseline_value = self.lambda_bg,
                                                         directional = False )
        else:
            ctrl_pv = [treat_pv[0][-1:], np.array([self.lambda_bg,], dtype="float32")] # set a global lambda

        self.chr_pos_treat_ctrl = self.__chrom_pair_treat_ctrl( treat_pv, ctrl_pv)

        # clean treat_pv and ctrl_pv
        treat_pv = []
        ctrl_pv  = []

        # save data to temporary file
        try:
            f = open(self.pileup_data_files[ chrom ],"wb")
            cPickle.dump( self.chr_pos_treat_ctrl, f , protocol=2 )
            f.close()
        except:
            # fail to write then remove the key in pileup_data_files
            self.pileup_data_files.pop(chrom)
        return

    cdef list __chrom_pair_treat_ctrl ( self, treat_pv, ctrl_pv ):
        """*private* Pair treat and ctrl pileup for each region.

        treat_pv and ctrl_pv are [np.ndarray, np.ndarray].

        return [p, t, c] list, each element is a numpy array.
        """
        cdef:
            list ret
            int64_t pre_p, index_ret, it, ic, lt, lc
            np.ndarray[np.int32_t, ndim=1] t_p, c_p, ret_p
            np.ndarray[np.float32_t, ndim=1] t_v, c_v, ret_t, ret_c

            int32_t * t_p_ptr
            int32_t * c_p_ptr
            int32_t * ret_p_ptr

            float32_t * t_v_ptr
            float32_t * c_v_ptr
            float32_t * ret_t_ptr
            float32_t * ret_c_ptr

        [ t_p, t_v ] = treat_pv
        [ c_p, c_v ] = ctrl_pv

        lt = t_p.shape[0]
        lc = c_p.shape[0]

        chrom_max_len = lt + lc

        ret_p = np.zeros( chrom_max_len, dtype="int32" ) # position
        ret_t = np.zeros( chrom_max_len, dtype="float32" ) # value from treatment
        ret_c = np.zeros( chrom_max_len, dtype="float32" ) # value from control

        t_p_ptr = <int32_t *> t_p.data
        t_v_ptr = <float32_t *> t_v.data
        c_p_ptr = <int32_t *> c_p.data
        c_v_ptr = <float32_t *> c_v.data
        ret_p_ptr = <int32_t *> ret_p.data
        ret_t_ptr = <float32_t *>ret_t.data
        ret_c_ptr = <float32_t *>ret_c.data

        pre_p = 0
        index_ret = 0
        it = 0
        ic = 0

        while it < lt and ic < lc:
            if t_p_ptr[0] < c_p_ptr[0]:
                # clip a region from pre_p to p1, then set pre_p as p1.
                ret_p_ptr[0] = t_p_ptr[0]
                ret_t_ptr[0] = t_v_ptr[0]
                ret_c_ptr[0] = c_v_ptr[0]
                ret_p_ptr += 1
                ret_t_ptr += 1
                ret_c_ptr += 1
                pre_p = t_p_ptr[0]
                index_ret += 1
                # call for the next p1 and v1
                it += 1
                t_p_ptr += 1
                t_v_ptr += 1
            elif t_p_ptr[0] > c_p_ptr[0]:
                # clip a region from pre_p to p2, then set pre_p as p2.
                ret_p_ptr[0] = c_p_ptr[0]
                ret_t_ptr[0] = t_v_ptr[0]
                ret_c_ptr[0] = c_v_ptr[0]
                ret_p_ptr += 1
                ret_t_ptr += 1
                ret_c_ptr += 1
                pre_p = c_p_ptr[0]
                index_ret += 1
                # call for the next p2 and v2
                ic += 1
                c_p_ptr += 1
                c_v_ptr += 1
            else:
                # from pre_p to p1 or p2, then set pre_p as p1 or p2.
                ret_p_ptr[0] = t_p_ptr[0]
                ret_t_ptr[0] = t_v_ptr[0]
                ret_c_ptr[0] = c_v_ptr[0]
                ret_p_ptr += 1
                ret_t_ptr += 1
                ret_c_ptr += 1
                pre_p = t_p_ptr[0]
                index_ret += 1
                # call for the next p1, v1, p2, v2.
                it += 1
                ic += 1
                t_p_ptr += 1
                t_v_ptr += 1
                c_p_ptr += 1
                c_v_ptr += 1

        ret_p.resize( index_ret, refcheck=False)
        ret_t.resize( index_ret, refcheck=False)
        ret_c.resize( index_ret, refcheck=False)
        return [ret_p, ret_t, ret_c]

    cdef np.ndarray __cal_score ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2, cal_func ):
        cdef:
            int64_t i
            np.ndarray[np.float32_t, ndim=1] s
        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")
        for i in range(array1.shape[0]):
            s[i] = cal_func( array1[i], array2[i] )
        return s

    cdef void __cal_pvalue_qvalue_table ( self ):
        """After this function is called, self.pqtable is built. All
        chromosomes will be iterated. So it will take some time.

        """
        cdef:
            bytes chrom
            np.ndarray pos_array, treat_array, ctrl_array, score_array
            dict pscore_stat
            int64_t n, pre_p, length, pre_l, l, i, j
            float32_t this_v, pre_v, v, q, pre_q
            int64_t N, k, this_l
            float32_t f
            list unique_values
            int32_t * pos_ptr
            float32_t * treat_value_ptr
            float32_t * ctrl_value_ptr

        debug ( "Start to calculate pvalue stat..." )

        pscore_stat = {} #dict()
        for i in range( len( self.chromosomes ) ):
            chrom = self.chromosomes[ i ]
            pre_p = 0

            self.__pileup_treat_ctrl_a_chromosome( chrom )
            [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl

            pos_ptr = <int32_t *> pos_array.data
            treat_value_ptr = <float32_t *> treat_array.data
            ctrl_value_ptr = <float32_t *> ctrl_array.data

            for j in range(pos_array.shape[0]):
                this_v = get_pscore( (<int32_t>(treat_value_ptr[0]), ctrl_value_ptr[0] ) )
                this_l = pos_ptr[0] - pre_p
                if this_v in pscore_stat:
                    pscore_stat[ this_v ] += this_l
                else:
                    pscore_stat[ this_v ] = this_l
                pre_p = pos_ptr[0]
                pos_ptr += 1
                treat_value_ptr += 1
                ctrl_value_ptr += 1

        N = sum(pscore_stat.values()) # total length
        k = 1                         # rank
        f = -log10(N)
        pre_v = -2147483647
        pre_l = 0
        pre_q = 2147483647      # save the previous q-value

        self.pqtable = Float32to32Map( for_int = False )
        unique_values = sorted(list(pscore_stat.keys()), reverse=True) 
        for i in range(len(unique_values)):
            v = unique_values[i]
            l = pscore_stat[v]
            q = v + (log10(k) + f)
            if q > pre_q:
                q = pre_q
            if q <= 0:
                q = 0
                break
            #q = max(0,min(pre_q,q))           # make q-score monotonic
            self.pqtable[ v ] = q
            pre_q = q
            k += l
        # bottom rank pscores all have qscores 0
        for j in range(i, len(unique_values) ):
            v = unique_values[ j ]
            self.pqtable[ v ] = 0
        return

    cdef void __pre_computes ( self, int32_t max_gap = 50, int32_t min_length = 200 ):
        """After this function is called, self.pqtable and self.pvalue_length is built. All
        chromosomes will be iterated. So it will take some time.

        """
        cdef:
            bytes chrom
            np.ndarray pos_array, treat_array, ctrl_array, score_array
            dict pscore_stat
            int64_t n, pre_p, this_p, length, j, pre_l, l, i
            float32_t q, pre_q, this_t, this_c
            float32_t this_v, pre_v, v, cutoff
            int64_t N, k, this_l
            float32_t f
            list unique_values
            float64_t t0, t1, t

            np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos
            list peak_content
            int64_t peak_length, total_l, total_p

            list tmplist

            int32_t * acs_ptr   # above cutoff start position pointer
            int32_t * ace_ptr   # above cutoff end position pointer
            int32_t * pos_array_ptr # position array pointer
            float32_t * score_array_ptr # score array pointer

        debug ( "Start to calculate pvalue stat..." )

        # tmplist contains a list of log pvalue cutoffs from 0.3 to 10
        tmplist = [round(x,5) for x in sorted( list(np.arange(0.3, 10.0, 0.3)), reverse = True )]

        pscore_stat = {} #dict()
        #print (list(pscore_stat.keys()))
        #print (list(self.pvalue_length.keys()))
        #print (list(self.pvalue_npeaks.keys()))
        for i in range( len( self.chromosomes ) ):
            chrom = self.chromosomes[ i ]
            self.__pileup_treat_ctrl_a_chromosome( chrom )
            [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl

            score_array = self.__cal_pscore( treat_array, ctrl_array )

            for n in range( len( tmplist ) ):
                cutoff = tmplist[ n ]
                total_l = 0           # total length in potential peak
                total_p = 0

                # get the regions with scores above cutoffs
                above_cutoff = np.nonzero( score_array > cutoff )[0]# this is not an optimized method. It would be better to store score array in a 2-D ndarray?
                above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
                above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff

                if above_cutoff_endpos.size == 0:
                    continue

                # first bit of region above cutoff
                acs_ptr = <int32_t *> above_cutoff_startpos.data
                ace_ptr = <int32_t *> above_cutoff_endpos.data

                peak_content = [( acs_ptr[ 0 ], ace_ptr[ 0 ] ), ]
                lastp = ace_ptr[ 0 ]
                acs_ptr += 1
                ace_ptr += 1

                for i in range( 1, above_cutoff_startpos.size ):
                    tl = acs_ptr[ 0 ] - lastp
                    if tl <= max_gap:
                        peak_content.append( ( acs_ptr[ 0 ], ace_ptr[ 0 ] ) )
                    else:
                        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
                        if peak_length >= min_length: # if the peak is too small, reject it
                            total_l +=  peak_length
                            total_p += 1
                        peak_content = [ ( acs_ptr[ 0 ], ace_ptr[ 0 ] ), ]
                    lastp = ace_ptr[ 0 ]
                    acs_ptr += 1
                    ace_ptr += 1

                if peak_content:
                    peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
                    if peak_length >= min_length: # if the peak is too small, reject it
                        total_l +=  peak_length
                        total_p += 1
                self.pvalue_length[ cutoff ] = self.pvalue_length.get( cutoff, 0 ) + total_l
                self.pvalue_npeaks[ cutoff ] = self.pvalue_npeaks.get( cutoff, 0 ) + total_p

            pos_array_ptr = <int32_t *> pos_array.data
            score_array_ptr = <float32_t *> score_array.data

            pre_p = 0
            for i in range(pos_array.shape[0]):
                this_p = pos_array_ptr[ 0 ]
                this_l = this_p - pre_p
                this_v = score_array_ptr[ 0 ]
                if this_v in pscore_stat:
                    pscore_stat[ this_v ] += this_l
                else:
                    pscore_stat[ this_v ] = this_l
                pre_p = this_p #pos_array[ i ]
                pos_array_ptr += 1
                score_array_ptr += 1

        #debug ( "make pscore_stat cost %.5f seconds" % t )

        # add all pvalue cutoffs from cutoff-analysis part. So that we
        # can get the corresponding qvalues for them.
        for cutoff in tmplist:
            if cutoff not in pscore_stat:
                pscore_stat[ cutoff ] = 0

        nhval = 0

        N = sum(pscore_stat.values()) # total length
        k = 1                           # rank
        f = -log10(N)
        pre_v = -2147483647
        pre_l = 0
        pre_q = 2147483647              # save the previous q-value

        self.pqtable = Float32to32Map( for_int = False ) #{}
        unique_values = sorted(list(pscore_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
        for i in range(len(unique_values)):
            v = unique_values[i]
            l = pscore_stat[v]
            q = v + (log10(k) + f)
            if q > pre_q:
                q = pre_q
            if q <= 0:
                q = 0
                break
            #q = max(0,min(pre_q,q))           # make q-score monotonic
            self.pqtable[ v ] = q
            pre_v = v
            pre_q = q
            k+=l
        for j in range(i, len(unique_values) ):
            v = unique_values[ j ]
            self.pqtable[ v ] = 0

        # write pvalue and total length of predicted peaks
        # this is the output from cutoff-analysis
        fhd = open( self.cutoff_analysis_filename, "w" )
        fhd.write( "pscore\tqscore\tnpeaks\tlpeaks\tavelpeak\n" )
        x = []
        y = []
        for cutoff in tmplist:
            if self.pvalue_npeaks[ cutoff ] > 0:
                fhd.write( "%.2f\t%.2f\t%d\t%d\t%.2f\n" % ( cutoff, self.pqtable[ cutoff ], self.pvalue_npeaks[ cutoff ], self.pvalue_length[ cutoff ], self.pvalue_length[ cutoff ]/self.pvalue_npeaks[ cutoff ] ) )
                x.append( cutoff )
                y.append( self.pvalue_length[ cutoff ] )
        fhd.close()
        info( "#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename )
        #info( "#3 Suggest a cutoff..." )
        #optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
        #info( "#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
        #print (list(pqtable.keys()))
        #print (list(self.pvalue_length.keys()))
        #print (list(self.pvalue_npeaks.keys()))
        return

    cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int32_t min_length = 200,
                       int32_t max_gap = 50, bool call_summits = False, bool cutoff_analysis = False ):
        """Call peaks for all chromosomes. Return a PeakIO object.

        scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
        score_cutoff_s    : cutoff values corresponding to scoring functions
        min_length        : minimum length of peak
        max_gap           : maximum gap of 'insignificant' regions within a peak. Note, for PE_mode, max_gap and max_length are both set as fragment length.
        call_summits      : boolean. Whether or not call sub-peaks.
        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
        """
        cdef:
            bytes chrom
            bytes tmp_bytes

        peaks = PeakIO()

        # prepare p-q table
        if len( self.pqtable ) == 0:
            info("#3 Pre-compute pvalue-qvalue table...")
            if cutoff_analysis:
                info("#3 Cutoff vs peaks called will be analyzed!")
                self.__pre_computes( max_gap = max_gap, min_length = min_length )
            else:
                self.__cal_pvalue_qvalue_table()


        # prepare bedGraph file
        if self.save_bedGraph:
            self.bedGraph_treat_f = fopen( self.bedGraph_treat_filename, "w" )
            self.bedGraph_ctrl_f = fopen( self.bedGraph_control_filename, "w" )

            info ("#3 In the peak calling step, the following will be performed simultaneously:")
            info ("#3   Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
            info ("#3   Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")

            if self.save_SPMR:
                info ( "#3   --SPMR is requested, so pileup will be normalized by sequencing depth in million reads." )
            elif self.treat_scaling_factor == 1:
                info ( "#3   Pileup will be based on sequencing depth in treatment." )
            else:
                info ( "#3   Pileup will be based on sequencing depth in control." )

            if self.trackline:
                # this line is REQUIRED by the wiggle format for UCSC browser
                tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
                fprintf( self.bedGraph_treat_f, tmp_bytes )
                tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
                fprintf( self.bedGraph_ctrl_f, tmp_bytes )

        info("#3 Call peaks for each chromosome...")
        for chrom in self.chromosomes:
            # treat/control bedGraph will be saved if requested by user.
            self.__chrom_call_peak_using_certain_criteria ( peaks, chrom, scoring_function_symbols, score_cutoff_s, min_length, max_gap, call_summits, self.save_bedGraph )

        # close bedGraph file
        if self.save_bedGraph:
            fclose(self.bedGraph_treat_f)
            fclose(self.bedGraph_ctrl_f)
            self.save_bedGraph = False

        return peaks

    cdef void __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int32_t min_length,
                                                   int32_t max_gap, bool call_summits, bool save_bedGraph ):
        """ Call peaks for a chromosome.

        Combination of criteria is allowed here.

        peaks: a PeakIO object, the return value of this function
        scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
        """
        cdef:
            float64_t t0
            int32_t i, n
            str s
            np.ndarray above_cutoff
            np.ndarray[np.int32_t, ndim=1] above_cutoff_endpos, above_cutoff_startpos, pos_array, above_cutoff_index_array

            np.ndarray[np.float32_t, ndim=1] treat_array, ctrl_array
            list score_array_s          # list to keep different types of scores
            list peak_content           #  to store information for a
                                        #  chunk in a peak region, it
                                        #  contains lists of: 1. left
                                        #  position; 2. right
                                        #  position; 3. treatment
                                        #  value; 4. control value;
                                        #  5. list of scores at this
                                        #  chunk
            int64_t tl, lastp, ts, te, ti
            float32_t tp, cp
            int32_t * acs_ptr
            int32_t * ace_ptr
            int32_t * acia_ptr
            float32_t * treat_array_ptr
            float32_t * ctrl_array_ptr


        assert len(scoring_function_s) == len(score_cutoff_s), "number of functions and cutoffs should be the same!"

        peak_content = []           # to store points above cutoff

        # first, build pileup, self.chr_pos_treat_ctrl
        # this step will be speeped up if pqtable is pre-computed.
        self.__pileup_treat_ctrl_a_chromosome( chrom )
        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl

        # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
        if save_bedGraph:
            self.__write_bedGraph_for_a_chromosome ( chrom )

        # keep all types of scores needed
        #t0 = ttime()
        score_array_s = []
        for i in range(len(scoring_function_s)):
            s = scoring_function_s[i]
            if s == 'p':
                score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) )
            elif s == 'q':
                score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) )
            elif s == 'f':
                score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) )
            elif s == 's':
                score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) )

        # get the regions with scores above cutoffs
        above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,score_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
        above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
        above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
        above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff

        if above_cutoff.size == 0:
            # nothing above cutoff
            return

        if above_cutoff[0] == 0:
            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
            above_cutoff_startpos[0] = 0

        #print "apply cutoff -- chrom:",chrom,"  time:", ttime() - t0
        # start to build peak regions
        #t0 = ttime()

        # first bit of region above cutoff
        acs_ptr = <int32_t *>above_cutoff_startpos.data
        ace_ptr = <int32_t *>above_cutoff_endpos.data
        acia_ptr= <int32_t *>above_cutoff_index_array.data
        treat_array_ptr = <float32_t *> treat_array.data
        ctrl_array_ptr = <float32_t *> ctrl_array.data

        ts = acs_ptr[ 0 ]
        te = ace_ptr[ 0 ]
        ti = acia_ptr[ 0 ]
        tp = treat_array_ptr[ ti ]
        cp = ctrl_array_ptr[ ti ]

        peak_content.append( ( ts, te, tp, cp, ti ) )
        lastp = te
        acs_ptr += 1
        ace_ptr += 1
        acia_ptr+= 1

        for i in range( 1, above_cutoff_startpos.shape[0] ):
            ts = acs_ptr[ 0 ]
            te = ace_ptr[ 0 ]
            ti = acia_ptr[ 0 ]
            acs_ptr += 1
            ace_ptr += 1
            acia_ptr+= 1
            tp = treat_array_ptr[ ti ]
            cp = ctrl_array_ptr[ ti ]
            tl = ts - lastp
            if tl <= max_gap:
                # append.
                peak_content.append( ( ts, te, tp, cp, ti ) )
                lastp = te #above_cutoff_endpos[i]
            else:
                # close
                if call_summits:
                    self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
                else:
                    self.__close_peak_wo_subpeaks   (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
                peak_content = [ ( ts, te, tp, cp, ti ), ]
                lastp = te #above_cutoff_endpos[i]
        # save the last peak
        if not peak_content:
            return
        else:
            if call_summits:
                self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
            else:
                self.__close_peak_wo_subpeaks   (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'

        #print "close peaks -- chrom:",chrom,"  time:", ttime() - t0
        return

    cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int32_t min_length,
                                          bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[]):
        """Close the peak region, output peak boundaries, peak summit
        and scores, then add the peak to peakIO object.

        peak_content contains [start, end, treat_p, ctrl_p, index_in_score_array]

        peaks: a PeakIO object

        """
        cdef:
            int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
            float64_t treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
            int32_t tlist_scores_p

        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
        if peak_length >= min_length: # if the peak is too small, reject it
            tsummit = []
            summit_pos   = 0
            summit_value = 0
            for i in range(len(peak_content)):
                (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
                tscore = ttreat_p # use pscore as general score to find summit
                if not summit_value or summit_value < tscore:
                    tsummit = [(tend + tstart) // 2, ]
                    tsummit_index = [ i, ]
                    summit_value = tscore
                elif summit_value == tscore:
                    # remember continuous summit values
                    tsummit.append((tend + tstart) // 2)
                    tsummit_index.append( i )
            # the middle of all highest points in peak region is defined as summit
            midindex = (len(tsummit) + 1) // 2 - 1
            summit_pos    = tsummit[ midindex ]
            summit_index  = tsummit_index[ midindex ]

            summit_treat = peak_content[ summit_index ][ 2 ]
            summit_ctrl = peak_content[ summit_index ][ 3 ]

            # this is a double-check to see if the summit can pass cutoff values.
            for i in range(len(score_cutoff_s)):
                if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
                    return False # not passed, then disgard this peak.

            summit_p_score = pscore_dict[ ( <int32_t>(summit_treat), summit_ctrl ) ] #get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
            summit_q_score = self.pqtable[ summit_p_score ]

            peaks.add( chrom,           # chromosome
                       peak_content[0][0], # start
                       peak_content[-1][1], # end
                       summit      = summit_pos, # summit position
                       peak_score  = summit_q_score, # score at summit
                       pileup      = summit_treat, # pileup
                       pscore      = summit_p_score, # pvalue
                       fold_change = ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change
                       qscore      = summit_q_score # qvalue
                       )
            # start a new peak
            return True

    cdef bool __close_peak_with_subpeaks (self, list peak_content, peaks, int32_t min_length,
                                         bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[],
                                         float32_t min_valley = 0.9 ):
        """Algorithm implemented by Ben, to profile the pileup signals
        within a peak region then find subpeak summits. This method is
        highly recommended for TFBS or DNAase I sites.

        """
        cdef:
            int32_t summit_pos, tstart, tend, tmpindex, summit_index, summit_offset
            int32_t start, end, i, j, start_boundary, m, n, l
            float64_t summit_value, tvalue, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
            np.ndarray[np.float32_t, ndim=1] peakdata
            np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets
            int32_t tlist_scores_p

        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]

        if peak_length < min_length: return  # if the region is too small, reject it

        # Add 10 bp padding to peak region so that we can get true minima
        end = peak_content[ -1 ][ 1 ] + 10
        start = peak_content[ 0 ][ 0 ] - 10
        if start < 0:
            start_boundary = 10 + start # this is the offset of original peak boundary in peakdata list.
            start = 0
        else:
            start_boundary = 10 # this is the offset of original peak boundary in peakdata list.

        peakdata = np.zeros(end - start, dtype='float32') # save the scores (qscore) for each position in this region
        peakindices = np.zeros(end - start, dtype='int32') # save the indices for each position in this region
        for i in range(len(peak_content)):
            (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
            tscore = ttreat_p # use pileup as general score to find summit
            m = tstart - start + start_boundary
            n = tend - start + start_boundary
            peakdata[m:n] = tscore
            peakindices[m:n] = i

        summit_offsets = maxima(peakdata, smoothlen) # offsets are the indices for summits in peakdata/peakindices array.

        if summit_offsets.shape[0] == 0:
            # **failsafe** if no summits, fall back on old approach #
            return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom, smoothlen, score_array_s, score_cutoff_s)
        else:
            # remove maxima that occurred in padding
            m = np.searchsorted(summit_offsets, start_boundary)
            n = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right')
            summit_offsets = summit_offsets[m:n]

        summit_offsets = enforce_peakyness(peakdata, summit_offsets)

        #print "enforced:",summit_offsets
        if summit_offsets.shape[0] == 0:
            # **failsafe** if no summits, fall back on old approach #
            return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom, smoothlen, score_array_s, score_cutoff_s)

        summit_indices = peakindices[summit_offsets] # indices are those point to peak_content
        summit_offsets -= start_boundary

        for summit_offset, summit_index in list(zip(summit_offsets, summit_indices)):

            summit_treat = peak_content[ summit_index ][ 2 ]
            summit_ctrl = peak_content[ summit_index ][ 3 ]

            summit_p_score = pscore_dict[ ( <int32_t>(summit_treat), summit_ctrl ) ] # get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
            summit_q_score = self.pqtable[ summit_p_score ]

            for i in range(len(score_cutoff_s)):
                if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
                    return False # not passed, then disgard this summit.

            peaks.add( chrom,
                       peak_content[ 0 ][ 0 ],
                       peak_content[ -1 ][ 1 ],
                       summit      = start + summit_offset,
                       peak_score  = summit_q_score,
                       pileup      = summit_treat,
                       pscore      = summit_p_score,
                       fold_change = (summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change
                       qscore      = summit_q_score
                       )
        # start a new peak
        return True

    cdef np.ndarray __cal_pscore ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
        cdef:
            int64_t i, array1_size
            np.ndarray[np.float32_t, ndim=1] s
            float32_t * a1_ptr
            float32_t * a2_ptr
            float32_t * s_ptr

        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")

        a1_ptr = <float32_t *> array1.data
        a2_ptr = <float32_t *> array2.data
        s_ptr = <float32_t *> s.data

        array1_size = array1.shape[0]

        for i in range(array1_size):
            s_ptr[0] = get_pscore(( <int32_t>(a1_ptr[0]), a2_ptr[0] ))
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
        return s

    cdef np.ndarray __cal_qscore ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
        cdef:
            int64_t i, array1_size
            np.ndarray[np.float32_t, ndim=1] s
            float32_t * a1_ptr
            float32_t * a2_ptr
            float32_t * s_ptr

        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")

        a1_ptr = <float32_t *> array1.data
        a2_ptr = <float32_t *> array2.data
        s_ptr = <float32_t *> s.data

        for i in range(array1.shape[0]):
            s_ptr[0] = self.pqtable[ get_pscore(( <int32_t>(a1_ptr[0]), a2_ptr[0] )) ]
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
        return s

    cdef np.ndarray __cal_logLR ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
        cdef:
            int64_t i, array1_size
            np.ndarray[np.float32_t, ndim=1] s
            float32_t * a1_ptr
            float32_t * a2_ptr
            float32_t * s_ptr

        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")

        a1_ptr = <float32_t *> array1.data
        a2_ptr = <float32_t *> array2.data
        s_ptr = <float32_t *> s.data

        for i in range(array1.shape[0]):
            s_ptr[0] = get_logLR_asym( (a1_ptr[0] + self.pseudocount, a2_ptr[0] + self.pseudocount ) )
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
        return s

    cdef np.ndarray __cal_logFE ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
        cdef:
            int64_t i, array1_size
            np.ndarray[np.float32_t, ndim=1] s
            float32_t * a1_ptr
            float32_t * a2_ptr
            float32_t * s_ptr

        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")

        a1_ptr = <float32_t *> array1.data
        a2_ptr = <float32_t *> array2.data
        s_ptr = <float32_t *> s.data

        for i in range(array1.shape[0]):
            s_ptr[0] = get_logFE( a1_ptr[0] + self.pseudocount, a2_ptr[0] + self.pseudocount )
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
        return s

    cdef np.ndarray __cal_FE ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
        cdef:
            int64_t i, array1_size
            np.ndarray[np.float32_t, ndim=1] s
            float32_t * a1_ptr
            float32_t * a2_ptr
            float32_t * s_ptr

        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")

        a1_ptr = <float32_t *> array1.data
        a2_ptr = <float32_t *> array2.data
        s_ptr = <float32_t *> s.data

        for i in range(array1.shape[0]):
            s_ptr[0] = (a1_ptr[0] + self.pseudocount) / ( a2_ptr[0] + self.pseudocount )
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
        return s

    cdef np.ndarray __cal_subtraction ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
        cdef:
            int64_t i, array1_size
            np.ndarray[np.float32_t, ndim=1] s
            float32_t * a1_ptr
            float32_t * a2_ptr
            float32_t * s_ptr

        assert array1.shape[0] == array2.shape[0]
        s = np.zeros(array1.shape[0], dtype="float32")

        a1_ptr = <float32_t *> array1.data
        a2_ptr = <float32_t *> array2.data
        s_ptr = <float32_t *> s.data

        for i in range(array1.shape[0]):
            s_ptr[0] = a1_ptr[0] - a2_ptr[0]
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
        return s


    cdef bool __write_bedGraph_for_a_chromosome ( self, bytes chrom ):
        """Write treat/control values for a certain chromosome into a
        specified file handler.

        """
        cdef:
            np.ndarray[np.int32_t, ndim=1] pos_array
            np.ndarray[np.float32_t, ndim=1] treat_array, ctrl_array
            int32_t * pos_array_ptr
            float32_t * treat_array_ptr
            float32_t * ctrl_array_ptr
            int32_t l, i
            int32_t p, pre_p_t, pre_p_c # current position, previous position for treat, previous position for control
            float32_t pre_v_t, pre_v_c, v_t, v_c # previous value for treat, for control, current value for treat, for control
            float32_t denominator # 1 if save_SPMR is false, or depth in million if save_SPMR is true. Note, while piling up and calling peaks, treatment and control have been scaled to the same depth, so we need to find what this 'depth' is.
            FILE * ft
            FILE * fc
            basestring tmp_bytes

        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
        pos_array_ptr = <int32_t *> pos_array.data
        treat_array_ptr = <float32_t *> treat_array.data
        ctrl_array_ptr = <float32_t *> ctrl_array.data

        if self.save_SPMR:
            if self.treat_scaling_factor == 1:
                # in this case, control has been asked to be scaled to depth of treatment
                denominator  = self.treat.total/1e6
            else:
                # in this case, treatment has been asked to be scaled to depth of control
                denominator  = self.ctrl.total/1e6
        else:
            denominator = 1.0

        l = pos_array.shape[ 0 ]

        if l == 0:              # if there is no data, return
            return False

        ft = self.bedGraph_treat_f
        fc = self.bedGraph_ctrl_f
        #t_write_func = self.bedGraph_treat.write
        #c_write_func = self.bedGraph_ctrl.write

        pre_p_t = 0
        pre_p_c = 0
        pre_v_t = treat_array_ptr[ 0 ]/denominator
        pre_v_c = ctrl_array_ptr [ 0 ]/denominator
        treat_array_ptr += 1
        ctrl_array_ptr += 1

        for i in range( 1, l ):
            v_t = treat_array_ptr[ 0 ]/denominator
            v_c = ctrl_array_ptr [ 0 ]/denominator
            p   = pos_array_ptr  [ 0 ]
            pos_array_ptr += 1
            treat_array_ptr += 1
            ctrl_array_ptr += 1

            if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits
                fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
                pre_v_t = v_t
                pre_p_t = p

            if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits
                fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
                pre_v_c = v_c
                pre_p_c = p

        p = pos_array_ptr[ 0 ]
        # last one
        fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
        fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )

        return True

    cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400, bool cutoff_analysis = False):
        """This function try to find enriched regions within which,
        scores are continuously higher than a given cutoff for level
        1, and link them using the gap above level 2 cutoff with a
        maximum length of lvl2_max_gap.

        scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']

        lvl1_cutoff_s:  list of cutoffs at highly enriched regions, corresponding to scoring functions.
        lvl2_cutoff_s:  list of cutoffs at less enriched regions, corresponding to scoring functions.
        min_length :    minimum peak length, default 200.
        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
        lvl2_max_gap   :  maximum length of linkage regions, default 400.

        Return both general PeakIO object for highly enriched regions
        and gapped broad regions in BroadPeakIO.
        """
        cdef:
            int32_t i, j
            bytes chrom
            object lvl1peaks, lvl1peakschrom, lvl1
            object lvl2peaks, lvl2peakschrom, lvl2
            object broadpeaks
            set chrs
            list tmppeakset

        lvl1peaks = PeakIO()
        lvl2peaks = PeakIO()

        # prepare p-q table
        if len( self.pqtable ) == 0:
            info("#3 Pre-compute pvalue-qvalue table...")
            if cutoff_analysis:
                info("#3 Cutoff value vs broad region calls will be analyzed!")
                self.__pre_computes( max_gap = lvl2_max_gap, min_length = min_length )
            else:
                self.__cal_pvalue_qvalue_table()

        # prepare bedGraph file
        if self.save_bedGraph:

            self.bedGraph_treat_f = fopen( self.bedGraph_treat_filename, "w" )
            self.bedGraph_ctrl_f = fopen( self.bedGraph_control_filename, "w" )
            info ("#3 In the peak calling step, the following will be performed simultaneously:")
            info ("#3   Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
            info ("#3   Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")

            if self.trackline:
                # this line is REQUIRED by the wiggle format for UCSC browser
                tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
                fprintf( self.bedGraph_treat_f, tmp_bytes )
                tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
                fprintf( self.bedGraph_ctrl_f, tmp_bytes )


        info("#3 Call peaks for each chromosome...")
        for chrom in self.chromosomes:
            self.__chrom_call_broadpeak_using_certain_criteria ( lvl1peaks, lvl2peaks, chrom, scoring_function_symbols, lvl1_cutoff_s, lvl2_cutoff_s, min_length, lvl1_max_gap, lvl2_max_gap, self.save_bedGraph )

        # close bedGraph file
        if self.save_bedGraph:
            fclose( self.bedGraph_treat_f )
            fclose( self.bedGraph_ctrl_f )
            #self.bedGraph_ctrl.close()
            self.save_bedGraph = False

        # now combine lvl1 and lvl2 peaks
        chrs = lvl1peaks.get_chr_names()
        broadpeaks = BroadPeakIO()
        # use lvl2_peaks as linking regions between lvl1_peaks
        for chrom in sorted(chrs):
            lvl1peakschrom = lvl1peaks.get_data_from_chrom(chrom)
            lvl2peakschrom = lvl2peaks.get_data_from_chrom(chrom)
            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
            # our assumption is lvl1 regions should be included in lvl2 regions
            try:
                lvl1 = lvl1peakschrom_next()
                for i in range( len(lvl2peakschrom) ):
                    # for each lvl2 peak, find all lvl1 peaks inside
                    # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
                    lvl2 = lvl2peakschrom[i]

                    while True:
                        if lvl2["start"] <= lvl1["start"]  and lvl1["end"] <= lvl2["end"]:
                            tmppeakset.append(lvl1)
                            lvl1 = lvl1peakschrom_next()
                        else:
                            # make a hierarchical broad peak
                            #print lvl2["start"], lvl2["end"], lvl2["score"]
                            self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
                            tmppeakset = []
                            break
            except StopIteration:
                # no more strong (aka lvl1) peaks left
                self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
                tmppeakset = []
                # add the rest lvl2 peaks
                for j in range( i+1, len(lvl2peakschrom) ):
                    self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset )

        return broadpeaks

    cdef void  __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
                                                         int32_t min_length, int32_t lvl1_max_gap, int32_t lvl2_max_gap, bool save_bedGraph):
        """ Call peaks for a chromosome.

        Combination of criteria is allowed here.

        peaks: a PeakIO object
        scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
        """
        cdef:
            int32_t i
            str s
            np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos
            np.ndarray pos_array, treat_array, ctrl_array
            np.ndarray above_cutoff_index_array
            list score_array_s          # list to keep different types of scores
            list peak_content
            int32_t * acs_ptr
            int32_t * ace_ptr
            int32_t * acia_ptr
            float32_t * treat_array_ptr
            float32_t * ctrl_array_ptr

        assert len(scoring_function_s) == len(lvl1_cutoff_s), "number of functions and cutoffs should be the same!"
        assert len(scoring_function_s) == len(lvl2_cutoff_s), "number of functions and cutoffs should be the same!"

        # first, build pileup, self.chr_pos_treat_ctrl
        self.__pileup_treat_ctrl_a_chromosome( chrom )
        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl

        # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
        if save_bedGraph:
            self.__write_bedGraph_for_a_chromosome ( chrom )

        # keep all types of scores needed
        score_array_s = []
        for i in range(len(scoring_function_s)):
            s = scoring_function_s[i]
            if s == 'p':
                score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) )
            elif s == 'q':
                score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) )
            elif s == 'f':
                score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) )
            elif s == 's':
                score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) )

        # lvl1 : strong peaks
        peak_content = []           # to store points above cutoff

        # get the regions with scores above cutoffs
        above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl1_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
        above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
        above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
        above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff

        if above_cutoff.size == 0:
            # nothing above cutoff
            return

        if above_cutoff[0] == 0:
            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
            above_cutoff_startpos[0] = 0

        # first bit of region above cutoff
        acs_ptr = <int32_t *>above_cutoff_startpos.data
        ace_ptr = <int32_t *>above_cutoff_endpos.data
        acia_ptr= <int32_t *>above_cutoff_index_array.data
        treat_array_ptr = <float32_t *> treat_array.data
        ctrl_array_ptr = <float32_t *> ctrl_array.data

        ts = acs_ptr[ 0 ]
        te = ace_ptr[ 0 ]
        ti = acia_ptr[ 0 ]
        tp = treat_array_ptr[ ti ]
        cp = ctrl_array_ptr[ ti ]

        peak_content.append( ( ts, te, tp, cp, ti ) )
        acs_ptr += 1 # move ptr
        ace_ptr += 1
        acia_ptr+= 1
        lastp = te

        #peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], score_array_s, above_cutoff_index_array[0] ) )
        for i in range( 1, above_cutoff_startpos.size ):
            ts = acs_ptr[ 0 ]
            te = ace_ptr[ 0 ]
            ti = acia_ptr[ 0 ]
            acs_ptr += 1
            ace_ptr += 1
            acia_ptr+= 1
            tp = treat_array_ptr[ ti ]
            cp = ctrl_array_ptr[ ti ]
            tl = ts - lastp
            if tl <= lvl1_max_gap:
                # append
                #peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i] ) )
                peak_content.append( ( ts, te, tp, cp, ti ) )
                lastp = te
            else:
                # close
                self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap//2, score_array_s )
                #peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i]) , ]
                peak_content = [ ( ts, te, tp, cp, ti ), ]
                lastp = te #above_cutoff_endpos[i]

        # save the last peak
        if peak_content:
            self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap//2, score_array_s )

        # lvl2 : weak peaks
        peak_content = []           # to store points above cutoff

        # get the regions with scores above cutoffs
        above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl2_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
        above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
        above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
        above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff

        if above_cutoff.size == 0:
            # nothing above cutoff
            return

        if above_cutoff[0] == 0:
            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
            above_cutoff_startpos[0] = 0

        # first bit of region above cutoff
        acs_ptr = <int32_t *>above_cutoff_startpos.data
        ace_ptr = <int32_t *>above_cutoff_endpos.data
        acia_ptr= <int32_t *>above_cutoff_index_array.data
        treat_array_ptr = <float32_t *> treat_array.data
        ctrl_array_ptr = <float32_t *> ctrl_array.data

        ts = acs_ptr[ 0 ]
        te = ace_ptr[ 0 ]
        ti = acia_ptr[ 0 ]
        tp = treat_array_ptr[ ti ]
        cp = ctrl_array_ptr[ ti ]
        peak_content.append( ( ts, te, tp, cp, ti ) )
        acs_ptr += 1 # move ptr
        ace_ptr += 1
        acia_ptr+= 1

        lastp = te
        for i in range( 1, above_cutoff_startpos.size ):
            # for everything above cutoff
            ts = acs_ptr[ 0 ] # get the start
            te = ace_ptr[ 0 ] # get the end
            ti = acia_ptr[ 0 ]# get the index

            acs_ptr += 1 # move ptr
            ace_ptr += 1
            acia_ptr+= 1
            tp = treat_array_ptr[ ti ] # get the treatment pileup
            cp = ctrl_array_ptr[ ti ]  # get the control pileup
            tl = ts - lastp # get the distance from the current point to last position of existing peak_content

            if tl <= lvl2_max_gap:
                # append
                peak_content.append( ( ts, te, tp, cp, ti ) )
                lastp = te
            else:
                # close
                self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap//2, score_array_s )

                peak_content = [ ( ts, te, tp, cp, ti ), ]
                lastp = te

        # save the last peak
        if peak_content:
            self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap//2, score_array_s )

        return

    cdef bool __close_peak_for_broad_region (self, list peak_content, peaks, int32_t min_length,
                                             bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[]):
        """Close the broad peak region, output peak boundaries, peak summit
        and scores, then add the peak to peakIO object.

        peak_content contains [start, end, treat_p, ctrl_p, list_scores]

        peaks: a BroadPeakIO object

        """
        cdef:
            int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
            float64_t treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
            list tlist_pileup, tlist_control, tlist_length
            int32_t tlist_scores_p
            np.ndarray tarray_pileup, tarray_control, tarray_pscore, tarray_qscore, tarray_fc

        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
        if peak_length >= min_length: # if the peak is too small, reject it
            tlist_pileup = []
            tlist_control= []
            tlist_length = []
            for i in range(len(peak_content)): # each position in broad peak
                (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
                tlist_pileup.append( ttreat_p )
                tlist_control.append( tctrl_p )
                tlist_length.append( tend - tstart )

            tarray_pileup = np.array( tlist_pileup, dtype="float32")
            tarray_control = np.array( tlist_control, dtype="float32")
            tarray_pscore = self.__cal_pscore( tarray_pileup, tarray_control )
            tarray_qscore = self.__cal_qscore( tarray_pileup, tarray_control )
            tarray_fc     = self.__cal_FE    ( tarray_pileup, tarray_control )

            peaks.add( chrom,           # chromosome
                       peak_content[0][0], # start
                       peak_content[-1][1], # end
                       summit = 0,
                       peak_score  = mean_from_value_length( tarray_qscore, tlist_length ),
                       pileup      = mean_from_value_length( tarray_pileup, tlist_length ),
                       pscore      = mean_from_value_length( tarray_pscore, tlist_length ),
                       fold_change = mean_from_value_length( tarray_fc, tlist_length ),
                       qscore      = mean_from_value_length( tarray_qscore, tlist_length ),
                       )
            #if chrom == "chr1" and  peak_content[0][0] == 237643 and peak_content[-1][1] == 237935:
            #    print tarray_qscore, tlist_length
            # start a new peak
            return True

    cdef __add_broadpeak (self, bpeaks, bytes chrom, object lvl2peak, list lvl1peakset):
        """Internal function to create broad peak.

        *Note* lvl1peakset/strong_regions might be empty
        """

        cdef:
            int32_t blockNum, start, end
            bytes blockSizes, blockStarts, thickStart, thickEnd,

        start      = lvl2peak["start"]
        end        = lvl2peak["end"]

        if not lvl1peakset:
            # will complement by adding 1bps start and end to this region
            # may change in the future if gappedPeak format was improved.
            bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
                       blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
                       pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
                       qscore = lvl2peak["qscore"] )
            return bpeaks

        thickStart = b"%d" % (lvl1peakset[0]["start"])
        thickEnd   = b"%d" % (lvl1peakset[-1]["end"])
        blockNum   = len(lvl1peakset)
        blockSizes = b",".join([b"%d" % y for y in [x["length"] for x in lvl1peakset]])
        blockStarts = b",".join([b"%d" % x for x in getitem_then_subtract(lvl1peakset, start)])

        # add 1bp left and/or right block if necessary
        if int(thickStart) != start:
            # add 1bp left block
            thickStart = b"%d" % start
            blockNum += 1
            blockSizes = b"1,"+blockSizes
            blockStarts = b"0,"+blockStarts
        if int(thickEnd) != end:
            # add 1bp right block
            thickEnd = b"%d" % end
            blockNum += 1
            blockSizes = blockSizes + b",1"
            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))

        bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
                   blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts, pileup = lvl2peak["pileup"],
                   pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
                   qscore = lvl2peak["qscore"] )
        return bpeaks