File: test_api.py

package info (click to toggle)
python-cooler 0.10.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 32,600 kB
  • sloc: python: 11,033; makefile: 173; sh: 31
file content (224 lines) | stat: -rw-r--r-- 8,153 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os.path as op

import h5py
import numpy as np
import pandas as pd
import pytest
import scipy.sparse as sps

from cooler import api

testdir = op.realpath(op.dirname(__file__))
datadir = op.join(testdir, "data")


def test_info(mock_cooler):
    info = api.info(mock_cooler)
    assert isinstance(info, dict)


def test_chromtable(mock_cooler):
    table = api.chroms(mock_cooler)
    assert np.all(table["length"] == mock_cooler["chroms"]["length"])


def test_bintable(mock_cooler):
    chromID_lookup = pd.Series({"chr1": 0, "chr2": 1})
    lo, hi = 2, 10
    table = api.bins(mock_cooler, lo, hi)
    assert np.all(chromID_lookup[table["chrom"]] == mock_cooler["bins"]["chrom"][lo:hi])
    assert np.all(table["start"] == mock_cooler["bins"]["start"][lo:hi])
    assert np.all(table["end"] == mock_cooler["bins"]["end"][lo:hi])

    table = api.bins(mock_cooler, lo, hi, fields=["start", "end"])
    assert np.all(table["start"] == mock_cooler["bins"]["start"][lo:hi])
    assert np.all(table["end"] == mock_cooler["bins"]["end"][lo:hi])


def test_bintable_many_contigs():
    # In a file with many contigs, bins/chrom does not have an ENUM header,
    # so chromosome names are taken from the chroms/name
    clr = api.Cooler(op.join(datadir, "manycontigs.1.cool"))
    bins = clr.bins()[:10]
    assert isinstance(bins["chrom"].dtype, pd.CategoricalDtype)

    bins = clr.bins()[["chrom", "start"]][:10]
    assert isinstance(bins["chrom"].dtype, pd.CategoricalDtype)

    chroms = clr.bins()["chrom"][:10]
    clr.bins()["start"][:10]
    assert isinstance(chroms.dtype, pd.CategoricalDtype)


def test_pixeltable(mock_cooler):
    lo, hi = 2, 10
    table = api.pixels(mock_cooler, lo, hi, join=False)
    assert np.all(table["bin1_id"] == mock_cooler["pixels"]["bin1_id"][lo:hi])
    assert np.all(table["bin2_id"] == mock_cooler["pixels"]["bin2_id"][lo:hi])

    table = api.pixels(mock_cooler, lo, hi, join=True)
    assert table.shape == (hi - lo, len(mock_cooler["pixels"]) + 4)


def test_annotate(mock_cooler):
    clr = api.Cooler(mock_cooler)

    # works with full bin table / view or only required bins
    df = clr.matrix(as_pixels=True, balance=False).fetch("chr1")
    df1 = api.annotate(df, clr.bins()[:])
    df2 = api.annotate(df, clr.bins())
    df3 = api.annotate(df, clr.bins().fetch("chr1"))
    assert np.all(df1 == df2)
    assert np.all(df1 == df3)

    # works on empty dataframe
    df4 = api.annotate(df[0:0], clr.bins()[:])
    assert np.all(df4.columns == df3.columns)
    assert len(df4) == 0


def test_annotate_with_partial_bins():
    # Addresses a bug where partial bin-table dataframes were sliced incorrectly,
    # specifically when the pixel dataframe happened to be shorter than it. This
    # led to incorrect NaNs in the join output.
    #
    # This is different from the case where there are pixels from bins that do
    # not appear in the provided partial bin-table dataframe. This will lead to
    # NaNs in the join but the result will be correct because those bins were
    # missing from the input. However, we may want to raise an error in such
    # cases, or disallow partial bin-table inputs entirely.
    clr = api.Cooler(op.join(datadir, "hg19.GM12878-MboI.matrix.2000kb.cool"))
    pix = clr.matrix(as_pixels=True, balance=False).fetch("chr2").iloc[:50]

    bins_chr2 = clr.bins().fetch("chr2")
    assert len(bins_chr2) > len(pix)

    out = api.annotate(pix, bins_chr2)

    for col in ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]:
        assert out[col].notnull().all()


def test_annotate_with_partial_bins_and_partial_pixels():
    # Addresses a bug where annotating only certain pixels with a partial
    # bin-table dataframe would lead to IndexError

    clr = api.Cooler(op.join(datadir, "hg19.GM12878-MboI.matrix.2000kb.cool"))
    region = "chr2:0-1000000"

    pix = clr.matrix(as_pixels=True, balance=False).fetch(region)

    # Use partial bin table
    bins_region = clr.bins().fetch(region)
    out = api.annotate(pix, bins_region)
    for col in ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]:
        assert out[col].notnull().all()

    # Compare with full bin table as selector
    pd.testing.assert_frame_equal(out, api.annotate(pix, clr.bins()))

    # Compare with full bin table materialized
    pd.testing.assert_frame_equal(out, api.annotate(pix, clr.bins()[:]))


def test_matrix():
    clr = api.Cooler(op.join(datadir, "yeast.10kb.cool"))
    region = ("chrI:100345-220254", "chrII:200789-813183")
    # numpy array
    clr.matrix(balance=False).fetch(*region)
    clr.matrix(balance=True).fetch(*region)
    clr.matrix(balance="weight").fetch(*region)
    clr.matrix(balance="weight", divisive_weights=True).fetch(*region)
    # sparse coo_matrix
    clr.matrix(sparse=True, balance=False).fetch(*region)
    clr.matrix(sparse=True, balance=True).fetch(*region)
    clr.matrix(sparse=True, balance="weight").fetch(*region)
    clr.matrix(sparse=True, balance="weight", divisive_weights=True).fetch(*region)
    # dataframe
    clr.matrix(as_pixels=True, join=False, balance=False).fetch(*region)
    clr.matrix(as_pixels=True, join=False, balance=True).fetch(*region)
    clr.matrix(as_pixels=True, join=True, balance=True).fetch(*region)
    clr.matrix(as_pixels=True, join=True, balance=True).fetch(*region)
    clr.matrix(as_pixels=True, join=True, balance="weight").fetch(*region)
    clr.matrix(
        as_pixels=True, join=True, balance="weight", divisive_weights=True
    ).fetch(*region)

    # Unbalanced and asymmetric cooler
    clr = api.Cooler(op.join(datadir, "toy.asymm.2.cool"))
    region = ("chr2", "chr1:2-24")
    # numpy array
    clr.matrix(balance=False).fetch(*region)
    with pytest.raises(ValueError):
        clr.matrix(balance=True).fetch(*region)
    # sparse coo_matrix
    clr.matrix(sparse=True, balance=False).fetch(*region)
    with pytest.raises(ValueError):
        clr.matrix(sparse=True, balance=True).fetch(*region)
    # dataframe
    clr.matrix(as_pixels=True, join=False, balance=False).fetch(*region)
    with pytest.raises(ValueError):
        clr.matrix(
            as_pixels=True, join=True, balance="weight", divisive_weights=True
        ).fetch(*region)


def test_cooler_class(mock_cooler):
    clr = api.Cooler(mock_cooler)
    assert clr.shape == (20, 20)

    # chrom table
    table = clr.chroms()[:]
    assert table["name"].tolist() == mock_cooler["chroms"]["name"].astype("U").tolist()
    assert np.all(table["length"] == mock_cooler["chroms"]["length"])

    # bin table
    table = clr.bins().fetch("chr1")
    assert np.all(table["start"] == mock_cooler["bins"]["start"][0:10])
    assert np.all(table["end"] == mock_cooler["bins"]["end"][0:10])

    # pixel table
    table = clr.pixels().fetch("chr1")

    # offsets
    assert clr.offset("chr1") == 0
    assert clr.extent("chr1") == (0, 10)

    # 2D range queries as rectangular or triangular
    A1 = np.triu(clr.matrix(balance=False).fetch("chr2"))
    df = clr.matrix(as_pixels=True, join=False, balance=False).fetch("chr2")
    i0 = clr.offset("chr2")
    i, j, v = df["bin1_id"], df["bin2_id"], df["count"]
    mat = sps.coo_matrix((v, (i - i0, j - i0)), (A1.shape))
    A2 = np.triu(mat.toarray())
    assert np.all(A1 == A2)


def test_cooler_class2():
    path = op.join(datadir, "toy.symm.upper.2.cool")
    with h5py.File(path, "r") as f:
        clr = api.Cooler(f)
        repr(clr)
        assert clr.root == "/"
        assert clr.filename == path
        assert isinstance(clr.store, h5py.File)
        with clr.open("r") as f:
            pass

    with pytest.raises(KeyError):
        api.Cooler(path + "::/does/not/exist")

    clr = api.Cooler(path)
    clr._load_dset("indexes/chrom_offset")
    clr._load_dset("indexes/bin1_offset")
    clr._load_attrs("bins/chrom")

    with clr.open("r") as f:
        pass

    assert clr.storage_mode == "symmetric-upper"
    assert clr.binsize == 2
    assert len(clr.chromsizes) == 2
    assert clr.info["nchroms"] == 2
    assert clr.chromnames == ["chr1", "chr2"]
    assert repr(clr) == '<Cooler "{}::{}">'.format("toy.symm.upper.2.cool", "/")