File: test_harmony.py

package info (click to toggle)
harmonypy 0.0.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,092 kB
  • sloc: python: 393; makefile: 5
file content (79 lines) | stat: -rw-r--r-- 2,562 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# vprof -c p tests/test_harmony.py
from time import time

import numpy as np
import pandas as pd
from scipy.cluster.vq import kmeans2
from scipy.stats import pearsonr
import sys

import harmonypy as hm


def test_run_harmony():

    meta_data = pd.read_csv("data/pbmc_3500_meta.tsv.gz", sep="\t")
    data_mat = pd.read_csv("data/pbmc_3500_pcs.tsv.gz", sep="\t")

    start = time()
    ho = hm.run_harmony(data_mat, meta_data, ['donor'])
    end = time()
    print("{:.2f} seconds elapsed".format(end - start))

    res = pd.DataFrame(ho.Z_corr).T
    res.columns = ['PC{}'.format(i + 1) for i in range(res.shape[1])]
    # res.to_csv("data/pbmc_3500_pcs_harmonized_python.tsv.gz", sep = "\t", index = False)

    harm = pd.read_csv("data/pbmc_3500_pcs_harmonized.tsv.gz", sep="\t")

    cors = []
    for i in range(res.shape[1]):
        cors.append(pearsonr(res.iloc[:, i].values, harm.iloc[:, i].values))
    print([np.round(x[0], 3) for x in cors])

    # Correlation between test PCs and observed PCs is high
    assert np.all(np.array([x[0] for x in cors]) >= 0.9)


def test_random_seed():
    meta_data = pd.read_csv("data/pbmc_3500_meta.tsv.gz", sep="\t")
    data_mat = pd.read_csv("data/pbmc_3500_pcs.tsv.gz", sep="\t")

    def run(random_state):
        ho = hm.run_harmony(data_mat,
                            meta_data, ['donor'],
                            max_iter_harmony=2,
                            max_iter_kmeans=2,
                            random_state=random_state)
        return ho.Z_corr

    # Assert same results when random_state is set.
    np.testing.assert_allclose(run(42), run(42))

    # Assert different values when random_state is None. Absolute differences
    # in multiple runs are usually > 2000
    assert np.abs(run(None) - run(None)).sum() > 1000


def test_cluster_fn():
    meta_data = pd.read_csv("data/pbmc_3500_meta.tsv.gz", sep="\t")
    data_mat = pd.read_csv("data/pbmc_3500_pcs.tsv.gz", sep="\t")

    if sys.version_info.major == 3 and sys.version_info.minor == 6:
        return

    def cluster_fn(data, K):
        centroid, label = kmeans2(data, K, minit='++', seed=0)
        return centroid

    def run(cluster_fn):
        ho = hm.run_harmony(data_mat,
                            meta_data, ['donor'],
                            max_iter_harmony=2,
                            max_iter_kmeans=2,
                            cluster_fn=cluster_fn)
        return ho.Z_corr

    # Assert same results when random_state is set.
    np.testing.assert_equal(run(cluster_fn), run(cluster_fn))