File: benchmark_DelayedMatrix-mult.R

package info (click to toggle)
r-bioc-delayedarray 0.32.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,596 kB
  • sloc: ansic: 79; makefile: 2
file content (153 lines) | stat: -rw-r--r-- 5,044 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
### ===========================================================================
### EXAMPLE 1
### On the importance of acknowledging the geometry of physical chunks

library(DelayedArray)
library(HDF5Array)

super_BLOCK_mult <- DelayedArray:::.super_BLOCK_mult
BLOCK_mult_Lgrid <- DelayedArray:::BLOCK_mult_Lgrid
BLOCK_mult_Rgrid <- DelayedArray:::BLOCK_mult_Rgrid

set.seed(2009)
m1 <- matrix(runif(15e6), ncol=1000)   # 15000 x 1000
m2 <- matrix(runif(4000), nrow=1000)   # 1000 x 4
#M1 <- writeTENxMatrix(m1, "M1.h5", group="M1")
M1 <- TENxMatrix("M1.h5", group="M1")  # column oriented!
M2 <- writeTENxMatrix(m2)

m0 <- m1 %*% m2

setAutoBlockSize(1e6)

## m1 %*% m2
res1 <- super_BLOCK_mult(m1, m2, MULT=`%*%`)
stopifnot(identical(as.matrix(res1), m0))
res2 <- BLOCK_mult_Lgrid(m1, m2)
stopifnot(all.equal(res2, m0))

## M1 %*% m2
res3 <- super_BLOCK_mult(M1, m2, MULT=`%*%`)         # 120 s
stopifnot(identical(as.matrix(res3), m0))
res4 <- BLOCK_mult_Lgrid(M1, m2)  # 1.58 s
stopifnot(all.equal(res4, m0))
### The difference lies in how super_BLOCK_mult() and
### BLOCK_mult_Lgrid() walk on M1. The former
### walks on blocks made of full rows while the latter walks on
### blocks made of full cols. For a TENxMatrix object like M1,
### loading a full row is bad because it actually loads the entire
### dataset in memory. This means that, for each block that it processes,
### super_BLOCK_mult() will load the entire dataset in memory!

## m1 %*% M2
res5 <- super_BLOCK_mult(m1, M2, MULT=`%*%`)
stopifnot(identical(as.matrix(res5), m0))
res6 <- BLOCK_mult_Rgrid(m1, M2)
stopifnot(all.equal(res6, m0))


### ===========================================================================
### EXAMPLE 2

### With a subset of the 1.3 Million Brain Cell Dataset
library(HDF5Array)
DelayedArray:::set_verbose_block_processing(TRUE)
library(ExperimentHub)
hub <- ExperimentHub()
tenx <- TENxMatrix(hub[["EH1039"]], group="mm10")

### 12.5k cols:
M <- tenx[ , 1:12500]
set.seed(2009)
m <- cbind(runif(ncol(M)), runif(ncol(M)))
system.time(res <- M %*% m)  # 4.3s / 1.1Gb (was 32s / 2.1Gb)

### 25k cols:
M <- tenx[ , 1:25000]
set.seed(2009)
m <- cbind(runif(ncol(M)), runif(ncol(M)))
system.time(res <- M %*% m)  # 7.6s / 1.1Gb (was 110s / 3.1Gb)

### 50k cols:
M <- tenx[ , 1:50000]
set.seed(2009)
m <- cbind(runif(ncol(M)), runif(ncol(M)))
system.time(res <- M %*% m)  # 13.4s / 1.1Gb  (was 495s / 5.6Gb)

### 100k cols:
M <- tenx[ , 1:100000]
set.seed(2009)
m <- cbind(runif(ncol(M)), runif(ncol(M)))
system.time(res <- M %*% m)  # 24s / 1.2Gg  (was 2409s / 9.1Gb)


### ===========================================================================
### EXAMPLE 3

library(HDF5Array)
library(ExperimentHub)
hub <- ExperimentHub()
M <- TENxMatrix(hub[["EH1039"]], group="mm10") # 1.3 Million Brain Cell Dataset

### Find Singular Values with RSpectra::svds() (note that one would
### typically log-normalize the data before doing this but we're skipping
### that step here to keep things as simple as possible):
library(RSpectra)
#DelayedArray:::set_verbose_block_processing(TRUE)
system.time(row_means <- rowMeans(M))  # 368s / 1.2Gb
Ax <- function(x, args) {M %*% x - row_means * sum(x)}
Atx <- function(x, args) {x %*% M - as.vector(row_means %*% x)}
### Will take a while, but memory usage will stay under 2Gb!
svd <- RSpectra::svds(Ax, Atrans=Atx, k=2, dim=dim(M))


### ===========================================================================
### EXAMPLE 4

library(DelayedArray)
library(HDF5Array)
super_BLOCK_mult <- DelayedArray:::.super_BLOCK_mult
BLOCK_mult_Lgrid <- DelayedArray:::BLOCK_mult_Lgrid
DelayedArray:::set_verbose_block_processing(TRUE)
#setAutoBlockSize(1e6)
setAutoBlockSize(1e7)
setAutoBPPARAM(BiocParallel::SnowParam(2))

set.seed(2009)
m3 <- matrix(runif(21e6), ncol=600) # 35000 x 600
set.seed(1972)
m4 <- matrix(runif(9e6), nrow=600)  # 600 x 15000
#M3 <- writeHDF5Array(m3, "M3.h5", name="M3", chunkdim=c(50, 50))
#M3 <- HDF5Array("M3.h5", name="M3")

#m34 <- m3 %*% m4  # 168s / 4.5g

## m3 %*% m4
res1 <- super_BLOCK_mult(m3, m4, MULT=`%*%`)
# block size = 1e6 -> 113s / 8.5g
# block size = 1e7 -> 107s / 8.4g
# block size = 1e7; SnowParam(2) -> 73s / 8.5g
# block size = 1e7; SnowParam(4) -> 62s / 8.5g (4 workers used 11g!)
stopifnot(identical(as.matrix(res1), m34))
res2 <- BLOCK_mult_Lgrid(m3, m4)
# block size = 1e6 -> 118s / 8.7g
# block size = 1e7 -> 112s / 8.5g
# block size = 1e7; SnowParam(2) -> 76s / 8.5g
# block size = 1e7; SnowParam(4) -> 64s / 8.5 g (4 workers used 16g!)
stopifnot(all.equal(res2, m34))

## m3 %*% m4 but now writing the result to disk
setAutoRealizationBackend("HDF5Array")
res3 <- super_BLOCK_mult(m3, m4, MULT=`%*%`)
# block size = 1e6 -> 206s / 9.2g
# block size = 1e7 -> 201s / 9.2g
stopifnot(identical(as.matrix(res3), m34))
res4 <- BLOCK_mult_Lgrid(m3, m4)
# block size = 1e6 -> 213s / 1g
# block size = 1e7 -> 202s / 1.9g
res4b <- realize(res4)
# block size = 1e6 -> 287s / 1.2g
# block size = 1e7 -> 141s / 1.5g
stopifnot(all.equal(res4b, m34))

# and if we parallelize?