File: 03-DelayedArray_HDF5Array_update.Rnw

package info (click to toggle)
r-bioc-delayedarray 0.24.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,480 kB
  • sloc: ansic: 727; makefile: 2
file content (216 lines) | stat: -rw-r--r-- 6,085 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{DelayedArray / HDF5Array update}
%\VignetteDepends{knitr,Matrix,DelayedArray,HDF5Array,lobstr}

% 2019-12-22: A temporary fix to avoid the following pdflatex error caused by
% an issue in LaTeX package filehook-scrlfile (used by beamer):
%   ! Package filehook Error: Detected unknown definition of \InputIfFileExists.
%   Use the 'force' option of 'filehook' to overwrite it..
% The error appeared on tokay2 in Dec 2019 after reinstalling MiKTeX 2.9.
% See comment by Phelype Oleinik here for the fix:
%   https://tex.stackexchange.com/questions/512189/problem-with-chemmacros-beamer-and-filehook-scrlfile-sty
\PassOptionsToPackage{force}{filehook}

\documentclass[8pt]{beamer}

\mode<presentation> {
\usetheme{Madrid}
\usecolortheme{crane}
}

\usepackage{slides}
\renewcommand\Rclass[1]{{\texttt{#1}\index{#1 (class)}}}

\AtBeginSection[]
{
  \begin{frame}<beamer>
    \tableofcontents[currentsection]
  \end{frame}
}

\title{DelayedArray / HDF5Array update}

\author{Herv\'e Pag\`es}

\institute{Fred Hutch, Seattle}

\date{April 2021}

\begin{document}

<<setup, include=FALSE>>=
library(knitr)
opts_chunk$set(size="scriptsize")
options(width=80)
library(Matrix)
library(DelayedArray)
library(HDF5Array)
library(lobstr)
@

\maketitle

\frame{\tableofcontents}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Recent additions to package DelayedArray}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{ConstantArray objects (by Aaron)}

  \begin{block}{}
  This would ordinarily take up 8 TB of memory:
<<ConstantArray>>=
library(DelayedArray)
CM <- ConstantArray(c(1e6, 1e6), value=NA_real_)
CM
lobstr::obj_size(CM)
@
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{sinkApply()}

  \begin{block}{}
    \Rcode{sinkApply()}: a convenience function for walking on a
    \Rcode{RealizationSink} derivative for the purpose of filling
    it with blocks of data
  \end{block}

  \bigskip

  \begin{block}{}
    Example: Fill a 1e6 x 1e6 on-disk matrix with random data
    \begin{knitrout}\scriptsize
    \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}
    \color{fgcolor}
    \begin{kframe}
\begin{verbatim}
sink <- HDF5RealizationSink(c(1e6L, 1e6L))  # or TileDBRealizationSink

sink_grid <- defaultSinkAutoGrid(sink)

FUN <- function(sink, viewport) {
    block <- array(runif(length(viewport)), dim=dim(viewport))
    write_block(sink, viewport, block)
}

sink <- sinkApply(sink, FUN, grid=sink_grid)

close(sink)
M <- as(sink, "DelayedArray")
\end{verbatim}
    \end{kframe}
    \end{knitrout}
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{rbind(), cbind(), and sparsity}

  \begin{block}{}
    \Rcode{rbind()} and \Rcode{cbind()} of \Rcode{DelayedArray} objects
    now propagate sparsity
  \end{block}

  \bigskip

  \begin{block}{}
    \begin{knitrout}\scriptsize
    \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}
    \color{fgcolor}
    \begin{kframe}
\begin{verbatim}
tenx1 <- HDF5Array::TENxMatrix("tenx1.h5")  # is_sparse(tenx1) is TRUE
tenx2 <- HDF5Array::TENxMatrix("tenx2.h5")  # is_sparse(tenx2) is TRUE

bigtenx <- cbind(tenx1, tenx2)
is_sparse(bigtenx)  # TRUE

blockApply(bigtenx, FUN, ...)   # will take advantage of sparsity
\end{verbatim}
    \end{kframe}
    \end{knitrout}
  \end{block}
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Recent additions to package HDF5Array}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{Recent additions to package HDF5Array}

  \begin{block}{}
    \Rcode{HDF5Array()}: can now take an URL to a file on Amazon S3
    (kind of slow!)
  \end{block}

  \bigskip

  \begin{block}{}
    \Rcode{H5SparseMatrix}: a \Rcode{DelayedMatrix} subclass for
    representing and operating on an HDF5 sparse matrix stored
    in CSR/CSC/Yale format (e.g. 10x Genomics and h5ad formats)
  \end{block}
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Work in progress and future work}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{Work in progress and future work}

  Work in progress:

  \begin{block}{}
    \Rcode{h5summarize(..., op="sum")}: Optimized summarization of
    an HDF5 dataset or subset:
    \begin{itemize}
      \item Implemented in C (direct calls to HDF5 C lib in \Biocpkg{Rhdf5lib})
      \item Operates at the level of the physical chunks
      \item More efficient than \Rcode{blockApply()}
      \item Integration to \Biocpkg{DelayedArray}/\Biocpkg{DelayedMatrixStats}:
            \Rcode{h5summarize()} will be used behind the scene by things
            like \Rcode{rowVars()}
    \end{itemize}
  \end{block}

  \bigskip

  Future work:

  \begin{block}{}
    \Rcode{SparseArray} objects: In-memory sparse representation of
    arrays of arbitrary dimensions
    \begin{itemize}
      \item Already used internally by block processing of sparse
            \Rcode{DelayedArray} objects (current name is
            \Rcode{SparseArraySeed})
      \item Will go to their own package (currently in \Rcode{DelayedArray})
      \item Implement fast native operations: arithmetic, \Rcode{Math}
            group (e.g. \Rcode{log}), summarization, etc..
            This will benefit block processing of sparse \Rcode{DelayedArray}
            objects
    \end{itemize}
  \end{block}
\end{frame}



\end{document}