1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
|
%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{DelayedArray / HDF5Array update}
%\VignetteDepends{knitr,Matrix,DelayedArray,HDF5Array,lobstr}
% 2019-12-22: A temporary fix to avoid the following pdflatex error caused by
% an issue in LaTeX package filehook-scrlfile (used by beamer):
% ! Package filehook Error: Detected unknown definition of \InputIfFileExists.
% Use the 'force' option of 'filehook' to overwrite it..
% The error appeared on tokay2 in Dec 2019 after reinstalling MiKTeX 2.9.
% See comment by Phelype Oleinik here for the fix:
% https://tex.stackexchange.com/questions/512189/problem-with-chemmacros-beamer-and-filehook-scrlfile-sty
\PassOptionsToPackage{force}{filehook}
\documentclass[8pt]{beamer}
\mode<presentation> {
\usetheme{Madrid}
\usecolortheme{crane}
}
\usepackage{slides}
\renewcommand\Rclass[1]{{\texttt{#1}\index{#1 (class)}}}
\AtBeginSection[]
{
\begin{frame}<beamer>
\tableofcontents[currentsection]
\end{frame}
}
\title{DelayedArray / HDF5Array update}
\author{Herv\'e Pag\`es}
\institute{Fred Hutch, Seattle}
\date{April 2021}
\begin{document}
<<setup, include=FALSE>>=
library(knitr)
opts_chunk$set(size="scriptsize")
options(width=80)
library(Matrix)
library(DelayedArray)
library(HDF5Array)
library(lobstr)
@
\maketitle
\frame{\tableofcontents}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Recent additions to package DelayedArray}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]
\frametitle{ConstantArray objects (by Aaron)}
\begin{block}{}
This would ordinarily take up 8 TB of memory:
<<ConstantArray>>=
library(DelayedArray)
CM <- ConstantArray(c(1e6, 1e6), value=NA_real_)
CM
lobstr::obj_size(CM)
@
\end{block}
\end{frame}
\begin{frame}[fragile]
\frametitle{sinkApply()}
\begin{block}{}
\Rcode{sinkApply()}: a convenience function for walking on a
\Rcode{RealizationSink} derivative for the purpose of filling
it with blocks of data
\end{block}
\bigskip
\begin{block}{}
Example: Fill a 1e6 x 1e6 on-disk matrix with random data
\begin{knitrout}\scriptsize
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}
\color{fgcolor}
\begin{kframe}
\begin{verbatim}
sink <- HDF5RealizationSink(c(1e6L, 1e6L)) # or TileDBRealizationSink
sink_grid <- defaultSinkAutoGrid(sink)
FUN <- function(sink, viewport) {
block <- array(runif(length(viewport)), dim=dim(viewport))
write_block(sink, viewport, block)
}
sink <- sinkApply(sink, FUN, grid=sink_grid)
close(sink)
M <- as(sink, "DelayedArray")
\end{verbatim}
\end{kframe}
\end{knitrout}
\end{block}
\end{frame}
\begin{frame}[fragile]
\frametitle{rbind(), cbind(), and sparsity}
\begin{block}{}
\Rcode{rbind()} and \Rcode{cbind()} of \Rcode{DelayedArray} objects
now propagate sparsity
\end{block}
\bigskip
\begin{block}{}
\begin{knitrout}\scriptsize
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}
\color{fgcolor}
\begin{kframe}
\begin{verbatim}
tenx1 <- HDF5Array::TENxMatrix("tenx1.h5") # is_sparse(tenx1) is TRUE
tenx2 <- HDF5Array::TENxMatrix("tenx2.h5") # is_sparse(tenx2) is TRUE
bigtenx <- cbind(tenx1, tenx2)
is_sparse(bigtenx) # TRUE
blockApply(bigtenx, FUN, ...) # will take advantage of sparsity
\end{verbatim}
\end{kframe}
\end{knitrout}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Recent additions to package HDF5Array}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]
\frametitle{Recent additions to package HDF5Array}
\begin{block}{}
\Rcode{HDF5Array()}: can now take an URL to a file on Amazon S3
(kind of slow!)
\end{block}
\bigskip
\begin{block}{}
\Rcode{H5SparseMatrix}: a \Rcode{DelayedMatrix} subclass for
representing and operating on an HDF5 sparse matrix stored
in CSR/CSC/Yale format (e.g. 10x Genomics and h5ad formats)
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Work in progress and future work}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]
\frametitle{Work in progress and future work}
Work in progress:
\begin{block}{}
\Rcode{h5summarize(..., op="sum")}: Optimized summarization of
an HDF5 dataset or subset:
\begin{itemize}
\item Implemented in C (direct calls to HDF5 C lib in \Biocpkg{Rhdf5lib})
\item Operates at the level of the physical chunks
\item More efficient than \Rcode{blockApply()}
\item Integration to \Biocpkg{DelayedArray}/\Biocpkg{DelayedMatrixStats}:
\Rcode{h5summarize()} will be used behind the scene by things
like \Rcode{rowVars()}
\end{itemize}
\end{block}
\bigskip
Future work:
\begin{block}{}
\Rcode{SparseArray} objects: In-memory sparse representation of
arrays of arbitrary dimensions
\begin{itemize}
\item Already used internally by block processing of sparse
\Rcode{DelayedArray} objects (current name is
\Rcode{SparseArraySeed})
\item Will go to their own package (currently in \Rcode{DelayedArray})
\item Implement fast native operations: arithmetic, \Rcode{Math}
group (e.g. \Rcode{log}), summarization, etc..
This will benefit block processing of sparse \Rcode{DelayedArray}
objects
\end{itemize}
\end{block}
\end{frame}
\end{document}
|