File: writeHDF5Array.R

package info (click to toggle)
r-bioc-hdf5array 1.34.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,736 kB
  • sloc: ansic: 5,815; makefile: 4
file content (226 lines) | stat: -rw-r--r-- 8,715 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
### =========================================================================
### writeHDF5Array()
### -------------------------------------------------------------------------
###


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### HDF5RealizationSink objects
###
### The HDF5RealizationSink class is a concrete RealizationSink subclass that
### implements an HDF5Array realization sink.
###

setClass("HDF5RealizationSink",
    contains="RealizationSink",
    representation(
        ## Slots that support the RealizationSink constructor contract.
        dim="integer",              # Naming this slot "dim" makes dim() work
                                    # out of the box.
        dimnames="list",
        type="character",           # Single string.
        as_sparse="logical",        # TRUE or FALSE.

        ## Other slots.
        filepath="character",       # Single string.
        name="character",           # Dataset name.
        chunkdim="integer_OR_NULL"  # An integer vector parallel to the 'dim'
                                    # slot or NULL.
    )
)

setMethod("dimnames", "HDF5RealizationSink",
    function(x)
    {
        ans <- x@dimnames
        if (all(S4Vectors:::sapply_isNULL(ans)))
            return(NULL)
        ans
    }
)

setMethod("type", "HDF5RealizationSink", function(x) x@type)

setMethod("chunkdim", "HDF5RealizationSink", function(x) x@chunkdim)

setMethod("is_sparse", "HDF5RealizationSink", function(x) x@as_sparse)

.normarg_chunkdim <- function(chunkdim, dim)
{
    if (!(is.numeric(chunkdim) || is.logical(chunkdim) && all(is.na(chunkdim))))
        stop(wmsg("'chunkdim' must be NULL or an integer vector"))
    if (!is.integer(chunkdim))
        chunkdim <- as.integer(chunkdim)
    if (length(chunkdim) != length(dim))
        stop(wmsg("'chunkdim' must be an integer vector of length ",
                  "the number of dimensions of the object to write"))
    if (!all(chunkdim <= dim, na.rm=TRUE))
        stop(wmsg("the chunk dimensions specified in 'chunkdim' exceed ",
                  "the dimensions of the object to write"))
    if (any(chunkdim == 0L & dim != 0L, na.rm=TRUE))
        stop(wmsg("'chunkdim' must contain nonzero values unless ",
                  "the zero values correspond to dimensions in the ",
                  "object to write that are also zero"))
    na_idx <- which(is.na(chunkdim))
    chunkdim[na_idx] <- dim[na_idx]
    if (prod(chunkdim) > .Machine$integer.max)
        stop(wmsg("The chunk dimensions in 'chunkdim' are too big. The ",
                  "product of the chunk dimensions should always be <= ",
                  ".Machine$integer.max"))
    chunkdim
}

### Note that the supplied 'as.sparse' value is stored in the 'as_sparse'
### slot of the returned object, and that's all. It doesn't change how the
### data will be laid out to the HDF5 file in anyway (HDF5 doesn't support
### sparse storage at the moment). The only reason we store the supplied
### 'as.sparse' value in the object is so that we can propagate it later
### when we coerce the object to HDF5ArraySeed.
### Unlike with rhdf5::h5createDataset(), if 'chunkdim' is NULL then an
### automatic chunk geometry will be used. To write "unchunked data" (a.k.a.
### contiguous data), 'chunkdim' must be set to 0.
HDF5RealizationSink <- function(dim, dimnames=NULL, type="double",
                                as.sparse=FALSE,
                                filepath=NULL, name=NULL,
                                H5type=NULL, size=NULL,
                                chunkdim=NULL, level=NULL)
{
    if (!isTRUEorFALSE(as.sparse))
        stop(wmsg("'as.sparse' must be TRUE or FALSE"))
    if (is.null(filepath)) {
        filepath <- getHDF5DumpFile()
    } else {
        filepath <- normalize_dump_filepath(filepath)
    }
    if (is.null(name)) {
        name <- getHDF5DumpName(for.use=TRUE)
    } else {
        name <- normalize_dump_name(name)
    }
    if (is.null(chunkdim)) {
        ## TODO: Pass 'x' instead of 'dim' to getHDF5DumpChunkDim() and modify
        ## getHDF5DumpChunkDim() to return 'chunkdim(x)' if it's not NULL.
        ## See TODO comment in dump-management.R
        chunkdim <- getHDF5DumpChunkDim(dim)
    } else if (isSingleNumber(chunkdim) && chunkdim == 0) {
        chunkdim <- NULL  # no chunking
    } else {
        chunkdim <- .normarg_chunkdim(chunkdim, dim)
    }
    if (is.null(level)) {
        if (is.null(chunkdim)) {
            level <- 0L
        } else {
            level <- getHDF5DumpCompressionLevel()
        }
    } else {
        level <- normalize_compression_level(level)
    }
    create_and_log_HDF5_dataset(filepath, name, dim,
                                type=type, H5type=H5type, size=size,
                                chunkdim=chunkdim, level=level)
    if (is.null(dimnames)) {
        dimnames <- vector("list", length(dim))
    } else {
        h5writeDimnames(dimnames, filepath, name)
    }
    new2("HDF5RealizationSink", dim=dim, dimnames=dimnames, type=type,
                                as_sparse=as.sparse,
                                filepath=filepath, name=name,
                                chunkdim=chunkdim)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Writing data to an HDF5RealizationSink object
###

setMethod("write_block", "HDF5RealizationSink",
    function(sink, viewport, block)
    {
        if (!is.array(block))
            block <- as.array(block)
        h5write(block, sink@filepath, sink@name,
                start=start(viewport), count=width(viewport))
        sink
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Coercing an HDF5RealizationSink object
###

setAs("HDF5RealizationSink", "HDF5ArraySeed",
    function(from) HDF5ArraySeed(from@filepath, from@name,
                                 as.sparse=from@as_sparse)
)

setAs("HDF5RealizationSink", "HDF5Array",
    function(from) DelayedArray(as(from, "HDF5ArraySeed"))
)

setAs("HDF5RealizationSink", "DelayedArray",
    function(from) DelayedArray(as(from, "HDF5ArraySeed"))
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### writeHDF5Array()
###

### If 'filepath' and 'name' are NULL (the default), write the dataset to
### the current dump.
### If 'chunkdim' is NULL, an automatic chunk geometry will be used.
### To write "unchunked data" (a.k.a. contiguous data), 'chunkdim' must be
### set to 0.
### Return an HDF5Array object pointing to the newly written HDF5 dataset
### on disk.
writeHDF5Array <- function(x, filepath=NULL, name=NULL,
                              H5type=NULL, chunkdim=NULL, level=NULL,
                              as.sparse=NA,
                              with.dimnames=TRUE, verbose=NA)
{
    if (!(is.logical(as.sparse) && length(as.sparse) == 1L))
        stop(wmsg("'as.sparse' must be NA, TRUE or FALSE"))
    if (!isTRUEorFALSE(with.dimnames))
        stop("'with.dimnames' must be TRUE or FALSE")
    verbose <- DelayedArray:::normarg_verbose(verbose)

    if (is.na(as.sparse))
        as.sparse <- is_sparse(x)
    sink_dimnames <- if (with.dimnames) dimnames(x) else NULL
    ## compute_max_string_size() will trigger block processing if 'x' is a
    ## DelayedArray object of type "character", so it could take a while.
    size <- compute_max_string_size(x)
    sink <- HDF5RealizationSink(dim(x), sink_dimnames, type(x), as.sparse,
                                filepath=filepath, name=name,
                                H5type=H5type, size=size,
                                chunkdim=chunkdim, level=level)
    sink <- BLOCK_write_to_sink(sink, x, verbose=verbose)
    as(sink, "HDF5Array")
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Coercion to HDF5Array
###
### The methods below write the object to disk. Note that coercion from
### HDF5RealizationSink to HDF5Array is already taken care of by the specific
### method above and doesn't write anything to disk. So coercing to HDF5Array
### in general writes the object to disk *except* when the object to coerce is
### an HDF5RealizationSink object.
###

### Write to current dump.
.as_HDF5Array <- function(from) writeHDF5Array(from)

setAs("ANY", "HDF5Array", .as_HDF5Array)

### Automatic coercion methods from DelayedArray to HDF5Array and from
### DelayedMatrix to HDF5Matrix silently return broken objects (unfortunately
### these dummy automatic coercion methods don't bother to validate the object
### they return). So we overwrite them.
setAs("DelayedArray", "HDF5Array", .as_HDF5Array)
setAs("DelayedMatrix", "HDF5Matrix", .as_HDF5Array)