File: cram_external.c

package info (click to toggle)
htslib 1.16%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 9,804 kB
  • sloc: ansic: 59,004; perl: 1,727; makefile: 732; sh: 359
file content (412 lines) | stat: -rw-r--r-- 13,923 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
/*
Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd.
Author: James Bonfield <jkb@sanger.ac.uk>

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

   1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.

   2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/*! \file
 * External CRAM interface.
 *
 * Internally we're happy to use macros and to grub around in the cram
 * structures.  This isn't very sustainable for an externally usable
 * ABI though, so we have anonymous structs and accessor functions too
 * to permit software such as samtools reheader to manipulate cram
 * containers and blocks in a robust manner.
 */

#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
#include <config.h>

#include "../htslib/hfile.h"
#include "cram.h"

/*
 *-----------------------------------------------------------------------------
 * cram_fd
 */
sam_hdr_t *cram_fd_get_header(cram_fd *fd) { return fd->header; }
void cram_fd_set_header(cram_fd *fd, sam_hdr_t *hdr) { fd->header = hdr; }

int cram_fd_get_version(cram_fd *fd) { return fd->version; }
void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; }

int cram_major_vers(cram_fd *fd) { return CRAM_MAJOR_VERS(fd->version); }
int cram_minor_vers(cram_fd *fd) { return CRAM_MINOR_VERS(fd->version); }

hFILE *cram_fd_get_fp(cram_fd *fd) { return fd->fp; }
void cram_fd_set_fp(cram_fd *fd, hFILE *fp) { fd->fp = fp; }


/*
 *-----------------------------------------------------------------------------
 * cram_container
 */
int32_t cram_container_get_length(cram_container *c) {
    return c->length;
}

void cram_container_set_length(cram_container *c, int32_t length) {
    c->length = length;
}


int32_t cram_container_get_num_blocks(cram_container *c) {
    return c->num_blocks;
}

void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
    c->num_blocks = num_blocks;
}


/* Returns the landmarks[] array and the number of elements
 * in num_landmarks.
 */
int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) {
    *num_landmarks = c->num_landmarks;
    return c->landmark;
}

/* Sets the landmarks[] array (pointer copy, not a memory dup) and
 * num_landmarks value.
 */
void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
                                  int32_t *landmarks) {
    c->num_landmarks = num_landmarks;
    c->landmark = landmarks;
}


/* Returns true if the container is empty (EOF marker) */
int cram_container_is_empty(cram_fd *fd) {
    return fd->empty_container;
}


/*
 *-----------------------------------------------------------------------------
 * cram_block_compression_hdr
 */

/*
 * Utility function to edit an RG id.
 * This is only possible if there is one single RG value used and it
 * is in the container compression header using HUFFMAN or BETA
 * codec.  In this case it is essentially hard coded and needs no
 * editing of external (or worse, CORE) blocks.
 *
 * Returns 0 on success
 *        -1 on failure
 */
// Or arbitrary set compression header constant?

static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
                                             int ds, int new_rg) {
    if (!ch || !ch->codecs[ds])
        return -1;

    switch (ch->codecs[ds]->codec) {
    case E_HUFFMAN:
        if (ch->codecs[ds]->u.huffman.ncodes != 1)
            return -1;
        ch->codecs[ds]->u.huffman.codes[0].symbol = new_rg;
        return 0;

    case E_BETA:
        if (ch->codecs[ds]->u.beta.nbits != 0)
            return -1;
        ch->codecs[ds]->u.beta.offset = -new_rg;
        return 0;

    default:
        break;
    }

    return -1;
}

int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
    return cram_block_compression_hdr_set_DS(ch, DS_RG, new_rg);
}

/*
 * Converts a cram_block_compression_hdr struct used for decoding to
 * one used for encoding.  Maybe this should be a transparent
 * operation applied on-demand.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
                                               cram_block_compression_hdr *ch) {
    int i;

    if (!ch)
        return -1;

    for (i = 0; i < DS_END; i++) {
        cram_codec *co = ch->codecs[i];
        if (!co)
            continue;

        if (-1 == cram_codec_decoder2encoder(fd, co))
            return -1;
    }

    return 0;
}

/*
 *-----------------------------------------------------------------------------
 * cram_slice
 */
int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
    return hdr->num_blocks;
}

int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h) {
    return h->ref_base_id;
}

void cram_slice_hdr_get_coords(cram_block_slice_hdr *h,
                               int *refid, hts_pos_t *start, hts_pos_t *span) {
    if (refid)
        *refid = h->ref_seq_id;
    if (start)
        *start = h->ref_seq_start;
    if (span)
        *span  = h->ref_seq_span;
}

/*
 *-----------------------------------------------------------------------------
 * cram_block
 */
int32_t cram_block_get_content_id(cram_block *b)  { return b->content_id; }
int32_t cram_block_get_comp_size(cram_block *b)   { return b->comp_size; }
int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
int32_t cram_block_get_crc32(cram_block *b)       { return b->crc32; }
void *  cram_block_get_data(cram_block *b)        { return BLOCK_DATA(b); }
int32_t cram_block_get_size(cram_block *b)        { return BLOCK_SIZE(b); }
enum cram_content_type cram_block_get_content_type(cram_block *b) {
    return b->content_type;
}

void cram_block_set_content_id(cram_block *b, int32_t id) { b->content_id = id; }
void cram_block_set_comp_size(cram_block *b, int32_t size) { b->comp_size = size; }
void cram_block_set_uncomp_size(cram_block *b, int32_t size) { b->uncomp_size = size; }
void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; }
void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; }
void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; }

int cram_block_append(cram_block *b, const void *data, int size) {
    BLOCK_APPEND(b, data, size);
    return 0;

 block_err:
    return -1;
}
void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }

// Offset is known as "size" internally, but it can be confusing.
size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }


/*
 * Copies the blocks representing the next num_slice slices from a
 * container from 'in' to 'out'.  It is expected that the file pointer
 * is just after the read of the cram_container and cram compression
 * header.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
    int32_t i, j;

    for (i = 0; i < num_slice; i++) {
        cram_block *blk;
        cram_block_slice_hdr *hdr;

        if (!(blk = cram_read_block(in)))
            return -1;
        if (!(hdr = cram_decode_slice_header(in, blk))) {
            cram_free_block(blk);
            return -1;
        }
        if (cram_write_block(out, blk) != 0) {
            cram_free_block(blk);
            return -1;
        }
        cram_free_block(blk);

        int num_blocks = cram_slice_hdr_get_num_blocks(hdr);
        for (j = 0; j < num_blocks; j++) {
            blk = cram_read_block(in);
            if (!blk || cram_write_block(out, blk) != 0) {
                if (blk) cram_free_block(blk);
                return -1;
            }
            cram_free_block(blk);
        }
        cram_free_slice_header(hdr);
    }

    return 0;
}

/*
 * Renumbers RG numbers in a cram compression header.
 *
 * CRAM stores RG as the Nth number in the header, rather than a
 * string holding the ID: tag.  This is smaller in space, but means
 * "samtools cat" to join files together that contain single but
 * different RG lines needs a way of renumbering them.
 *
 * The file descriptor is expected to be immediately after the
 * cram_container structure (ie before the cram compression header).
 * Due to the nature of the CRAM format, this needs to read and write
 * the blocks itself.  Note that there may be multiple slices within
 * the container, meaning multiple compression headers to manipulate.
 * Changing RG may change the size of the compression header and
 * therefore the length field in the container.  Hence we rewrite all
 * blocks just in case and also emit the adjusted container.
 *
 * The current implementation can only cope with renumbering a single
 * RG (and only then if it is using HUFFMAN or BETA codecs).  In
 * theory it *may* be possible to renumber multiple RGs if they use
 * HUFFMAN to the CORE block or use an external block unshared by any
 * other data series.  So we have an API that can be upgraded to
 * support this, but do not implement it for now.  An example
 * implementation of RG as an EXTERNAL block would be to find that
 * block and rewrite it, returning the number of blocks consumed.
 *
 * Returns 0 on success;
 *        -1 if unable to edit;
 *        -2 on other errors (eg I/O).
 */
int cram_transcode_rg(cram_fd *in, cram_fd *out,
                      cram_container *c,
                      int nrg, int *in_rg, int *out_rg) {
    int new_rg = *out_rg, old_size, new_size;
    cram_block *o_blk, *n_blk;
    cram_block_compression_hdr *ch;

    if (nrg != 1) {
        hts_log_error("CRAM transcode supports only a single RG");
        return -2;
    }

    // Produce a new block holding the updated compression header,
    // with RG transcoded to a new value. (Single only supported.)
    o_blk = cram_read_block(in);
    old_size = cram_block_size(o_blk);
    ch = cram_decode_compression_header(in, o_blk);
    if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
        return -1;
    if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0)
        return -1;
    n_blk = cram_encode_compression_header(in, c, ch, in->embed_ref);
    cram_free_compression_header(ch);

    /*
     * Warning: this has internal knowledge of the cram compression
     * header format.
     *
     * The decoder doesn't set c->tags_used, so the encoder puts a two
     * byte blank segment.  This means n_blk is too short.  We skip
     * through the decoded old block (o_blk) and copy from there.
     */
    char *cp = cram_block_get_data(o_blk);
    char *op = cp;
    char *endp = cp + cram_block_get_uncomp_size(o_blk);
    //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
    int32_t i32, err = 0;

    i32 = in->vv.varint_get32(&cp, endp, &err);
    cp += i32;
    i32 = in->vv.varint_get32(&cp, endp, &err);
    cp += i32;
    op = cp;
    i32 = in->vv.varint_get32(&cp, endp, &err);
    i32 += (cp-op);
    if (err)
        return -2;

    //fprintf(stderr, "remaining %d bytes\n", i32);
    cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
    cram_block_append(n_blk, op, i32);
    cram_block_update_size(n_blk);

    new_size = cram_block_size(n_blk);

    //fprintf(stderr, "size %d -> %d\n", old_size, new_size);

    // Now we've constructedthe updated compression header,
    // amend the container too (it may have changed size).
    int32_t *landmarks, num_landmarks;
    landmarks = cram_container_get_landmarks(c, &num_landmarks);

    if (old_size != new_size) {
        int diff = new_size - old_size, j;

        for (j = 0; j < num_landmarks; j++)
            landmarks[j] += diff;
        //cram_container_set_landmarks(c, num_landmarks, landmarks);
        cram_container_set_length(c, cram_container_get_length(c) + diff);
    }

    // Finally write it all out; container, compression header,
    // and then all the remaining slice blocks.
    if (cram_write_container(out, c) != 0)
        return -2;

    cram_write_block(out, n_blk);
    cram_free_block(o_blk);
    cram_free_block(n_blk);

    // Container num_blocks can be invalid, due to a bug.
    // Instead we iterate in slice context instead.
    return cram_copy_slice(in, out, num_landmarks);
}


/*!
 * Returns the refs_t structure used by a cram file handle.
 *
 * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
 * share reference memory between multiple file handles.
 *
 * @return
 * Returns NULL if none exists or the file handle is not a CRAM file.
 */
refs_t *cram_get_refs(htsFile *fd) {
    return fd->format.format == cram
        ? fd->fp.cram->refs
        : NULL;
}