1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
|
/*
New style BAM reader. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
// This is a complete rewrite of Artem Tarasov's original reader.
module bio.std.experimental.hts.bam.reader;
import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.exception;
import std.file;
import std.format : format;
import std.stdio;
import std.string;
import std.typecons;
import std.bitmanip;
//TODO remove these dependecies
import bio.std.hts.bam.cigar;
import bio.std.hts.bam.constants;
import bio.core.utils.exception;
import bio.std.experimental.hts.bgzf;
import bio.std.experimental.hts.constants;
import bio.std.experimental.hts.bam.header;
template ReadFlags(alias flag) {
// 0x01: template having multiple segments in sequencing
@property bool is_paired() nothrow { return cast(bool)(flag & 0x1); }
// 0x2: Each segment properly aligned according to the aligner
@property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); }
// 0x4: Segment unmapped
@property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); }
@property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); }
// 0x8: Next segment in template unmapped
@property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); }
// 0x10: SEQ being reverse complimented
@property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); }
// 0x20: SEQ of the next segment in the template being reverse complemented
@property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); }
// 0x40: The first segment in the template
@property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); }
// 0x80: The last segment in the template
@property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); }
// 0x100: Secondary segment
@property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); }
// 0x200: Not passing filters, such as platform/vendor quality controls
@property bool is_qc_fail() {
assert(is_mapped_raw,to!string(this));
return cast(bool)(flag & 0x200); }
alias is_qc_fail failed_quality_control;
/// 0x400: PCR or optical duplicate
@property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); }
/// 0x800: Supplementary alignment
@property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); }
@property string show_flags() {
string res = format("b%b-%d",flag,flag);
if (is_paired) res ~= " pair";
if (is_proper_pair) res ~= " proper";
if (is_mapped_raw) res ~= " mapped";
if (is_unmapped_raw) res ~= " unmapped";
if (mate_is_unmapped) res ~= " mate_unmapped";
if (is_reverse_strand) res ~= " rev_strand";
if (mate_is_reverse_strand) res ~= " mate_rev_strand";
if (is_first_of_pair) res ~= " first_of_pair";
if (is_second_of_pair) res ~= " second_of_pair";
if (is_secondary_alignment) res ~= " secondary_aln";
if (is_mapped_raw && is_qc_fail) res ~= " qc_fail";
if (is_duplicate) res ~= " duplicate";
if (is_supplementary) res ~= " suppl";
return res;
}
}
template CheckMapped(alias refid) {
@property nothrow bool is_unmapped() {
return is_unmapped_raw;
}
@property bool is_mapped() {
debug {
if (is_mapped_raw) {
assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec
}
}
return !is_unmapped_raw;
}
}
enum Offset {
bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24
};
/**
Raw Read buffer containing unparsed data. It should be considered
read-only.
Current implementation is a cluct (class-struct hybrid). The _data
pointer is shared when ReadBlob is assigned to another variable
(i.e., there is a remote dependency). The advantage is that for
each Read data gets allocated on the heap only once.
All offsets are indexed on init (except for tags). When using
fields beyond refid,pos use ProcessReadBlob instead because it
caches values.
*/
struct ReadBlob {
RefId refid; // -1 is invalid (BAM Spec)
GenomePos pos; // 0 coordinate based (BAM spec)
private ubyte[] _data;
uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max;
mixin ReadFlags!(_flag);
mixin CheckMapped!(refid);
/*
this(RefId ref_id, GenomePos read_pos, ubyte[] buf) {
refid = ref_id;
pos = read_pos;
_data = buf;
}
*/
@property void cleanup() {
destroy(_data);
_data = null;
}
// Turn ReadBlob into class-struct hybrid or a cluct ;)
// @disable this(this); // disable copy semantics;
@property @trusted nothrow private const T fetch(T)(uint raw_offset) {
ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof];
return cast(const(T))buf.read!(T,Endian.littleEndian)();
}
@property @trusted nothrow private const
uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); }
@property @trusted nothrow private const
uint _flag_nc() { return fetch!uint(Offset.flag_nc); }
@property @trusted nothrow private const
int sequence_length() { return fetch!int(Offset.l_seq); }
@property @trusted nothrow private const
int _next_refID() { return fetch!int(Offset.next_refID); }
@property @trusted nothrow private const
int _next_pos() { return fetch!int(Offset.next_pos); }
@property @trusted nothrow private const
int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN
@property @trusted nothrow private const
ushort _bin() { return _bin_mq_nl >> 16; }
@property @trusted nothrow private const
ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; }
@property @trusted nothrow private const
ubyte _l_read_name() { return _bin_mq_nl & 0xFF; }
@property @trusted nothrow private const
ushort _flag() { return fetch!ushort(Offset.flag); }
@property @trusted nothrow private const
ushort _n_cigar_op() { return _flag_nc & 0xFFFF; }
@property @trusted nothrow private const
uint _read_name_offset() { return Offset.read_name; }
@property @trusted nothrow private
uint _cigar_offset() {
if (offset_cigar == int.max)
offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof);
return offset_cigar;
}
@property @trusted nothrow private
uint _seq_offset() {
if (offset_seq == int.max)
offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof);
return offset_seq;
}
@property @trusted nothrow private
uint _qual_offset() {
if (offset_qual == int.max)
offset_qual = _seq_offset + (sequence_length + 1)/2;
return offset_qual;
}
@property @trusted nothrow private
uint _tags_offset() { return _qual_offset + sequence_length; }
@property @trusted nothrow private
ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; }
@property @trusted nothrow private
ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; }
@property @trusted nothrow private
ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; }
alias sequence_length _l_seq;
alias _mapq mapping_quality; // MAPQ
string toString() {
return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">";
}
}
/**
ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use
this when you need to access field/elements multiple times. Note
that ProcessReadBlob becomes invalid when ReadBlob goes out of scope.
*/
struct ProcessReadBlob {
private Nullable!ReadBlob _read2;
Nullable!int sequence_length2;
private Nullable!string sequence2, read_name2;
private Nullable!CigarOperations cigar2;
private Nullable!GenomePos consumed_reference_bases2;
mixin ReadFlags!(_flag);
mixin CheckMapped!(ref_id);
this(Nullable!ReadBlob _r) {
_read2 = _r;
}
@property nothrow ReadBlob read2() {
return _read2.get;
}
@property void cleanup() {
read2.cleanup;
}
@property nothrow bool isNull() {
return _read2.isNull;
}
@property RefId ref_id() {
enforce(_read2.get.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2));
return read2.refid;
}
@property RefId raw_ref_id() {
return read2.refid;
}
@property nothrow uint _flag_nc() {
return read2._flag_nc;
}
@property nothrow ushort _flag() {
return read2._flag;
}
alias ref_id refid;
/// Get the start position on the reference sequence (better use
/// start_loc), i.e., the first base that gets consumed in the
/// CIGAR.
@property GenomePos start_pos() {
assert(read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec
asserte(read2.pos < GenomePos.max);
return cast(GenomePos)read2.pos;
}
@property GenomePos raw_start_pos() {
return cast(GenomePos)read2.pos;
}
/// Get the end position on the reference sequence (better use end_loc)
@property GenomePos end_pos() {
assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence");
assert(!consumed_reference_bases.isNull);
return start_pos + consumed_reference_bases.get;
}
@property GenomePos raw_end_pos() {
return raw_start_pos + consumed_reference_bases.get;
}
@property GenomeLocation start_loc() {
return GenomeLocation(ref_id,start_pos);
}
@property GenomeLocation end_loc() {
return GenomeLocation(ref_id,end_pos);
}
@property @trusted MappingQuality mapping_quality() { // MAPQ
assert(read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec
return MappingQuality(read2.mapping_quality);
}
@property @trusted int tlen() { // do not use
return read2._tlen;
}
@property @trusted GenomePos sequence_length() {
if (sequence_length2.isNull)
sequence_length2 = read2.sequence_length;
return sequence_length2.get;
}
/// Count and caches consumed reference bases. Uses raw_cigar to
/// avoid a heap allocation.
@property @trusted Nullable!GenomePos consumed_reference_bases() {
if (consumed_reference_bases2.isNull) {
assert(read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) read2.raw_cigar();
if (raw.length==1 && raw[0] == '*')
return consumed_reference_bases2; // null
else {
GenomePos bases = 0;
for (size_t i = 0; i < raw.length; i++) {
auto cigarop = CigarOperation(raw[i]);
if (cigarop.is_reference_consuming)
bases += cigarop.length;
}
consumed_reference_bases2 = bases;
}
}
return consumed_reference_bases2;
}
/// Count query consumed bases. Uses raw_cigar to avoid a heap
/// allocation.
@property @trusted GenomePos consumed_query_bases() {
assert(read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) read2.raw_cigar();
if (raw.length==1 && raw[0] == '*')
return consumed_reference_bases2.get; // null
else {
GenomePos bases = 0;
for (size_t i = 0; i < raw.length; i++) {
auto cigarop = CigarOperation(raw[i]);
if (cigarop.is_query_consuming)
bases += cigarop.length;
}
return bases;
}
}
/// Return read name as a string. If unavailable returns
/// null. Caches name.
@property Nullable!string read_name() {
if (read_name2.isNull) {
assert(read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec
auto raw = read2.read_name;
if (raw.length == 0 || (raw.length ==1 && raw[0] == '*'))
return read_name2; // null
assert(raw.length < 255); // BAM spec
if (raw[raw.length-1] == 0) // strip trailing C zero
raw.length -= 1;
read_name2 = Nullable!string(cast(string)raw);
}
return read_name2;
}
/// Returns Cigar as an array of operations. Returns null if no
/// operations. Caches Cigar when there are operations.
@property Nullable!CigarOperations cigar() {
if (cigar2.isNull) {
assert(read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) read2.raw_cigar();
if (raw.length==0 || (raw.length==1 && raw[0] == '*'))
return cigar2; // null
else {
auto s = new CigarOperation[raw.length]; // Heap alloc
s.length = 0;
for (size_t i = 0; i < raw.length; i++) {
s ~= CigarOperation(raw[i]);
}
cigar2 = s;
}
}
return cigar2;
}
/// Return human readable sequence fragment - null if
/// undefined. Caches sequence.
@property Nullable!string sequence() {
if (sequence2.isNull) { // is it cached in sequence2?
auto raw = read2.raw_sequence();
if (raw[0] == '*') {
assert(raw.length == 1);
return sequence2; // null
}
auto raw_length = (sequence_length + 1) / 2;
char[16] convert = "=ACMGRSVTWYHKDBN";
string s;
s.reserve(sequence_length); // Heap alloc
for (size_t i = 0; i < sequence_length; i++) {
auto is_odd = i % 2;
auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4);
s ~= convert[nuc];
}
sequence2 = s;
}
return sequence2;
}
@property ubyte[] toBlob() {
return read2._data;
}
@property string posString() {
return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped");
}
string toString() {
// return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(read2.refid) ~ ":" ~ to!string(read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">";
return _read2.get.toString();
}
}
/**
BamReader2 is used for foreach loops
*/
struct BamReadBlobs {
BgzfStream stream;
BamHeader header;
this(string fn) {
stream = BgzfStream(fn);
}
int opApply(scope int delegate(ref ReadBlob) dg) {
fetch_bam_header(header, stream);
// parse the reads
while (!stream.eof()) {
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();
ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME
auto read = ReadBlob(refid,pos,stream.read(data));
dg(read);
}
return 0;
}
}
/**
Read streamer - use on single thread only
*/
// import core.memory : pureMalloc;
struct BamReadBlobStream {
BgzfStream stream;
BamHeader header;
Nullable!ReadBlob readbuf; // points to current read
ubyte[] data; // in sync with readbuf
this(string fn) {
stream = BgzfStream(fn);
fetch_bam_header(header, stream);
if (!empty)
popFront(); // preload front
}
bool empty() @property {
return stream.eof();
}
// Returns first read available. If past eof returns null.
Nullable!ReadBlob front() {
return readbuf;
}
void popFront() {
asserte(!empty); // should have been checked for
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
data = new ubyte[block_size-2*int.sizeof];
readbuf = ReadBlob(refid,pos,stream.read(data));
assert(readbuf.get._data.ptr == data.ptr);
}
}
/**
Reader - use on single thread only
This one provides peek support. Peek looks one read ahead in the read stream.
*/
struct BamBlobReader {
BgzfStream stream;
BamHeader header;
Nullable!ReadBlob peekbuf; // points to current read
// ubyte[] data; // in sync with peekbuf
this(string fn) {
stream = BgzfStream(fn);
fetch_bam_header(header, stream);
}
bool empty() @property {
return peekbuf.isNull && stream.eof();
}
Nullable!ReadBlob peek() {
if (peekbuf.isNull && !empty)
fetch();
return peekbuf;
}
/// Fetches the next read. If the peekbuf is not empty return that
/// first and reset peekbuf.
Nullable!ReadBlob fetch() {
if (!peekbuf.isNull) {
auto readbuf = peekbuf;
peekbuf = Nullable!ReadBlob();
return readbuf;
}
asserte(!empty); // should have been checked for
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
auto data = new ubyte[block_size-2*int.sizeof];
peekbuf = ReadBlob(refid,pos,stream.read(data));
return peekbuf;
}
/// Returns the next matching read. Otherwise null
///
/// Example:
///
/// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped));
/*
Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) {
while(!empty()) {
auto readbuf = read();
if (is_match(R(readbuf)))
return readbuf;
else
return Nullable!ReadBlob();
}
return Nullable!ReadBlob();
}
*/
}
|