File: BamReader.cpp

package info (click to toggle)
pbbam 2.4.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 14,144 kB
  • sloc: cpp: 60,214; xml: 2,908; ansic: 660; sh: 275; python: 203; makefile: 187
file content (189 lines) | stat: -rw-r--r-- 5,757 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#include "PbbamInternalConfig.h"

#include <pbbam/BamReader.h>

#include <pbbam/BamRecord.h>
#include <pbbam/Deleters.h>
#include <pbbam/Validator.h>
#include "Autovalidate.h"
#include "MemoryUtils.h"

#include <htslib/bgzf.h>
#include <htslib/hfile.h>
#include <htslib/hts.h>
#include <htslib/thread_pool.h>

#include <optional>
#include <sstream>
#include <stdexcept>
#include <string>

#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

namespace PacBio {
namespace BAM {

struct SamFileHandle
{
    htsThreadPool ThreadPool = {NULL, 0};
    samFile* File = nullptr;

    ~SamFileHandle()
    {
        if (File) {
            sam_close(File);
            File = nullptr;
        }
        if (ThreadPool.pool) {
            hts_tpool_destroy(ThreadPool.pool);
            ThreadPool.pool = nullptr;
        }
    }
};

class BamReader::BamReaderPrivate
{
public:
    explicit BamReaderPrivate(std::string fn) : filename_{std::move(fn)}
    {
        auto displayFilename = [&]() {
            if (filename_ == "-") {
                return std::string{" stdin"};
            } else {
                return "\n  file: " + filename_;
            }
        };

        handle_.File = sam_open(filename_.c_str(), "rb");
        if (!handle_.File || !handle_.File->fp.bgzf) {
            std::ostringstream s;
            s << "[pbbam] BAM reader ERROR: could not open for reading:" << displayFilename();
            throw std::runtime_error{s.str()};
        }

        // Use environment variable as number of BamReader threads
        if (const char* envCStr = std::getenv("PB_BAMREADER_THREADS")) {
            try {
                const int32_t numThreads = std::stoi(envCStr);
                if (numThreads <= 0) {
                    std::ostringstream s;
                    s << "[pbbam] BAM reader ERROR: environment variable PB_BAMREADER_THREADS is "
                         "not a positive, non-negative number:"
                      << envCStr;
                    throw std::runtime_error{s.str()};
                }
                handle_.ThreadPool.pool = hts_tpool_init(numThreads);
                hts_set_opt(handle_.File, HTS_OPT_THREAD_POOL, &handle_.ThreadPool);
            } catch (const std::exception&) {
                std::ostringstream s;
                s << "[pbbam] BAM reader ERROR: environment variable PB_BAMREADER_THREADS is not a "
                     "number:"
                  << envCStr;
                throw std::runtime_error{s.str()};
            }
        }

        const auto bgzfPos = bgzf_tell(handle_.File->fp.bgzf);
        if (bgzfPos != 0) {
            std::ostringstream s;
            s << "[pbbam] BAM reader ERROR: could not read from empty input:" << displayFilename();
            throw std::runtime_error{s.str()};
        }

        const std::unique_ptr<bam_hdr_t, HtslibHeaderDeleter> hdr(sam_hdr_read(handle_.File));
        if (!hdr) {
            std::ostringstream s;
            s << "[pbbam] BAM reader ERROR: could not read header from:" << displayFilename();
            throw std::runtime_error{s.str()};
        }

        header_ = BamHeaderMemory::FromRawData(hdr.get());
    }

    std::string filename_;
    SamFileHandle handle_;
    BamHeader header_;
};

BamReader::BamReader() : internal::IQuery{}, d_{std::make_unique<BamReaderPrivate>("-")} {}

BamReader::BamReader(std::string fn)
    : internal::IQuery{}, d_{std::make_unique<BamReaderPrivate>(std::move(fn))}
{}

BamReader::BamReader(BamFile bamFile) : BamReader{bamFile.Filename()} {}

BamReader::BamReader(BamReader&&) noexcept = default;

BamReader& BamReader::operator=(BamReader&&) noexcept = default;

BamReader::~BamReader() = default;

BGZF* BamReader::Bgzf() const { return d_->handle_.File->fp.bgzf; }

const std::string& BamReader::Filename() const { return d_->filename_; }

const BamHeader& BamReader::Header() const { return d_->header_; }

bool BamReader::GetNext(BamRecord& record)
{
    assert(BamRecordMemory::GetRawData(record).get());

    const auto result = ReadRawData(d_->handle_.File, BamRecordMemory::GetRawData(record).get());

    // success
    if (result >= 0) {
        BamRecordMemory::UpdateRecordTags(record);
        record.header_ = d_->header_;
        record.ResetCachedPositions();

#if PBBAM_AUTOVALIDATE
        Validator::Validate(record);
#endif
        return true;
    }

    // EOF or end-of-data range (not an error)
    else if (result == -1) {
        return false;

        // error corrupted file
    } else {
        std::ostringstream msg;
        msg << "[pbbam] BAM reader ERROR: cannot read from corrupted file:\n"
            << "  file: " << Filename() << '\n'
            << "  reason: ";
        if (result == -2) {
            msg << "probably truncated";
        } else if (result == -3) {
            msg << "could not read BAM record's' core data";
        } else if (result == -4) {
            msg << "could not read BAM record's' variable-length data";
        } else {
            msg << "unknown reason (status code = " << result << ") (" << Filename() << ')';
        }
        throw std::runtime_error{msg.str()};
    }
}

int BamReader::ReadRawData(samFile* file, bam1_t* b) { return bam_read1(file->fp.bgzf, b); }

void BamReader::VirtualSeek(int64_t virtualOffset)
{
    const auto result = bgzf_seek(Bgzf(), virtualOffset, SEEK_SET);
    if (result != 0) {
        std::ostringstream msg;
        msg << "[pbbam] BAM reader ERROR: failed to seek:\n"
            << "  file: " << Filename() << '\n'
            << "  vOffset: " << virtualOffset;
        throw std::runtime_error{msg.str()};
    }
}

int64_t BamReader::VirtualTell() const { return bgzf_tell(Bgzf()); }

}  // namespace BAM
}  // namespace PacBio