File: ReaderAgglomerate.hpp

package info (click to toggle)
pbseqlib 5.3.5%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 7,020 kB
  • sloc: cpp: 77,250; python: 331; sh: 103; makefile: 41
file content (158 lines) | stat: -rw-r--r-- 4,744 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#ifndef _BLASR_READER_AGGLOMERATE_HPP_
#define _BLASR_READER_AGGLOMERATE_HPP_

#include <cstdlib>

#include <pbdata/Enumerations.h>
#include <alignment/files/BaseSequenceIO.hpp>
#include <hdf/HDFBasReader.hpp>
#include <hdf/HDFCCSReader.hpp>
#include <pbdata/CCSSequence.hpp>
#include <pbdata/FASTAReader.hpp>
#include <pbdata/FASTQReader.hpp>
#include <pbdata/SMRTSequence.hpp>
#include <pbdata/StringUtils.hpp>
#include <pbdata/reads/ReadType.hpp>

#ifdef USE_PBBAM

#include <pbbam/BamRecord.h>
#include <pbbam/DataSet.h>
#include <pbbam/EntireFileQuery.h>
#include <pbbam/PbiFilter.h>
#include <pbbam/PbiFilterQuery.h>
// the following added to support Polymerase read for unrolled mode
#include <pbbam/virtual/ZmwReadStitcher.h>  // new interface

#include <alignment/query/PbiFilterZmwGroupQuery.h>
#include <alignment/query/SequentialZmwGroupQuery.h>

#endif

class ReaderAgglomerate : public BaseSequenceIO
{
    FASTAReader fastaReader;
    FASTQReader fastqReader;
    int readQuality;
    int stride;
    int start;
    float subsample;
    bool useRegionTable;
    bool ignoreCCS;
    ReadType::ReadTypeEnum readType;
    bool unrolled;  // indicate if unrolled mode; needed because GetNext() must know about the mode
    bool polymerase;
    std::string scrapsFileName;  // Needed for unrolled to initiate if in PBBAM

public:
    //
    // Create interfaces for reading hdf
    //
    T_HDFBasReader<SMRTSequence> hdfBasReader;
    HDFCCSReader<CCSSequence> hdfCcsReader;
    std::vector<SMRTSequence> readBuffer;
    std::vector<CCSSequence> ccsBuffer;
    std::string readGroupId;

public:
    void SetToUpper();

    void InitializeParameters();
    ReaderAgglomerate();

    ReaderAgglomerate(float _subsample);

    ReaderAgglomerate(int _stride);

    ReaderAgglomerate(int _start, int _stride);

    void GetMovieName(std::string &movieName);

    /// Get BindingKit, SequencingKit and Base Caller Version from h5.
    ///
    /// /param [out] sequencingKit - sequencingKit from
    /// /ScanData/RunInfo/SequencingKit.
    ///
    /// /param [out] bindingKit - BindingKit from
    /// /ScanData/RunInfo/BindingKit.
    ///
    /// /param [out] baseCallerVersion - Base Caller Version
    /// from /PulseData/BaseCalls/ChangeListID.
    ///
    void GetChemistryTriple(std::string &bindingKit, std::string &sequencingKit,
                            std::string &baseCallerVersion);

    bool FileHasZMWInformation();

    void SkipReadQuality();

    void IgnoreCCS();

    void UseCCS();

    int Initialize(std::string &pFileName);

    bool SetReadFileName(std::string &pFileName);

    void SetScrapsFileName(std::string &pFileName);  // needed for unrolled

    int Initialize(FileType &pFileType, std::string &pFileName);

    bool HasRegionTable();

    // add unrolled mode, to indicate we need to initialize VP/VPC|Reader
    // polymerase mode will only work with BAM records
    int Initialize(bool unrolled_mode = false, bool polymerase_mode = false);

    ReaderAgglomerate &operator=(ReaderAgglomerate &rhs);

    bool Subsample(float rate);

    // Set read type to SUBREAD, CCS, or UNKNOWN.
    void SetReadType(const ReadType::ReadTypeEnum &readType_);

    // returns read type, SUBREAD, CCS, or UNKNOWN
    ReadType::ReadTypeEnum GetReadType();

public:
    int GetNext(FASTASequence &seq);
    int GetNext(FASTQSequence &seq);
    int GetNext(SMRTSequence &seq);
    int GetNext(CCSSequence &seq);
    int GetNext(std::vector<SMRTSequence> &reads);

    template <typename T_Sequence>
    int GetNext(T_Sequence &seq, int &randNum);

    int GetNextBases(SMRTSequence &seq, bool readQVs);

    int Advance(int nSteps);

    void Close();

#ifdef USE_PBBAM
public:
    // Define reader to fetch sequences from bam.
    PacBio::BAM::DataSet *dataSetPtr;
    PacBio::BAM::EntireFileQuery *entireFileQueryPtr;
    PacBio::BAM::EntireFileQuery::iterator entireFileIterator;
    PacBio::BAM::PbiFilterQuery *pbiFilterQueryPtr;
    PacBio::BAM::PbiFilterQuery::iterator pbiFilterIterator;
    PacBio::BAM::SequentialZmwGroupQuery *sequentialZmwQueryPtr;
    PacBio::BAM::SequentialZmwGroupQuery::iterator sequentialZmwIterator;
    PacBio::BAM::PbiFilterZmwGroupQuery *pbiFilterZmwQueryPtr;
    PacBio::BAM::PbiFilterZmwGroupQuery::iterator pbiFilterZmwIterator;
    // the following to added to support ZMW reads in unrolled mode
    PacBio::BAM::ZmwReadStitcher *VPReader;  // new interface
#endif
};

template <typename T_Sequence>
int ReadChunkByNReads(ReaderAgglomerate &reader, std::vector<T_Sequence> &reads, int maxNReads);

template <typename T_Sequence>
int ReadChunkBySize(ReaderAgglomerate &reader, std::vector<T_Sequence> &reads, int maxMemorySize);

#include "ReaderAgglomerateImpl.hpp"

#endif