File: SMRTSequence.hpp

package info (click to toggle)
pbseqlib 0~20161219-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 5,924 kB
  • ctags: 5,123
  • sloc: cpp: 82,727; makefile: 305; python: 239; sh: 8
file content (194 lines) | stat: -rw-r--r-- 6,238 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// Author: Mark Chaisson

#ifndef _BLASR_SMRT_SEQUENCE_HPP_
#define _BLASR_SMRT_SEQUENCE_HPP_

#include <cassert>
#include <iostream>
#include <sstream>

#include "Types.h"
#include "Enumerations.h"
#include "NucConversion.hpp"
#include "FASTQSequence.hpp"
#include "reads/RegionTable.hpp"
#include "reads/ZMWGroupEntry.hpp"


class SMRTSequence : public FASTQSequence {
friend class HDFZMWReader;
friend class HDFZMWWriter;
friend class HDFZMWMetricsWriter;

private:
    enum SnrIndex4Base {A=0, C=1, G=2, T=3};
    float hqRegionSnr_[4]; // Always saved as 'ACGT'

    DNALength subreadStart_;
    DNALength subreadEnd_;

    // read group id associated with each SMRTSequence
    std::string readGroupId_;

public:
    ZMWGroupEntry zmwData;

    DNALength lowQualityPrefix, lowQualitySuffix;
    int highQualityRegionScore; // High quality region score in region table.
    float readScore;

    // Whether or not this is originally copied from a BamRecord.
    bool copiedFromBam;

    HalfWord *preBaseFrames;
    HalfWord *widthInFrames;
    //
    // The following are fields that are read in from the pulse file.
    // Because they are not standard in bas.h5 files, these fields
    // should not be preallocated when resizing a SMRTSequence, and
    // memory should be managed separately.  For now, these fields all
    // have the same length as the number of bases, but this could
    // change so that all pulse values are stored in a SMRTSequence.
    //
    HalfWord *meanSignal, *maxSignal, *midSignal;
    float *classifierQV;
    unsigned int *startFrame;
    int *pulseIndex;

public:
    SMRTSequence();

    inline ~SMRTSequence();

    /// \name Sets and gets attributes.
    /// \{
    /// Set HoleNumber.
    /// \returns this SMRTSequence
    SMRTSequence & HoleNumber(UInt holeNumber);

    /// \reutrns HoleNumber
    UInt HoleNumber(void) const;

    /// Set HoleXY
    SMRTSequence & HoleXY(const int x, const int y);

    /// \returns HoleX
    UInt HoleX(void) const;

    /// \returns HoleY
    UInt HoleY(void) const;

    /// Set HoleStatus
    SMRTSequence & HoleStatus(const unsigned char);

    /// \returns HoleStatus
    unsigned char HoleStatus(void) const;

    /// \returns movie name parsed from sequence title
    std::string MovieName(void) const;

    /// \returns start pos of this sequence in coordinate of zmw polymerase sequence
    DNALength SubreadStart(void) const;

    /// Sets subreadStart.
    SMRTSequence & SubreadStart(const DNALength start);

    /// \returns subread end pos of this sequence in coordinate of zmw polymerase sequence
    DNALength SubreadEnd(void) const;

    /// Set subread end pos in coordinate of polymerase sequence.
    SMRTSequence & SubreadEnd(const DNALength end);

    /// A SMRTSequence's this->seq may point to sequence of a whole
    /// polymerase read, but only represents a subread [subreadStart_,  subreadEnd_).
    /// \returns subread length (SubreadEnd() - SubreadStart())
    DNALength SubreadLength(void) const;

    /// \returns read group id for this sequence.
    std::string ReadGroupId(void) const;

    /// Set readGroup Id for this sequence.
    SMRTSequence & ReadGroupId(const std::string & rid);

    /// Access to HQRegion SNRs must be done via public API.
    float HQRegionSnr(const char base) const;

    /// Set HQRegion SNR of base as v.
    SMRTSequence & HQRegionSnr(const char base, float v);

    /// \}

public:
    /// \name Clip subread
    /// \{
    SMRTSequence & Clip(const DNALength subreadStart, const DNALength subreadEnd);
    /// \}
 
    /// \name Allocate
    /// \{
    /// Allocate space for all possible QVs.
    void Allocate(DNALength length); 

    /// Compact allocate space for QVs needed in order to compute alignment score.
    /// \param[in] hasInsertionDeletionQVTag: insertionQV, deletionQV and deletionTag exist
    /// \param[in] hasSubstitutionQVTag: substitutionQV and substitutionTag exist
    void CompactAllocate(const DNALength length,
                         const bool hasInsertionDeletionQVTag,
                         const bool hasSubstitutionQVTag);
    /// \}

    /// Reconstruct this as a polymerase read of a zmw given subreads of the zmw.
    /// Copy only QVs necessary for computing alignment score, ignore irrelavent QVs.
    /// Length of this polymerase read is max {subread.SubreadEnd()}.
    /// Pad sequences and tags of scraps regions with 'N' and QVs of scraps regions with 0s.
    /// \param[in] subreads: subreads of a zmw
    void MadeFromSubreadsAsPolymerase(const std::vector<SMRTSequence> & subreads);

    void SetSubreadTitle(SMRTSequence &subread, DNALength subreadStart, 
        DNALength subreadEnd); 

    void SetSubreadBoundaries(SMRTSequence &subread, DNALength subreadStart, 
        DNALength subreadEnd); 

    void MakeSubreadAsMasked(SMRTSequence &subread, DNALength subreadStart = 0, 
        int subreadEnd = -1); 

    void MakeSubreadAsReference(SMRTSequence &subread, DNALength subreadStart = 0, 
        int subreadEnd = -1); 

    void Copy(const SMRTSequence &rhs); 

    void Copy(const SMRTSequence &rhs, DNALength rhsPos, DNALength rhsLength); 

    void Print(std::ostream &out) const;

    SMRTSequence& operator=(const SMRTSequence &rhs); 

    void Free(); 
    
#ifdef USE_PBBAM
public:
    /// \returns if record is a valid bam record.
    /// FIXME: remove this function when pbbam.BamRecord.IsValid is available. 
    static bool IsValid(const PacBio::BAM::BamRecord & record);

    // Copy read sequence, title, holeNumber, readGroupId, and QVs
    // (iq, dq, sq, mq, st, dt) from BamRecord to this SMRTSequence.
    // If copyAllQVs is false, also copy all QVs.
    void Copy(const PacBio::BAM::BamRecord & record, 
              bool copyAllQVs = false);

    // Keep track of BamRecord from which this SMRTSequence is 
    // originally copied. However, one should NOT assume
    // that this SMRTSequence has the same sequence, title, QVs as 
    // the BamRecord, because this SMRTSequence may be created by
    // MakeSubreadAsMasked(...) or MakeRC(...).
    PacBio::BAM::BamRecord bamRecord;
#endif 
};

inline SMRTSequence::~SMRTSequence(){
    SMRTSequence::Free();
}

#endif  // _BLASR_SMRT_SEQUENCE_HPP_