1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
#ifndef REFSEQ
#define REFSEQ
#include<cassert>
#include<fstream>
#include<string>
#include<vector>
#include "utils.h"
//Each Object can only be used once
class RefSeq {
public:
RefSeq() {
fullLen = totLen = 0;
name = ""; seq = "";
fmasks.clear();
}
//Constructor , seq : the forward strand of the reference
//tag does not contain ">"
//polyALen : length of polyA tail we add
RefSeq(const std::string& name, const std::string& seq, int polyALen) {
fullLen = seq.length();
totLen = fullLen + polyALen;
this->name = name;
this->seq = seq;
this->seq.append(polyALen, 'A');
assert(fullLen > 0 && totLen >= fullLen);
int len = (fullLen - 1) / NBITS + 1;
fmasks.assign(len, 0);
// set mask if poly(A) tail is added
if (polyALen > 0) {
for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i);
}
}
RefSeq(const RefSeq& o) {
fullLen = o.fullLen;
totLen = o.totLen;
name = o.name;
seq = o.seq;
fmasks = o.fmasks;
}
RefSeq& operator= (const RefSeq &rhs) {
if (this != &rhs) {
fullLen = rhs.fullLen;
totLen = rhs.totLen;
name = rhs.name;
seq = rhs.seq;
fmasks = rhs.fmasks;
}
return *this;
}
~RefSeq() {}
bool read(std::ifstream&, int = 0);
void write(std::ofstream&);
int getFullLen() const { return fullLen; }
int getTotLen() const { return totLen; }
const std::string& getName() const { return name; }
std::string getSeq() const { return seq; }
std::string getRSeq() const {
std::string rseq = "";
for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i])));
return rseq;
}
//get the sequence dir 0 : + 1 : -
std::string getSeq(int dir) const {
return (dir == 0 ? getSeq() : getRSeq());
}
int get_id(int pos, int dir) const {
assert(pos >= 0 && pos < totLen);
return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1]));
}
bool getMask(int seedPos) const {
assert(seedPos >= 0 && seedPos < totLen);
return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS];
}
void setMask(int seedPos) {
assert(seedPos >= 0 && seedPos < totLen);
fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS];
}
private:
int fullLen; // fullLen : the original length of an isoform
int totLen; // totLen : the total length, included polyA tails, if any
std::string name; // the tag
std::string seq; // the raw sequence, in forward strand
std::vector<unsigned int> fmasks; // record masks for forward strand, each position occupies 1 bit
};
//internal read; option 0 : read all 1 : do not read seqences
bool RefSeq::read(std::ifstream& fin, int option) {
std::string line;
if (!(fin>>fullLen>>totLen)) return false;
assert(fullLen > 0 && totLen >= fullLen);
getline(fin, line);
if (!getline(fin, name)) return false;
if (!getline(fin, seq)) return false;
int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits
fmasks.assign(len, 0);
for (int i = 0; i < len; i++)
if (!(fin>>fmasks[i])) return false;
getline(fin, line);
assert(option == 0 || option == 1);
if (option == 1) { seq = ""; }
return true;
}
//write to file in "internal" format
void RefSeq::write(std::ofstream& fout) {
fout<<fullLen<<" "<<totLen<<std::endl;
fout<<name<<std::endl;
fout<<seq<<std::endl;
int len = fmasks.size();
for (int i = 0; i < len - 1; i++) fout<<fmasks[i]<<" ";
fout<<fmasks[len - 1]<<std::endl;
}
#endif
|