File: Refs.h

package info (click to toggle)
rsem 1.3.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 37,664 kB
  • sloc: cpp: 19,230; perl: 1,326; python: 1,245; ansic: 547; makefile: 186; sh: 154
file content (159 lines) | stat: -rw-r--r-- 4,176 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#ifndef REFS
#define REFS

#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<cassert>
#include<string>
#include<fstream>
#include<vector>

#include "utils.h"
#include "RefSeq.h"
#include "RefSeqPolicy.h"
#include "PolyARules.h"


class Refs {
 public:
  Refs() {
    M = 0;
    seqs.clear();
    has_polyA = false;
  }

  ~Refs() {}

  void makeRefs(char*, RefSeqPolicy&, PolyARules&);
  void loadRefs(char*, int = 0);
  void saveRefs(char*);

  int getM() { return M; } // get number of isoforms

  //int getNS() { return M + 1; } // get number of parameters, I do not think we need it here.

  RefSeq& getRef(int sid) { return seqs[sid]; } // get a particular reference

  std::vector<RefSeq>& getRefs() { return seqs; } // may be slow, for copying the whole thing

  bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added

  //lim : >=0 If mismatch > lim , return; -1 find all mismatches
  int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) {
    int nMis = 0; // number of mismatches

    for (int i = 0; i < LEN; i++) {
      char rc = toupper(readseq[i]);
      if (seq[i + pos] == 'N' || rc == 'N' || seq[i + pos] != rc) nMis++;

      // a speed up tech
      if (lim >= 0 && nMis > lim) return nMis;
    }

    return nMis;
  }

  bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) {
    if (sid <= 0 || sid > M || (dir != 0 && dir != 1) ||  pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false;
    const std::string& seq = seqs[sid].getSeq(dir);
    return countMismatch(seq, pos, readseq, LEN, C) <= C;
  }

  // get segment from refs
  std::string getSegment(int sid, int dir, int pos, int LEN) {
    if (pos < 0 || pos + LEN > seqs[sid].getTotLen()) return "fail";

    const std::string& seq = seqs[sid].getSeq(dir);
    std::string seg = "";

    for (int i = 0; i < LEN; i++)
      seg.append(1,  seq[pos + i]);

    return seg;
  }

 private:
  int M; // # of isoforms, id starts from 1
  std::vector<RefSeq> seqs;  // reference sequences, starts from 1; 0 is for noise gene
  bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false
};

//inpF in fasta format
void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
  //read standard fasta format here
  std::ifstream fin;
  std::string tag, line, rawseq;

  seqs.clear();
  seqs.push_back(RefSeq()); // noise isoform

  M = 0;
  has_polyA = false;

  fin.open(inpF);
  if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
  getline(fin, line);
  while ((fin) && (line[0] == '>')) {
    tag = line.substr(1);
    rawseq = "";
    while((getline(fin, line)) && (line[0] != '>')) {
      rawseq += line;
    }
    if (rawseq.size() <= 0) {
      fprintf(stderr, "Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); 
      continue;
    }
    ++M;
    seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
    has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen();
  }
  fin.close();

  if (verbose) { printf("Refs.makeRefs finished!\n"); }
}

//inpF in fasta format, with sequence all in one line together
//option 0 read all, 1 do not read sequences
void Refs::loadRefs(char *inpF, int option) {
  std::ifstream fin;
  RefSeq seq;

  fin.open(inpF);
  if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
  seqs.clear();
  seqs.push_back(RefSeq());

  M = 0;
  has_polyA = false;

  bool success;
  do {
    success = seq.read(fin, option);
    if (success) {
    	seqs.push_back(seq);
        ++M;
    	has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen();
    }
  } while (success);

  fin.close();

  assert(M + 1 == (int)seqs.size());

  if (verbose) { printf("Refs.loadRefs finished!\n"); }
}

void Refs::saveRefs(char* outF) {
  std::ofstream fout;

  fout.open(outF);
  for (int i = 1; i <= M; i++) {
    seqs[i].write(fout);
  }
  fout.close();

  if (verbose) { printf("Refs.saveRefs finished!\n"); }
}

#endif