File: TantanMasker.hh

package info (click to toggle)
last-align 1651-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 14,688 kB
  • sloc: cpp: 44,419; python: 5,217; ansic: 1,938; sh: 710; makefile: 457
file content (55 lines) | stat: -rw-r--r-- 1,463 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
// Copyright 2015 Martin C. Frith

// This class masks simple repeats (e.g. acacacacacac) in sequences,
// in one of 3 modes: normal mode, protein mode, or AT-rich DNA mode.

// The masking is insensitive to uppercase/lowercase of the input
// sequences.

// "isATrichDna" takes precedence over "isProtein".

// The "alphabet" (e.g. ACGT) is used only in normal mode.

// It is assumed that the sequences will be supplied after mapping
// letters to small integers (e.g. ACGT -> 0123).  "letterToIndex"
// specifies this mapping.  It is assumed that the small integers are
// less than scoreMatrixRowSize.

// "maskTable" defines how to do the masking: it maps small integers
// to masked integers.

#ifndef TANTAN_MASKER_HH
#define TANTAN_MASKER_HH

#include "ScoreMatrixRow.hh"
#include "tantan.hh"
#include <string>

namespace cbrc {

typedef unsigned char uchar;

class TantanMasker {
public:
  void init(bool isProtein,
	    bool isATrichDna,
	    bool isMaskWeakRepeats,
	    int maxRepeatUnitLength,
	    const std::string &alphabet,
	    const uchar *letterToIndex);

  void mask(uchar *seqBeg, uchar *seqEnd, const uchar *maskTable) const {
    tantan::maskSequences(seqBeg, seqEnd, maxRepeatOffset, probMatrixPointers,
			  repeatProb, 0.05, 0.9, 0, 0, 0.5, maskTable);
  }

private:
  int maxRepeatOffset;
  double repeatProb;
  double probMatrix[scoreMatrixRowSize][scoreMatrixRowSize];
  double *probMatrixPointers[scoreMatrixRowSize];
};

}

#endif