File: Alphabet.hh

package info (click to toggle)
last-align 963-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 3,380 kB
  • sloc: cpp: 41,136; python: 2,744; ansic: 1,240; makefile: 383; sh: 255
file content (65 lines) | stat: -rw-r--r-- 2,347 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// Copyright 2008, 2009, 2010, 2012, 2013, 2014 Martin C. Frith

// This struct maps characters to codes (small integers) and back.

// We allow for both "proper" letters (e.g. ACGT for DNA), which get
// the lowest codes, and "improper" letters.  This is because real
// sequence data includes additional letters (e.g. ambiguous bases),
// so we have to handle them.  In addition, the space character
// represents a special delimiter.

#ifndef ALPHABET_HH
#define ALPHABET_HH

#include <string>
#include <iosfwd>

namespace cbrc{

typedef unsigned char uchar;

struct Alphabet{
  typedef unsigned long long countT;

  static const char* dna;
  static const char* protein;

  static const unsigned capacity = 256;

  // does this alphabet start with the standard protein alphabet?
  bool isProtein() const{ return letters.find( protein ) == 0; }

  // make an Alphabet from a string containing the "proper" letters
  void fromString( const std::string& alphString );

  // add counts of "proper" letters to "counts" (counting lowercase too)
  void count( const uchar* beg, const uchar* end, countT* counts ) const;

  // translate (encode) a sequence of letters to numbers, in place
  void tr( uchar* beg, uchar* end, bool isKeepLowercase=true ) const;

  // reverse-translate (decode) a sequence of numbers to letters
  // return the position after the last written position in dest
  char* rtCopy( const uchar* beg, const uchar* end, char* dest ) const;

  std::string letters;    // the "proper" letters, e.g. ACGT for DNA
  unsigned size;          // same as letters.size(): excludes delimiters
  uchar encode[capacity];  // translate ASCII letters to codes (small integers)
  uchar decode[capacity];  // translate codes to ASCII letters
  uchar numbersToUppercase[capacity];  // translate codes to uppercase codes
  uchar numbersToLowercase[capacity];  // translate codes to lowercase codes
  uchar lettersToUppercase[capacity];  // translate letters to uppercase codes
  uchar complement[capacity];  // translate DNA codes to their complements

  void init();
  void addLetters( const std::string& lettersToAdd, unsigned& code );
  void initCaseConversions( unsigned codeEnd );
  void makeComplement();
};

std::ostream& operator<<( std::ostream& s, const Alphabet& a );
std::istream& operator>>( std::istream& s, Alphabet& a );

}

#endif