File: sequence.h

package info (click to toggle)
wham-align 0.1.5-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 892 kB
  • sloc: cpp: 8,769; sh: 76; makefile: 52
file content (116 lines) | stat: -rw-r--r-- 3,311 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#ifndef _SEQUENCE_H_
#define _SEQUENCE_H_

/**
 *    WHAM - high-throughput sequence aligner
 *    Copyright (C) 2011  WHAM Group, University of Wisconsin
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*	$Id: sequence.h 157 2012-07-25 05:58:09Z yinan $ */

#include "lib.h"
#include <stdio.h>
#include "interval.h"

#define BASE_A	0
#define BASE_C	1
#define BASE_G	2
#define BASE_T	3
#define BASE_N	4

#define NUM_LONGWORD(x)	((((x) - 1) >> BITS_LONGWORD_SHIFT) + 1)
#define NUM_LONGWORD_BASE(x)	(((((x) - 1) >> BITS_LONGWORD_SHIFT) * BITS_PER_BASE) + 1)

#define BITMAP_IS(x, y)		(((x) >> (y)) & 0x1)
#define BITMAP_SET(x, y)	((x) |= (0x1 << (y)))
#define BITMAP_CLEAR(x, y)	((x) &= ~(0x1 << (y)))

#define MAX_NUM_CHAR 4294967295LLU

#define SEQUENCE_HEAD_WORDS	6
#define MAX_LENGTH_PATH 256

class Aligner;

class CompactSequence {
private:
  uint32 numChar; /* the number of characters in the compact sequence */
  uint32 numNSegment; /* the number of N segments */
  uint32 size; /* the size of the compact sequence (3bit/character) */
  int64 * sequence; /* the sequence (has an offset to the beginingn of the pool array) */
  int64 * pool; /* the space for storing the compact sequence */

  char ** seqNames; /* file names */
  char * seqNamepool;
  uint32 * seqLens;
  int nSeq; /* the number of sequences */

  int len;
  int nError;
  bool skipMask; /* treat the masks( lowercase characters) as unknown character)*/

  uint32 numRead;
  int64 * keys;
  uint32 * offsets;

public:
  IntervalTree * itree; /* the interval tree that used for translation between orginal sequence offset and compact sequence offset. */

public:
  CompactSequence();
  CompactSequence(bool skip);
  int build(char ** fname, int numSeq, int length, int numError);
  int filter(Aligner * aligner, char ** fname, int numFile, char * path);
  int save(char * path);
  int load(char * path);
  int alignment(int lenKey, int nMismatch);
  int valid(int length, int numError);
  static void compose(char * str, int length, int64 * words);
  static void decompose(char * str, int length, int64 * words);

private:
  int preProcess(char * fname, uint32 numError, int64 & num,
      int64 & numNSegment, int64 & numFileSeq);
  int skipLine(FILE * file);
  int getSeqName(FILE * file, char * str);
  void extractFileName(char * dest, char * src);

public:
  unsigned int getNum() {
    return numChar;
  }

  int getNumSeq() {
    return nSeq;
  }

  int64 * getSequence() {
    return sequence;
  }

  char * getSeqName(int sid) {
    return seqNames[sid];
  }

  uint32 getSeqLen(int sid) {
    return seqLens[sid];
  }
};

//CompactSequence * sequenceLoad(char * fname, int length);

#endif