File: msa.h

package info (click to toggle)
pilercr 1.06%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 804 kB
  • sloc: cpp: 14,339; makefile: 67; sh: 3
file content (148 lines) | stat: -rwxr-xr-x 5,133 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#ifndef	MSA_H
#define MSA_H

const int MAX_SEQ_NAME = 63;
struct PathEdge;
class TextFile;
class Seq;
class ClusterNode;
class NodeCounts;
class DataBuffer;

class MSA
	{
public:
	MSA();
	virtual ~MSA();

public:
// Ways to create an MSA
	void FromFile(TextFile &File);
	void FromFASTAFile(TextFile &File);
	void FromSeq(const Seq &s);

	void ToFile(TextFile &File) const;
	void ToFASTAFile(TextFile &File) const;
	void ToMSFFile(TextFile &File, const char *ptrComment = 0) const;
	void ToAlnFile(TextFile &File) const;
	void ToHTMLFile(TextFile &File) const;
	void ToPhySequentialFile(TextFile &File) const;
	void ToPhyInterleavedFile(TextFile &File) const;

	void SetSize(unsigned uSeqCount, unsigned uColCount);
	void SetSeqCount(unsigned uSeqCount);
	char GetChar(unsigned uSeqIndex, unsigned uIndex) const;
	unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const;
	unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const;
	const char *GetSeqName(unsigned uSeqIndex) const;
	unsigned GetSeqId(unsigned uSeqIndex) const;
	unsigned GetSeqIndex(unsigned uId) const;
	bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const;
	double GetOcc(unsigned uColIndex) const;
	void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
	  FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
	  FCOUNT *fcGapExtend, FCOUNT *ptrfOcc,
	  FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const;
	bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const;
	bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const;
	bool IsGapColumn(unsigned uColIndex) const;
	bool ColumnHasGap(unsigned uColIndex) const;
	bool IsGapSeq(unsigned uSeqIndex) const;

	void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c);
	void SetSeqName(unsigned uSeqIndex, const char szName[]);
	void SetSeqId(unsigned uSeqIndex, unsigned uId);
	bool HasGap() const;
	bool IsLegalLetter(unsigned uLetter) const;
	void GetSeq(unsigned uSeqIndex, Seq &seq) const;
	void Copy(const MSA &msa);
	double GetCons(unsigned uColIndex) const;
	double GetAvgCons() const;
	double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const;
	bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const;
	void DeleteCol(unsigned uColIndex);
	void DeleteColumns(unsigned uColIndex, unsigned uColCount);
	void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex);
	void DeleteSeq(unsigned uSeqIndex);
//	void DeleteEmptyCols(bool bProgress = false);
	bool IsEmptyCol(unsigned uColIndex) const;

	unsigned GetGCGCheckSum(unsigned uSeqIndex) const;

	unsigned UniqueResidueTypes(unsigned uColIndex) const;

	void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const;
	void ValidateBreakMatrices() const;
	unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const;
	const char *GetSeqBuffer(unsigned uSeqIndex) const;
	unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const;
	unsigned GetSeqLength(unsigned uSeqIndex) const;
	void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID,
	  unsigned *ptruPosCount) const;

	void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[],
	  int iMap2[]) const;

	void LogMe() const;

	void GapInfoToDataBuffer(DataBuffer &Buffer) const;
	void GapInfoFromDataBuffer(const DataBuffer &Buffer);
	double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const;

	void Clear()
		{
		Free();
		}
	unsigned GetSeqCount() const
		{
		return m_uSeqCount;
		}
	unsigned GetColCount() const
		{
		return m_uColCount;
		}

	static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,
	  unsigned uSeqIndex2);

	static void SetIdCount(unsigned uIdCount);

private:
	void Free();
	void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel);
	void ExpandCache(unsigned uSeqCount, unsigned uColCount);
	void GetNameFromFASTAAnnotationLine(const char szLine[],
	  char szName[], unsigned uBytes);
	void CopyCol(unsigned uFromCol, unsigned uToCol);

private:
	unsigned m_uSeqCount;
	unsigned m_uColCount;
	unsigned m_uCacheSeqLength;
	unsigned m_uCacheSeqCount;
	char **m_szSeqs;
	char **m_szNames;

	static unsigned m_uIdCount;

	unsigned *m_IdToSeqIndex;
	unsigned *m_SeqIndexToId;
	};

void SeqVectFromMSA(const MSA &msa, SeqVect &v);
void DeleteGappedCols(MSA &msa);
void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
  MSA &msaOut);
void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat);
void MSAAppend(MSA &msa1, const MSA &msa2);
void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
  MSA &msaOut);
void AssertMSAEq(const MSA &msa1, const MSA &msa2);
void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2);
void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,
  MSA &msaOut);
void SetMSAWeightsMuscle(MSA &msa);
void SetClustalWWeightsMuscle(MSA &msa);
void SetThreeWayWeightsMuscle(MSA &msa);

#endif	// MSA_H