1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
#include "pilercr.h"
#include "multaln.h"
#define TRACE 0
static const char Letter[5] = { 'A', 'C', 'G', 'T', '-'};
static void GetCounts(const MSA &Aln, int ColIndex, int Counts[5])
{
memset(Counts, 0, 5*sizeof(unsigned));
const int SeqCount = Aln.GetSeqCount();
for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)
{
char c = Aln.GetChar(SeqIndex, ColIndex);
c = toupper(c);
switch (c)
{
case 'a':
case 'A':
++(Counts[0]);
break;
case 'c':
case 'C':
++(Counts[1]);
break;
case 'g':
case 'G':
++(Counts[2]);
break;
case 't':
case 'T':
++(Counts[3]);
break;
case '-':
++(Counts[4]);
break;
}
}
}
static double GetCons(const MSA &Aln, int Col, char *ptrc)
{
int Counts[5];
GetCounts(Aln, Col, Counts);
int MaxCount = -1;
char c = '?';
for (int i = 0; i < 5; ++i)
{
if (Counts[i] > MaxCount)
{
MaxCount = Counts[i];
c = Letter[i];
}
}
*ptrc = c;
return (double) MaxCount / (double) Aln.GetSeqCount();;
}
#if TRACE
static void LogCol(const MSA &Aln, int Col)
{
int SeqCount = Aln.GetSeqCount();
for (int i = 0; i < SeqCount; ++i)
Log("%c", Aln.GetChar(i, Col));
}
#endif
// Column is conserved if the two preceding and two following columns
// are conserved.
static void GetSmoothedCons(const MSA &Aln, double MinCons, BoolVec &ColCons)
{
int ColCount = (int) Aln.GetColCount();
ColCons.resize(ColCount, false);
for (int Col = 0; Col < ColCount; ++Col)
{
char c;
ColCons[Col] = (GetCons(Aln, Col, &c) >= MinCons);
}
for (int Col = 0; Col < ColCount; ++Col)
{
if (ColCons[Col])
continue;
if (!ColCons[Col] &&
(Col > 0 && ColCons[Col-1]) &&
(Col > 1 && ColCons[Col-2]) &&
(Col < ColCount - 1 && ColCons[Col+1]) &&
(Col < ColCount - 2 && ColCons[Col+2]))
ColCons[Col] = true;
}
}
static void FindStartEnd(const MSA &Aln, double MinCons, int *ptrStart, int *ptrEnd)
{
BoolVec ColCons;
GetSmoothedCons(Aln, MinCons, ColCons);
int BestStart = 0;
int BestEnd = 0;
int Length = 0;
int Start = 0;
int ColCount = (int) Aln.GetColCount();
for (int Col = 0; Col <= ColCount; ++Col)
{
#if TRACE
if (Col != ColCount)
LogCol(Aln, Col);
#endif
bool ConsOk = (Col != ColCount && ColCons[Col]);
bool ContinueBlock = (Col != ColCount && ConsOk);
#if TRACE
if (ContinueBlock)
Log(" cont\n");
else
Log(" ** END BLOCK **\n");
#endif
if (ContinueBlock)
{
if (Start == -1)
{
Start = Col;
Length = 1;
}
else
++Length;
}
else
{
if (Length > BestEnd - BestStart + 1)
{
BestStart = Start;
BestEnd = Col - 1;
if (BestEnd - BestStart + 1 != Length)
Quit("BestEnd");
}
Length = -1;
Start = -1;
}
}
*ptrStart = BestStart;
*ptrEnd = BestEnd;
}
static char ConsNextChar(const MSA &Aln, int EndCol)
{
const unsigned SeqCount = Aln.GetSeqCount();
const unsigned ColCount = Aln.GetColCount();
char Cons = '-';
for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex)
{
for (unsigned Col = EndCol + 1; Col < ColCount; ++Col)
{
char c = Aln.GetChar(SeqIndex, Col);
if (c == '-')
continue;
else
{
if (Col == EndCol + 1)
Cons = c;
else if (Cons != c)
return '-';
break;
}
}
}
return Cons;
}
void GetConsSeq(const MSA &Aln, double MinCons, int *ptrStartCol,
int *ptrEndCol, Seq &ConsSeq)
{
int SeqCount = Aln.GetSeqCount();
if (SeqCount == 0)
{
ConsSeq.clear();
*ptrStartCol = -1;
*ptrEndCol = -1;
return;
}
int ColCount = Aln.GetColCount();
int StartCol = 0;
int EndCol = ColCount - 1;
FindStartEnd(Aln, MinCons, &StartCol, &EndCol);
for (int Col = StartCol; Col <= EndCol; ++Col)
{
char c;
GetCons(Aln, Col, &c);
if (c != '-')
ConsSeq.push_back(c);
}
// Eggregious hack to compensate for missing the end
// of the concensus sequence sometimes. Check the
// following letter to see if it is conserved.
char Nextc = ConsNextChar(Aln, EndCol);
if (Nextc != '-')
ConsSeq.push_back(Nextc);
*ptrStartCol = StartCol;
*ptrEndCol = EndCol;
}
char GetConsSymbol(const MSA &Aln, unsigned Col)
{
int Counts[5];
GetCounts(Aln, (int) Col, Counts);
const int SeqCount = (int) Aln.GetSeqCount();
int BigCount = 0;
int BigLetter = 0;
for (int i = 0; i < 5; ++i)
{
if (Counts[i] > BigCount)
{
BigCount = Counts[i];
BigLetter = i;
}
}
if (BigLetter == 4)
return ' ';
if (BigCount == SeqCount)
return '*';
if (((double) BigCount / (double) SeqCount) >= g_ColonThresh)
return ':';
if (Counts[BigLetter] == SeqCount - 1 && SeqCount > 5)
return ':';
return ' ';
}
void GetConsSymbols(const MSA &Aln, Seq &ConsSymbols)
{
const unsigned ColCount = Aln.GetColCount();
for (unsigned Col = 0; Col < ColCount; ++Col)
{
char c = GetConsSymbol(Aln, Col);
ConsSymbols.push_back(c);
}
}
|