1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
|
#include "default.h"
#include "seq_util.h"
/** randomizes seq[] by shuffling, and places the result in randseq[];
if randseq[] and seq[] are distinct, seq[] is left unchanged */
void shuffle_seq(int len,
char seq[],
char randseq[])
{
int i,j;
char c;
for (i=0;i<len;i++) {
j=rand()%len; /* CHOOSE RANDOM POSITION j */
c=seq[i]; /* SWAP POSITIONS i AND j */
randseq[i]=seq[j];
randseq[j]=c;
}
return;
}
/**@memo TRANSLATE FROM ASCII LETTERS TO COMPACTED NUMBERICAL INDEX:
index_symbols(seq[i].length,seq[i].sequence,seq[i].sequence,
m->nsymbol,m->symbol);
*/
/** converts characters in seq[] to the INDEX of the matching character in
symbols[], and returns the result in out[] */
void index_symbols(int nseq,char seq[],char out[],
int nsymbs,char symbols[])
{
int i,j,k;
LOOP (i,nseq) {
k=nsymbs-1; /* DEFAULT: UNMATCHABLE SYMBOL */
LOOP (j,nsymbs) { /* FIND MATCHING SYMBOL */
if (symbols[j]==seq[i]) { /* FOUND IT! */
k=j;
break;
}
}
out[i]=k; /* SAVE THE TRANSLATED CODE */
}
return;
}
int *Score_matrix_row=NULL;
int best_match_qsort_cmp(const void *void_a,const void *void_b)
{
int *a=(int *)void_a,*b=(int *)void_b;
if (Score_matrix_row[*a]>Score_matrix_row[*b])
return -1;
else if (Score_matrix_row[*a]<Score_matrix_row[*b])
return 1;
else /* EQUAL */
return 0;
}
#ifdef SOURCE_EXCLUDED
char DNA_symbols[1024];
float DNA_rescale_score;
#endif
/** reads an alignment scoring matrix in the pam format */
int read_score_matrix(char filename[],ResidueScoreMatrix_T *m)
{
int i,j,k,nsymb=0,found_symbol_line=0,isymb;
char line[1024],dna_codes[256];
FILE *ifile;
/* GAP PENALTY DEFAULTS */
m->gap_penalty_set[0][0]=m->gap_penalty_set[1][0]=12; /*SAVE PENALTIES*/
m->gap_penalty_set[0][1]=m->gap_penalty_set[1][1]=2;
m->gap_penalty_set[0][2]=m->gap_penalty_set[1][2]=0;
m->trunc_gap_length = TRUNCATE_GAP_LENGTH;
m->decay_gap_length = DECAY_GAP_LENGTH;
ifile=fopen(filename,"r");
if (!ifile) {
WARN_MSG(USERR,(ERRTXT,"Can't open alignment matrix from %s\n",filename),"$Revision: 1.2.2.2 $");
return -2; /* FAILED TO FIND FILE TO READ */
}
while (fgets(line,1023,ifile)) {
if ('#'==line[0] || '\n'==line[0]) /* SKIP COMMENT OR BLANK LINES */
continue;
else if (1==sscanf(line,"GAP-TRUNCATION-LENGTH=%d",&i)) {
m->trunc_gap_length = i;
}
else if (1==sscanf(line,"GAP-DECAY-LENGTH=%d",&i)) {
m->decay_gap_length = i;
}
else if (3==sscanf(line,"GAP-PENALTIES=%d %d %d",&i,&j,&k)) {
m->gap_penalty_set[0][0]=m->gap_penalty_set[1][0]=i; /*SAVE PENALTIES*/
m->gap_penalty_set[0][1]=m->gap_penalty_set[1][1]=j;
m->gap_penalty_set[0][2]=m->gap_penalty_set[1][2]=k;
}
else if (3==sscanf(line,"GAP-PENALTIES-X=%d %d %d",&i,&j,&k)) {
m->gap_penalty_set[1][0]=i; /*SAVE PENALTIES ONLY FOR X DIRECTION*/
m->gap_penalty_set[1][1]=j;
m->gap_penalty_set[1][2]=k;
}
#ifdef SOURCE_EXCLUDED
else if (1==sscanf(line,"DNACODES=%99s",dna_codes)) { /* READ DNACODES*/
strcpy(DNA_symbols,dna_codes);/*SYMBOLS COUNTED AS DNA FOR AUTORECOG*/
}
else if (1==sscanf(line,"DNASCALE=%f",&DNA_rescale_score))
continue;
#endif
else if (!found_symbol_line) { /* READ THIS LINE AS LIST OF SEQ SYMBOLS*/
for (i=0;'\0'!=line[i];i++)
if (!isspace(line[i])) /* IGNORE WHITESPACE */
m->symbol[nsymb++]=line[i]; /* SAVE TO LIST OF SYMBOLS */
found_symbol_line=1; /* SET FLAG SO WE NOW READ MATRIX SCORE VALUES */
}
else { /* READ SCORING MATRIX LINES */
found_symbol_line=0; /* DEFAULT: FAILED TO FIND MATCHING SYMBOL IN LIST*/
LOOP (isymb,nsymb) /* FIND MATCH TO THIS SYMBOL */
if (m->symbol[isymb]==line[0]) {
found_symbol_line=1; /* SIGNAL THAT WE SUCCESFULLY FOUND MATCH */
j=1; /* SKIP FIRST CHARACTER: OUR SEQUENCE SYMBOL */
LOOPF (i,nsymb) { /* READ ALL THE SCORE VALUES ON THIS LINE */
if (1==sscanf(line+j,"%d%n",&(m->score[isymb][i]),&k))
j+=k; /* ADVANCE THE READING POSITION */
else { /* MISSING SCORE DATA: ERROR! */
IF_GUARD(1,5.23,(ERRTXT,"Missing score value for pair %c:%c",
m->symbol[isymb],m->symbol[i]),TRAP)
;
fclose(ifile); /* CLOSE OUR STREAM */
return -1;
}
}
break;
}
IF_GUARD(!found_symbol_line,1.5,(ERRTXT,"Missing or unknown sequence symbol: %c",line[0]),TRAP) { /* ERROR: AN INVALID SYMBOL, NOT IN LIST */
fclose(ifile); /* CLOSE OUR STREAM */
return -1;
}
}
}
fclose(ifile);
/* CONSTRUCT GAP PENALTY ARRAYS FROM GAP PARAMETERS: */
m->max_gap_length = m->trunc_gap_length + m->decay_gap_length;
CALLOC (m->gap_penalty_x, m->max_gap_length+2, LPOScore_T);
CALLOC (m->gap_penalty_y, m->max_gap_length+2, LPOScore_T);
/*** GAP OPENING PENALTY @ L=0->1 */
m->gap_penalty_x[0] = m->gap_penalty_set[0][0];
m->gap_penalty_y[0] = m->gap_penalty_set[1][0];
/*** 1st AFFINE EXTENSION PENALTY (A1) @ L=1->2,2->3,...T-1->T */
for (i=1;i<m->trunc_gap_length;i++) {
m->gap_penalty_x[i] = m->gap_penalty_set[0][1];
m->gap_penalty_y[i] = m->gap_penalty_set[1][1];
}
/*** DECAYING EXTENSION PENALTY (A1-->A2; skipped if D=0) @ L=T->T+1,...T+D-1->T+D */
for (i=0;i<m->decay_gap_length;i++) {
double dec_x = (m->gap_penalty_set[0][1] - m->gap_penalty_set[0][2]) / ((double)(m->decay_gap_length + 1));
double dec_y = (m->gap_penalty_set[1][1] - m->gap_penalty_set[1][2]) / ((double)(m->decay_gap_length + 1));
m->gap_penalty_x[i+m->trunc_gap_length] = m->gap_penalty_set[0][1] - (i+1) * dec_x;
m->gap_penalty_y[i+m->trunc_gap_length] = m->gap_penalty_set[1][1] - (i+1) * dec_y;
}
/*** 2nd AFFINE EXTENSION PENALTY (A2) @ L>=T+D */
m->gap_penalty_x[m->max_gap_length] = m->gap_penalty_set[0][2];
m->gap_penalty_y[m->max_gap_length] = m->gap_penalty_set[1][2];
m->gap_penalty_x[m->max_gap_length+1] = 0; /* DON'T REMOVE THIS!... SPECIAL STATE USED IN align_lpo. */
m->gap_penalty_y[m->max_gap_length+1] = 0; /* DON'T REMOVE THIS!... SPECIAL STATE USED IN align_lpo. */
LOOPF (i,nsymb) {
Score_matrix_row= m->score[i]; /* ROW TO USE FOR SORTING best_match */
LOOP (j,nsymb)
m->best_match[i][j] = j;
qsort(m->best_match[i],nsymb,sizeof(m->best_match[0][0]),
best_match_qsort_cmp);
#ifdef SOURCE_EXCLUDED
printf("%c SORT",m->symbol[i]); /* TEST: PRINT OUT SORTED TABLE */
LOOPF (j,nsymb)
printf("\t%c:%d",m->symbol[m->best_match[i][j]],
m->score[i][m->best_match[i][j]]);
printf("\n");
#endif
}
m->symbol[nsymb]='\0'; /* TERMINATE THE SYMBOL STRING */
m->nsymbol=nsymb;
return nsymb;
}
/** prints a scoring matrix, only including those symbols in subset[] */
void print_score_matrix(FILE *ifile,ResidueScoreMatrix_T *m,char subset[])
{
int i,i_m,j,j_m,nsubset;
nsubset=strlen(subset);
printf(" ");
LOOPF (i,nsubset)
printf(" %c",subset[i]);
printf("\n");
LOOPF (i,nsubset) {
LOOP (i_m,m->nsymbol)
if (m->symbol[i_m]==subset[i])
break;
printf("%c",subset[i]);
LOOPF (j,nsubset) {
LOOP (j_m,m->nsymbol)
if (m->symbol[j_m]==subset[j])
break;
printf("%3d",m->score[i_m][j_m]);
}
printf("\n");
}
return;
}
/** restricts seq[] to the set of allowed characters given by symbol[];
other characters will be replaced by the default symbol[0] */
int limit_residues(char seq[],char symbol[])
{
int i,len,nreplace=0;
len=strlen(seq);
for (i=strspn(seq,symbol);i<len;i=i+1+strspn(seq+i+1,symbol)) {
seq[i]=symbol[0]; /* FORCE IT TO BE A VALID SYMBOL */
nreplace++; /* COUNT TOTAL REPLACEMENTS */
}
return nreplace;
}
/** RETURNS THE COMPLEMENTARY BASE *IN LOWER CASE* */
char complementary_base(char base)
{
switch (base) {
case 'A': case 'a': return 't';
case 'T': case 't': case 'U': case 'u': return 'a';
case 'G': case 'g': return 'c';
case 'C': case 'c': return 'g';
default: return base;
}
}
/** REVERSE COMPLEMENTS seq[] IN PLACE, AND RETURNS POINTER TO seq
---------------------------------------------------------------
-----------------------------------------------------------
*/
char *reverse_complement(char seq[])
{
int i,j;
char c;
for ((i=0),(j=strlen(seq)-1);i<=j;i++,j--) {/* SWAP FROM ENDS TO CENTER*/
c=complementary_base(seq[i]); /* SWAP ENDS AND COMPLEMENT */
seq[i]=complementary_base(seq[j]);
seq[j]=c;
}
return seq;
}
|