File: edit_distance.cpp

package info (click to toggle)
wham-align 0.1.5-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 892 kB
  • sloc: cpp: 8,769; sh: 76; makefile: 52
file content (374 lines) | stat: -rw-r--r-- 8,918 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#include <string>

#include "edit_distance.h"

using namespace std;

int ** F;
char ** traceback;

/* Description of edit_distance algo
 * 	given q_seq and db_seq it return alignment between these 2 sequence
 *
 *   we have qlen*dlen edit_distance matrix.
 *   0th row is intitalized with 0
 *   0th columng is initialized with their row number
 *
 *   [i,j] = min ( [i-1][j]+1 , [i][j-1]+1 , [i-1][j-1] + {0 if db[i] == query[j] , 1 otherwise }
 *   this is very similar to Needlman+Wunsch global alignment alog.
 *
 *	 traceBack matrix of same size is also created. depending upon which of three are minimum traceback is initialized
 *
 *	 once we have traceback. we look at last row and start building alignment from lowest column in last row.
 *
 *	 for example if last row is     7,6,6,5,4,3,2,4,5. then we start building alignment from 3.
 *	  reason we do this is because dblen is more then qlen ( to adjust gaps) so if we have better score some where in
 *	  last row means last few character of dbseq can be ignored
 *
 *	  once we have starting location we follow traceback matrix and start building the alignment.
 *
 *	  this way alignment is built in reverse order need to modify the order.
 *
 *
 *
 */

void edit_distance_init() {
  int i;
  int L1 = 99;
  int L2 = 99;

  // Dynamic programming matrix
  F = (int **) malloc(sizeof(int *) * (L2 + 1));
  for (i = 0; i <= L2; i++)
    F[i] = (int *) malloc(sizeof(int) * (L1 + 1));

  // Traceback matrix
  traceback = (char **) malloc(sizeof(char *) * (L2 + 1));
  for (i = 0; i <= L2; i++)
    traceback[i] = (char *) malloc(sizeof(char) * (L1 + 1));

}

scoreinfo edit_distance(char *q_seq, char *db_seq, int q_len, int db_len,
    char *align_query, char *align_db, int prm) {

  int gap_penalty = obj_score_mat.gap_penalty; /* gap penalty */

  // can not assume that q_seq and db_seq are null terminated
  int L1 = db_len; //strlen(db_seq);
  int L2 = q_len; //strlen(q_seq);
  int i;
#if EDIT_DEBUG
  printf("db_len : %d , query_len : %d \n",L1,L2);
#endif
  //printf("inside nw() %s %s ",db_seq,q_seq);

  // Initialize traceback and F matrix (fill in first row and column)
  matrix_init(F, traceback, L1, L2, gap_penalty);

  // Create alignment
  scoreinfo ret = edit_distance_align(F, traceback, q_seq, db_seq, q_len,
      db_len, align_query, align_db, gap_penalty);

  //print_score(ret);
#if EDIT_DEBUG
  cout << "Length after alignment: " << ret.align_len << endl;
#endif

  if (prm) {
    printf("\nEdit Distance matrix:\n\n");
    print_matrix(F, db_seq, q_seq, db_len, q_len);

    printf("\nTraceback matrix: \n\n");
    print_traceback(traceback, db_seq, q_seq, db_len, q_len);

    printf("\n");
  }
  //for( int i = 0; i <= L2; i++ )  delete F[ i ];
  //delete []  F;
  //for( int i = 0; i <= L2; i++ )  delete traceback[ i ];
  //delete [] traceback;

  return ret;
}

void matrix_init(int ** F, char ** traceback, int L1, int L2, int d) {
  F[0][0] = 0;
  traceback[0][0] = 'n';

  int i = 0, j = 0;

  // initialize 1st row to 0
  for (j = 1; j <= L1; j++) {
    F[0][j] = 0;
    traceback[0][j] = '-';
  }
  for (i = 1; i <= L2; i++) {
    F[i][0] = i;
    traceback[i][0] = '|';
  }
}

/*
 * Logic of edit_distance Alignment.
 *  given two string q_seq and db_seq. we perform edit_distance based alignment.
 *
 *  matrix F is populated using dynamic programming appraoch.
 *
 *
 *
 *
 */

scoreinfo edit_distance_align(
    // Needleman-Wunsch algorithm
    int ** F, char ** traceback, char * q_seq, char * db_seq, int q_len,
    int db_len, char *query_align, char *db_align, int gap_penalty // Gap penalty
    ) {
  //char * db_align = tmp_seq_1_al;
  //char * query_align = tmp_seq_2_al;

  int k = 0, x = 0, y = 0;
  int fU, fD, fL;
  char ptr; //, nuc ;
  int i = 0, j = 0;
  scoreinfo ret = { 0, 0, 0, 0, 0, 0, 0 };

  int L1 = db_len;
  int L2 = q_len;
  int s;
  for (i = 1; i <= L2; i++) {
    for (j = 1; j <= L1; j++) {
      //nuc = seq_1[ j-1 ] ;
      if (db_seq[j - 1] == q_seq[i - 1]) {
        s = 0;
      } else {
        s = 1;
      }

      fU = F[i - 1][j] + 1;
      fD = F[i - 1][j - 1] + s;
      fL = F[i][j - 1] + 1;

      F[i][j] = min(fU, fD, fL, &ptr);

      traceback[i][j] = ptr;
    }
  }
  i--;
  j--;

  // instead of starting look back from [qlen][dlen] start from minimum in [qlen] row.
  int loc_j = j;
  ret.score = F[i][j];
  for (int jj = j; jj >= 0; jj--) {
    if (ret.score > F[i][jj]) { //TODO : we need to check if we should move when min score is equal to score on left . moving when equal might put more gap.
      ret.score = F[i][jj];
      loc_j = jj;
    }
  }

#ifdef EDIT_DEBUG
  printf("Best score %d found at [%d][%d]",ret.score,i,loc_j);
#endif

  j = loc_j;
  //print_traceback(traceback,seq_1,seq_2);
  int kk = 0;
  while (i > 0 || j > 0) {
    //int tmp_db = strlen(db_align),tmp_q = strlen(query_align),tmp_i=i,tmp_j = j;
    char tmp_trace = traceback[i][j];
    switch (traceback[i][j]) {
    case '|':
      db_align[k] = '-';
      query_align[k] = q_seq[i - 1]; // db_align is initialized to 0 so direct assignment can be made here
      i--;
      ret.dgap++;
      break;

    case '\\':
      db_align[k] = db_seq[j - 1];
      query_align[k] = q_seq[i - 1];

      if (db_seq[j - 1] != q_seq[i - 1])
        ret.mm++;
      else
        ret.n_iden++;

      i--;
      j--;
      break;

    case '-':
      db_align[k] = db_seq[j - 1];
      query_align[k] = '-';
      ret.qgap++;
      j--;
    }
    k++;
  }
  ret.align_len = k;

  //print_score(ret);

  // check if gap occur at any end of query_align ( qstring ) and if it occurs then correct the score and qgap and accordingly
  while (query_align[0] == '-') { // gap at beginning ( or at the end of actual alignment currently its reversed )
    ret.qgap--;

    int l1 = ret.align_len;

    // move alignment to right its equivalent to saying query_align = query_align+1;
    for (int i = 0; i < l1; i++) {
      query_align[i] = query_align[i + 1];
      db_align[i] = db_align[i + 1];
    }

    ret.align_len--;

  }
  //print_score(ret);

  while (query_align[ret.align_len - 1] == '-') {
    ret.pos++; // string is reversed righnow to pos should increase when its at the end
    ret.qgap--;
    //ret.score -= obj_score_mat.gap_penalty;
    query_align[ret.align_len - 1] = '\0';
    db_align[ret.align_len - 1] = '\0';

    ret.align_len--;
  }

  //print_score(ret);

  reverse_str(db_align, ret.align_len);
  reverse_str(query_align, ret.align_len);

  return ret;
}

int min(int fu, int fd, int fl, char * ptr) {
  int min = 0;

  if (fd <= fu && fd <= fl) {
    min = fd;
    *ptr = '\\';
  } else if (fu < fl) {
    min = fu;
    *ptr = '|';
  } else {
    min = fl;
    *ptr = '-';
  }

  return min;
}

void print_matrix(int ** F, char * seq_1, char * seq_2, int L1, int L2) {
  int i, j;
  printf("        ");
  for (j = 0; j < L1; j++) {
    printf("%c  ", seq_1[j]);
  }
  printf("\n");

  for (i = 0; i <= L2; i++) {
    if (i > 0) {
      printf("%c  ", seq_2[i - 1]);
    }
    for (j = 0; j <= L1; j++) {
      //cout.width( 3 );
      printf("%d ", F[i][j]);
    }
    printf("\n");
  }
}

void print_traceback(char ** traceback, char * seq_1, char * seq_2, int L1,
    int L2) {

  char line[100] = "";
  int i, j;
  printf("    ");
  for (j = 0; j < L1; j++) {
    char tmp[4] = "";
    sprintf(tmp, "%c ", seq_1[j]);
    strcat(line, tmp);
  }
  printf("  %s \n", line);
  line[0] = '\0';
  for (i = 0; i <= L2; i++) {
    char tmp[4] = "";
    if (i > 0) {
      sprintf(tmp, "%c ", seq_2[i - 1]);
      strcat(line, tmp);
    }
    for (j = 0; j <= L1; j++) {
      sprintf(tmp, "%c ", traceback[i][j]);
      strcat(line, tmp);
    }

    printf("%s \n", line);
    line[0] = '\0';
  }
}

void print_al(char * db_align, char * query_align) {
  printf("DB:\t%s\n", db_align);
  printf("Q:\t%s\n", query_align);

}

void initialize_str(char *str, int len) {
  int i;
  for (i = 0; i < len; i++) {
    str[i] = '\0';
  }
}

void reverse_str(char *str, int len) {
  char tmp;
  //int len= strlen(str),i;
  for (int i = 0; i < len / 2; i++) {
    tmp = str[i];
    str[i] = str[len - 1 - i];
    str[len - 1 - i] = tmp;
  }
}

void print_score(scoreinfo s) {
  printf(
      " Score: %d, n_iden: %d,Pos: %d Mismatch: %d, dgap: %d, s.qgap: %d, align_len %d\n",
      s.score, s.n_iden, s.pos, s.mm, s.dgap, s.qgap, s.align_len);
}

/*
 *
 */
#ifdef EXE
int main(int argc,char *argv[])
{
  char *query = argv[1];
  char *db = argv[2];

  if(argc != 4 ) {
    printf("%d , usage : ./a.out query db prm",argc);
    return -1;
  }
  int prm = atoi(argv[3]);

  int q_len = strlen(query);
  int db_len = strlen(db);

  char query_align[MAX],db_align[MAX];

  initialize_str(query_align,MAX);
  initialize_str(db_align,MAX);

  scoreinfo score = edit_distance(query,db,q_len,db_len,query_align,db_align,prm);

  print_score(score);

  print_al(db_align,query_align);
}
#endif