File: protomat.h

package info (click to toggle)
blimps 3.9%2Bds-1
  • links: PTS, VCS
  • area: non-free
  • in suites: bookworm, bullseye, buster
  • size: 6,812 kB
  • sloc: ansic: 43,271; csh: 553; perl: 116; makefile: 99; cs: 27; cobol: 23
file content (264 lines) | stat: -rw-r--r-- 11,669 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
/*------------------------------------------------------------------------*/
/*(C) Copyright 1991-2006, Fred Hutchinson Cancer Research Center         */
/*      motifj.h  Header file for PROTOMAT programs                       */
/* NOTE for Silicon Graphics users:  The type of scores in
       struct score should be changed from char to int to get correct
       processing (but not for SUN!)                                      */
/*------------------------------------------------------------------------*/
/*  6/29/90 J. Henikoff
    1/28/99 Increased SNAMELEN from 11 to 18; IDLEN from 10 to 12
    2/21/00 Added id->full_entry to struct db_id
    8/20/01 Increased MAXSEQS and MAXFREQ from 400 to 600
    1/ 2/06 Increase MAXSEQS and MAXFREQ from 600 to 1000
   12/23/06 Increased SNAMELEN from 18 to 20
--------------------------------------------------------------------------*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define VERSION		   8		/*  motifj version number */
#define YES                1
#define NO                 0
#define ESC               27
#define CR                13
#define LF                10
#define UMIN(x, y)	  ( (x<y) ? x : y)   /* UNIX min macro */
#define UMAX(x, y)	  ( (x>y) ? x : y)   /* UNIX max macro */
/*   INDEX & INDEXCOL compute the sequential indices for the lower half of an
     nxn symmetric matrix given row & column coordinates.  Lower half has
     n(n-1)/2 entries; col=0,n-2 and row=col+1,n-1; col has n-col-1 rows.
     Index runs from 0 to n(n-1)/2 - 1 down columns with
     (index=0)==(col=0,row=1) and (index=n(n-1)/2-1)==(col=n-2,row=n-1).  */
#define INDEXCOL(n, col)    ( col*n - (col*(col+1))/2 )
#define INDEXCOLROW(n, col, row)  ( col*n - (col*(col+3))/2 - 1 + row )

#define randomize()       srand((unsigned)time(NULL))  /* Seed rand() */

#define MAX_DISTANCE  	  24	/* Max spacing between aminos of motif */
#define MIN_DISTANCE      1     /* Min distance specification */
#define MAXSEQS	  	  1000  /* Max number of sequences to be analyzed */
#define MINSEQS           2     /* Min number of sequences to be analyzed */
#define MAXFREQ           1000  /* Max occurences of motif in all seqs */
#define MAX_LENGTH	  5500  /* Max length of each sequence */
#define MIN_DOMAIN_WIDTH  10	/* Minimum width */
#define MAX_DOMAIN_WIDTH  55	/* Maximum width */
#define MAX_MERGE_WIDTH   55	/* Max. width of merged blocks */
#define RELEVANT_MOTIFS   50	/* Only top scoring motifs are retained */
#define MAX_MOTIFS	  100	/* Buffer motifs before discarding */
#define MINSCORE          1     /* Min block trimming column score (0-2500)*/
#define CLTHRES            80   /* Clustering identity percentage (0-100)*/
#define DROPSCORE         -10   /* Default std devs *10 for dropping block */
#define MOTAUTO4            3   /* max. # motifs for run type 4  */
#define MOTAUTO3	    6   /* min. # motifs for run type 3 */
#define MAXBLK             15   /* max # blocks for shotgun assembly */
#define MAXTITLE	   75   /* max sequence title length */

#define PROTEIN_SUBDIRECTORY "pros/"  /* Subdirectory containing proteins */
#define PROTEIN_EXTENSION    ".pro"    /* Extension for all protein files */
#define READ                 "r"       /* Code to read disk files */
#define SNAMELEN              20       /* Max length of sequence name */
#define IDLEN                 12       /* Max length of db id */
#define FNAMELEN             160       /* Max length of file name */
#define MAXLINE               480      /* Max line length for ASCII file */
#define MATSIZE               21       /* Scoring matrix dimension */
#define HIGHPASS              4	       /* Default high pass filter value */

#define round(x) ((x >= 0.0) ? (int) (x+0.5) : (int) (x-0.5))

/* Declare new data types */
typedef unsigned char *aa_type[20][20][MAX_DISTANCE];

/* Structure to store information about each motif: */
/*  NOTE: integer fields defined as unsigned char to save space;
	  they must not exceed 255 in value */
struct motif_struct {
  unsigned char aa1, aa2, aa3, distance1, distance2;
  /*  freq is the number of sequences with this motif */
  int freq, dups;
  /*   seq_no[freq] lists the sequence numbers that have the motif,
       pos[freq] lists the offset of the motif in the corresponding
       sequences; so pos[x] is the offset into sequence # seq_no[x], 
       NOT into sequence # x */
  int seq_no[MAXFREQ], pos[MAXFREQ];
  int score, scores[MAX_DOMAIN_WIDTH], domain, mots;
  char group, sub_group;
  };

/* Structure to store information about groups for motif map routine: */
struct group_struct {
  int group_no, sub_no, position;
  };

/*-------------------------------------------------------------------*/
/*  merged_motif is an array of motifs.  Each entry is one or more   */
/*    motifs.  Each can be thought of as a block of sequences aligned*/
/*    around all the motifs.                                         */
/*-------------------------------------------------------------------*/
struct merged_motif {
	int dropped;			/* YES if dropped */
	char aa[3];			/* Amino acid motif */
	int nmotif;			/* number of motifs, >= 0  */
					/* 0 => merged (inactive) */
	int nident;			/* number of identities */
					/* ..occurs >=SIGNIF in a col. */
	int max_score;			/* max. score of merged motifs */
	int domain;			/* displacement of merged motif blocks */
					/* block width = domain+1 */
	int distance;			/* width of motifs within block */
	int dups;			/* total # dups in all seqs */
	int loffset;			/* 1st position of motif within block*/
	int leftpos[MAXSEQS];		/* leftmost position of motif   */
					/*... for each sequence*/
	int cluster[MAXSEQS];		/* cluster number for each seq */
	int scores[MAX_MERGE_WIDTH];	/* column scores */
	int t_loffset;			/* 1st position of motif within */
					/*  trimmed block */
	int t_domain;			/* trimmed block width=t_domain+1*/
	int t_score;			/* score over trimmed block */
	int position[MAXSEQS];		/* position of block within each seq*/
	int maxpos;			/* max position in any seq */
	int minpos;			/* min position in any seq */
	int in_degree;			/* in-degree of block in DAG */
	int out_degree;			/* out-degree of block in DAG */
};

/*------------------------------------------------------------------------*/
/*   sequences contains all information about the sequences.              */
/*------------------------------------------------------------------------*/
struct sequences {
	int num;		/* number of sequences */
	int totlen;		/* total length of all sequences */
	int *len;		/* lengths of each sequence */
	int *offlen;		/* offset to start of each sequence */
	char *name;		/* 10 char name of each sequence */
	char *seq;		/* sequence bases */
};

/*------------------------------------------------------------------------*/
/*   aux_seq is a list of blocks ordered left to right within a sequence  */
/*  Each sequence has the same number of blocks, but the blocks may be
     arranged in different orders in each sequence.                       */
/*------------------------------------------------------------------------*/
struct aux_seq {
	int block[RELEVANT_MOTIFS];	/* index of block in each position*/
};

struct temp {			/* temporary structure for sorting */
	int value;
	int index;
	int flag;
};
struct dtemp {			/* temporary structure for sorting */
	double value;
	int index;
};

/*-----------------------------------------------------------------------*/
/*    Structure for pairs of sequences.                                  */
/*     pair should be allocated as an array, & the number of the         */
/*     sequences forming the pair inferred from the array index.         */
/*-----------------------------------------------------------------------*/
struct pair {
	int score;		/* # of identities within trimmed block */
	int cluster;		/* cluster # for this pair */
};

/*-----------------------------------------------------------------------*/
/*  block_list is a list of blocks in an order that is consistent among
    all sequences;  it implies a partial multiple alignment.
    The next_block list is a list of all blocks, some of which may
    overlap.  The doubly linked next_best/prev_best list is a subset
    consisting of the best non-overlapping blocks in the list.           */
/*-----------------------------------------------------------------------*/
struct block_list {
	int b;			/* index of block (merged_motif) */
	int minprev;		/* min. distance from previous block */
	int maxprev;		/* max. distance from previous block */
	struct block_list *next_block;  /* all blocks in the list  */
	struct block_list *next_best;   /* best blocks in the list */
	struct block_list *prev_best;
};

/*------------------------------------------------------------------------*/
/*  path is a list of all possible paths through the blocks in all seqs.
     The first_block list includes all blocks, including those that
     possible overlap.  The first_best list includes the best sub-path
     of non-overlapping blocks.                                           */
/*------------------------------------------------------------------------*/
struct path {
	int nblocks;		/* # of blocks in path */
	int nbest;		/* # of blocks in best sub-path */
	int naas;		/* # of AAs in best sub-path */
	unsigned long totscore;	/* sum of scores of blocks in best sub-path*/
	int totmotif;		/* # motifs in best sub-path*/
	int totident;		/* # conserved residues in best sub_path */
	int nseqs;		/* # seqs in best sub-path */
	int seqs[MAXSEQS];	/* YES if path holds for sequence */
	struct block_list *first_block;  /* first block in path */
	struct block_list *first_best;	 /* first block in best sub-path*/

	struct path *next_path;
};

/*----------------------------------------------------------------------*/
/*  Ajacency matrix representation of distances between blocks in
    all sequences.                                                      */
/*----------------------------------------------------------------------*/
struct matrix {
	int npos;		/* # of seqs with positive diff in cell */
	int maxdiff;		/* maximum positive difference in cell  */
/*	int dist[MAXSEQS];	 distance from row to col for seq s */
	int mark;		/* all-purpose flag */
};

struct follow_data {
	struct path *path;
	int mark[RELEVANT_MOTIFS][RELEVANT_MOTIFS];
};

/* -------- Scoring matrix structure ----------------------------------*/
struct score {
	char scores[MATSIZE][MATSIZE];	/* valid range -127 to +128 */
	int highpass;			/* high pass filter value */
};

/*-Structure to split up a file name of the form <directory>\<name>.ext -*/
struct split_name {
	int dir_len, file_len, name_len;
};
/*------ Structure to hold the contents of a .lis or .lst file ------*/
struct db_id {
   char entry[SNAMELEN+1];	/* sequence name */
   char full_entry[2*SNAMELEN];	/* enhanced sequence name */
   char ps[2];			/* PS type=T, F or P */
   char info[FNAMELEN];		/* additional text info */
   int len;			/* sequence length */
   int frag;			/* YES if seq is a fragment */
   int lst;			/* seq in .lst file */
   int found;			/* seq found in database */
   int block;			/* seq found in block */
   int search;			/* used by excluded.c => use seq for search*/
   int rank;			/* used by matodat.c, fastodat.c */
   int score;			/* used by matodat.c, fastodat.c */
   double pvalue;		/* P-value */
   struct db_id *next;
   struct db_id *prior;
};

struct db_id *makedbid();
struct db_id *check_entry();
int get_ids();

struct split_name *split_names();

char *dir_unix();
int kr_atoi();
void kr_itoa();

void getscore();

char *num_to_aachar();
int aachar_to_num();
void pr_num_to_aa();
void pr_num_to_aa_space();