File: sequences.h

package info (click to toggle)
sift 6.2.1-2
  • links: PTS, VCS
  • area: non-free
  • in suites: sid
  • size: 4,784 kB
  • sloc: ansic: 18,272; perl: 219; csh: 164; makefile: 152
file content (280 lines) | stat: -rwxr-xr-x 9,657 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
/* (C) Copyright 1993, Fred Hutchinson Cancer Research Center */
/* Use, modification or distribution of these programs is subject to */
/* the terms of the non-commercial licensing agreement in license.h. */

/* sequences.h: sequence manipulation functions */
/* Written by: Bill Alford */
/* Change log information is at the end of the file. */

#ifndef SEQUENCES_H_
#define SEQUENCES_H_

#include "strutil.h"

/*
 * Exported variables and data structures
 */
#define AA_SEQ 0	/* Amino acid sequence designator */
#define NA_SEQ 1	/* Nucleic acid sequence designator */
#define UNKNOWN_SEQ 2	/* Unknown sequence designator */

#define NA_FREQ 0.85	/* the percent of 'A', 'C', 'G'. & 'T' in */
			/* the sequence to consider it a NA_SEQ */

/* the Residue type */
typedef unsigned char Residue;

struct sequence_struct {
  char name[SMALL_BUFF_LENGTH];	   /* the name of the sequence */
  char info[SMALL_BUFF_LENGTH];	   /* the info line of the sequence */
  int position;			   /* the position of the subsequence (if */
				   /* is one) in the sequence */
  int length;			   /* the length of the sequence */
  int max_length;		   /* the maximum length of memory allocated */
  int type;			   /* type of sequence: AA_SEQ or NA_SEQ */
  double weight;		   /* the weight of the sequence if the */
				   /* sequence is in a block */
  double undefined;		   /* for future use */
  double undefined_dbl;		   /* for future use */
  void *undefined_ptr;		   /* for future use */
  Residue *sequence;		   /* the sequence of either protein or DNA */
};
typedef struct sequence_struct Sequence;




/* Sequence Database types */
/*--------Sequence database format structure -------------------------*/
#define MAXDB   10		/* Max. # database formats */
#define GB       0		/* GenBank type */
#define PIR      1		/* PIR type */
#define EMBL     2		/* EMBL type.  SwissProt is the same. */
#define GCG      3		/* plain GCG type */
#define GCG_GB   4		/* GCG with GenBank header */
#define GCG_PIR  5		/* GCG with PIR header */
#define GCG_EMBL 6		/* GCG with EMBL/SwissProt header */
#define FASTA    7		/* FASTA type */
#define UNI      8  		/* UNIVERSAL type */
#define FLAT     9		/* FLAT, no title, just sequence */
/*#define VMS 3*/ 		/* PIR/VMS type */
/* NOTE: VMS must be the the last type and UNI must be just before it.  See */
/*       the code in type_dbs for the reason */
/* NOTE: Fasta and Universa (and VMS) formats are very similar.  The */
/*       only difference between Fasta and Universa is that there is a */
/*       '*' at the end of the Universa format.  The reason for the */
/*       Universa format is that there was a time when patmat needed */
/*       the '*' end-of-sequence delimiter.  There is no longer a */
/*       need with blimps, but the universa format handling is being */
/*       kept around for backward compatibility. */
/* NOTE: The GCG formats consists of a header file of the format of the */
/*       originating database and the sequence protion in the GCG */
/*       format.  This obviously can lead to confusion, especially */
/*       since the characters in the second line of the GCG sequence */
/*       format will signal that the sequence is an amino acid */
/*       sequence (bad when the GCG sequence is from a DNA database). */
/* NOTE: The plain GCG format is a kludge.  There is code to handle */
/*       it, but since it is one of the formats that does not have an */
/*       end delimiter there is the possibility that there will be */
/*       junk added to the sequence if there is more than one */
/*       sequence in the file. */

struct db_info {		/* Sequence Database format info */
   char *type;
   char *start;
   char *desc;
   char *seq;
   char *end;
   int title_offset;
   int seq_offset;
};

extern struct db_info DbInfo[MAXDB];



/*
 * Exported functions
 */

/*
 * read_a_sequence
 *   reads a sequence from the data base and returns a pointer to the new
 *   Sequence data structure
 *   Parameters:
 *     FILE *sfp: the sequence file pointer
 *     int db:    the database type of the file
 *     int type:  the type of sequences in the file
 *   Error codes: Returns NULL if there was no sequence to read.
 */

extern Sequence *read_a_sequence();

/*
 * read_sequence
 *   read_sequence reads from the string the sequence of the given type into
 *   the passed in Sequence data structure
 *   Parameters:
 *     Sequence *seq: the Sequence data structure to put the data in
 *     int seq_type:  the type of sequence (amino acid or nucleic acid)
 *     int start_pos: the position in the sequence to start entering the data
 *     char *str:     the string that has the sequence (NULL terminated)
 *   Note: The entire string up to the NULL character is considered the
 *         sequence.
 *   Preconditions: seq.length must have a the value of the size allocated to
 *                  the residues
 *   Postconditions: the residues from start_pos to either seq.length or
 *                   return value + start_pos in the seq.sequence have been
 *                   updated to the residues passed in by the string
 *                   seq.type is set to seq_type
 *   Return codes: returns the number of residues added to the sequence
 *   Error codes: returns the negative value of the number of residues left
 *                over after the sequence has been filled.
 *   Note: A return value of zero is means that no residues were
 *         added to the sequence.
 */

extern int read_sequence();

/*
 * sequence_type
 *   tries to guess the type of the sequence passed in the string.
 *   Note: if the sequence can not be matched to an NA_SEQ, it falls through
 *         to being a AA_SEQ.
 *   Parameters:
 *     char *str: the sequence string, no more than SMALL_BUFF_LENGTH long
 *   Return Codes: Returns either AA_SEQ or NA_SEQ depending on
 *                 if the sequence is an Amino Acid or Nucleic Acid.
 *   Error Codes: none
 */

extern int sequence_type();


/*
 * sequence_comparison
 *   Compares two sequences.   It compares by the value in the name
 *   field if it exists.
 *   Parameters:
 *     SequenceListEntry a, b: the entries to compare
 *   Return codes:  a return value < 0 if a < b, a return value = 0 if a == b,
 *                  and a return value > 0 if a > b
 *   Error codes: none
 */

extern int sequence_comparison();

/*
 * free_sequence
 *   Deletes the sequence and the sub elements.
 *   Parameters:
 *     Sequence *sequence: the sequence to free
 *   Return code: none
 *   Error code: none
 */

extern void free_sequence();



/*
 * translate_sequence
 *   Given a frame (1, 2, 3, -1, -2, or -3), translates the DNA sequence
 *   (NA_SEQ) into an amino acid sequence.
 *   Parameters:
 *     Sequence *seq: the DNA sequence
 *     int frame:     the frame to read
 *     unsigned char gcode[64]:    the genetic code (forward).
 *                                   for frames 1, 2, 3
 *     unsigned char revgcode[64]: the genetic code (reverse complement)
 *                                   for frames -1, -2, -3
 *   Return code: a pointer to the new translated sequence
 *   Error codes: returns the original sequence if the sequence is not a
 *                NA_SEQ or the translation frame is incorrect.
 */

extern Sequence *translate_sequence();

/*  untranslate an amino acid sequence to a degenerate DNA sequence  */
extern Sequence *untranslate_sequence();


/* NOTE: These should only be used in config.c when determining the */
/*       database types.  These should be used to set the values in the */
/*       file lists. */

/*
 * type_dbs
 *   Figures out the type of the database to be read.
 *   This is from motomisc.c used in the universa conversion program.
 *   Parameters:
 *     FILE *fin:            the database file
 *     struct db_info dbs[]: the information of the structure of the db's
 *   Return codes: the type of database the file is.
 *   Error code: the database number -1 is returned if the type of database
 *               cannot be figured out
 */
extern int type_dbs();


/*
 * seq_type_dbs
 *   Finds the type or the database given.
 *   Parameters:
 *     FILE *fin:            the database file
 *     struct db_info dbs[]: the information of the structure of the db's
 *     int db:               the number of this database (indexes into dbs)
 *   Return codes: Returns the type of sequence of the first entry in the
 *                 database.  This assumes that the entire database is of this
 *                 type.
 *   Error codes: none
 */

extern int seq_type_dbs();

/*
 * print_sequence
 *   Prints a Sequence data structure.  Primarily for debugging purposes.
 *   Parameters:
 *     Sequence *seq:  the sequence to print
 *   Error Codes: none
 */

extern void print_sequence();

/*
 * output_sequence
 *   Outputs a Sequence data structure to the given file.
 *   Parameters:
 *     Sequence *seq:  the sequence to print
 *     FILE *osfp:     the output sequence file pointer
 *   Error Codes: none
 */

extern void output_sequence();


/*
 * resize_sequence
 *   Adds more memory for the storage of the residues of a sequence.
 *   Parameter:
 *     Sequence *seq: the sequence to resize
 *   Error codes: none
 */

extern void resize_sequence();

#endif /*  SEQUENCES_H_ */

/* Change log information follows. */
/*
   Changes since version 3.3.2:
   12/15/99 Added double undefined_dbl field.
   Changes since version 3.2:
   11/24/97 Made resize_sequence() external.
   Changes since version 3.1:
   1/20/97  Added untranslate_sequence()
   Changes since version 2.x:
   3/20/96  Lowered NA_FREQ from 0.90 to 0.85 after user complaints. JGH
 *
 */