File: parse_ft.h

package info (click to toggle)
staden 2.0.0%2Bb11-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, buster
  • size: 21,556 kB
  • sloc: ansic: 240,603; tcl: 65,360; cpp: 12,854; makefile: 11,201; sh: 2,952; fortran: 2,033; perl: 63; awk: 46
file content (137 lines) | stat: -rw-r--r-- 5,085 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#ifndef _PARSE_FT_H
#define _PARSE_FT_H

#include <tcl.h>

/*
 * Parses the EMBL FT lines.
 * A feature table consists of a type, a location line, and qualifiers.
 * For example:
 *
 * FT   CDS             join(complement(115903..116071),complement(95491..95621),
 * FT                   complement(93634..93714),complement(92556..92682),
 * FT                   complement(87329..87423),complement(82749..82887),
 * FT                   complement(79690..79802))
 * FT                   /db_xref="SPTREMBL:O43728"
 * FT                   /evidence=NOT_EXPERIMENTAL
 * FT                   /note="match: proteins: Tr:O43728 Tr:O88872 Sw:P52840
 * FT                   Tr:O75897 Sw:P50224 Sw:P50225 Sw:P17988 Sw:P50226 Sw:P52846
 * FT                   Tr:O43704 Tr:O46503 Tr:O00338 Tr:O70262 Tr:O35403 Tr:O46640
 * FT                   Sw:P19217 Sw:P50237"
 * FT                   /gene="dJ388M5.3"
 * FT                   /product="dJ388M5.3 (novel Sulfotransferase (sulfokinase,
 * FT                   EC 2.8.2.1) like protein)"
 * FT                   /protein_id="CAB09788.1"
 * FT                   /translation="MAESEAETPSTPGEFESKYFEFHGVRLPPFCRGKMEEIANFPVRP
 * FT                   SDVWIVTYPKSGTSLLQEVVYLVSQGADPDEIGLMNIDEQLPVLEYPQPGLDIIKELTS
 * FT                   PRLIKSHLPYRFLPSDLHNGDSKVIYMARNPKDLVVSYYQFHRSLRTMSYRGTFQEFCR
 * FT                   RFMNDKLGYGSWFEHVQEFWEHRMDSNVLFLKYEDMHRDLVTMVEQLARFLGVSCDKAQ
 * FT                   LEALTEHCHQLVDQCCNAEALPVGRGRVGLWKDIFTVSMNEKFDLVYKQKMGKCDLTFD
 * FT                   FYL"
 *
 * Ranges consist of one or more range joined together.
 * For example join(55798..55943,56245..57257). Each range has a left and
 * a right location. Each location is typically expressed exactly, but may
 * itself be a range if the exact location is unknown. Eg: <10, 10^20, 10.20.
 * So a single range may consist of "<100..(150.160)".
 *
 * Qualifiers are stored as /qual, /qual=value or /qual="value".
 * /qual may occur more than once in a single FT entry, as it typical with
 * /dbxref qualifiers.
 *
 * The main data structure for holding this is ft_entry.
 * Within this the type holds "CDS". The location and qualifier fields hold
 * textual information while range and qual_hash contain processed data
 * structures.
 */

/* ---------------------------------------------------------------------- */
/* Data types */


/*
 * A single location type:
 * exact:			"10"
 * single base in a range:	"10.20"
 * site within a range:		"10^20"
 */
typedef enum {
    EXACT,
    BASE,
    SITE
} ft_location_type;

/*
 * A location (half of a range).
 * Locations are either exact or ranges. They may also have qualifiers to
 * indicate whether they are before or after a certain point (eg "<10").
 */
typedef struct ft_location_t {
    int min, max; 		/* minimum and maximum values in "10.20" or
				   "10^20" type */
    int min_lt;			/* -1 = <, 1 = >, 0 = none */
    int max_lt;			/* -1 = <, 1 = >, 0 = none */
    ft_location_type type;	/* Distinguishes "10", "10.20" or "10^20" */
} ft_location;

/*
 * A range: consists of two locations separated by "..".
 * Eg "10..20" or "(<10.20)..(30.40)".
 */
typedef struct ft_range_t {
    /*
     * TODO: Add order(), join() and complement() information. This
     * may require changing this to a hierarchial structure.
     */
    ft_location *left;		/* Two locations for loc..loc. eg
				   "10..(30.40)" => 10 && (30.40) */
    ft_location *right;		/* Right is 2nd location if present or 
				   NULL if not. */
    int complemented;		/* 0 == no, 1 == yes */
    struct ft_range_t *next;	/* Linked list; to next range */
} ft_range;

/*
 * A single feature entry: consists of a feature type, a linked list of
 * ranges (feature locations), and zero or more qualifiers.
 */
typedef struct ft_t {
    char type[20];		/* feature type in string form */
    char *location;		/* textual version of the location string */
    ft_range *range;		/* A linked list of ranges */
    char *qualifiers;		/* Qualifiers as a single block of
				   text. TODO: split this into a
				   suitable data structure. */
    int qual_hash_init;		/* 1 if qual_hash is in use */
    Tcl_HashTable qual_hash;	/* Qualifiers as a Tcl hash table */
} ft_entry;

/*
 * A value item in the qualifier hash table. These are a linked list as the
 * qualifiers may be duplicated.
 */
struct ft_value_element_t;
typedef struct ft_value_element_t {
    char *value;
    struct ft_value_element_t *next;
} ft_value_element;

/* ---------------------------------------------------------------------- */
/* Function prototypes */

ft_location *new_ft_location(void);
void del_ft_location(ft_location *l);
ft_range *new_ft_range(void);
void del_ft_range(ft_range *r);
ft_entry *new_ft_entry(void);
void del_ft_entry(ft_entry *e);
ft_entry *parse_ft_entry(char *str);
char *get_ft_qualifier(char *str, char *qualifier, int *value_len);

void init_ft_qual_hash(ft_entry *e, char *str);
void del_ft_qual_hash(ft_entry *e);
ft_value_element *search_ft_qual_hash(ft_entry *e, char *qual);

void print_entry(ft_entry *e);

#endif /* _PARSE_FT_H */