File: blxGff3Parser.hpp

package info (click to toggle)
seqtools 4.44.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 28,492 kB
  • sloc: cpp: 53,636; sh: 12,232; makefile: 387
file content (147 lines) | stat: -rw-r--r-- 6,695 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/*  File: blxGff3parser.h
 *  Author: Gemma Barson
 *  Copyright (c) 2010 - 2012 Genome Research Ltd
 * ---------------------------------------------------------------------------
 * SeqTools is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 3
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
 * ---------------------------------------------------------------------------
 * This file is part of the SeqTools sequence analysis package, 
 * written by
 *      Gemma Barson      (Sanger Institute, UK)  <gb10@sanger.ac.uk>
 * 
 * based on original code by
 *      Erik Sonnhammer   (SBC, Sweden)           <Erik.Sonnhammer@sbc.su.se>
 * 
 * and utilizing code taken from the AceDB and ZMap packages, written by
 *      Richard Durbin    (Sanger Institute, UK)  <rd@sanger.ac.uk>
 *      Jean Thierry-Mieg (CRBM du CNRS, France)  <mieg@kaa.crbm.cnrs-mop.fr>
 *      Ed Griffiths      (Sanger Institute, UK)  <edgrif@sanger.ac.uk>
 *      Roy Storey        (Sanger Institute, UK)  <rds@sanger.ac.uk>
 *      Malcolm Hinsley   (Sanger Institute, UK)  <mh17@sanger.ac.uk>
 *
 * Description: GFF v3 file format parser.
 *----------------------------------------------------------------------------
 */

#ifndef BLX_GFF_P_H
#define BLX_GFF_P_H

#include <gtk/gtk.h>
#include <seqtoolsUtils/blxmsp.hpp>

/* We follow glib convention in error domain naming:
 *          "The error domain is called <NAMESPACE>_<MODULE>_ERROR" */
#define BLX_GFF_ERROR "BLX_GFF_ERROR"


/* This struct contains info about a GFF type: its name and SO id, and the blixem
 * type that it corresponds to */
typedef struct _BlxGffType
  {
    char *name;         /* the type name as given in the GFF file, e.g. nucleotide_match */
    char *soId;         /* the type's SO Id, as given in the GFF file, e.g. SO:0000347 */
    BlxMspType blxType; /* the Blixem object type that the GFF type corresponds to */
  } BlxGffType;



/* This enum is to record the type of data currently being parsed by the parser. An input file can 
 * contain multiple types of data. The start of a new section of data is indicated by a header 
 * line beginning with a hash and a known type name, e.g. "# exblx_x" or "##gff-version   3"
 *
 * FS and SFS (Feature Series) data type names have "FS" or "SFS" followed by a specific format 
 * type, e.g. "# SFS type=SEG". 
 *
 * For some data types, additional data is included in the header line(s) as well as in the 'body' 
 * section below it. For these, there are two data types in the enum, one postfixed with _HEADER 
 * and one with _BODY. When the header line is detected the initial type is set to the _HEADER enum, 
 * and when we are finished processing the header information it is set to _BODY, so that we go on
 * to process the body of the data on the next loop through. For types with no information in the 
 * header, there is only a _BODY enum.
 */
typedef enum
  {
    PARSER_START,                  /* indicates that we haven't started processing yet */
    PARSER_ERROR,                  /* indiates an error state */
    
    GFF_3_HEADER,                  /* GFF 3 header */
    GFF_3_BODY,                    /* GFF 3 data */
    
    FASTA_SEQ_HEADER,              /* FASTA sequence header */
    FASTA_SEQ_BODY,                /* Sequence data in FASTA format */
    FASTA_SEQ_IGNORE,              /* A FASTA sequence we're not interested in */
    
    EXBLX_BODY,                    /* Old style sequence entries. */
    SEQBL_BODY,                    /* Old style sequence entries. */
    EXBLX_X_BODY,                  /* New style sequence entries with gaps and match strand. (_X stands for eXtended.) */ 
    SEQBL_X_BODY,                  /* New style sequence entries with gaps and match strand. (_X stands for eXtended.) */
    
    FS_HSP_BODY,                   /* feature-series HSP data */
    
    FS_GSP_HEADER,                 /* feature-series GSP data header */
    FS_GSP_BODY,                   /* feature-series GSP data */
    
    FS_GFF_BODY,                   /* feature-series GFF data */
    
    FS_SEG_BODY,                   /* feature-series segment data */
    
    FS_XY_HEADER,                  /* feature-series XY data header */
    FS_XY_BODY,                    /* feature-series XY data */
    
    FS_SEQ_HEADER,                 /* feature-series sequence data header */
    FS_SEQ_BODY                    /* feature-series sequence data */
  } BlxParserState ;



/* External functions */
void parseGff3Header(const int lineNum,
                     MSP **lastMsp, 
                     MSP **mspList, 
                     BlxParserState *parserState, 
                     GString *line_string, 
                     GList **seqList,
                     char *refSeqName,
                     IntRange *refSeqRange,
                     GError **error);

void parseGff3Body(const int lineNum,
                   GArray* featureLists[],
                   MSP **lastMsp, 
		   MSP **mspList, 
		   BlxParserState *parserState, 
		   GString *line_string, 
		   GList **seqList,
                   GList *columnList,
                   GSList *supportedTypes,
                   GSList *styles,
                   const int resFactor,
                   GKeyFile *keyFile,
                   const IntRange* const refSeqRange,
                   GHashTable *lookupTable,
                   GHashTable *fetchMethods);

void parseFastaSeqHeader(char *line, const int lineNum,
                         char **refSeq, char *refSeqName, IntRange *refSeqRange,
                         char ***readSeq, int *readSeqLen, int *readSeqMaxLen,
                         BlxParserState *parserState);
                         
                         
GSList*                            blxCreateSupportedGffTypeList(const BlxSeqType seqType);
void                               blxDestroyGffTypeList(GSList **supportedTypes);
BlxDataType*                       getBlxDataType(GQuark dataType, const char *source, GKeyFile *keyFile, GError **error);


#endif /* BLX_GFF_P_H */