File: randseqaccess.hh

package info (click to toggle)
augustus 3.2.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 289,676 kB
  • sloc: cpp: 48,711; perl: 13,339; ansic: 1,251; makefile: 859; sh: 58
file content (169 lines) | stat: -rw-r--r-- 6,326 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/**********************************************************************
 * file:    randseqaccess.hh
 * licence: Artistic Licence, see file LICENCE.TXT or 
 *          http://www.opensource.org/licenses/artistic-license.php
 * descr.:  random acces to sequence data, e.g. get me chr1:1000-2000 from species 'human'
 * authors: Mario Stanke, Stefanie Koenig, yuquilin
 *
 *********************************************************************/

#ifndef _RANDSEQACCESS
#define _RANDSEQACCESS

// project includes
#include "gene.hh"
#include "types.hh"
#include "extrinsicinfo.hh"

#include <map>
#include <vector>
#include <cstring>

#ifdef AMYSQL
#include <mysql++.h>
#endif

#ifdef SQLITE
#include "sqliteDB.hh"
#endif

/*
 * SpeciesCollection holds all extrinsic evidence given for the species.
 * It consists of a set of group specific FeatureCollections and
 * a default FeatureCollection. Species for which no extrinsic evidence
 * is given, make use of the default collection (identical to ab initio
 * gene prediction, no bonus/malus).
 * Subsets of the species with the same extrinsic config, i.e. same feature table
 * in the extrinsicCfgFile, share one group specific FeatureCollection.
 */
class SpeciesCollection{
public:
    FeatureCollection* getFeatureCollection(string speciesname);
    int getGroupID(string speciesname);
    void addSpeciesToGroup(string skey, int groupID);
    bool withEvidence(string speciesname){return getGroupID(speciesname)>0;}
    // reading in the extrinsicCfgFile and hintsFile
    void readGFFFile(const char* filename); 
    void readExtrinsicCFGFile(vector<string> &speciesNames);
private:
    map<int,FeatureCollection> speciesColl; // maps the group number to a FeatureCollection
    map<string,int> groupIDs; // maps the speciesname to the group number
    FeatureCollection defaultColl; // default FeatureColleciton
    static int groupCount; // number of groups
};

/*
 * abstract class for quick access to an arbitrary sequence segment in genomes
 * needed for comparative gene finding
 */
class RandSeqAccess {
public:
    int getNumSpecies() {return numSpecies;}
    void setLength(int idx, string chrName, int len);
    int getChrLen(int idx, string chrName);
    void setSpeciesNames(vector<string> speciesNames);
    string getSname(size_t idx) {return speciesNames[idx];}
    int getMaxSnameLen(); // for neat indentation into right column
    int getIdx(string speciesname);
    void printStats();
    bool withEvidence(string speciesname) {return extrinsicFeatures.withEvidence(speciesname);}
    virtual AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand) =  0;
    AnnoSequence* getSeq(size_t speciesIdx, string chrName, int start, int end, Strand strand) {
	return getSeq(getSname(speciesIdx), chrName, start, end, strand);
    }
    virtual SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand) = 0;  
    virtual ~RandSeqAccess() {}
protected:
    RandSeqAccess() {};
    int numSpecies;
    vector<map<string,int> > chrLen;
    vector<string> speciesNames;
    map<string, size_t> speciesIndex; // to quickly access the index for a given species name
    SpeciesCollection extrinsicFeatures; // all hints
};

/*
 * Achieve random access by simply storing all genomes in memory and then retrieving the required
 * substrings when desired. This may need a lot of RAM.
 */
class MemSeqAccess : public RandSeqAccess {
public:
    MemSeqAccess(vector<string> s);
    ~MemSeqAccess(){} // TODO: delete DNA sequences from 'sequences' map
    AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand);
    SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand);
    void open(){}
private:
    map<string,string> filenames;
    map<string,char*> sequences;  //keys: speciesname:chrName values: dna sequence
};


/*
 * read an input file of format:
 * human        <TAB> /dir/to/genome/genome.fa
 * Mus musculus <TAB> /dir/to/genome/mouse.fa
 * to a map
 */
map<string,string> getFileNames (string listfile);


/*
 * Random access to sequence segments through a database.
 * The sequences must be stored in a database.
 */
class DbSeqAccess : public RandSeqAccess {
public:
    virtual AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand)=0;
    virtual SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand)=0;  
    virtual ~DbSeqAccess() {}

protected:
    DbSeqAccess(vector<string> s = vector<string>());
    string dbaccess;

};

#ifdef AMYSQL
class MysqlAccess : public DbSeqAccess {
public:
    MysqlAccess(vector<string> s = vector<string>()) : DbSeqAccess(s){
	open();
    }
    ~MysqlAccess() {}
    AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand);
    // the following function is for the BGI-style database
    AnnoSequence* getSeq2(string speciesname, string chrName, int start, int end, Strand strand);
    SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand);  
    void open();
    int split_dbaccess();
    void connect_db(ostream& out=cout);
    template<class T>  
    AnnoSequence* getNextDBSequence(string charName, int start, int end, vector<T>& asm_query_region);
    // template<class T>
    // AnnoSequence* getDBSequenceList(string charName,int start,int end,vector<T>& asm_query_region);
    template<class T>
    int get_region_coord(int seq_region_id, int start, int end, vector<T>& asm_query_region);

private:
    mysqlpp::Connection con;
    vector<string> db_information;
};
#endif // AMYSQL

#ifdef SQLITE
class SQLiteAccess : public DbSeqAccess {
public:
    SQLiteAccess(const char* f, vector<string> s = vector<string>()) : DbSeqAccess(s), db(f) {
	filenames = getFileNames (Constant::speciesfilenames);
    }
    ~SQLiteAccess() {}
    AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand);
    SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand);
private:
    SQLiteDB db;
    map<string,string> filenames;
};
#endif // SQLITE

#endif  // _RANDSEQACCESS