1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
/**********************************************************************
* file: randseqaccess.hh
* licence: Artistic Licence, see file LICENCE.TXT or
* http://www.opensource.org/licenses/artistic-license.php
* descr.: random acces to sequence data, e.g. get me chr1:1000-2000 from species 'human'
* authors: Mario Stanke, Stefanie Koenig, yuquilin
*
*********************************************************************/
#ifndef _RANDSEQACCESS
#define _RANDSEQACCESS
// project includes
#include "gene.hh"
#include "types.hh"
#include "extrinsicinfo.hh"
#include <map>
#include <vector>
#include <cstring>
#ifdef AMYSQL
#include <mysql++.h>
#endif
#ifdef SQLITE
#include "sqliteDB.hh"
#endif
/*
* SpeciesCollection holds all extrinsic evidence given for the species.
* It consists of a set of group specific FeatureCollections and
* a default FeatureCollection. Species for which no extrinsic evidence
* is given, make use of the default collection (identical to ab initio
* gene prediction, no bonus/malus).
* Subsets of the species with the same extrinsic config, i.e. same feature table
* in the extrinsicCfgFile, share one group specific FeatureCollection.
*/
class SpeciesCollection{
public:
FeatureCollection* getFeatureCollection(string speciesname);
int getGroupID(string speciesname);
void addSpeciesToGroup(string skey, int groupID);
bool withEvidence(string speciesname){return getGroupID(speciesname)>0;}
// reading in the extrinsicCfgFile and hintsFile
void readGFFFile(const char* filename);
void readExtrinsicCFGFile(vector<string> &speciesNames);
private:
map<int,FeatureCollection> speciesColl; // maps the group number to a FeatureCollection
map<string,int> groupIDs; // maps the speciesname to the group number
FeatureCollection defaultColl; // default FeatureColleciton
static int groupCount; // number of groups
};
/*
* abstract class for quick access to an arbitrary sequence segment in genomes
* needed for comparative gene finding
*/
class RandSeqAccess {
public:
int getNumSpecies() {return numSpecies;}
void setLength(int idx, string chrName, int len);
int getChrLen(int idx, string chrName);
void setSpeciesNames(vector<string> speciesNames);
string getSname(size_t idx) {return speciesNames[idx];}
int getMaxSnameLen(); // for neat indentation into right column
int getIdx(string speciesname);
void printStats();
bool withEvidence(string speciesname) {return extrinsicFeatures.withEvidence(speciesname);}
virtual AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand) = 0;
AnnoSequence* getSeq(size_t speciesIdx, string chrName, int start, int end, Strand strand) {
return getSeq(getSname(speciesIdx), chrName, start, end, strand);
}
virtual SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand) = 0;
virtual ~RandSeqAccess() {}
protected:
RandSeqAccess() {};
int numSpecies;
vector<map<string,int> > chrLen;
vector<string> speciesNames;
map<string, size_t> speciesIndex; // to quickly access the index for a given species name
SpeciesCollection extrinsicFeatures; // all hints
};
/*
* Achieve random access by simply storing all genomes in memory and then retrieving the required
* substrings when desired. This may need a lot of RAM.
*/
class MemSeqAccess : public RandSeqAccess {
public:
MemSeqAccess(vector<string> s);
~MemSeqAccess(){} // TODO: delete DNA sequences from 'sequences' map
AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand);
SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand);
void open(){}
private:
map<string,string> filenames;
map<string,char*> sequences; //keys: speciesname:chrName values: dna sequence
};
/*
* read an input file of format:
* human <TAB> /dir/to/genome/genome.fa
* Mus musculus <TAB> /dir/to/genome/mouse.fa
* to a map
*/
map<string,string> getFileNames (string listfile);
/*
* Random access to sequence segments through a database.
* The sequences must be stored in a database.
*/
class DbSeqAccess : public RandSeqAccess {
public:
virtual AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand)=0;
virtual SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand)=0;
virtual ~DbSeqAccess() {}
protected:
DbSeqAccess(vector<string> s = vector<string>());
string dbaccess;
};
#ifdef AMYSQL
class MysqlAccess : public DbSeqAccess {
public:
MysqlAccess(vector<string> s = vector<string>()) : DbSeqAccess(s){
open();
}
~MysqlAccess() {}
AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand);
// the following function is for the BGI-style database
AnnoSequence* getSeq2(string speciesname, string chrName, int start, int end, Strand strand);
SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand);
void open();
int split_dbaccess();
void connect_db(ostream& out=cout);
template<class T>
AnnoSequence* getNextDBSequence(string charName, int start, int end, vector<T>& asm_query_region);
// template<class T>
// AnnoSequence* getDBSequenceList(string charName,int start,int end,vector<T>& asm_query_region);
template<class T>
int get_region_coord(int seq_region_id, int start, int end, vector<T>& asm_query_region);
private:
mysqlpp::Connection con;
vector<string> db_information;
};
#endif // AMYSQL
#ifdef SQLITE
class SQLiteAccess : public DbSeqAccess {
public:
SQLiteAccess(const char* f, vector<string> s = vector<string>()) : DbSeqAccess(s), db(f) {
filenames = getFileNames (Constant::speciesfilenames);
}
~SQLiteAccess() {}
AnnoSequence* getSeq(string speciesname, string chrName, int start, int end, Strand strand);
SequenceFeatureCollection* getFeatures(string speciesname, string chrName, int start, int end, Strand strand);
private:
SQLiteDB db;
map<string,string> filenames;
};
#endif // SQLITE
#endif // _RANDSEQACCESS
|