1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
///////////////////////////////////////////////////////////////////////////////////////////////////
// File : SeqList.h
// Purpose : Handles the sequence file functions for ParsInsert
//
// Developer : David Knox (david.knox@colorado.edu) Jan 2011
// Copyright : Copyright (C) 2007-2011 David Knox
//
// Web site : http://parsinsert.sourceforge.net/
//
///////////////////////////////////////////////////////////////////////////////////////////////////
// This file is part of ParsInsert.
//
// ParsInsert is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ParsInsert is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ParsInsert. If not, see <http://www.gnu.org/licenses/>.
///////////////////////////////////////////////////////////////////////////////////////////////////
#if !defined(__SEQLIST_H__)
#define __SEQLIST_H__
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#include "Knox_Stddef.h"
#include <vector>
#include <string>
#include <map>
using namespace std;
#define MAX_SEQ_SIZE (64*1024)
class CPNode;
///////////////////////////////////////////////////////////////////////////////////////////////////
class CBestLocationEntry
{
public:
CPNode *node; // pointer to node with best match
int score; // score of match at this position
string tax; // taxonomy assignd to this position
int levels; // number of ranks in taxonomy
public:
CBestLocationEntry(int s, CPNode *n, LPCSTR t)
{
score = s;
node = n;
tax = t;
levels = 0;
if (!tax.empty())
{
for (int i=0; i < tax.length() ; ++i)
if (tax[i] == ';')
++levels;
}
}
};
typedef vector<CBestLocationEntry> CBestList;
///////////////////////////////////////////////////////////////////////////////////////////////////
class CBestLocation
{
public:
static int default_len;
CBestList list; // list of best matches
int len; // number of matches to keep
public:
CBestLocation(int _len=-1)
{
if (_len > 0)
len = _len;
else
len = default_len;
}
void Add(int score, CPNode *node, LPCSTR t);
int WorstScore();
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class CSequenceItem
{
public:
long offset; // offset in sequence file where sequence entry begins
string name; // id from the fasta file
int len; // length of the sequence data (sequence only)
LPSTR data; // sequence string
LPSTR hdr; // header line from fasta file
public:
CSequenceItem();
CSequenceItem(LPCSTR _name, long _offset, int _len, LPCSTR _data=NULL);
~CSequenceItem();
LPCSTR GetSeqData();
BOOL AllocateSeqData();
BOOL ReleaseSeqData();
BOOL AllocateSeqHeader(int size);
BOOL ReleaseSeqHeader();
LPCSTR ReadSeqHeader(FILE *f);
LPCSTR ReadSeqData(FILE *f);
BOOL WriteSeqData(FILE *f);
};
typedef map<string,CSequenceItem*> CSequenceList;
typedef CSequenceList::iterator CSequenceListIter;
///////////////////////////////////////////////////////////////////////////////////////////////////
class CSequenceFile
{
public:
FILE *f; // file to read data
string fname; // name of file opened
int seqCount; // number of sequences used in array
int seqN; // number of items allocated in array
CSequenceItem* *seqArray; // array of sequence numbers
CSequenceList seqlist; // map of name to sequence
CSequenceListIter seqIter; // iterator to step thru map
int posArray; // iterator position in array
static int progressCount; // number of sequences between progress reports
static int verbose;
public:
CSequenceFile(LPCSTR _fname=NULL, int size=MAX_SEQ_SIZE, int hashsize=10*1024);
~CSequenceFile();
CSequenceItem * GetSequence(LPCSTR name);
CSequenceItem * GetSequenceHeader(LPCSTR name);
BOOL Open(LPCSTR _fname, LPCSTR mode="r");
void Close();
int GetCount()
{
return seqCount;
}
int ReadSequenceIndexFile(LPCSTR filename);
int ReadSequenceFile(int type);
CSequenceItem *GetSeq(LPCSTR name);
BOOL WriteSequence(CSequenceItem *seq);
void ResetSeqIterator();
CSequenceItem *GetNextSeq();
int ReadTaxonomyFile();
};
#endif // __SEQLIST_H__
|