File: SeqList.h

package info (click to toggle)
parsinsert 1.04-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 105,512 kB
  • sloc: cpp: 6,519; makefile: 95; sh: 15
file content (177 lines) | stat: -rw-r--r-- 4,983 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/////////////////////////////////////////////////////////////////////////////////////////////////// 
//  File      : SeqList.h
//  Purpose   : Handles the sequence file functions for ParsInsert
//
//  Developer : David Knox (david.knox@colorado.edu) Jan 2011
//  Copyright : Copyright (C) 2007-2011 David Knox
//
//  Web site  : http://parsinsert.sourceforge.net/
//
///////////////////////////////////////////////////////////////////////////////////////////////////
//	This file is part of ParsInsert.
//
//    ParsInsert is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    ParsInsert is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with ParsInsert.  If not, see <http://www.gnu.org/licenses/>.
///////////////////////////////////////////////////////////////////////////////////////////////////
#if !defined(__SEQLIST_H__)
#define __SEQLIST_H__

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

#include "Knox_Stddef.h"
#include <vector>
#include <string>
#include <map>

using namespace std;

#define MAX_SEQ_SIZE (64*1024)

class CPNode;

///////////////////////////////////////////////////////////////////////////////////////////////////

class CBestLocationEntry
	{
	public:

		CPNode		*node;			// pointer to node with best match
		int			score;			// score of match at this position
		string		tax;			// taxonomy assignd to this position
		int			levels;			// number of ranks in taxonomy

	public:
		CBestLocationEntry(int s, CPNode *n, LPCSTR t)
			{
			score  = s;
			node   = n;
			tax    = t;
			levels = 0;

			if (!tax.empty())
				{
				for (int i=0; i < tax.length() ; ++i)
					if (tax[i] == ';')
						++levels;
				}
			}
	};

typedef vector<CBestLocationEntry> CBestList;

///////////////////////////////////////////////////////////////////////////////////////////////////

class CBestLocation
	{
	public:
		static int			default_len;

		CBestList			list;		// list of best matches
		int					len;		// number of matches to keep
	public:
		CBestLocation(int _len=-1)
			{
			if (_len > 0)
				len = _len;
			else
				len = default_len;
			}

		void Add(int score, CPNode *node, LPCSTR t);

		int  WorstScore();
	};

///////////////////////////////////////////////////////////////////////////////////////////////////

class CSequenceItem	
	{
	public:
		long		offset;		// offset in sequence file where sequence entry begins
		string		name;		// id from the fasta file
		int			len;		// length of the sequence data (sequence only)
		LPSTR		data;		// sequence string
		LPSTR		hdr;		// header line from fasta file

	public:
		CSequenceItem();
		CSequenceItem(LPCSTR _name, long _offset, int _len, LPCSTR _data=NULL);
		~CSequenceItem();

		LPCSTR		GetSeqData();

		BOOL		AllocateSeqData();
		BOOL		ReleaseSeqData();

		BOOL		AllocateSeqHeader(int size);
		BOOL		ReleaseSeqHeader();

		LPCSTR		ReadSeqHeader(FILE *f);
		LPCSTR		ReadSeqData(FILE *f);
		BOOL		WriteSeqData(FILE *f);

	};

typedef map<string,CSequenceItem*> 	CSequenceList;
typedef CSequenceList::iterator		CSequenceListIter;

///////////////////////////////////////////////////////////////////////////////////////////////////

class CSequenceFile
	{
	public:
		FILE				*f;					// file to read data
		string				fname;				// name of file opened

		int					seqCount;			// number of sequences used in array
		int					seqN;				// number of items allocated in array
		
		CSequenceItem*		*seqArray;			// array of sequence numbers
		CSequenceList		seqlist;			// map of name to sequence

		CSequenceListIter	seqIter;			// iterator to step thru map
		int					posArray;			// iterator position in array

        static int          progressCount;		// number of sequences between progress reports
        static int			verbose;
        
	public:
		CSequenceFile(LPCSTR _fname=NULL, int size=MAX_SEQ_SIZE, int hashsize=10*1024);
		~CSequenceFile();

		CSequenceItem * GetSequence(LPCSTR name);
		CSequenceItem * GetSequenceHeader(LPCSTR name);

		BOOL			Open(LPCSTR _fname, LPCSTR mode="r");
		void			Close();

		int				GetCount()
							{
							return seqCount;
							}

		int				ReadSequenceIndexFile(LPCSTR filename);
		int				ReadSequenceFile(int type);
		CSequenceItem	*GetSeq(LPCSTR name);
		BOOL			WriteSequence(CSequenceItem *seq);

		void			ResetSeqIterator();
		CSequenceItem	*GetNextSeq();

		int				ReadTaxonomyFile();

	};

#endif // __SEQLIST_H__