File: MemHash.cpp

package info (click to toggle)
libmems 1.6.0%2B4725-9
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 2,120 kB
  • sloc: cpp: 21,579; ansic: 4,312; xml: 115; makefile: 103; sh: 26
file content (330 lines) | stat: -rw-r--r-- 9,865 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
/*******************************************************************************
 * $Id: MemHash.cpp,v 1.32 2004/03/01 02:40:08 darling Exp $
 * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
 * Please see the file called COPYING for licensing, copying, and modification
 * Please see the file called COPYING for licensing details.
 * **************
 ******************************************************************************/

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "libMems/MemHash.h"
#include "libGenome/gnFilter.h"
#include <list>
#include <map>
#include <sstream>

using namespace std;
using namespace genome;
namespace mems {

	MemHash::MemHash() : MatchFinder(), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )

{
	table_size = DEFAULT_MEM_TABLE_SIZE;
	seq_count = 0;
	m_mem_count = 0;
	m_collision_count = 0;
	m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
	m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
	//allocate the hash table
	mem_table.resize(table_size);
	mem_table_count.reserve( table_size );
	for(uint32 i=0; i < table_size; ++i)
		mem_table_count.push_back(0);
	match_log = NULL;
}

//make sure this calls the destructor on each element
MemHash::~MemHash(){
//	allocator.Free(allocated);
}

MemHash::MemHash(const MemHash& mh) : MatchFinder(mh), allocator( SlotAllocator<MatchHashEntry>::GetSlotAllocator() )
{
	*this = mh;
}

MemHash& MemHash::operator=( const MemHash& mh ){
	table_size = mh.table_size;
	mer_size = mh.mer_size;
	seq_count = mh.seq_count;
	m_mem_count = mh.m_mem_count;
	m_collision_count = mh.m_collision_count;
	m_repeat_tolerance = mh.m_repeat_tolerance;
	m_enumeration_tolerance = mh.m_enumeration_tolerance;
	mem_table.resize(table_size);
	for(uint32 i=0; i < table_size; ++i){
		mem_table_count.push_back(mh.mem_table_count[i]);
		mem_table[i] = mh.mem_table[i];
	}
	match_log = mh.match_log;
	return *this;
}

MemHash* MemHash::Clone() const{
	return new MemHash(*this);
}

void MemHash::ClearSequences()
{
	MatchFinder::Clear();
}

void MemHash::Clear()
{
	MatchFinder::Clear();
	m_mem_count = 0;
	m_collision_count = 0;
	m_repeat_tolerance = DEFAULT_REPEAT_TOLERANCE;
	m_enumeration_tolerance = DEFAULT_ENUMERATION_TOLERANCE;
	//clear the hash table
	for(uint32 listI = 0; listI < table_size; ++listI){
		mem_table[listI].clear();
		mem_table_count[ listI ] = 0;
	}
	match_log = NULL;

	allocator.Free(allocated);
	// WARNING! WARNING! WARNING! this will destroy ALL objects since the allocator has static lifetime!!
//	allocator.Purge();
}

void MemHash::SetTableSize(uint32 new_table_size){
	//allocate the hash table
	table_size = new_table_size;
	mem_table.clear();
	mem_table.resize(table_size);
	mem_table_count.clear();
	mem_table_count.resize(table_size,0);
}

boolean MemHash::CreateMatches(){
	MatchFinder::FindMatchSeeds();
	return true;
}

void MemHash::FindMatches( MatchList& ml ) {
	vector<gnSeqI> start_points;
	for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
		start_points.push_back( 0 );
	}
	FindMatchesFromPosition( ml, start_points );
}

void MemHash::FindMatchesFromPosition( MatchList& ml, const vector<gnSeqI>& start_points ){
	for( uint32 seqI = 0; seqI < ml.seq_table.size(); ++seqI ){
		if( !AddSequence( ml.sml_table[ seqI ], ml.seq_table[ seqI ] ) ){
			ErrorMsg( "Error adding " + ml.seq_filename[seqI] + "\n");
			return;
		}
	}
	MatchFinder::FindMatchSeeds( start_points );

	GetMatchList( ml );
}

MatchList MemHash::GetMatchList() const{
	MatchList ml;
	GetMatchList( ml );
	ml.seq_table = seq_table;
	ml.sml_table = sar_table;

	return ml;
}

// an attempt to do this without sorting, which appears to be very slow...
boolean MemHash::EnumerateMatches( IdmerList& match_list )
{
	vector< uint > enum_tally(seq_count, 0);
	IdmerList::iterator iter = match_list.begin();
	IdmerList hash_list;
	for(; iter != match_list.end(); ++iter)
	{
		if( enum_tally[iter->id] < m_enumeration_tolerance )
		{
			hash_list.push_back(*iter);
		}
		if(enum_tally[iter->id] > m_repeat_tolerance)
			return true;
		++enum_tally[iter->id];
	}

	if(hash_list.size() > 1){
		if(m_enumeration_tolerance == 1)
			return HashMatch(hash_list);
		else
			return MatchFinder::EnumerateMatches( hash_list );
	}
	return true;
}

//why have separate hash tables? dunno.  no reason.  what was i thinking
// at that coffeehouse in portland when i wrote this crappy code?
// MemHashEntries use GENETICIST coordinates.  They start at 1, not 0.
boolean MemHash::HashMatch(IdmerList& match_list){
	//check that there is at least one forward component
//	match_list.sort(&idmer_id_lessthan);
	// initialize the hash entry
	MatchHashEntry mhe = MatchHashEntry(seq_count, GetSar(0)->SeedLength());
	mhe.SetLength( GetSar(0)->SeedLength() );
	
	//Fill in the new Match and set direction parity if needed.
	IdmerList::iterator iter = match_list.begin();
	for(; iter != match_list.end(); ++iter)
		mhe.SetStart(iter->id, iter->position + 1);
	SetDirection(mhe);
	mhe.CalculateOffset();
	if(mhe.Multiplicity() < 2){
		cout << "red flag " << mhe << "\n";
		cout << "match_list.size(): " << match_list.size() << endl;
	}else 
		AddHashEntry(mhe);

	return true;
}

void MemHash::SetDirection(MatchHashEntry& mhe){
	//get the reference direction
	boolean ref_forward = false;
	uint32 seqI=0;
	for(; seqI < mhe.SeqCount(); ++seqI)
		if(mhe[seqI] != NO_MATCH){
			ref_forward = !(GetSar(seqI)->GetDnaSeedMer(mhe[seqI] - 1) & 0x1);
			break;
		}
	//set directional parity for the rest
	for(++seqI; seqI < mhe.SeqCount(); ++seqI)
		if(mhe[seqI] != NO_MATCH)
			if(ref_forward == (GetSar(seqI)->GetDnaSeedMer(mhe[seqI] - 1) & 0x1))
				mhe.SetStart(seqI, -mhe[seqI]);
}

// Tries to add a new mem to the mem hash table
// If the mem already exists in the table, a pointer to it
// is returned.  Otherwise mhe is added and a pointer to
// it is returned.
MatchHashEntry* MemHash::AddHashEntry(MatchHashEntry& mhe){
	//first compute which hash table bucket this is going into
	int64 offset = mhe.Offset();

	uint32 bucketI = ((offset % table_size) + table_size) % table_size;
	vector<MatchHashEntry*>::iterator insert_he;
	insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), &mhe, mhecomp);
//	insert_he = mem_table[bucketI].find(&mhe);
	if( insert_he != mem_table[bucketI].end() && (!mhecomp(*insert_he, &mhe) && !mhecomp(&mhe, *insert_he)) ){
		++m_collision_count;
		return *insert_he;
	}
	
	//if we made it this far there were no collisions
	//extend the mem into the surrounding region.
	vector<MatchHashEntry> subset_matches;
	if( !mhe.Extended() )
		ExtendMatch(mhe, subset_matches);

	MatchHashEntry* new_mhe = allocator.Allocate();
	new_mhe = new(new_mhe) MatchHashEntry(mhe); 
//	*new_mhe = mhe;
	allocated.push_back(new_mhe);
	
	// can't insert until after the extend!!
	insert_he = std::lower_bound(mem_table[bucketI].begin(), mem_table[bucketI].end(), new_mhe, mhecomp);
	mem_table[bucketI].insert(insert_he, new_mhe);

	// log it.
	if( match_log != NULL ){
		(*match_log) << *new_mhe << endl;
		match_log->flush();
	}
	
	// link up the subset matches
	for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
		MatchHashEntry* submem = AddHashEntry( subset_matches[ subsetI ] );
	}
	
	++mem_table_count[bucketI];
	++m_mem_count;
	return new_mhe;
}

void MemHash::PrintDistribution(ostream& os) const{
    vector<MatchHashEntry*>::const_iterator mem_iter;
	gnSeqI base_count;
	for(uint32 i=0; i < mem_table_count.size(); ++i){
		mem_iter = mem_table[i].begin();
		base_count = 0;
		for(; mem_iter != mem_table[i].end(); ++mem_iter){
			base_count += (*mem_iter)->Length();
		}
		os << i << '\t' << mem_table_count[i] << '\t' << base_count << '\n';
	}
}

void MemHash::LoadFile(istream& mem_file){
	string tag;
	gnSeqI len;
	int64 start;
	MatchHashEntry mhe;
	getline( mem_file, tag );
	stringstream first_mum( tag );
	seq_count = 0;
	first_mum >> len;
	while( first_mum >> start ){
		seq_count++;
	}
	mhe = MatchHashEntry(seq_count, mer_size, MatchHashEntry::seed);
	first_mum.str( tag );
	first_mum.clear();
	for(uint32 seqI = 0; seqI < seq_count; seqI++){
		first_mum >> start;
		mhe.SetStart(seqI, start);
	}
	mhe.SetLength( len );
	mhe.CalculateOffset();
	AddHashEntry(mhe);
	
	while(mem_file.good()){
		mem_file >> len;
		if(!mem_file.good())
			break;
		mhe.SetLength(len);
		for(uint32 seqI = 0; seqI < seq_count; seqI++){
			mem_file >> start;
			mhe.SetStart(seqI, start);
		}
		//break if the stream ended
		if(!mem_file.good())
			break;
		mhe.CalculateOffset();
		AddHashEntry(mhe);
	}
}

void MemHash::WriteFile(ostream& mem_file) const{
	mem_file << "FormatVersion" << '\t' << 1 << "\n";
	mem_file << "SequenceCount" << '\t' << sar_table.size() << "\n";
	for(unsigned int seqI = 0; seqI < seq_count; seqI++){
		mem_file << "Sequence" << seqI << "File";
		gnGenomeSpec* specker = seq_table[seqI]->GetSpec();
		string sourcename = specker->GetName();
		if( sourcename == "" )
			sourcename = "null";
		mem_file << '\t' << sourcename << "\n";
		mem_file << "Sequence" << seqI << "Length";
		mem_file << '\t' << seq_table[seqI]->length() << "\n";
	}
	mem_file << "MatchCount" << '\t' << m_mem_count << endl;
	//get all the mems out of the hash table and write them out
    vector<MatchHashEntry*>::const_iterator mem_table_iter;
	for(uint32 i=0; i < table_size; i++){
		mem_table_iter = mem_table[i].begin();
		for(; mem_table_iter != mem_table[i].end(); mem_table_iter++)
			mem_file << **mem_table_iter << "\n";
	}
}


} // namespace mems