File: Aligner.h

package info (click to toggle)
libmems 1.6.0%2B4725-4
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,092 kB
  • ctags: 2,761
  • sloc: cpp: 21,579; ansic: 4,312; xml: 115; makefile: 101; sh: 26
file content (307 lines) | stat: -rw-r--r-- 11,670 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
/*******************************************************************************
 * $Id: Aligner.h,v 1.23 2004/04/19 23:10:13 darling Exp $
 * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
 * This file is licensed under the GPL.
 * Please see the file called COPYING for licensing details.
 * **************
 ******************************************************************************/

#ifndef _Aligner_h_
#define _Aligner_h_

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "libGenome/gnSequence.h"
#include "libMems/DNAMemorySML.h"
#include "libMems/GappedAligner.h"
#include "libMems/MatchList.h"
#include "libMems/Interval.h"
#include "libMems/IntervalList.h"
#include "libMems/MemHash.h"
#include "libMems/MaskedMemHash.h"
#include <map>
#include "libMems/NumericMatrix.h"
#include "libMems/GreedyBreakpointElimination.h"
#include <list>
#include "libMems/LCB.h"
#include "libMUSCLE/threadstorage.h"

namespace mems {

/**
 * A mem labeled with a number.
 * Used by LCB construction algorithm 
 */
class LabeledMem{
public:
	Match* mem;
	uint label;
};

/**
 * Compares Matches labeled with a number.
 * Used by LCB construction algorithm 
 */
class LabeledMemComparator {
public:
	LabeledMemComparator( uint seq ){
		m_seq = seq;
	}
	LabeledMemComparator( const LabeledMemComparator& lmc ){
		m_seq = lmc.m_seq;
	}
	boolean operator()(const LabeledMem& a, const LabeledMem& b) const{
		
		int64 a_start = a.mem->Start( m_seq ), b_start = b.mem->Start( m_seq );
		if( a_start == NO_MATCH || b_start == NO_MATCH ){
			if( b_start != NO_MATCH )
				return true;
			return false;
		}
		if(a_start < 0)
			a_start = -a_start;
//			a_start = -a_start + a.mem->Length();
		if(b_start < 0)
			b_start = -b_start;
//			b_start = -b_start + b.mem->Length();
		int64 diff = a_start - b_start;
		return diff < 0;
	}
protected:
	uint m_seq;
private:
	LabeledMemComparator();
};

/**
 * A match with an associated list iterator.
 * Used by LCB construction algorithm 
 */
class PlacementMatch{
public:
	Match* mem;
	std::list< LabeledMem >::iterator iter;
};

/**
 * Compares Matches.
 * Used by LCB construction algorithm 
 */
class PlacementMatchComparator {
public:
	PlacementMatchComparator( uint seq ){
		m_seq = seq;
	}
	PlacementMatchComparator( PlacementMatchComparator& lmc ){
		m_seq = lmc.m_seq;
	}
	boolean operator()(const PlacementMatch& a, const PlacementMatch& b) const{
		
		int64 a_start = a.mem->Start( m_seq ), b_start = b.mem->Start( m_seq );
		if( a_start == NO_MATCH || b_start == NO_MATCH ){
			if( b_start != NO_MATCH )
				return true;
			return false;
		}
		if(a_start < 0)
			a_start = -a_start;
//			a_start = -a_start + a.mem->Length();
		if(b_start < 0)
			b_start = -b_start;
//			b_start = -b_start + b.mem->Length();

		int64 diff = a_start - b_start;
		return diff < 0;
	}
protected:
	uint m_seq;
private:
	PlacementMatchComparator();
};


/** a cache type to remember which intervals have already been searched */
typedef std::pair< mems::Match*, mems::Match* > search_cache_t;


/**
 * Used to find locally colinear blocks (LCBs) and do recursive
 * alignments on the blocks
 * To create an alignment one need only use the align method.
 * LCB lists are typically stored using the IntervalList class.  They can be
 * read and written in interval format using that class.  For input and output
 * of gapped alignments in other formats, see the gnAlignedSequences class.
 * Other methods in this class are available for experimentation.
 */
class Aligner {
public:
	/** 
	 * Constructs an aligner for the specified number of sequences.
	 * @param seq_count 	The number of sequences that will be aligned with this Aligner
	 */
	Aligner( uint seq_count );
	Aligner( const Aligner& al );
	Aligner& operator=( const Aligner& al );

	/**
	 * Performs an alignment.  Takes a MatchList as input and outputs a list of LCBs as an IntervalList.
	 * Several of the options can be used to filter out unlikely LCBs.  If the recursive option is
	 * specified, the regions between matches in each LCB are searched for further homology and a full
	 * gapped alignment is produced.
	 * @param mlist					The MatchList to use as input for the alignment process
	 * @param interval_list			The IntervalList that is created by the alignment process
	 * @param LCB_minimum_density	The minimum density that an LCB may have to be considered a valid block
	 *   						 	This should be a number between 0 and 1.
	 * @param LCB_minimum_range		A misnomer: really it's the minimum number of matching base pairs an LCB 
	 *								must contain to be considered an LCB. Coverage is defined as 
	 *								(length of match) * (# of matching sequences)
	 * @param recursive 			Option for performing a recursive alignment.  If this is set to
	 * 								true, all regions which have gaps will be searched for exact matches.
	 * @param extend_lcbs		If true, attempt to extend the boundaries of LCBs by searching for 
	 *                          additional matches between LCBs
	 * @param tree_filename		The name of the output file to write the phylogenetic guide tree into.  If
	 *                          an empty string is specified then a temporary file is created.  
	 * @throws AlignerError 	may be thrown if an error occurs
	 */
	void align( MatchList& mlist, IntervalList& interval_list, double LCB_minimum_density, double LCB_minimum_range, boolean recursive, boolean extend_lcbs, boolean gapped_alignment, std::string tree_filename = "" );
	
	void Recursion( MatchList& r_list, Match* r_begin, Match* r_end, boolean nway_only = false );
	void GetBestLCB( MatchList& r_list, MatchList& best_lcb );
	void DoSomethingCool( MatchList& mlist, Interval& iv );
	
	/**
	 * Set the minimum size of intervening region between two anchor matches that will
	 * be considered for recursive anchor determination.  When the gaps between two anchors
	 * are less than this cutoff value the region is handed off to the dynamic programming aligner
	 * e.g. ClustalW 
	 */
	void SetMinRecursionGapLength( gnSeqI min_r_gap );

	void SetMaxExtensionIterations( uint ext_iters ){ this->max_extension_iters = ext_iters; }

	void SearchWithinLCB( MatchList& mlist, std::vector< search_cache_t >& new_cache, bool leftmost = false, bool rightmost = false );
	void RecursiveAnchorSearch( MatchList& mlist, gnSeqI minimum_weight, std::vector< MatchList >& LCB_list, boolean entire_genome, std::ostream* status_out = NULL );

	void AlignLCB( MatchList& mlist, Interval& iv );
	void SetGappedAligner( GappedAligner& gal );
	/** forwards the request to whatever gapped aligner is being used */
	void SetMaxGappedAlignmentLength( gnSeqI len );

	/** Set output parameters for permutation matrices */
	void SetPermutationOutput( std::string& permutation_filename, int64 permutation_weight );
	void WritePermutation( std::vector< LCB >& adjacencies, std::string out_filename );

	void SetRecursive( bool value ){ this->recursive = value; }
protected:
	TLS<MemHash> gap_mh;			/**< Used during recursive alignment */
	MaskedMemHash nway_mh;	/**< Used during recursive alignment to find nway matches only */
	uint32 seq_count;		/**< The number of sequences this aligner is working with */
	boolean debug;			/**< Flag for debugging output */
	
	double LCB_minimum_density;
	double LCB_minimum_range;

	uint max_extension_iters;	/**< maximum number of attempts at LCB extension */
	
	int64 cur_min_coverage;	/**< Tracks the minimum weight of the least weight LCB */
	
	gnSeqI min_recursive_gap_length;	/**< Minimum size of gap regions that will be recursed on */

	void consistencyCheck( uint lcb_count, std::vector< LCB >& adjacencies, std::vector< MatchList >& lcb_list, std::vector< int64 >& weights );
	
	boolean recursive;		/**< Set to true if a recursive anchor search/gapped alignment should be performed */
	boolean extend_lcbs;	/**< Set to true if LCB extension should be attempted */
	boolean gapped_alignment;	/**< Set to true to complete a gapped alignment */
	boolean currently_recursing;	/**< True when the recursive search has begun */
	boolean collinear_genomes;	/**< Set to true if all genomes are assumed to be collinear */
	
	GappedAligner* gal;

	std::string permutation_filename;
	int64 permutation_weight;

	std::vector< search_cache_t > search_cache; /**< a list of recursive searches that have already been done */
};

/**
 * Thrown if some error occurs during alignment
 */
CREATE_EXCEPTION( AlignerError );

void transposeMatches( MatchList& mlist, uint seqI, const std::vector< int64 >& seq_regions );

/**
 * Deletes overlapping regions in a set of matches.  Always removes matching base pairs from the
 * match covering fewer bases.  Coverage is defined as (length of match) * (# of matching sequences)
 */
void EliminateOverlaps( MatchList& ml );

/**
 * Function to determine the breakpoints in a set of matches. 
 * Sorts the matches in mlist and returns the indices of breakpoints.
 * This function attempts (sometimes unsuccessfully) to determine subset LCBs.  If a set of
 * matches containing subset LCBs has been passed to it, the resulting breakpoint set may
 * be incorrect.  You have been warned.
 * @param mlist 		A list of matches to search for LCBs.
 * @param breakpoints 	The indices of matches in the sorted match list that are at LCB boundaries
 */
void AaronsLCB( MatchList& mlist, std::set<uint>& breakpoints );


void ComputeLCBs( MatchList& meml, std::set<uint>& breakpoints, std::vector<MatchList>& lcb_list, std::vector<int64>& weights );
void computeLCBAdjacencies_v2( std::vector<MatchList>& lcb_list, std::vector< int64 >& weights, std::vector< LCB >& adjacencies );
void computeLCBAdjacencies_v2( IntervalList& iv_list, std::vector< int64 >& weights, std::vector< LCB >& adjacencies );
void scanLeft( int& left_recurseI, std::vector< LCB >& adjacencies, int min_weight, int seqI );
void scanRight( int& right_recurseI, std::vector< LCB >& adjacencies, int min_weight, int seqI );
void GetLCBCoverage( MatchList& lcb, uint64& coverage );

int64 greedyBreakpointElimination( gnSeqI minimum_weight, std::vector< LCB >& adjacencies, std::vector< int64 >& weights, std::ostream* status_out = NULL );
void filterMatches( std::vector< LCB >& adjacencies, std::vector< MatchList >& lcb_list, std::vector< int64 >& weights );

void CreateGapSearchList( std::vector< LCB >& adjacencies, const std::vector< genome::gnSequence* >& seq_table, std::vector< std::vector< int64 > >& iv_regions, boolean entire_genome );
void SearchLCBGaps( MatchList& new_matches, const std::vector< std::vector< int64 > >& iv_regions, MaskedMemHash& nway_mh );

static const uint MIN_ANCHOR_LENGTH = 9;


/** used for search cache lookups */
class SearchCacheComparator
{
public:
	SearchCacheComparator() : msc(0){};
	bool operator()( const search_cache_t& a, const search_cache_t& b ) const
	{
		bool lt = true;
		if( a.first == NULL )
		{
			if( b.first == NULL )
				lt = false;
		}else if( b.first == NULL )
		{
			lt = false;
		}else if( !msc( a.first, b.first ) )
			lt = false;
		else if( a.second == NULL )
		{
			if( b.second == NULL )
				lt = false;
		}else if( b.second == NULL )
		{
			lt = false;
		}else if( !msc( a.second, b.second ) )
			lt = false;

		return lt;		
	}
protected:
	mems::MatchStartComparator<mems::Match> msc;
};

static SearchCacheComparator cache_comparator;


}

#endif // _Aligner_h_