File: prefix_tree.cpp

package info (click to toggle)
patman 1.2.2%2Bdfsg-8
links: PTS, VCS
area: main
in suites: bookworm, sid, trixie
size: 220 kB
sloc: cpp: 783; makefile: 92; sh: 12
file content (668 lines) | stat: -rw-r--r-- 20,355 bytes
parent folder | download | duplicates (3)
// PatMaN DNA pattern matcher
// (C) 2007 Kay Pruefer, Udo Stenzel
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or (at
// your option) any later version.  See the LICENSE file for details.


#include "prefix_tree.h" 

#include <cassert>
#include <iostream>
#include <queue>
#include <cstdlib>

using namespace std ;

// Why a deque?  Because allocation of many small objects of
// equal size is faster in it and deallocation of the whole
// deque is much faster than deallocation of the small objects,
// especially with our extremely simple allocation pattern.
// This is probably the most sensible way to plug this memory
// leak.
prefix_tree::prefix_tree(  ) : nodes()
{
	nodes.push_back( node() ) ;
}

#ifndef NDEBUG
void prefix_tree::debug( std::ostream& s, const node& n ) const
{
	if( debug_flags & debug_trie )
	{
		for( int i = 0 ; i != 5 ; ++i )
			s << n.label << " -" << "ACGTN"[i] << "> " << n.childs[i]->label << '/' << (int)n.drop[i] << std::endl ;
		if( n.suffix ) s << n.label << " ==> " << n.suffix->label << '/' << n.drop_suffix << std::endl ;

		for( int i = 0 ; i != 5 ; ++i )
			if( !n.drop[i] ) debug( s, *n.childs[i] ) ;
	}
}
#endif


static mismatch_ptr *junk_yard = 0 ;

static mismatch_ptr *new_node()
{
	if( junk_yard )
	{
		mismatch_ptr *p = junk_yard ;
		junk_yard = p->next ;
		return p ;
	}
	else
	{
		return new mismatch_ptr ;
	}
}

mismatch_ptr *prefix_tree::seed() const 
{
	// add ptr to the root node
	mismatch_ptr *p = new_node() ;
	p->ptr = &nodes[0] ;
	p->mismatch = 0 ;
	p->gaps = 0 ;
	p->first = 0 ;
	p->isgap = 0 ;
	p->matched = 0 ;
	p->next = 0 ;
	return p ;
}

mismatch_ptr *prefix_tree::init( mismatch_ptr *p ) const 
{
	if( p ) {
		mismatch_ptr *q = p ;
		while( p->next ) p = p->next ;
		p->next = junk_yard ;
		junk_yard = q ;
	}
	return seed() ;
}

// creates child node # pos if necessary and returns ptr to this child node
node* prefix_tree::create_go_node( node* n, int pos ) {
	if ( n->childs[pos] == 0 ) {
		node o ;
		o.depth = n->depth + 1 ;
		nodes.push_back( o ) ;
		node *next = &nodes.back() ;
		n->childs[pos] = next ;
#ifndef NDEBUG
		if( debug_flags & debug_trie )
			n->childs[pos]->label = n->label + "ACGTN"[pos] ;
#endif
		return next ;
	} else {
		return n->childs[pos] ;
	}
}

void prefix_tree::add_recursion( const string& seq, const string& name, node* ptr, size_t i, char strand ) 
{
	// is this a leaf?
	if ( i == seq.size() ) {
		
		probe_s *probe = new probe_s() ;
		probe->name = name ;
		probe->strand = strand ;
		probe->length = seq.size() ;
		probe->next = ptr->probes ;
		ptr->probes = probe ;
	
	// no leaf -> add 
	} else {
		switch ( toupper(seq[i]) ) {
			case 'A':
				add_recursion( seq, name, create_go_node( ptr, 0 ), ++i, strand ) ;
				break ;
			case 'C':
				add_recursion( seq, name, create_go_node( ptr, 1 ), ++i, strand ) ;
				break ;
			case 'G':
				add_recursion( seq, name, create_go_node( ptr, 2 ), ++i, strand ) ;
				break ;
			case 'T':
			case 'U':
				add_recursion( seq, name, create_go_node( ptr, 3 ), ++i, strand ) ;
				break ;
			case 'N':
			case 'X':
				add_recursion( seq, name, create_go_node( ptr, 4 ), ++i, strand ) ;
				break ;
			case '-':
				break ;
			default:
				add_recursion( seq, name, create_go_node( ptr, 4 ), ++i, strand ) ;
				if( debug_flags & debug_notquiet )
					clog << "Warning: Sequence with name " << name 
						<< " has character '" << seq[i]
						<< "' other than ACGTUXN." << endl ;
				debug_flags &= ~debug_notquiet ;
				break ;
		}
		
	}
}

/***
 * Adding the sequence taking into account the ambiguity codes. 
 * i.e. Sequence with "N" will be added to A, C, G and T node
 */
void prefix_tree::add_recursion_ambiguity( const string& seq, const string& name, node* ptr, size_t i, char strand ) 
{
	// is this a leaf?
	if ( i == seq.size() ) {
		
		probe_s *probe = new probe_s() ;
		probe->name = name ;
		probe->strand = strand ;
		probe->length = seq.size() ;
		probe->next = ptr->probes ;
		ptr->probes = probe ;
	
	// no leaf -> add 
	} else {
		switch ( toupper(seq[i]) ) {
			case 'A':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				break ;
			case 'C':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				break ;
			case 'G':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				break ;
			case 'T':
			case 'U':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'N':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'R':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				break ;
			case 'Y':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'M':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				break ;
			case 'K':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'S':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				break ;
			case 'W':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'B':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'D':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'H':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 3 ), i+1, strand ) ;
				break ;
			case 'V':
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 0 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 1 ), i+1, strand ) ;
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 2 ), i+1, strand ) ;
				break ;

			case '-':
				break ;

			default:
				add_recursion_ambiguity( seq, name, create_go_node( ptr, 4 ), i+1, strand ) ;
				if( debug_flags & debug_notquiet )
					clog << "Warning: Sequence with name " << name 
						<< " has character '" << seq[i]
						<< "' other than ACGTU, BDH, SWKMRY, or XN." << endl ;
				debug_flags &= ~debug_notquiet ;
				break ;
		}
		
	}
}

/* Ambiguity codes:
 *                           R = G or A (puRine)
 *                           Y = C or T (pYrimidine)
 *                           M = A or C (aMine)
 *                           K = G or T (Ketone)
 *                           S = G or C (Strong coupling)
 *                           W = A or T (Weak coupling)
 *                           B = not A (C or G or T)
 *                           D = not C (A or G or T)
 *                           H = not G (A or C or T)
 *                           V = not U (A or C or G)
 *                           N = A or C or G or T (Nucleotide)
 */

/*
 * reverse complement of a dna sequence, including correct ambiguity mapping
 */
inline static string reverse_complement( const string& s ) {
	string ret ;
	for ( int i = s.size()-1 ; i >= 0 ; i-- ) {
		switch ( toupper( s[i] ) ) {
			case 'A':
				ret += 'T' ; break ;
			case 'C':
				ret += 'G' ; break ;
			case 'G':
				ret += 'C' ; break ;
			case 'T':
			case 'U':
				ret += 'A' ; break ;
			case 'R':
				ret += 'Y' ; break ;
			case 'Y':
				ret += 'R' ; break ;
			case 'M':
				ret += 'K' ; break ;
			case 'K':
				ret += 'M' ; break ;
			case 'B':
				ret += 'V' ; break ;
			case 'V':
				ret += 'B' ; break ;
			case 'D':
				ret += 'H' ; break ;
			case 'H':
				ret += 'D' ; break ;
			case 'S':
			case 'W':
			case 'N':
			case '-':
				ret += s[i] ;
				break ; 
			default:
				clog << "Sequence has invalid character '"
					 << s[i] << "'." << endl ;
				exit( 1 ) ;
		}
	}
	return ret ;
}

void prefix_tree::add_seq( const string& seq, const string& name, bool ambi_codes ) 
{
	node *ptr = &nodes[0] ;
	int i = 0 ;
	if ( ! ambi_codes ) {
		add_recursion( seq, name, ptr, i, '+' ) ;
		if ( only_plus_strand != 1 ) {
			string rev_seq = reverse_complement( seq ) ;
			add_recursion( rev_seq, name, ptr, i, '-' ) ;
		}
	} else {
		add_recursion_ambiguity( seq, name, ptr, i, '+' ) ;
		if ( only_plus_strand != 1 ) {
			string rev_seq = reverse_complement( seq ) ;
			add_recursion_ambiguity( rev_seq, name, ptr, i, '-' ) ;
		}
	}
}

// Suffix links.  These are only needed as an intermediate step.  Each
// node gets a suffix link, which points to the node that is the longest
// existing true suffix.  Given a node that already has a suffix, we
// find the suffix of child "A" by following the "A" link from our
// suffix and adding the 'drop' values for the "suffix" and the "A"
// link.  We can then fill in the missing children for this node in the
// same fashion.  To ensure that any node we visit already has a suffix,
// we do a breadth-first search.
//
// The root never has a suffix.  To start the fill-in, each missing
// child of the root becomes the root itself with a "drop" of one and
// the existing ones get the root as suffix with a drop of one.
//
// Also, after we added the missing children, we no longer need the
// suffix link for anything but accessing query sequences that happen to
// be substrings of other queries.  So we can optimize these.

void prefix_tree::add_suffix_links()
{
	std::queue< node* > q ;
	node& root = nodes[0] ;
	for( int i = 0 ; i != 5 ; ++i )
	{
		if( root.childs[i] )
		{
			root.childs[i]->suffix = &root ;
			root.childs[i]->drop_suffix = 1 ;
			q.push( root.childs[i] ) ;
		}
		else
		{
			root.childs[i] = &root ;
			root.drop[i] = 1 ;
		}
	}

	while( !q.empty() ) 
	{
		node* n = q.front() ;
		q.pop() ;
		for( int i = 0 ; i != 5 ; ++i )
		{
			if( n->childs[i] )
			{
				n->childs[i]->suffix = n->suffix->childs[i] ;
				n->childs[i]->drop_suffix = n->drop_suffix + n->suffix->drop[i] ;
				q.push( n->childs[i] ) ;
			}
			else
			{
				n->childs[i] = n->suffix->childs[i] ;
				n->drop[i] = n->drop_suffix + n->suffix->drop[i] ;
			}
		}
		if( !n->suffix->probes )
		{
			if( n->suffix->suffix )
			{
				n->drop_suffix += n->suffix->drop_suffix ;
				n->suffix = n->suffix->suffix ;
			}
			else
			{
				n->suffix = 0 ;
			}
		}
	}
}

inline static void out( int mm, const node* n, int mlen )
{
	for( const probe_s *probe = n->probes ; probe ; probe = probe->next )
	{
		( output ? *output : cout )
		    << genome_name <<  '\t' << probe->name << '\t'
			<< position_genome - mlen << '\t'
			<< position_genome - 1 << '\t'
			<< probe->strand << '\t' << mm << '\n' ;
	}
}

// Why does this work?  Because when we follow suffix links (to print
// matches that are completely inside other matches), we know that we
// only drop perfectly matching stuff from the beginning.  This allows
// us to update "mlen".
 
inline static void out_rec( int mm, const node* n, int first, int mlen )
{
	if( n->probes ) out( mm, n, mlen ) ;

	const node* nn = n ;
	while( nn->suffix && nn->drop_suffix <= first )
	{
		first -= nn->drop_suffix ;
		mlen -= nn->drop_suffix ;
		nn = nn->suffix ;
		if( nn->probes ) out( mm, nn, mlen ) ;
	}
}


/* Advancing the mismatch pointers.
 * - We receive a (singly linked) list of mps.  There has to be exactly
 *   one mp that didn't encounter any mismatches in its history (keeping
 *   this one ensures that the algorithm reduces to Aho-Corasick if
 *   mismatches aren't allowed).  Too avoid to much special treatment,
 *   this special pointer will always be the first in the list.  We
 *   process it first, initalize the result list with it or re-seed the
 *   result list, then go into the loop over the rest of the list.  This
 *   implies an if-block in the middle of the loop and it also implies
 *   that some pointers are only initialized in the loop.
 *
 * - We will keep multiple pointers into the two involved lists: one to
 *   the end of the result list; this is where we splice in pointers to
 *   be queued for the next round.  This list is at all time properly
 *   terminated and disconnected from the stuff we're processing this
 *   round.
 *
 * - One to the next node to be processed.  This one is easy, it is
 *   advanced once per loop, it's the head of the incoming list and
 *   we're done if it becomes null.
 *
 * - One several places ahead of the current one.  We prefetch the node
 *   pointed to by this one and its next link.  Keeping another yet more
 *   advanced pointer to prefetch mismatch pointers is useless because
 *   of the need to follow the chain of next pointers.  This one has to
 *   be valid at all times (see next point), so if it would become null,
 *   we don't advance it.
 *
 * - When gapping, we need to splice new mps into the queue being
 *   processed.  We do this after the node we prefetched last.  That
 *   makes sense, because we already have that mp in cache, so we can
 *   splice without penalty.  It works fine, as long as there is such a
 *   pointer, and to ensure that, we first handle gaps (in this
 *   direction) while the current node is still in the queue, then
 *   cache the contents of the current node, handle the node itself,
 *   thereby removing and or splicing it, then handle mismatches and/or
 *   gaps.
 */

inline static mismatch_ptr *splice_fresh_after( mismatch_ptr *mp )
{
	if( junk_yard )
	{
		mismatch_ptr *p0 = junk_yard ;
		junk_yard = junk_yard->next ;
		p0->next = mp->next ;
		mp->next = p0 ;
		return p0 ;
	}
	else
	{
		mismatch_ptr *p0 = new mismatch_ptr ;
		p0->next = mp->next ;
		mp->next = p0 ;
		return p0 ;
	}
}

mismatch_ptr *prefix_tree::compare( char c, mismatch_ptr *ptrs ) const
{
	int pos ;
	switch( c )
	{
		case '-': return ptrs ;
		case 'A': case 'a': pos = 0 ; break ;
		case 'C': case 'c': pos = 1 ; break ;
		case 'G': case 'g': pos = 2 ; break ;
		case 'T': case 't': pos = 3 ; break ;
		case 'U': case 'u': pos = 3 ; break ;
		default: pos = -1 ; break ; 
	}

	bool is_first = true ;
	mismatch_ptr *new_head = 0, *last_done = 0 ;
	mismatch_ptr *next_in_queue = ptrs ;
	mismatch_ptr *next_to_prefetch = ptrs ;
	for( int i = 0 ; i != do_prefetch && next_to_prefetch->next ; ++i )
		next_to_prefetch = next_to_prefetch->next ;

	while( next_in_queue ) 
	{
		++total_nodes ;
		if( next_to_prefetch && next_to_prefetch->ptr  ) __builtin_prefetch( next_to_prefetch->ptr  ) ;
		if( next_to_prefetch && next_to_prefetch->next ) __builtin_prefetch( next_to_prefetch->next ) ;

		const node* n = next_in_queue->ptr ;
		size_t mm = next_in_queue->mismatch ;
		size_t gg = next_in_queue->gaps ;
		int ff = next_in_queue->first ;
		int nn = next_in_queue->matched ;

		if( !next_in_queue->isgap ) out_rec( mm, n, is_first ? 255 : ff, nn ) ;

		// If gaps are allowed, also add gapped nodes for
		// processing _in_this_iteration_.  If
		// discount_adenine is set, don't gap an A (makes
		// bookkeeping difficult).
				
		if( gg < allow_gaps && mm < cutoff && n->depth > 0 ) {
			for ( int i = discount_adenine ? 1 : 0 ; i != 5 ; i++ ) {
				int ff_ = is_first ? n->depth-1 : ff ;
				if( ff_ >= n->drop[i] ) {
					assert( next_to_prefetch ) ;
					mismatch_ptr *mp2 = splice_fresh_after( next_to_prefetch ) ;
					mp2->ptr = n->childs[i] ;
					mp2->mismatch = mm + 1 ;
					mp2->gaps = gg + 1 ;
					mp2->first = ff_ - n->drop[i] ;
					mp2->isgap = 1 ;
					mp2->matched = nn ;
				}
			}
		}

		// Are we gapping As?  then queue that up, no matter how many
		// mismatches we've already accumulated.  (Note how this isn't
		// even considered a gap, note that gapping the root actually
		// makes a certain kind of sense.)

		if( discount_adenine ) {
			int ff_ = is_first ? n->depth-1 : ff ;
			if( ff_ >= n->drop[0] && n->drop[0] ) {
				assert( next_to_prefetch ) ;
				mismatch_ptr *mp2 = splice_fresh_after( next_to_prefetch ) ;
				mp2->ptr = n->childs[0] ;
				mp2->mismatch = mm ;
				mp2->gaps = gg ;
				mp2->first = ff_ - n->drop[0] ;
				mp2->isgap = 0 ;
				mp2->matched = nn ;
			}
		}

		// No longer is anything enqueued into the old queue, so we no
		// longer need the next_to_prefetch pointer and can move it
		// ahead.
		if( next_to_prefetch->next ) next_to_prefetch = next_to_prefetch->next ;

		// At this point, we no longer run into difficulties if the
		// incoming queue gets emptied.  So now we move the pointer to its
		// matching child and throw it out if dropping the prefix kills
		// a mismatch.  Also, we can move the next_to_prefetch pointer,
		// since it's not needed anymore in this iteration and might
		// point to nowhere now.  Moreover, if we're at the first node
		// (the one without any mismatches), this is the place to
		// initialize the new list.

		int pos_ = pos == -1 ? 4 : pos ;
		if( is_first ) {
			next_in_queue = ptrs->next ;
			if( pos == -1 ) {
				new_head = seed() ;
				ptrs->next = junk_yard ;
				junk_yard = ptrs ;
			} else {
				new_head = ptrs ;
				new_head->matched += 1 - ptrs->ptr->drop[pos] ;
				new_head->ptr = ptrs->ptr->childs[pos] ;
				new_head->next = 0 ;
			}
			last_done = new_head ;
		} else if( ff >= n->drop[pos_] ) {
			next_in_queue->ptr = n->childs[pos_] ;
			next_in_queue->first -= n->drop[pos_] ;
			next_in_queue->matched ++ ;
			next_in_queue->matched -= n->drop[pos_] ;
			next_in_queue->isgap = 0 ;
			assert( !last_done->next ) ;
			last_done->next = next_in_queue ;
			last_done = next_in_queue ;
			next_in_queue = next_in_queue->next ;
			last_done->next = 0 ;
		} else {
			mismatch_ptr *p = next_in_queue ;
			next_in_queue = next_in_queue->next ;
			p->next = junk_yard ;
			junk_yard = p ;
		}

		// The following always produces new mismatches, so skip it
		// completely if we're already at the limit.

		if ( mm < cutoff ) {
			// if gaps are allowed, always keep the node (but don't
			// uselessly gap the root node)
			if( gg < allow_gaps && n->depth > 0 )
			{
				assert( last_done ) ;
				mismatch_ptr *p = splice_fresh_after( last_done ) ;
				p->ptr = n ;
				p->mismatch = mm + 1 ;
				p->gaps = gg + 1 ;
				p->isgap = 1 ;
				p->first = is_first ? n->depth-1 : ff ;
				p->matched = nn + 1 ;
				last_done = p ;
			}

			// Take care of the children, but not the one that matches.
			// (That one has already been handled.)

			for ( int i = 0 ; i != 5 ; i++ ) {
				if( i != pos ) {
					// The child always exists, but we follow it only
					// iff the first mismatch would not be forgotten by
					// following a suffix link.
					assert( n->childs[i] ) ;
					int ff_ = is_first ? n->depth : ff ;
					if( ff_ >= n->drop[i] )
					{
						mismatch_ptr *p = splice_fresh_after( last_done ) ;
						p->ptr = n->childs[i] ;
						p->first = ff_ - n->drop[i] ;
						p->mismatch = mm + 1 ;
						p->gaps = gg ;
						p->isgap = 0 ;
						p->matched = nn + 1 - n->drop[i] ;
						assert( !p->next ) ;
						assert( last_done->next == p ) ;
						last_done = p ;
					}
				}
			}
		}
		is_first = false ;
		assert( !last_done->next ) ;
	} 

	if( debug_flags & debug_numnodes )
	{
		int n = 0 ;
		for( mismatch_ptr *p = new_head ; p ; p=p->next ) ++n ;
		std::clog << '\r' << n << std::endl;
	}
	return new_head ;
}