File: binary_sa_search.h

package info (click to toggle)
hisat2 2.1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 13,756 kB
  • sloc: cpp: 86,309; python: 12,230; sh: 2,171; perl: 936; makefile: 375
file content (102 lines) | stat: -rw-r--r-- 3,535 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef BINARY_SA_SEARCH_H_
#define BINARY_SA_SEARCH_H_

#include <stdint.h>
#include <iostream>
#include <limits>
#include "alphabet.h"
#include "assert_helpers.h"
#include "ds.h"
#include "btypes.h"

/**
 * Do a binary search using the suffix of 'host' beginning at offset
 * 'qry' as the query and 'sa' as an already-lexicographically-sorted
 * list of suffixes of host.  'sa' may be all suffixes of host or just
 * a subset.  Returns the index in sa of the smallest suffix of host
 * that is larger than qry, or length(sa) if all suffixes of host are
 * less than qry.
 *
 * We use the Manber and Myers optimization of maintaining a pair of
 * counters for the longest lcp observed so far on the left- and right-
 * hand sides and using the min of the two as a way of skipping over
 * characters at the beginning of a new round.
 *
 * Returns maximum value if the query suffix matches an element of sa.
 */
template<typename TStr, typename TSufElt> inline
TIndexOffU binarySASearch(
	const TStr& host,
	TIndexOffU qry,
	const EList<TSufElt>& sa)
{
	TIndexOffU lLcp = 0, rLcp = 0; // greatest observed LCPs on left and right
	TIndexOffU l = 0, r = (TIndexOffU)sa.size()+1; // binary-search window
	TIndexOffU hostLen = (TIndexOffU)host.length();
	while(true) {
		assert_gt(r, l);
		TIndexOffU m = (l+r) >> 1;
		if(m == l) {
			// Binary-search window has closed: we have an answer
			if(m > 0 && sa[m-1] == qry) {
				return std::numeric_limits<TIndexOffU>::max(); // qry matches
			}
			assert_leq(m, sa.size());
			return m; // Return index of right-hand suffix
		}
		assert_gt(m, 0);
		TIndexOffU suf = sa[m-1];
		if(suf == qry) {
			return std::numeric_limits<TIndexOffU>::max(); // query matches an elt of sa
		}
		TIndexOffU lcp = min(lLcp, rLcp);
#ifndef NDEBUG
		if(sstr_suf_upto_neq(host, qry, host, suf, lcp)) {
			assert(0);
		}
#endif
		// Keep advancing lcp, but stop when query mismatches host or
		// when the counter falls off either the query or the suffix
		while(suf+lcp < hostLen && qry+lcp < hostLen && host[suf+lcp] == host[qry+lcp]) {
			lcp++;
		}
		// Fell off the end of either the query or the sa elt?
		bool fell = (suf+lcp == hostLen || qry+lcp == hostLen);
		if((fell && qry+lcp == hostLen) || (!fell && host[suf+lcp] < host[qry+lcp])) {
			// Query is greater than sa elt
			l = m;                 // update left bound
			lLcp = max(lLcp, lcp); // update left lcp
		}
		else if((fell && suf+lcp == hostLen) || (!fell && host[suf+lcp] > host[qry+lcp])) {
			// Query is less than sa elt
			r = m;                 // update right bound
			rLcp = max(rLcp, lcp); // update right lcp
		} else {
			assert(false); // Must be one or the other!
		}
	}
	// Shouldn't get here
	assert(false);
	return std::numeric_limits<TIndexOffU>::max();
}

#endif /*BINARY_SA_SEARCH_H_*/