File: suffix_tree_algorithm.hpp

package info (click to toggle)
seqan3 3.0.2%2Bds-9
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 16,052 kB
  • sloc: cpp: 144,641; makefile: 1,288; ansic: 294; sh: 228; xml: 217; javascript: 50; python: 27; php: 25
file content (259 lines) | stat: -rw-r--r-- 8,984 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
// Copyright (c) 2016, the SDSL Project Authors.  All rights reserved.
// Please see the AUTHORS file for details.  Use of this source code is governed
// by a BSD license that can be found in the LICENSE file.
/*! \file suffix_tree_algorithm.hpp
    \brief suffix_tree_algorithm.hpp contains algorithms on CSTs
    \author Simon Gog
*/
#ifndef INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
#define INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM

#include <iterator>
#include "suffix_array_algorithm.hpp"

namespace sdsl {

//! Forward search for a character c on the path on depth \f$d\f$ to node \f$v\f$.
/*!
 * \param cst      The CST object
 * \param v        The node at the endpoint of the current edge.
 * \param d        The current depth of the path starting with 0.
 * \param c        The character c which should be matched with pathlabel_{root()..v}[d]
 * \param char_pos T[char_pos-d+1..char_pos] matches with the already matched pattern P[0..d-1] or
 *                 d=0 => char_pos=0.
 * \return The number of the matching substrings in T.
 *
 *    \par Time complexity
 *        \f$ \Order{ t_{\Psi} } \f$ or \f$ \Order{t_{cst.child}} \f$
 */
template <class t_cst>
typename t_cst::size_type
forward_search(const t_cst&					   cst,
			   typename t_cst::node_type&	  v,
			   const typename t_cst::size_type d,
			   const typename t_cst::char_type c,
			   typename t_cst::size_type&	  char_pos,
			   SDSL_UNUSED
			   typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
									   cst_tag>::type x = cst_tag())
{
	auto cc = cst.csa.char2comp[c]; // check if c occurs in the text of the csa
	if (cc == 0 and cc != c)		//   "    " "    "     "  "    "   "  "   "
		return 0;
	typename t_cst::size_type depth_node = cst.depth(v);
	if (d < depth_node) { // in an edge, no  branching
		char_pos = cst.csa.psi[char_pos];
		if (char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc + 1]) return 0;
		return cst.size(v);
	} else if (d == depth_node) { // at a node,  branching
		v = cst.child(v, c, char_pos);
		if (v == cst.root())
			return 0;
		else
			return cst.size(v);
	} else {
		return 0;
	}
}

//! Forward search for a pattern pat on the path on depth \f$d\f$ to node \f$v\f$.
/*!
 *    \param cst      The compressed suffix tree.
 *    \param v        The node at the endpoint of the current edge.
 *    \param d        The current depth of the path. 0 = first character on each edge of the root node.
 *    \param pat      The character c which should be matched at the path on depth \f$d\f$ to node \f$v\f$.
 *    \param char_pos One position in the text, which corresponds to the text that is already matched. If v=cst.root() and d=0 => char_pos=0.
 *
 *    \par Time complexity
 *        \f$ \Order{ t_{\Psi} } \f$ or \f$ \Order{t_{cst.child}} \f$
 */
template <class t_cst, class t_pat_iter>
typename t_cst::size_type
forward_search(const t_cst&				  cst,
			   typename t_cst::node_type& v,
			   typename t_cst::size_type  d,
			   t_pat_iter				  begin,
			   t_pat_iter				  end,
			   typename t_cst::size_type& char_pos,
			   SDSL_UNUSED
			   typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
									   cst_tag>::type x = cst_tag())
{
	if (begin == end) return cst.size(v);
	typename t_cst::size_type size = 0;
	t_pat_iter				  it   = begin;
	while (it != end and (size = forward_search(cst, v, d, *it, char_pos))) {
		++d;
		++it;
	}
	return size;
}

//! Counts the number of occurrences of a pattern in a CST.
/*!
 * \tparam t_cst      CST type.
 * \tparam t_pat_iter Pattern iterator type.
 *
 * \param cst   The CST object.
 * \param begin Iterator to the begin of the pattern (inclusive).
 * \param end   Iterator to the end of the pattern (exclusive).
 * \return The number of occurrences of the pattern in the CSA.
 *
 * \par Time complexity
 *        \f$ \Order{ t_{backward\_search} } \f$
 */
template <class t_cst, class t_pat_iter>
typename t_cst::size_type count(const t_cst& cst, t_pat_iter begin, t_pat_iter end, cst_tag)
{
	return count(cst.csa, begin, end);
}


//! Calculates all occurrences of a pattern pat in a CST.
/*!
 * \tparam t_cst      CST type.
 * \tparam t_pat_iter Pattern iterator type.
 * \tparam t_rac      Resizeable random access container.
 *
 * \param cst   The CST object.
 * \param begin Iterator to the begin of the pattern (inclusive).
 * \param end   Iterator to the end of the pattern (exclusive).
 * \return A vector containing the occurrences of the pattern  in the CST.
 *
 * \par Time complexity
 *        \f$ \Order{ t_{backward\_search} + z \cdot t_{SA} } \f$, where \f$z\f$ is the number of
 *         occurrences of pattern in the CST.
 */
template <class t_cst, class t_pat_iter, class t_rac = int_vector<64>>
t_rac locate(const t_cst& cst,
			 t_pat_iter   begin,
			 t_pat_iter   end,
			 SDSL_UNUSED
			 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
									 cst_tag>::type x = cst_tag())
{
	return locate(cst.csa, begin, end);
}

//! Calculate the concatenation of edge labels from the root to the node v of a CST.
/*!
 * \tparam t_cst       CST type.
 * \tparam t_text_iter Random access iterator type.
 *
 * \param cst   The CST object.
 * \param v     The node where the concatenation of the edge label ends.
 * \param text  Random access iterator pointing to the start of an container, which can hold at least (end-begin+1) character.
 * \returns The length of the extracted edge label.
 * \pre text has to be initialized with enough memory (\f$ cst.depth(v)+1\f$ bytes) to hold the extracted text.
 */
template <class t_cst, class t_text_iter>
typename t_cst::size_type
extract(const t_cst&					 cst,
		const typename t_cst::node_type& v,
		t_text_iter						 text,
		SDSL_UNUSED
		typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
								cst_tag>::type x = cst_tag())
{
	if (v == cst.root()) {
		text[0] = 0;
		return 0;
	}
	// first get the suffix array entry of the leftmost leaf in the subtree rooted at v
	typename t_cst::size_type begin = cst.csa[cst.lb(v)];
	// then call the extract method on the compressed suffix array
	extract(cst.csa, begin, begin + cst.depth(v) - 1, text);
}

//! Calculate the concatenation of edge labels from the root to the node v of of c CST.
/*!
 * \tparam t_rac Random access container which should hold the result.
 * \tparam t_cst CSA type.

 * \param cst The CST object.
 * \return A t_rac object holding the extracted edge label.
 * \return The string of the concatenated edge labels from the root to the node v.
 */
template <class t_cst>
typename t_cst::csa_type::string_type
extract(const t_cst&					 cst,
		const typename t_cst::node_type& v,
		SDSL_UNUSED
		typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
								cst_tag>::type x = cst_tag())
{
	typedef typename t_cst::csa_type::string_type t_rac;
	if (v == cst.root()) {
		return t_rac(0);
	}
	// first get the suffix array entry of the leftmost leaf in the subtree rooted at v
	typename t_cst::size_type begin = cst.csa[cst.lb(v)];
	// then call the extract method on the compressed suffix array
	return extract(cst.csa, begin, begin + cst.depth(v) - 1);
}


//! Calculate the zeroth order entropy of the text that follows a certain substring s
/*!
 * \param v     A suffix tree node v. The label of the path from the root to v is s.
 * \param cst   The suffix tree of v.
 * \return      The zeroth order entropy of the concatenation of all characters that follow
                s in the original text.
 */
template <class t_cst>
double H0(const typename t_cst::node_type& v, const t_cst& cst)
{
	if (cst.is_leaf(v)) {
		return 0;
	} else {
		double h0 = 0;
		auto   n  = cst.size(v);
		for (const auto& child : cst.children(v)) {
			double p = ((double)cst.size(child)) / n;
			h0 -= p * log2(p);
		}
		return h0;
	}
}

//! Calculate the k-th order entropy of a text
/*!
 * \param cst The suffix tree.
 * \param k   Parameter k for which H_k should be calculated.
 * \return    H_k and the number of contexts.
 */
template <class t_cst>
std::pair<double, size_t> Hk(const t_cst& cst, typename t_cst::size_type k)
{
	double								hk		= 0;
	size_t								context = 0;
	std::set<typename t_cst::size_type> leafs_with_d_smaller_k;
	for (typename t_cst::size_type d = 1; d < k; ++d) {
		leafs_with_d_smaller_k.insert(cst.csa.isa[cst.csa.size() - d]);
	}
	for (typename t_cst::const_iterator it = cst.begin(), end = cst.end(); it != end; ++it) {
		if (it.visit() == 1) {
			if (!cst.is_leaf(*it)) {
				typename t_cst::size_type d = cst.depth(*it);
				if (d >= k) {
					if (d == k) {
						hk += cst.size(*it) * H0(*it, cst);
					}
					++context;
					it.skip_subtree();
				}
			} else {
				// if d of leaf is >= k, add context
				if (leafs_with_d_smaller_k.find(cst.lb(*it)) == leafs_with_d_smaller_k.end()) {
					++context;
				}
			}
		}
	}
	hk /= cst.size();
	return {hk, context};
}


} // end namespace
#endif