File: termgenerator_internal.cc

package info (click to toggle)
xapian-core 1.4.3-2+deb9u3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 21,412 kB
  • sloc: cpp: 113,868; ansic: 8,723; sh: 4,433; perl: 836; makefile: 566; tcl: 317; python: 40
file content (862 lines) | stat: -rw-r--r-- 23,535 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
/** @file termgenerator_internal.cc
 * @brief TermGenerator class internals
 */
/* Copyright (C) 2007,2010,2011,2012,2015,2016,2017 Olly Betts
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <config.h>

#include "termgenerator_internal.h"

#include "api/omenquireinternal.h"
#include "api/queryinternal.h"

#include <xapian/document.h>
#include <xapian/queryparser.h>
#include <xapian/stem.h>
#include <xapian/unicode.h>

#include "stringutils.h"

#include <algorithm>
#include <cmath>
#include <deque>
#include <limits>
#include <list>
#include <string>
#include <unordered_map>
#include <vector>

#include "cjk-tokenizer.h"

using namespace std;

namespace Xapian {

inline bool
U_isupper(unsigned ch) {
    return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
}

inline unsigned check_wordchar(unsigned ch) {
    if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
    return 0;
}

inline bool
should_stem(const std::string & term)
{
    const unsigned int SHOULD_STEM_MASK =
	(1 << Unicode::LOWERCASE_LETTER) |
	(1 << Unicode::TITLECASE_LETTER) |
	(1 << Unicode::MODIFIER_LETTER) |
	(1 << Unicode::OTHER_LETTER);
    Utf8Iterator u(term);
    return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
}

/** Value representing "ignore this" when returned by check_infix() or
 *  check_infix_digit().
 */
const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();

inline unsigned check_infix(unsigned ch) {
    if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
	// Unicode includes all these except '&' in its word boundary rules,
	// as well as 0x2019 (which we handle below) and ':' (for Swedish
	// apparently, but we ignore this for now as it's problematic in
	// real world cases).
	return ch;
    }
    // 0x2019 is Unicode apostrophe and single closing quote.
    // 0x201b is Unicode single opening quote with the tail rising.
    if (ch == 0x2019 || ch == 0x201b) return '\'';
    if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
	return UNICODE_IGNORE;
    return 0;
}

inline unsigned check_infix_digit(unsigned ch) {
    // This list of characters comes from Unicode's word identifying algorithm.
    switch (ch) {
	case ',':
	case '.':
	case ';':
	case 0x037e: // GREEK QUESTION MARK
	case 0x0589: // ARMENIAN FULL STOP
	case 0x060D: // ARABIC DATE SEPARATOR
	case 0x07F8: // NKO COMMA
	case 0x2044: // FRACTION SLASH
	case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
	case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
	case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
	    return ch;
    }
    if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
	return UNICODE_IGNORE;
    return 0;
}

inline bool
is_digit(unsigned ch) {
    return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
}

inline unsigned check_suffix(unsigned ch) {
    if (ch == '+' || ch == '#') return ch;
    // FIXME: what about '-'?
    return 0;
}

/** Templated framework for processing terms.
 *
 *  Calls action(term, positional) for each term to add, where term is a
 *  std::string holding the term, and positional is a bool indicating
 *  if this term carries positional information.
 */
template<typename ACTION> void
parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
{
    while (true) {
	// Advance to the start of the next term.
	unsigned ch;
	while (true) {
	    if (itor == Utf8Iterator()) return;
	    ch = check_wordchar(*itor);
	    if (ch) break;
	    ++itor;
	}

	string term;
	// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
	// Don't worry if there's a trailing '.' or not.
	if (U_isupper(*itor)) {
	    const Utf8Iterator end;
	    Utf8Iterator p = itor;
	    do {
		Unicode::append_utf8(term, Unicode::tolower(*p++));
	    } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
	    // One letter does not make an acronym!  If we handled a single
	    // uppercase letter here, we wouldn't catch M&S below.
	    if (term.size() > 1) {
		// Check there's not a (lower case) letter or digit
		// immediately after it.
		if (p == end || !Unicode::is_wordchar(*p)) {
		    itor = p;
		    goto endofterm;
		}
	    }
	    term.resize(0);
	}

	while (true) {
	    if (cjk_ngram &&
		CJK::codepoint_is_cjk(*itor) &&
		Unicode::is_wordchar(*itor)) {
		const string & cjk = CJK::get_cjk(itor);
		for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
		    const string & cjk_token = *tk;
		    if (!action(cjk_token, with_positions && tk.get_length() == 1, itor))
			return;
		}
		while (true) {
		    if (itor == Utf8Iterator()) return;
		    ch = check_wordchar(*itor);
		    if (ch) break;
		    ++itor;
		}
		continue;
	    }
	    unsigned prevch;
	    do {
		Unicode::append_utf8(term, ch);
		prevch = ch;
		if (++itor == Utf8Iterator() ||
		    (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
		    goto endofterm;
		ch = check_wordchar(*itor);
	    } while (ch);

	    Utf8Iterator next(itor);
	    ++next;
	    if (next == Utf8Iterator()) break;
	    unsigned nextch = check_wordchar(*next);
	    if (!nextch) break;
	    unsigned infix_ch = *itor;
	    if (is_digit(prevch) && is_digit(*next)) {
		infix_ch = check_infix_digit(infix_ch);
	    } else {
		// Handle things like '&' in AT&T, apostrophes, etc.
		infix_ch = check_infix(infix_ch);
	    }
	    if (!infix_ch) break;
	    if (infix_ch != UNICODE_IGNORE)
		Unicode::append_utf8(term, infix_ch);
	    ch = nextch;
	    itor = next;
	}

	{
	    size_t len = term.size();
	    unsigned count = 0;
	    while ((ch = check_suffix(*itor))) {
		if (++count > 3) {
		    term.resize(len);
		    break;
		}
		Unicode::append_utf8(term, ch);
		if (++itor == Utf8Iterator()) goto endofterm;
	    }
	    // Don't index fish+chips as fish+ chips.
	    if (Unicode::is_wordchar(*itor))
		term.resize(len);
	}

endofterm:
	if (!action(term, with_positions, itor))
	    return;
    }
}

void
TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
				    const string & prefix, bool with_positions)
{
    bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();

    stop_strategy current_stop_mode;
    if (!stopper.get()) {
	current_stop_mode = TermGenerator::STOP_NONE;
    } else {
	current_stop_mode = stop_mode;
    }

    parse_terms(itor, cjk_ngram, with_positions,
	[=](const string & term, bool positional, const Utf8Iterator &) {
	    if (term.size() > max_word_length) return true;

	    if (current_stop_mode == TermGenerator::STOP_ALL && (*stopper)(term))
		return true;

	    if (strategy == TermGenerator::STEM_SOME ||
		strategy == TermGenerator::STEM_NONE) {
		if (positional) {
		    doc.add_posting(prefix + term, ++termpos, wdf_inc);
		} else {
		    doc.add_term(prefix + term, wdf_inc);
		}
	    }

	    if ((flags & FLAG_SPELLING) && prefix.empty())
		db.add_spelling(term);

	    if (strategy == TermGenerator::STEM_NONE ||
		!stemmer.internal.get()) return true;

	    if (strategy == TermGenerator::STEM_SOME) {
		if (current_stop_mode == TermGenerator::STOP_STEMMED &&
		    (*stopper)(term))
		    return true;

		// Note, this uses the lowercased term, but that's OK as we
		// only want to avoid stemming terms starting with a digit.
		if (!should_stem(term)) return true;
	    }

	    // Add stemmed form without positional information.
	    const string& stem = stemmer(term);
	    if (rare(stem.empty())) return true;
	    string stemmed_term;
	    if (strategy != TermGenerator::STEM_ALL) {
		stemmed_term += "Z";
	    }
	    stemmed_term += prefix;
	    stemmed_term += stem;
	    if (strategy != TermGenerator::STEM_SOME && with_positions) {
		doc.add_posting(stemmed_term, ++termpos, wdf_inc);
	    } else {
		doc.add_term(stemmed_term, wdf_inc);
	    }
	    return true;
	});
}

struct Sniplet {
    double* relevance;

    size_t term_end;

    size_t highlight;

    Sniplet(double* r, size_t t, size_t h)
	: relevance(r), term_end(t), highlight(h) { }
};

class SnipPipe {
    deque<Sniplet> pipe;
    deque<Sniplet> best_pipe;

    // Requested length for snippet.
    size_t length;

    // Position in text of start of current pipe contents.
    size_t begin = 0;

    // Rolling sum of the current pipe contents.
    double sum = 0;

    size_t phrase_len = 0;

  public:
    size_t best_begin = 0;

    size_t best_end = 0;

    double best_sum = 0;

    // Add one to length to allow for inter-word space.
    // FIXME: We ought to correctly allow for multiple spaces.
    explicit SnipPipe(size_t length_) : length(length_ + 1) { }

    bool pump(double* r, size_t t, size_t h, unsigned flags);

    void done();

    bool drain(const string & input,
	       const string & hi_start,
	       const string & hi_end,
	       const string & omit,
	       string & output);
};

#define DECAY 2.0

inline bool
SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
{
    if (h > 1) {
	if (pipe.size() >= h - 1) {
	    // The final term of a phrase is entering the window.  Peg the
	    // phrase's relevance onto the first term of the phrase, so it'll
	    // be removed from `sum` when the phrase starts to leave the
	    // window.
	    auto & phrase_start = pipe[pipe.size() - (h - 1)];
	    if (phrase_start.relevance) {
		*phrase_start.relevance *= DECAY;
		sum -= *phrase_start.relevance;
	    }
	    sum += *r;
	    phrase_start.relevance = r;
	    phrase_start.highlight = h;
	    *r /= DECAY;
	}
	r = NULL;
	h = 0;
    }
    pipe.emplace_back(r, t, h);
    if (r) {
	sum += *r;
	*r /= DECAY;
    }

    // If necessary, discard words from the start of the pipe until it has the
    // desired length.
    // FIXME: Also shrink the window past words with relevance < 0?
    while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
	const Sniplet& word = pipe.front();
	if (word.relevance) {
	    *word.relevance *= DECAY;
	    sum -= *word.relevance;
	}
	begin = word.term_end;
	if (best_end >= begin)
	    best_pipe.push_back(word);
	pipe.pop_front();
	// E.g. can happen if the current term is longer than the requested
	// length!
	if (rare(pipe.empty())) break;
    }

    // Using > here doesn't work well, as we don't extend a snippet over terms
    // with 0 weight.
    if (sum >= best_sum) {
	// Discard any part of `best_pipe` which is before `begin`.
	if (begin >= best_end) {
	    best_pipe.clear();
	} else {
	    while (!best_pipe.empty() &&
		   best_pipe.front().term_end <= begin) {
		best_pipe.pop_front();
	    }
	}
	best_sum = sum;
	best_begin = begin;
	best_end = t;
    } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
	if (best_sum > 0 && best_end < begin) {
	    // We found something, and we aren't still looking near it.
	    // FIXME: Benchmark this and adjust if necessary.
	    return false;
	}
    }
    return true;
}

inline void
SnipPipe::done()
{
    // Discard any part of `pipe` which is after `best_end`.
    if (begin >= best_end) {
	pipe.clear();
    } else {
	// We should never empty the pipe (as that case should be handled
	// above).
	while (rare(!pipe.empty()) &&
	       pipe.back().term_end > best_end) {
	    pipe.pop_back();
	}
    }
}

inline void
append_escaping_xml(const char* p, const char* end, string& output)
{
    while (p != end) {
	char ch = *p++;
	switch (ch) {
	    case '&':
		output += "&amp;";
		break;
	    case '<':
		output += "&lt;";
		break;
	    case '>':
		output += "&gt;";
		break;
	    default:
		output += ch;
	}
    }
}

inline bool
SnipPipe::drain(const string & input,
		const string & hi_start,
		const string & hi_end,
		const string & omit,
		string & output)
{
    if (best_pipe.empty() && !pipe.empty()) {
	swap(best_pipe, pipe);
    }

    if (best_pipe.empty()) {
	size_t tail_len = input.size() - best_end;
	if (tail_len == 0) return false;

	// See if this is the end of a sentence.
	// FIXME: This is quite simplistic - look at the Unicode rules:
	// http://unicode.org/reports/tr29/#Sentence_Boundaries
	bool punc = false;
	Utf8Iterator i(input.data() + best_end, tail_len);
	while (i != Utf8Iterator()) {
	    unsigned ch = *i;
	    if (punc && Unicode::is_whitespace(ch)) break;

	    // Allow "...", "!!", "!?!", etc...
	    punc = (ch == '.' || ch == '?' || ch == '!');

	    if (Unicode::is_wordchar(ch)) break;
	    ++i;
	}

	if (punc) {
	    // Include end of sentence punctuation.
	    append_escaping_xml(input.data() + best_end, i.raw(), output);
	} else {
	    // Append "..." or equivalent if this doesn't seem to be the start
	    // of a sentence.
	    output += omit;
	}

	return false;
    }

    const Sniplet & word = best_pipe.front();

    if (output.empty()) {
	// Start of the snippet.
	enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;

	Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
	while (i != Utf8Iterator()) {
	    unsigned ch = *i;
	    switch (sentence_boundary) {
		case NO:
		    if (ch == '.' || ch == '?' || ch == '!') {
			sentence_boundary = PUNC;
		    }
		    break;
		case PUNC:
		    if (Unicode::is_whitespace(ch)) {
			sentence_boundary = YES;
		    } else if (ch == '.' || ch == '?' || ch == '!') {
			// Allow "...", "!!", "!?!", etc...
		    } else {
			sentence_boundary = NO;
		    }
		    break;
		case YES:
		    break;
	    }
	    if (Unicode::is_wordchar(ch)) {
		// Start the snippet at the start of the first word.
		best_begin = i.raw() - input.data();
		break;
	    }
	    ++i;
	}

	// Add "..." or equivalent if this doesn't seem to be the start of a
	// sentence.
	if (sentence_boundary != YES) {
	    output += omit;
	}
    }

    if (word.highlight) {
	// Don't include inter-word characters in the highlight.
	Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
	while (i != Utf8Iterator()) {
	    unsigned ch = *i;
	    if (Unicode::is_wordchar(ch)) {
		append_escaping_xml(input.data() + best_begin, i.raw(), output);
		best_begin = i.raw() - input.data();
		break;
	    }
	    ++i;
	}
    }

    if (!phrase_len) {
	phrase_len = word.highlight;
	if (phrase_len) output += hi_start;
    }

    const char* p = input.data();
    append_escaping_xml(p + best_begin, p + word.term_end, output);
    best_begin = word.term_end;

    if (phrase_len && --phrase_len == 0) output += hi_end;

    best_pipe.pop_front();
    return true;
}

static void
check_query(const Xapian::Query & query,
	    list<vector<string>> & exact_phrases,
	    unordered_map<string, double> & loose_terms,
	    list<string> & wildcards,
	    size_t & longest_phrase)
{
    // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
    size_t n_subqs = query.get_num_subqueries();
    Xapian::Query::op op = query.get_type();
    if (op == query.LEAF_TERM) {
	const Xapian::Internal::QueryTerm & qt =
	    *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
	loose_terms.insert(make_pair(qt.get_term(), 0));
    } else if (op == query.OP_WILDCARD) {
	const Xapian::Internal::QueryWildcard & qw =
	    *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
	wildcards.push_back(qw.get_pattern());
    } else if (op == query.OP_PHRASE) {
	const Xapian::Internal::QueryPhrase & phrase =
	    *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
	if (phrase.get_window() == n_subqs) {
	    // Tight phrase.
	    for (size_t i = 0; i != n_subqs; ++i) {
		if (query.get_subquery(i).get_type() != query.LEAF_TERM)
		    goto non_term_subquery;
	    }

	    // Tight phrase of terms.
	    exact_phrases.push_back(vector<string>());
	    vector<string> & terms = exact_phrases.back();
	    terms.reserve(n_subqs);
	    for (size_t i = 0; i != n_subqs; ++i) {
		Xapian::Query q = query.get_subquery(i);
		const Xapian::Internal::QueryTerm & qt =
		    *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
		terms.push_back(qt.get_term());
	    }
	    if (n_subqs > longest_phrase) longest_phrase = n_subqs;
	    return;
	}
    }
non_term_subquery:
    for (size_t i = 0; i != n_subqs; ++i)
	check_query(query.get_subquery(i), exact_phrases, loose_terms,
		    wildcards, longest_phrase);
}

static double*
check_term(unordered_map<string, double> & loose_terms,
	   const Xapian::Weight::Internal * stats,
	   const string & term,
	   double max_tw)
{
    auto it = loose_terms.find(term);
    if (it == loose_terms.end()) return NULL;

    if (it->second == 0.0) {
	double relevance;
	if (!stats->get_termweight(term, relevance)) {
	    // FIXME: Assert?
	    loose_terms.erase(it);
	    return NULL;
	}

	it->second = relevance + max_tw;
    }
    return &it->second;
}

string
MSet::Internal::snippet(const string & text,
			size_t length,
			const Xapian::Stem & stemmer,
			unsigned flags,
			const string & hi_start,
			const string & hi_end,
			const string & omit) const
{
    if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
	// Too easy!
	return text;
    }

    bool cjk_ngram = CJK::is_cjk_enabled();

    size_t term_start = 0;
    double min_tw = 0, max_tw = 0;
    if (stats) stats->get_max_termweight(min_tw, max_tw);
    if (max_tw == 0.0) {
	max_tw = 1.0;
    } else {
	// Scale up by (1 + 1/64) so that highlighting works better for terms
	// with termweight 0 (which happens for terms not in the database, and
	// also with some weighting schemes for terms which occur in almost all
	// documents.
	max_tw *= 1.015625;
    }

    SnipPipe snip(length);

    list<vector<string>> exact_phrases;
    unordered_map<string, double> loose_terms;
    list<string> wildcards;
    size_t longest_phrase = 0;
    check_query(enquire->get_query(), exact_phrases, loose_terms,
		wildcards, longest_phrase);

    vector<double> exact_phrases_relevance;
    exact_phrases_relevance.reserve(exact_phrases.size());
    for (auto&& terms : exact_phrases) {
	// FIXME: What relevance to use?
	exact_phrases_relevance.push_back(max_tw * terms.size());
    }

    vector<double> wildcards_relevance;
    wildcards_relevance.reserve(exact_phrases.size());
    for (auto&& pattern : wildcards) {
	// FIXME: What relevance to use?
	(void)pattern;
	wildcards_relevance.push_back(max_tw + min_tw);
    }

    // Background relevance is the same for a given MSet, so cache it
    // between calls to MSet::snippet() on the same object.
    unordered_map<string, double>& background = snippet_bg_relevance;

    vector<string> phrase;
    if (longest_phrase) phrase.resize(longest_phrase - 1);
    size_t phrase_next = 0;
    bool matchfound = false;
    parse_terms(Utf8Iterator(text), cjk_ngram, true,
	[&](const string & term, bool positional, const Utf8Iterator & it) {
	    // FIXME: Don't hardcode this here.
	    const size_t max_word_length = 64;

	    if (!positional) return true;
	    if (term.size() > max_word_length) return true;

	    // We get segments with any "inter-word" characters in front of
	    // each word, e.g.:
	    // [The][ cat][ sat][ on][ the][ mat]
	    size_t term_end = text.size() - it.left();

	    double* relevance = 0;
	    size_t highlight = 0;
	    if (stats) {
		size_t i = 0;
		for (auto&& terms : exact_phrases) {
		    if (term == terms.back()) {
			size_t n = terms.size() - 1;
			bool match = true;
			while (n--) {
			    if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
				match = false;
				break;
			    }
			}
			if (match) {
			    // FIXME: Sort phrases, highest score first!
			    relevance = &exact_phrases_relevance[i];
			    highlight = terms.size();
			    goto relevance_done;
			}
		    }
		    ++i;
		}

		relevance = check_term(loose_terms, stats, term, max_tw);
		if (relevance) {
		    // Matched unstemmed term.
		    highlight = 1;
		    goto relevance_done;
		}

		string stem = "Z";
		stem += stemmer(term);
		relevance = check_term(loose_terms, stats, stem, max_tw);
		if (relevance) {
		    // Matched stemmed term.
		    highlight = 1;
		    goto relevance_done;
		}

		// Check wildcards.
		// FIXME: Sort wildcards, shortest pattern first or something?
		i = 0;
		for (auto&& pattern : wildcards) {
		    if (startswith(term, pattern)) {
			relevance = &wildcards_relevance[i];
			highlight = 1;
			goto relevance_done;
		    }
		    ++i;
		}

		if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
		    // Background document model.
		    auto bgit = background.find(term);
		    if (bgit == background.end()) bgit = background.find(stem);
		    if (bgit == background.end()) {
			Xapian::doccount tf = enquire->db.get_termfreq(term);
			if (!tf) {
			    tf = enquire->db.get_termfreq(stem);
			} else {
			    stem = term;
			}
			double r = 0.0;
			if (tf) {
			    // Add one to avoid log(0) when a term indexes all
			    // documents.
			    Xapian::doccount num_docs = stats->collection_size + 1;
			    r = max_tw * log((num_docs - tf) / double(tf));
			    r /= (length + 1) * log(double(num_docs));
#if 0
			    if (r <= 0) {
				Utf8Iterator i(text.data() + term_start, text.data() + term_end);
				while (i != Utf8Iterator()) {
				    if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
					r = max_tw * 0.05;
				    }
				}
			    }
#endif
			}
			bgit = background.emplace(make_pair(stem, r)).first;
		    }
		    relevance = &bgit->second;
		}
	    } else {
#if 0
		// In the absence of weight information, assume longer terms
		// are more relevant, and that unstemmed matches are a bit more
		// relevant than stemmed matches.
		if (queryterms.find(term) != queryterms.end()) {
		    relevance = term.size() * 3;
		} else {
		    string stem = "Z";
		    stem += stemmer(term);
		    if (queryterms.find(stem) != queryterms.end()) {
			relevance = term.size() * 2;
		    }
		}
#endif
	    }

	    // FIXME: Allow Enquire without a DB set or an empty MSet() to be
	    // used if you don't want the collection model?

#if 0
	    // FIXME: Punctuation should somehow be included in the model, but this
	    // approach is problematic - we don't want the first word of a sentence
	    // to be favoured when it's at the end of the window.

	    // Give first word in each sentence a relevance boost.
	    if (term_start == 0) {
		relevance += 10;
	    } else {
		for (size_t i = term_start; i + term.size() < term_end; ++i) {
		    if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
			relevance += 10;
			break;
		    }
		}
	    }
#endif

relevance_done:
	    if (longest_phrase) {
		phrase[phrase_next] = term;
		phrase_next = (phrase_next + 1) % (longest_phrase - 1);
	    }

	    if (highlight) matchfound = true;

	    if (!snip.pump(relevance, term_end, highlight, flags)) return false;

	    term_start = term_end;
	    return true;
	});

    snip.done();

    // Put together the snippet.
    string result;
    if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
	while (snip.drain(text, hi_start, hi_end, omit, result)) { }
    }

    return result;
}

}