File: queryparser.lemony

package info (click to toggle)
xapian-core 1.4.3-2%2Bdeb9u3
links: PTS, VCS
area: main
in suites: stretch
size: 21,412 kB
sloc: cpp: 113,868; ansic: 8,723; sh: 4,433; perl: 836; makefile: 566; tcl: 317; python: 40
file content (2251 lines) | stat: -rw-r--r-- 63,004 bytes
parent folder | download | duplicates (2)
%include {
/* queryparser.lemony: build a Xapian::Query object from a user query string.
 *
 * Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016 Olly Betts
 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
 * Copyright (C) 2010 Adam Sjøgren
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

#include <config.h>

#include "queryparser_internal.h"

#include "api/queryinternal.h"
#include "omassert.h"
#include "str.h"
#include "stringutils.h"
#include "xapian/error.h"
#include "xapian/unicode.h"

// Include the list of token values lemon generates.
#include "queryparser_token.h"

#include "cjk-tokenizer.h"

#include <algorithm>
#include <cstring>
#include <limits>
#include <list>
#include <string>
#include <vector>

using namespace std;

using namespace Xapian;

inline bool
U_isupper(unsigned ch) {
    return (ch < 128 && C_isupper(static_cast<unsigned char>(ch)));
}

inline bool
U_isdigit(unsigned ch) {
    return (ch < 128 && C_isdigit(static_cast<unsigned char>(ch)));
}

inline bool
U_isalpha(unsigned ch) {
    return (ch < 128 && C_isalpha(static_cast<unsigned char>(ch)));
}

using Xapian::Unicode::is_whitespace;

inline bool
is_not_whitespace(unsigned ch) {
    return !is_whitespace(ch);
}

using Xapian::Unicode::is_wordchar;

inline bool
is_not_wordchar(unsigned ch) {
    return !is_wordchar(ch);
}

inline bool
is_digit(unsigned ch) {
    return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
}

// FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
// and there's the risk of hyphens getting stuck onto the end of terms...
inline bool
is_suffix(unsigned ch) {
    return ch == '+' || ch == '#';
}

inline bool
is_double_quote(unsigned ch) {
    // We simply treat all double quotes as equivalent, which is a bit crude,
    // but it isn't clear that it would actually better to require them to
    // match up exactly.
    //
    // 0x201c is Unicode opening double quote.
    // 0x201d is Unicode closing double quote.
    return ch == '"' || ch == 0x201c || ch == 0x201d;
}

inline bool
prefix_needs_colon(const string & prefix, unsigned ch)
{
    if (!U_isupper(ch)) return false;
    string::size_type len = prefix.length();
    return (len > 1 && prefix[len - 1] != ':');
}

using Unicode::is_currency;

inline bool
is_positional(Xapian::Query::op op)
{
    return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
}

class Terms;

/** Class used to pass information about a token from lexer to parser.
 *
 *  Generally an instance of this class carries term information, but it can be
 *  used for a range query, and with some operators (e.g. the distance in
 *  NEAR/3 or ADJ/3, etc).
 */
class Term {
    State * state;

  public:
    string name;
    const FieldInfo * field_info;
    string unstemmed;
    QueryParser::stem_strategy stem;
    termpos pos;
    Query query;

    Term(const string &name_, termpos pos_)
	: name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
    explicit Term(const string &name_)
	: name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
    Term(const string &name_, const FieldInfo * field_info_)
	: name(name_), field_info(field_info_),
	  stem(QueryParser::STEM_NONE), pos(0) { }
    explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
    Term(State * state_, const string &name_, const FieldInfo * field_info_,
	 const string &unstemmed_,
	 QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
	 termpos pos_ = 0)
	: state(state_), name(name_), field_info(field_info_),
	  unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
    // For RANGE tokens.
    Term(const Xapian::Query & q, const string & grouping)
	: name(grouping), query(q) { }

    string make_term(const string & prefix) const;

    void need_positions() {
	if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
    }

    termpos get_termpos() const { return pos; }

    string get_grouping() const {
	return field_info->grouping;
    }

    Query * as_wildcarded_query(State * state) const;

    /** Build a query for a term at the very end of the query string when
     *  FLAG_PARTIAL is in use.
     *
     *  This query should match documents containing any terms which start with
     *  the characters specified, but should give a higher score to exact
     *  matches (since the user might have finished typing - we simply don't
     *  know).
     */
    Query * as_partial_query(State * state_) const;

    /** Build a query for a string of CJK characters. */
    Query * as_cjk_query() const;

    /** Handle a CJK character string in a positional context. */
    void as_positional_cjk_term(Terms * terms) const;

    /// Range query.
    Query as_range_query() const;

    Query get_query() const;

    Query get_query_with_synonyms() const;

    Query get_query_with_auto_synonyms() const;
};

/// Parser State shared between the lexer and the parser.
class State {
    QueryParser::Internal * qpi;

  public:
    Query query;
    const char * error;
    unsigned flags;

    State(QueryParser::Internal * qpi_, unsigned flags_)
	: qpi(qpi_), error(NULL), flags(flags_) { }

    string stem_term(const string &term) {
	return qpi->stemmer(term);
    }

    void add_to_stoplist(const Term * term) {
	qpi->stoplist.push_back(term->name);
    }

    void add_to_unstem(const string & term, const string & unstemmed) {
	qpi->unstem.insert(make_pair(term, unstemmed));
    }

    Term * range(const string &a, const string &b) {
	for (auto i : qpi->rangeprocs) {
	    Xapian::Query range_query = (i.proc)->check_range(a, b);
	    Xapian::Query::op op = range_query.get_type();
	    switch (op) {
		case Xapian::Query::OP_INVALID:
		    break;
		case Xapian::Query::OP_VALUE_RANGE:
		case Xapian::Query::OP_VALUE_GE:
		case Xapian::Query::OP_VALUE_LE:
		    if (i.default_grouping) {
			Xapian::Internal::QueryValueBase * base =
			    static_cast<Xapian::Internal::QueryValueBase*>(
				range_query.internal.get());
			Xapian::valueno slot = base->get_slot();
			return new Term(range_query, str(slot));
		    }
		    // FALLTHRU
		case Xapian::Query::LEAF_TERM:
		    return new Term(range_query, i.grouping);
		default:
		    return new Term(range_query, string());
	    }
	}
	return NULL;
    }

    Query::op default_op() const { return qpi->default_op; }

    bool is_stopword(const Term *term) const {
	return qpi->stopper.get() && (*qpi->stopper)(term->name);
    }

    Database get_database() const {
	return qpi->db;
    }

    const Stopper * get_stopper() const {
	return qpi->stopper.get();
    }

    size_t stoplist_size() const {
	return qpi->stoplist.size();
    }

    void stoplist_resize(size_t s) {
	qpi->stoplist.resize(s);
    }

    Xapian::termcount get_max_wildcard_expansion() const {
	return qpi->max_wildcard_expansion;
    }

    int get_max_wildcard_type() const {
	return qpi->max_wildcard_type;
    }

    Xapian::termcount get_max_partial_expansion() const {
	return qpi->max_partial_expansion;
    }

    int get_max_partial_type() const {
	return qpi->max_partial_type;
    }
};

string
Term::make_term(const string & prefix) const
{
    string term;
    if (stem == QueryParser::STEM_SOME || stem == QueryParser::STEM_ALL_Z)
	term += 'Z';
    if (!prefix.empty()) {
	term += prefix;
	if (prefix_needs_colon(prefix, name[0])) term += ':';
    }
    if (stem != QueryParser::STEM_NONE) {
	term += state->stem_term(name);
    } else {
	term += name;
    }

    if (!unstemmed.empty())
	state->add_to_unstem(term, unstemmed);
    return term;
}

// Iterator shim to allow building a synonym query from a TermIterator pair.
class SynonymIterator {
    Xapian::TermIterator i;

    Xapian::termpos pos;

    const Xapian::Query * first;

  public:
    SynonymIterator(const Xapian::TermIterator & i_,
		    Xapian::termpos pos_ = 0,
		    const Xapian::Query * first_ = NULL)
	: i(i_), pos(pos_), first(first_) { }

    SynonymIterator & operator++() {
	if (first)
	    first = NULL;
	else
	    ++i;
	return *this;
    }

    const Xapian::Query operator*() const {
	if (first) return *first;
	return Xapian::Query(*i, 1, pos);
    }

    bool operator==(const SynonymIterator & o) const {
	return i == o.i && first == o.first;
    }

    bool operator!=(const SynonymIterator & o) const {
	return !(*this == o);
    }

    typedef std::input_iterator_tag iterator_category;
    typedef Xapian::Query value_type;
    typedef Xapian::termcount_diff difference_type;
    typedef Xapian::Query * pointer;
    typedef Xapian::Query & reference;
};

Query
Term::get_query_with_synonyms() const
{
    // Handle single-word synonyms with each prefix.
    const list<string> & prefixes = field_info->prefixes;
    if (prefixes.empty()) {
	// FIXME: handle multiple here
	Assert(!field_info->procs.empty());
	return (**field_info->procs.begin())(name);
    }

    Query q = get_query();

    list<string>::const_iterator piter;
    for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
	// First try the unstemmed term:
	string term;
	if (!piter->empty()) {
	    term += *piter;
	    if (prefix_needs_colon(*piter, name[0])) term += ':';
	}
	term += name;

	Xapian::Database db = state->get_database();
	Xapian::TermIterator syn = db.synonyms_begin(term);
	Xapian::TermIterator end = db.synonyms_end(term);
	if (syn == end && stem != QueryParser::STEM_NONE) {
	    // If that has no synonyms, try the stemmed form:
	    term = 'Z';
	    if (!piter->empty()) {
		term += *piter;
		if (prefix_needs_colon(*piter, name[0])) term += ':';
	    }
	    term += state->stem_term(name);
	    syn = db.synonyms_begin(term);
	    end = db.synonyms_end(term);
	}
	q = Query(q.OP_SYNONYM,
		  SynonymIterator(syn, pos, &q),
		  SynonymIterator(end));
    }
    return q;
}

Query
Term::get_query_with_auto_synonyms() const
{
    const unsigned MASK_ENABLE_AUTO_SYNONYMS =
	QueryParser::FLAG_AUTO_SYNONYMS |
	QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
    if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
	return get_query_with_synonyms();

    return get_query();
}

static void
add_to_query(Query *& q, Query::op op, Query * term)
{
    Assert(term);
    if (q) {
	*q = Query(op, *q, *term);
	delete term;
    } else {
	q = term;
    }
}

static void
add_to_query(Query *& q, Query::op op, const Query & term)
{
    if (q) {
	*q = Query(op, *q, term);
    } else {
	q = new Query(term);
    }
}

Query
Term::get_query() const
{
    const list<string> & prefixes = field_info->prefixes;
    if (prefixes.empty()) {
	// FIXME: handle multiple here
	Assert(!field_info->procs.empty());
	return (**field_info->procs.begin())(name);
    }
    list<string>::const_iterator piter = prefixes.begin();
    Query q(make_term(*piter), 1, pos);
    while (++piter != prefixes.end()) {
	q = Query(Query::OP_OR, q, Query(make_term(*piter), 1, pos));
    }
    return q;
}

Query *
Term::as_wildcarded_query(State * state_) const
{
    const list<string> & prefixes = field_info->prefixes;
    list<string>::const_iterator piter;
    Xapian::termcount max = state_->get_max_wildcard_expansion();
    int max_type = state_->get_max_wildcard_type();
    vector<Query> subqs;
    subqs.reserve(prefixes.size());
    for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
	string root = *piter;
	root += name;
	// Combine with OP_OR, and apply OP_SYNONYM afterwards.
	subqs.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
			      Query::OP_OR));
    }
    Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
    delete this;
    return q;
}

Query *
Term::as_partial_query(State * state_) const
{
    Xapian::termcount max = state_->get_max_partial_expansion();
    int max_type = state_->get_max_partial_type();
    vector<Query> subqs_partial; // A synonym of all the partial terms.
    vector<Query> subqs_full; // A synonym of all the full terms.

    const list<string> & prefixes = field_info->prefixes;
    list<string>::const_iterator piter;
    for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
	string root = *piter;
	root += name;
	// Combine with OP_OR, and apply OP_SYNONYM afterwards.
	subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
				      Query::OP_OR));
	// Add the term, as it would normally be handled, as an alternative.
	subqs_full.push_back(Query(make_term(*piter), 1, pos));
    }
    Query * q = new Query(Query::OP_OR,
			  Query(Query::OP_SYNONYM,
				subqs_partial.begin(), subqs_partial.end()),
			  Query(Query::OP_SYNONYM,
				subqs_full.begin(), subqs_full.end()));
    delete this;
    return q;
}

Query *
Term::as_cjk_query() const
{
    vector<Query> prefix_subqs;
    vector<Query> cjk_subqs;
    const list<string> & prefixes = field_info->prefixes;
    list<string>::const_iterator piter;
    for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
	const string& prefix = *piter;
	for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
	    cjk_subqs.push_back(Query(prefix + *tk, 1, pos));
	}
	prefix_subqs.push_back(Query(Query::OP_AND,
				     cjk_subqs.begin(), cjk_subqs.end()));
	cjk_subqs.clear();
    }
    Query * q = new Query(Query::OP_OR,
			  prefix_subqs.begin(), prefix_subqs.end());
    delete this;
    return q;
}

Query
Term::as_range_query() const
{
    Query q = query;
    delete this;
    return q;
}

inline bool
is_phrase_generator(unsigned ch)
{
    // These characters generate a phrase search.
    // Ordered mostly by frequency of calls to this function done when
    // running the testcases in api_queryparser.cc.
    return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
}

inline bool
is_stem_preventer(unsigned ch)
{
    return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
}

inline bool
should_stem(const string & term)
{
    const unsigned int SHOULD_STEM_MASK =
	(1 << Unicode::LOWERCASE_LETTER) |
	(1 << Unicode::TITLECASE_LETTER) |
	(1 << Unicode::MODIFIER_LETTER) |
	(1 << Unicode::OTHER_LETTER);
    Utf8Iterator u(term);
    return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
}

/** Value representing "ignore this" when returned by check_infix() or
 *  check_infix_digit().
 */
const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();

inline unsigned check_infix(unsigned ch) {
    if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
	// Unicode includes all these except '&' in its word boundary rules,
	// as well as 0x2019 (which we handle below) and ':' (for Swedish
	// apparently, but we ignore this for now as it's problematic in
	// real world cases).
	return ch;
    }
    if (ch >= 0x200b) {
	// 0x2019 is Unicode apostrophe and single closing quote.
	// 0x201b is Unicode single opening quote with the tail rising.
	if (ch == 0x2019 || ch == 0x201b)
	    return '\'';
	if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
	    return UNICODE_IGNORE;
    }
    return 0;
}

inline unsigned check_infix_digit(unsigned ch) {
    // This list of characters comes from Unicode's word identifying algorithm.
    switch (ch) {
	case ',':
	case '.':
	case ';':
	case 0x037e: // GREEK QUESTION MARK
	case 0x0589: // ARMENIAN FULL STOP
	case 0x060D: // ARABIC DATE SEPARATOR
	case 0x07F8: // NKO COMMA
	case 0x2044: // FRACTION SLASH
	case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
	case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
	case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
	    return ch;
    }
    if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
	return UNICODE_IGNORE;
    return 0;
}

struct yyParser;

// Prototype the functions lemon generates.
static yyParser *ParseAlloc();
static void ParseFree(yyParser *);
static void Parse(yyParser *, int, Term *, State *);
static void yy_parse_failed(yyParser *);

void
QueryParser::Internal::add_prefix(const string &field, const string &prefix)
{
    map<string, FieldInfo>::iterator p = field_map.find(field);
    if (p == field_map.end()) {
	field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, prefix)));
    } else {
	// Check that this is the same type of filter as the existing one(s).
	if (p->second.type != NON_BOOLEAN) {
	    throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
	}
	if (!p->second.procs.empty())
	    throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
	p->second.prefixes.push_back(prefix);
   }
}

void
QueryParser::Internal::add_prefix(const string &field, FieldProcessor *proc)
{
    map<string, FieldInfo>::iterator p = field_map.find(field);
    if (p == field_map.end()) {
	field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, proc)));
    } else {
	// Check that this is the same type of filter as the existing one(s).
	if (p->second.type != NON_BOOLEAN) {
	    throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
	}
	if (!p->second.prefixes.empty())
	    throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
	throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
	// p->second.procs.push_back(proc);
   }
}

void
QueryParser::Internal::add_boolean_prefix(const string &field,
					  const string &prefix,
					  const string* grouping)
{
    // Don't allow the empty prefix to be set as boolean as it doesn't
    // really make sense.
    if (field.empty())
	throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
    if (!grouping) grouping = &field;
    filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
    map<string, FieldInfo>::iterator p = field_map.find(field);
    if (p == field_map.end()) {
	field_map.insert(make_pair(field, FieldInfo(type, prefix, *grouping)));
    } else {
	// Check that this is the same type of filter as the existing one(s).
	if (p->second.type != type) {
	    throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
	}
	if (!p->second.procs.empty())
	    throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
	p->second.prefixes.push_back(prefix); // FIXME grouping
   }
}

void
QueryParser::Internal::add_boolean_prefix(const string &field,
					  FieldProcessor *proc,
					  const string* grouping)
{
    // Don't allow the empty prefix to be set as boolean as it doesn't
    // really make sense.
    if (field.empty())
	throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
    if (!grouping) grouping = &field;
    filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
    map<string, FieldInfo>::iterator p = field_map.find(field);
    if (p == field_map.end()) {
	field_map.insert(make_pair(field, FieldInfo(type, proc, *grouping)));
    } else {
	// Check that this is the same type of filter as the existing one(s).
	if (p->second.type != type) {
	    throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
	}
	if (!p->second.prefixes.empty())
	    throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
	throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
	// p->second.procs.push_back(proc);
   }
}

string
QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
				  bool cjk_ngram, bool & is_cjk_term,
				  bool &was_acronym)
{
    string term;
    // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
    // Don't worry if there's a trailing '.' or not.
    if (U_isupper(*it)) {
	string t;
	Utf8Iterator p = it;
	do {
	    Unicode::append_utf8(t, *p++);
	} while (p != end && *p == '.' && ++p != end && U_isupper(*p));
	// One letter does not make an acronym!  If we handled a single
	// uppercase letter here, we wouldn't catch M&S below.
	if (t.length() > 1) {
	    // Check there's not a (lower case) letter or digit
	    // immediately after it.
	    // FIXME: should I.B.M..P.T.O be a range search?
	    if (p == end || !is_wordchar(*p)) {
		it = p;
		swap(term, t);
	    }
	}
    }
    was_acronym = !term.empty();

    if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
	term = CJK::get_cjk(it);
	is_cjk_term = true;
    }

    if (term.empty()) {
	unsigned prevch = *it;
	Unicode::append_utf8(term, prevch);
	while (++it != end) {
	    if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
	    unsigned ch = *it;
	    if (!is_wordchar(ch)) {
		// Treat a single embedded '&' or "'" or similar as a word
		// character (e.g. AT&T, Fred's).  Also, normalise
		// apostrophes to ASCII apostrophe.
		Utf8Iterator p = it;
		++p;
		if (p == end || !is_wordchar(*p)) break;
		unsigned nextch = *p;
		if (is_digit(prevch) && is_digit(nextch)) {
		    ch = check_infix_digit(ch);
		} else {
		    ch = check_infix(ch);
		}
		if (!ch) break;
		if (ch == UNICODE_IGNORE)
		    continue;
	    }
	    Unicode::append_utf8(term, ch);
	    prevch = ch;
	}
	if (it != end && is_suffix(*it)) {
	    string suff_term = term;
	    Utf8Iterator p = it;
	    // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
	    do {
		if (suff_term.size() - term.size() == 3) {
		    suff_term.resize(0);
		    break;
		}
		suff_term += *p;
	    } while (is_suffix(*++p));
	    if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
		// If the suffixed term doesn't exist, check that the
		// non-suffixed term does.  This also takes care of
		// the case when QueryParser::set_database() hasn't
		// been called.
		bool use_suff_term = false;
		string lc = Unicode::tolower(suff_term);
		if (db.term_exists(lc)) {
		    use_suff_term = true;
		} else {
		    lc = Unicode::tolower(term);
		    if (!db.term_exists(lc)) use_suff_term = true;
		}
		if (use_suff_term) {
		    term = suff_term;
		    it = p;
		}
	    }
	}
    }
    return term;
}

class ParserHandler {
    yyParser * parser;

  public:
    explicit ParserHandler(yyParser * parser_) : parser(parser_) { }
    operator yyParser*() const { return parser; }
    ~ParserHandler() { ParseFree(parser); }
};

Query
QueryParser::Internal::parse_query(const string &qs, unsigned flags,
				   const string &default_prefix)
{
    bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();

    // Set ranges if we may have to handle ranges in the query.
    bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);

    termpos term_pos = 1;
    Utf8Iterator it(qs), end;

    State state(this, flags);

    // To successfully apply more than one spelling correction to a query
    // string, we must keep track of the offset due to previous corrections.
    int correction_offset = 0;
    corrected_query.resize(0);

    // Stack of prefixes, used for phrases and subexpressions.
    list<const FieldInfo *> prefix_stack;

    // If default_prefix is specified, use it.  Otherwise, use any list
    // that has been set for the empty prefix.
    const FieldInfo def_pfx(NON_BOOLEAN, default_prefix);
    {
	const FieldInfo * default_field_info = &def_pfx;
	if (default_prefix.empty()) {
	    auto f = field_map.find(string());
	    if (f != field_map.end()) default_field_info = &(f->second);
	}

	// We always have the current prefix on the top of the stack.
	prefix_stack.push_back(default_field_info);
    }

    ParserHandler pParser(ParseAlloc());

    unsigned newprev = ' ';
main_lex_loop:
    enum {
	DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
	IN_GROUP2, EXPLICIT_SYNONYM
    } mode = DEFAULT;
    while (it != end && !state.error) {
	bool last_was_operator = false;
	bool last_was_operator_needing_term = false;
	if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
	if (false) {
just_had_operator:
	    if (it == end) break;
	    mode = DEFAULT;
	    last_was_operator_needing_term = false;
	    last_was_operator = true;
	}
	if (false) {
just_had_operator_needing_term:
	    last_was_operator_needing_term = true;
	    last_was_operator = true;
	}
	if (mode == IN_PHRASED_TERM) mode = DEFAULT;
	if (is_whitespace(*it)) {
	    newprev = ' ';
	    ++it;
	    it = find_if(it, end, is_not_whitespace);
	    if (it == end) break;
	}

	if (ranges &&
	    (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
	    // Scan forward to see if this could be the "start of range"
	    // token.  Sadly this has O(n^2) tendencies, though at least
	    // "n" is the number of words in a query which is likely to
	    // remain fairly small.  FIXME: can we tokenise more elegantly?
	    Utf8Iterator it_initial = it;
	    Utf8Iterator p = it;
	    unsigned ch = 0;
	    while (p != end) {
		if (ch == '.' && *p == '.') {
		    string a;
		    while (it != p) {
			Unicode::append_utf8(a, *it++);
		    }
		    // Trim off the trailing ".".
		    a.resize(a.size() - 1);
		    ++p;
		    // Either end of the range can be empty (for an open-ended
		    // range) but both can't be empty.
		    if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
			string b;
			// Allow any character except whitespace and ')' in the
			// upper bound.
			while (p != end && *p > ' ' && *p != ')') {
			    Unicode::append_utf8(b, *p++);
			}
			Term * range = state.range(a, b);
			if (!range) {
			    state.error = "Unknown range operation";
			    if (a.find(':', 1) == string::npos) {
				goto done;
			    }
			    // Might be a boolean filter with ".." in.  Leave
			    // state.error in case it isn't.
			    it = it_initial;
			    break;
			}
			Parse(pParser, RANGE, range, &state);
		    }
		    it = p;
		    goto main_lex_loop;
		}
		ch = *p;
		// Allow any character except whitespace and '(' in the lower
		// bound.
		if (ch <= ' ' || ch == '(') break;
		++p;
	    }
	}

	if (!is_wordchar(*it)) {
	    unsigned prev = newprev;
	    unsigned ch = *it++;
	    newprev = ch;
	    // Drop out of IN_GROUP mode.
	    if (mode == IN_GROUP || mode == IN_GROUP2)
		mode = DEFAULT;
	    switch (ch) {
	      case '"':
	      case 0x201c: // Left curly double quote.
	      case 0x201d: // Right curly double quote.
		// Quoted phrase.
		if (mode == DEFAULT) {
		    // Skip whitespace.
		    it = find_if(it, end, is_not_whitespace);
		    if (it == end) {
			// Ignore an unmatched " at the end of the query to
			// avoid generating an empty pair of QUOTEs which will
			// cause a parse error.
			goto done;
		    }
		    if (is_double_quote(*it)) {
			// Ignore empty "" (but only if we're not already
			// IN_QUOTES as we don't merge two adjacent quoted
			// phrases!)
			newprev = *it++;
			break;
		    }
		}
		if (flags & QueryParser::FLAG_PHRASE) {
		    Parse(pParser, QUOTE, NULL, &state);
		    if (mode == DEFAULT) {
			mode = IN_QUOTES;
		    } else {
			// Remove the prefix we pushed for this phrase.
			if (mode == IN_PREFIXED_QUOTES)
			    prefix_stack.pop_back();
			mode = DEFAULT;
		    }
		}
		break;

	      case '+': case '-': // Loved or hated term/phrase/subexpression.
		// Ignore + or - at the end of the query string.
		if (it == end) goto done;
		if (prev > ' ' && prev != '(') {
		    // Or if not after whitespace or an open bracket.
		    break;
		}
		if (is_whitespace(*it) || *it == '+' || *it == '-') {
		    // Ignore + or - followed by a space, or further + or -.
		    // Postfix + (such as in C++ and H+) is handled as part of
		    // the term lexing code in parse_term().
		    newprev = *it++;
		    break;
		}
		if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
		    int token;
		    if (ch == '+') {
			token = LOVE;
		    } else if (last_was_operator) {
			token = HATE_AFTER_AND;
		    } else {
			token = HATE;
		    }
		    Parse(pParser, token, NULL, &state);
		    goto just_had_operator_needing_term;
		}
		// Need to prevent the term after a LOVE or HATE starting a
		// term group...
		break;

	      case '(': // Bracketed subexpression.
		// Skip whitespace.
		it = find_if(it, end, is_not_whitespace);
		// Ignore ( at the end of the query string.
		if (it == end) goto done;
		if (prev > ' ' && strchr("()+-", prev) == NULL) {
		    // Or if not after whitespace or a bracket or '+' or '-'.
		    break;
		}
		if (*it == ')') {
		    // Ignore empty ().
		    newprev = *it++;
		    break;
		}
		if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
		    prefix_stack.push_back(prefix_stack.back());
		    Parse(pParser, BRA, NULL, &state);
		}
		break;

	      case ')': // End of bracketed subexpression.
		if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
		    // Remove the prefix we pushed for the corresponding BRA.
		    // If brackets are unmatched, it's a syntax error, but
		    // that's no excuse to SEGV!
		    if (prefix_stack.size() > 1) prefix_stack.pop_back();
		    Parse(pParser, KET, NULL, &state);
		}
		break;

	      case '~': // Synonym expansion.
		// Ignore at the end of the query string.
		if (it == end) goto done;
		if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
		    if (prev > ' ' && strchr("+-(", prev) == NULL) {
			// Or if not after whitespace, +, -, or an open bracket.
			break;
		    }
		    if (!is_wordchar(*it)) {
			// Ignore if not followed by a word character.
			break;
		    }
		    Parse(pParser, SYNONYM, NULL, &state);
		    mode = EXPLICIT_SYNONYM;
		    goto just_had_operator_needing_term;
		}
		break;
	    }
	    // Skip any other characters.
	    continue;
	}

	Assert(is_wordchar(*it));

	size_t term_start_index = it.raw() - qs.data();

	newprev = 'A'; // Any letter will do...

	// A term, a prefix, or a boolean operator.
	const FieldInfo * field_info = NULL;
	if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
	    !field_map.empty()) {
	    // Check for a fieldname prefix (e.g. title:historical).
	    Utf8Iterator p = find_if(it, end, is_not_wordchar);
	    if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
		string field;
		p = it;
		while (*p != ':')
		    Unicode::append_utf8(field, *p++);
		map<string, FieldInfo>::const_iterator f;
		f = field_map.find(field);
		if (f != field_map.end()) {
		    // Special handling for prefixed fields, depending on the
		    // type of the prefix.
		    unsigned ch = *++p;
		    field_info = &(f->second);

		    if (field_info->type != NON_BOOLEAN) {
			// Drop out of IN_GROUP if we're in it.
			if (mode == IN_GROUP || mode == IN_GROUP2)
			    mode = DEFAULT;
			it = p;
			string name;
			if (it != end && is_double_quote(*it)) {
			    // Quoted boolean term (can contain any character).
			    bool fancy = (*it != '"');
			    ++it;
			    while (it != end) {
				if (*it == '"') {
				    // Interpret "" as an escaped ".
				    if (++it == end || *it != '"')
					break;
				} else if (fancy && is_double_quote(*it)) {
				    // If the opening quote was ASCII, then the
				    // closing one must be too - otherwise
				    // the user can't protect non-ASCII double
				    // quote characters by quoting or escaping.
				    ++it;
				    break;
				}
				Unicode::append_utf8(name, *it++);
			    }
			} else {
			    // Can't boolean filter prefix a subexpression, so
			    // just use anything following the prefix until the
			    // next space or ')' as part of the boolean filter
			    // term.
			    while (it != end && *it > ' ' && *it != ')')
				Unicode::append_utf8(name, *it++);
			}
			// Build the unstemmed form in field.
			field += ':';
			field += name;
			// Clear any pending range error.
			state.error = NULL;
			Term * token = new Term(&state, name, field_info, field);
			Parse(pParser, BOOLEAN_FILTER, token, &state);
			continue;
		    }

		    if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
			// Prefixed phrase, e.g.: subject:"space flight"
			mode = IN_PREFIXED_QUOTES;
			Parse(pParser, QUOTE, NULL, &state);
			it = p;
			newprev = ch;
			++it;
			prefix_stack.push_back(field_info);
			continue;
		    }

		    if (ch == '(' && (flags & FLAG_BOOLEAN)) {
			// Prefixed subexpression, e.g.: title:(fast NEAR food)
			mode = DEFAULT;
			Parse(pParser, BRA, NULL, &state);
			it = p;
			newprev = ch;
			++it;
			prefix_stack.push_back(field_info);
			continue;
		    }

		    if (ch != ':') {
			// Allow 'path:/usr/local' but not 'foo::bar::baz'.
			while (is_phrase_generator(ch)) {
			    if (++p == end)
				goto not_prefix;
			    ch = *p;
			}
		    }

		    if (is_wordchar(ch)) {
			// Prefixed term.
			it = p;
		    } else {
not_prefix:
			// It looks like a prefix but isn't, so parse it as
			// text instead.
			field_info = NULL;
		    }
		}
	    }
	}

phrased_term:
	bool was_acronym;
	bool is_cjk_term = false;
	string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);

	// Boolean operators.
	if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
	    (flags & FLAG_BOOLEAN) &&
	    // Don't want to interpret A.N.D. as an AND operator.
	    !was_acronym &&
	    !field_info &&
	    term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {

	    string op = term;
	    if (flags & FLAG_BOOLEAN_ANY_CASE) {
		for (string::iterator i = op.begin(); i != op.end(); ++i) {
		    *i = C_toupper(*i);
		}
	    }
	    if (op.size() == 3) {
		if (op == "AND") {
		    Parse(pParser, AND, NULL, &state);
		    goto just_had_operator;
		}
		if (op == "NOT") {
		    Parse(pParser, NOT, NULL, &state);
		    goto just_had_operator;
		}
		if (op == "XOR") {
		    Parse(pParser, XOR, NULL, &state);
		    goto just_had_operator;
		}
		if (op == "ADJ") {
		    if (it != end && *it == '/') {
			size_t width = 0;
			Utf8Iterator p = it;
			while (++p != end && U_isdigit(*p)) {
			    width = (width * 10) + (*p - '0');
			}
			if (width && (p == end || is_whitespace(*p))) {
			    it = p;
			    Parse(pParser, ADJ, new Term(width), &state);
			    goto just_had_operator;
			}
		    } else {
			Parse(pParser, ADJ, NULL, &state);
			goto just_had_operator;
		    }
		}
	    } else if (op.size() == 2) {
		if (op == "OR") {
		    Parse(pParser, OR, NULL, &state);
		    goto just_had_operator;
		}
	    } else if (op.size() == 4) {
		if (op == "NEAR") {
		    if (it != end && *it == '/') {
			size_t width = 0;
			Utf8Iterator p = it;
			while (++p != end && U_isdigit(*p)) {
			    width = (width * 10) + (*p - '0');
			}
			if (width && (p == end || is_whitespace(*p))) {
			    it = p;
			    Parse(pParser, NEAR, new Term(width), &state);
			    goto just_had_operator;
			}
		    } else {
			Parse(pParser, NEAR, NULL, &state);
			goto just_had_operator;
		    }
		}
	    }
	}

	// If no prefix is set, use the default one.
	if (!field_info) field_info = prefix_stack.back();

	Assert(field_info->type == NON_BOOLEAN);

	{
	    string unstemmed_term(term);
	    term = Unicode::tolower(term);

	    // Reuse stem_strategy - STEM_SOME here means "stem terms except
	    // when used with positional operators".
	    stem_strategy stem_term = stem_action;
	    if (stem_term != STEM_NONE) {
		if (!stemmer.internal.get()) {
		    // No stemmer is set.
		    stem_term = STEM_NONE;
		} else if (stem_term == STEM_SOME) {
		    if (!should_stem(unstemmed_term) ||
			(it != end && is_stem_preventer(*it))) {
			// Don't stem this particular term.
			stem_term = STEM_NONE;
		    }
		}
	    }

	    Term * term_obj = new Term(&state, term, field_info,
				       unstemmed_term, stem_term, term_pos++);

	    if (is_cjk_term) {
		Parse(pParser, CJKTERM, term_obj, &state);
		if (it == end) break;
		continue;
	    }

	    if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
		if (it != end) {
		    if ((flags & FLAG_WILDCARD) && *it == '*') {
			Utf8Iterator p(it);
			++p;
			if (p == end || !is_wordchar(*p)) {
			    it = p;
			    if (mode == IN_GROUP || mode == IN_GROUP2) {
				// Drop out of IN_GROUP and flag that the group
				// can be empty if all members are stopwords.
				if (mode == IN_GROUP2)
				    Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
				mode = DEFAULT;
			    }
			    // Wildcard at end of term (also known as
			    // "right truncation").
			    Parse(pParser, WILD_TERM, term_obj, &state);
			    continue;
			}
		    }
		} else {
		    if (flags & FLAG_PARTIAL) {
			if (mode == IN_GROUP || mode == IN_GROUP2) {
			    // Drop out of IN_GROUP and flag that the group
			    // can be empty if all members are stopwords.
			    if (mode == IN_GROUP2)
				Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
			    mode = DEFAULT;
			}
			// Final term of a partial match query, with no
			// following characters - treat as a wildcard.
			Parse(pParser, PARTIAL_TERM, term_obj, &state);
			continue;
		    }
		}
	    }

	    // Check spelling, if we're a normal term, and any of the prefixes
	    // are empty.
	    if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
		const list<string> & pfxes = field_info->prefixes;
		list<string>::const_iterator pfx_it;
		for (pfx_it = pfxes.begin(); pfx_it != pfxes.end(); ++pfx_it) {
		    if (!pfx_it->empty())
			continue;
		    const string & suggest = db.get_spelling_suggestion(term);
		    if (!suggest.empty()) {
			if (corrected_query.empty()) corrected_query = qs;
			size_t term_end_index = it.raw() - qs.data();
			size_t n = term_end_index - term_start_index;
			size_t pos = term_start_index + correction_offset;
			corrected_query.replace(pos, n, suggest);
			correction_offset += suggest.size();
			correction_offset -= n;
		    }
		    break;
		}
	    }

	    if (mode == IN_PHRASED_TERM) {
		Parse(pParser, PHR_TERM, term_obj, &state);
	    } else {
		// See if the next token will be PHR_TERM - if so, this one
		// needs to be TERM not GROUP_TERM.
		if ((mode == IN_GROUP || mode == IN_GROUP2) &&
		    is_phrase_generator(*it)) {
		    // FIXME: can we clean this up?
		    Utf8Iterator p = it;
		    do {
			++p;
		    } while (p != end && is_phrase_generator(*p));
		    // Don't generate a phrase unless the phrase generators are
		    // immediately followed by another term.
		    if (p != end && is_wordchar(*p)) {
			mode = DEFAULT;
		    }
		}

		int token = TERM;
		if (mode == IN_GROUP || mode == IN_GROUP2) {
		    mode = IN_GROUP2;
		    token = GROUP_TERM;
		}
		Parse(pParser, token, term_obj, &state);
		if (token == TERM && mode != DEFAULT)
		    continue;
	    }
	}

	if (it == end) break;

	if (is_phrase_generator(*it)) {
	    // Skip multiple phrase generators.
	    do {
		++it;
	    } while (it != end && is_phrase_generator(*it));
	    // Don't generate a phrase unless the phrase generators are
	    // immediately followed by another term.
	    if (it != end && is_wordchar(*it)) {
		mode = IN_PHRASED_TERM;
		term_start_index = it.raw() - qs.data();
		goto phrased_term;
	    }
	} else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
	    int old_mode = mode;
	    mode = DEFAULT;
	    if (!last_was_operator_needing_term && is_whitespace(*it)) {
		newprev = ' ';
		// Skip multiple whitespace.
		do {
		    ++it;
		} while (it != end && is_whitespace(*it));
		// Don't generate a group unless the terms are only separated
		// by whitespace.
		if (it != end && is_wordchar(*it)) {
		    if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
			mode = IN_GROUP2;
		    } else {
			mode = IN_GROUP;
		    }
		}
	    }
	}
    }
done:
    if (!state.error) {
	// Implicitly close any unclosed quotes.
	if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
	    Parse(pParser, QUOTE, NULL, &state);

	// Implicitly close all unclosed brackets.
	while (prefix_stack.size() > 1) {
	    Parse(pParser, KET, NULL, &state);
	    prefix_stack.pop_back();
	}
	Parse(pParser, 0, NULL, &state);
    }

    errmsg = state.error;
    return state.query;
}

struct ProbQuery {
    Query * query;
    Query * love;
    Query * hate;
    // filter is a map from prefix to a query for that prefix.  Queries with
    // the same prefix are combined with OR, and the results of this are
    // combined with AND to get the full filter.
    map<string, Query> filter;

    ProbQuery() : query(0), love(0), hate(0) { }
    ~ProbQuery() {
	delete query;
	delete love;
	delete hate;
    }

    void add_filter(const string& grouping, const Query & q) {
	filter[grouping] = q;
    }

    void append_filter(const string& grouping, const Query & qnew) {
	auto it = filter.find(grouping);
	if (it == filter.end()) {
	    filter.insert(make_pair(grouping, qnew));
	} else {
	    Query & q = it->second;
	    // We OR multiple filters with the same prefix if they're
	    // exclusive, otherwise we AND them.
	    bool exclusive = !grouping.empty();
	    Query::op op = exclusive ? Query::OP_OR : Query::OP_AND;
	    q = Query(op, q, qnew);
	}
    }

    void add_filter_range(const string& grouping, const Query & range) {
	filter[grouping] = range;
    }

    void append_filter_range(const string& grouping, const Query & range) {
	Query & q = filter[grouping];
	q = Query(Query::OP_OR, q, range);
    }

    Query merge_filters() const {
	auto i = filter.begin();
	Assert(i != filter.end());
	Query q = i->second;
	while (++i != filter.end()) {
	    q = Query(Query::OP_AND, q, i->second);
	}
	return q;
    }
};

/// A group of terms separated only by whitespace.
class TermGroup {
    vector<Term *> terms;

    /** Controls how to handle a group where all terms are stopwords.
     *
     *  If true, then as_group() returns NULL.  If false, then the
     *  stopword status of the terms is ignored.
     */
    bool empty_ok;

    TermGroup(Term* t1, Term* t2) : empty_ok(false) {
	add_term(t1);
	add_term(t2);
    }

  public:
    /// Factory function - ensures heap allocation.
    static TermGroup* create(Term* t1, Term* t2) {
	return new TermGroup(t1, t2);
    }

    ~TermGroup() {
	for (auto&& t : terms) {
	    delete t;
	}
    }

    /// Add a Term object to this TermGroup object.
    void add_term(Term * term) {
	terms.push_back(term);
    }

    /// Set the empty_ok flag.
    void set_empty_ok() { empty_ok = true; }

    /// Convert to a Xapian::Query * using default_op.
    Query * as_group(State *state) const;
};

Query *
TermGroup::as_group(State *state) const
{
    const Xapian::Stopper * stopper = state->get_stopper();
    size_t stoplist_size = state->stoplist_size();
    bool default_op_is_positional = is_positional(state->default_op());
reprocess:
    Query::op default_op = state->default_op();
    vector<Query> subqs;
    subqs.reserve(terms.size());
    if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
	// Check for multi-word synonyms.
	Database db = state->get_database();

	string key;
	vector<Term*>::const_iterator begin = terms.begin();
	vector<Term*>::const_iterator i = begin;
	while (i != terms.end()) {
	    TermIterator synkey(db.synonym_keys_begin((*i)->name));
	    TermIterator synend(db.synonym_keys_end((*i)->name));
	    if (synkey == synend) {
		// No multi-synonym matches.
		if (stopper && (*stopper)((*i)->name)) {
		    state->add_to_stoplist(*i);
		} else {
		    if (default_op_is_positional)
			(*i)->need_positions();
		    subqs.push_back((*i)->get_query_with_auto_synonyms());
		}
		begin = ++i;
		continue;
	    }
	    key.resize(0);
	    while (i != terms.end()) {
		if (!key.empty()) key += ' ';
		key += (*i)->name;
		++i;
		synkey.skip_to(key);
		if (synkey == synend || !startswith(*synkey, key)) break;
	    }
	    // Greedily try to match as many consecutive words as possible.
	    TermIterator syn, end;
	    while (true) {
		syn = db.synonyms_begin(key);
		end = db.synonyms_end(key);
		if (syn != end) break;
		if (--i == begin) break;
		key.resize(key.size() - (*i)->name.size() - 1);
	    }
	    if (i == begin) {
		// No multi-synonym matches.
		if (stopper && (*stopper)((*i)->name)) {
		    state->add_to_stoplist(*i);
		} else {
		    if (default_op_is_positional)
			(*i)->need_positions();
		    subqs.push_back((*i)->get_query_with_auto_synonyms());
		}
		begin = ++i;
		continue;
	    }

	    vector<Query> subqs2;
	    vector<Term*>::const_iterator j;
	    for (j = begin; j != i; ++j) {
		if (stopper && (*stopper)((*j)->name)) {
		    state->add_to_stoplist(*j);
		} else {
		    if (default_op_is_positional)
			(*i)->need_positions();
		    subqs2.push_back((*j)->get_query());
		}
	    }
	    Query q_original_terms;
	    if (default_op_is_positional) {
		q_original_terms = Query(default_op,
					 subqs2.begin(), subqs2.end(),
					 subqs2.size() + 9);
	    } else {
		q_original_terms = Query(default_op,
					 subqs2.begin(), subqs2.end());
	    }
	    subqs2.clear();

	    // Use the position of the first term for the synonyms.
	    Query q(Query::OP_SYNONYM,
		    SynonymIterator(syn, (*begin)->pos, &q_original_terms),
		    SynonymIterator(end));
	    subqs.push_back(q);

	    begin = i;
	}
    } else {
	vector<Term*>::const_iterator i;
	for (i = terms.begin(); i != terms.end(); ++i) {
	    if (stopper && (*stopper)((*i)->name)) {
		state->add_to_stoplist(*i);
	    } else {
		if (default_op_is_positional)
		    (*i)->need_positions();
		subqs.push_back((*i)->get_query_with_auto_synonyms());
	    }
	}
    }

    if (!empty_ok && stopper && subqs.empty() &&
	stoplist_size < state->stoplist_size()) {
	// This group is all stopwords, so roll-back, disable stopper
	// temporarily, and reprocess this group.
	state->stoplist_resize(stoplist_size);
	stopper = NULL;
	goto reprocess;
    }

    Query * q = NULL;
    if (!subqs.empty()) {
	if (default_op_is_positional) {
	    q = new Query(default_op, subqs.begin(), subqs.end(),
			     subqs.size() + 9);
	} else {
	    q = new Query(default_op, subqs.begin(), subqs.end());
	}
    }
    delete this;
    return q;
}

/// Some terms which form a positional sub-query.
class Terms {
    vector<Term *> terms;
    size_t window;

    /** Keep track of whether the terms added all have the same list of
     *  prefixes.  If so, we'll build a set of phrases, one using each prefix.
     *  This works around the limitation that a phrase cannot have multiple
     *  components which are "OR" combinations of terms, but is also probably
     *  what users expect: i.e., if a user specifies a phrase in a field, and
     *  that field maps to multiple prefixes, the user probably wants a phrase
     *  returned with all terms having one of those prefixes, rather than a
     *  phrase comprised of terms with differing prefixes.
     */
    bool uniform_prefixes;

    /** The list of prefixes of the terms added.
     *  This will be NULL if the terms have different prefixes.
     */
    const list<string> * prefixes;

    /// Convert to a query using the given operator and window size.
    Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
	Query * q = NULL;
	size_t n_terms = terms.size();
	Xapian::termcount w = w_delta + terms.size();
	if (uniform_prefixes) {
	    if (prefixes) {
		list<string>::const_iterator piter;
		for (piter = prefixes->begin(); piter != prefixes->end(); ++piter) {
		    vector<Query> subqs;
		    subqs.reserve(n_terms);
		    vector<Term *>::const_iterator titer;
		    for (titer = terms.begin(); titer != terms.end(); ++titer) {
			Term * t = *titer;
			subqs.push_back(Query(t->make_term(*piter), 1, t->pos));
		    }
		    add_to_query(q, Query::OP_OR,
				 Query(op, subqs.begin(), subqs.end(), w));
		}
	    }
	} else {
	    vector<Query> subqs;
	    subqs.reserve(n_terms);
	    vector<Term *>::const_iterator titer;
	    for (titer = terms.begin(); titer != terms.end(); ++titer) {
		subqs.push_back((*titer)->get_query());
	    }
	    q = new Query(op, subqs.begin(), subqs.end(), w);
	}

	delete this;
	return q;
    }

    Terms() : window(0), uniform_prefixes(true), prefixes(NULL) { }

  public:
    /// Factory function - ensures heap allocation.
    static Terms* create() {
	return new Terms();
    }

    ~Terms() {
	for (auto&& t : terms) {
	    delete t;
	}
    }

    /// Add an unstemmed Term object to this Terms object.
    void add_positional_term(Term * term) {
	const list<string> & term_prefixes = term->field_info->prefixes;
	if (terms.empty()) {
	    prefixes = &term_prefixes;
	} else if (uniform_prefixes && prefixes != &term_prefixes) {
	    if (*prefixes != term_prefixes)  {
		prefixes = NULL;
		uniform_prefixes = false;
	    }
	}
	term->need_positions();
	terms.push_back(term);
    }

    void adjust_window(size_t alternative_window) {
	if (alternative_window > window) window = alternative_window;
    }

    /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
    Query * as_phrase_query() const {
	return as_opwindow_query(Query::OP_PHRASE, 0);
    }

    /// Convert to a Xapian::Query * using OP_NEAR.
    Query * as_near_query() const {
	// The common meaning of 'a NEAR b' is "a within 10 terms of b", which
	// means a window size of 11.  For more than 2 terms, we just add one
	// to the window size for each extra term.
	size_t w = window;
	if (w == 0) w = 10;
	return as_opwindow_query(Query::OP_NEAR, w - 1);
    }

    /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
    Query * as_adj_query() const {
	// The common meaning of 'a ADJ b' is "a at most 10 terms before b",
	// which means a window size of 11.  For more than 2 terms, we just add
	// one to the window size for each extra term.
	size_t w = window;
	if (w == 0) w = 10;
	return as_opwindow_query(Query::OP_PHRASE, w - 1);
    }
};

void
Term::as_positional_cjk_term(Terms * terms) const
{
    // Add each individual CJK character to the phrase.
    string t;
    for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
	Unicode::append_utf8(t, *it);
	Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
	terms->add_positional_term(c);
	t.resize(0);
    }

    // FIXME: we want to add the n-grams as filters too for efficiency.

    delete this;
}

// Helper macro for converting a boolean operation into a Xapian::Query.
#define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
    do {\
	if (!A || !B) {\
	    state->error = "Syntax: <expression> " OP_TXT " <expression>";\
	    yy_parse_failed(yypParser);\
	    return;\
	}\
	E = new Query(OP, *A, *B);\
	delete A;\
	delete B;\
    } while (0)

}

%token_type {Term *}
%token_destructor {delete $$;}

%extra_argument {State * state}

%parse_failure {
    // If we've not already set an error message, set a default one.
    if (!state->error) state->error = "parse error";
}

%syntax_error {
    yy_parse_failed(yypParser);
}

// Operators, grouped in order of increasing precedence:
%nonassoc ERROR.
%left OR.
%left XOR.
%left AND NOT.
%left NEAR ADJ.
%left LOVE HATE HATE_AFTER_AND SYNONYM.

// Destructors for terminal symbols:

// TERM is a query term, including prefix (if any).
%destructor TERM {delete $$;}

// GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
// is only separated by whitespace characters.
%destructor GROUP_TERM {delete $$;}

// PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
// separated only by one or more phrase generator characters (hyphen and
// apostrophe are common examples - see is_phrase_generator() for the list
// of all punctuation which does this).
%destructor PHR_TERM {delete $$;}

// WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
// expanded.
%destructor WILD_TERM {delete $$;}

// PARTIAL_TERM is like a TERM, but it's at the end of the query string and
// we're doing "search as you type".  It expands to something like WILD_TERM
// OR stemmed_form.
%destructor PARTIAL_TERM {delete $$;}

// BOOLEAN_FILTER is a query term with a prefix registered using
// add_boolean_prefix().  It's added to the query using an OP_FILTER operator,
// (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
%destructor BOOLEAN_FILTER {delete $$;}

// Grammar rules:

// query - The whole query - just an expr or nothing.

// query non-terminal doesn't need a type, so just give a dummy one.
%type query {int}

query ::= expr(E). {
    // Save the parsed query in the State structure so we can return it.
    if (E) {
	state->query = *E;
	delete E;
    } else {
	state->query = Query();
    }
}

query ::= . {
    // Handle a query string with no terms in.
    state->query = Query();
}

// expr - A query expression.

%type expr {Query *}
%destructor expr {delete $$;}

expr(E) ::= prob_expr(P).
	{ E = P; }

expr(E) ::= bool_arg(A) AND bool_arg(B).
	{ BOOL_OP_TO_QUERY(E, A, Query::OP_AND, B, "AND"); }

expr(E) ::= bool_arg(A) NOT bool_arg(B). {
    // 'NOT foo' -> '<alldocuments> NOT foo'
    if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
	A = new Query("", 1, 0);
    }
    BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "NOT");
}

expr(E) ::= bool_arg(A) AND NOT bool_arg(B). [NOT]
	{ BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND NOT"); }

expr(E) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND]
	{ BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND"); }

expr(E) ::= bool_arg(A) OR bool_arg(B).
	{ BOOL_OP_TO_QUERY(E, A, Query::OP_OR, B, "OR"); }

expr(E) ::= bool_arg(A) XOR bool_arg(B).
	{ BOOL_OP_TO_QUERY(E, A, Query::OP_XOR, B, "XOR"); }

// bool_arg - an argument to a boolean operator such as AND or OR.

%type bool_arg {Query *}
%destructor bool_arg {delete $$;}

bool_arg(A) ::= expr(E). { A = E; }

bool_arg(A) ::= . [ERROR] {
    // Set the argument to NULL, which enables the bool_arg-using rules in
    // expr above to report uses of AND, OR, etc which don't have two
    // arguments.
    A = NULL;
}

// prob_expr - a single compound term, or a prob.

%type prob_expr {Query *}
%destructor prob_expr {delete $$;}

prob_expr(E) ::= prob(P). {
    E = P->query;
    P->query = NULL;
    // Handle any "+ terms".
    if (P->love) {
	if (P->love->empty()) {
	    // +<nothing>.
	    delete E;
	    E = P->love;
	} else if (E) {
	    swap(E, P->love);
	    add_to_query(E, Query::OP_AND_MAYBE, P->love);
	} else {
	    E = P->love;
	}
	P->love = NULL;
    }
    // Handle any boolean filters.
    if (!P->filter.empty()) {
	if (E) {
	    add_to_query(E, Query::OP_FILTER, P->merge_filters());
	} else {
	    // Make the query a boolean one.
	    E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
	}
    }
    // Handle any "- terms".
    if (P->hate && !P->hate->empty()) {
	if (!E) {
	    // Can't just hate!
	    yy_parse_failed(yypParser);
	    return;
	}
	*E = Query(Query::OP_AND_NOT, *E, *P->hate);
    }
    delete P;
}

prob_expr(E) ::= term(T). {
    E = T;
}

// prob - a probabilistic sub-expression consisting of stop_terms, "+" terms,
// "-" terms, boolean filters, and/or ranges.
//
// Note: stop_term can also be several other things other than a simple term!

%type prob {ProbQuery *}
%destructor prob {delete $$;}

prob(P) ::= RANGE(R). {
    string grouping = R->name;
    const Query & range = R->as_range_query();
    P = new ProbQuery;
    P->add_filter_range(grouping, range);
}

prob(P) ::= stop_prob(Q) RANGE(R). {
    string grouping = R->name;
    const Query & range = R->as_range_query();
    P = Q;
    P->append_filter_range(grouping, range);
}

prob(P) ::= stop_term(T) stop_term(U). {
    P = new ProbQuery;
    P->query = T;
    if (U) {
	Query::op op = state->default_op();
	if (P->query && is_positional(op)) {
	    // If default_op is OP_NEAR or OP_PHRASE, set the window size to
	    // 11 for the first pair of terms and it will automatically grow
	    // by one for each subsequent term.
	    Query * subqs[2] = { P->query, U };
	    *(P->query) = Query(op, subqs, subqs + 2, 11);
	    delete U;
	} else {
	    add_to_query(P->query, op, U);
	}
    }
}

prob(P) ::= prob(Q) stop_term(T). {
    P = Q;
    // If T is a stopword, there's nothing to do here.
    if (T) add_to_query(P->query, state->default_op(), T);
}

prob(P) ::= LOVE term(T). {
    P = new ProbQuery;
    if (state->default_op() == Query::OP_AND) {
	P->query = T;
    } else {
	P->love = T;
    }
}

prob(P) ::= stop_prob(Q) LOVE term(T). {
    P = Q;
    if (state->default_op() == Query::OP_AND) {
	/* The default op is AND, so we just put loved terms into the query
	 * (in this case the only effect of love is to ignore the stopword
	 * list). */
	add_to_query(P->query, Query::OP_AND, T);
    } else {
	add_to_query(P->love, Query::OP_AND, T);
    }
}

prob(P) ::= HATE term(T). {
    P = new ProbQuery;
    P->hate = T;
}

prob(P) ::= stop_prob(Q) HATE term(T). {
    P = Q;
    add_to_query(P->hate, Query::OP_OR, T);
}

prob(P) ::= HATE BOOLEAN_FILTER(T). {
    P = new ProbQuery;
    P->hate = new Query(T->get_query());
    delete T;
}

prob(P) ::= stop_prob(Q) HATE BOOLEAN_FILTER(T). {
    P = Q;
    add_to_query(P->hate, Query::OP_OR, T->get_query());
    delete T;
}

prob(P) ::= BOOLEAN_FILTER(T). {
    P = new ProbQuery;
    P->add_filter(T->get_grouping(), T->get_query());
    delete T;
}

prob(P) ::= stop_prob(Q) BOOLEAN_FILTER(T). {
    P = Q;
    P->append_filter(T->get_grouping(), T->get_query());
    delete T;
}

prob(P) ::= LOVE BOOLEAN_FILTER(T). {
    // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    P = new ProbQuery;
    P->filter[T->get_grouping()] = T->get_query();
    delete T;
}

prob(P) ::= stop_prob(Q) LOVE BOOLEAN_FILTER(T). {
    // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    P = Q;
    // We OR filters with the same prefix...
    Query & q = P->filter[T->get_grouping()];
    q = Query(Query::OP_OR, q, T->get_query());
    delete T;
}

// stop_prob - A prob or a stop_term.

%type stop_prob {ProbQuery *}
%destructor stop_prob {delete $$;}

stop_prob(P) ::= prob(Q).
    { P = Q; }

stop_prob(P) ::= stop_term(T). {
    P = new ProbQuery;
    P->query = T;
}

// stop_term - A term which should be checked against the stopword list,
// or a compound_term.
//
// If a term is loved, hated, or in a phrase, we don't want to consult the
// stopword list, so stop_term isn't used there (instead term is).

%type stop_term {Query *}
%destructor stop_term {delete $$;}

stop_term(T) ::= TERM(U). {
    if (state->is_stopword(U)) {
	T = NULL;
	state->add_to_stoplist(U);
    } else {
	T = new Query(U->get_query_with_auto_synonyms());
    }
    delete U;
}

stop_term(T) ::= compound_term(U). {
    T = U;
}

// term - A term or a compound_term.

%type term {Query *}
%destructor term {delete $$;}

term(T) ::= TERM(U). {
    T = new Query(U->get_query_with_auto_synonyms());
    delete U;
}

term(T) ::= compound_term(U). {
    T = U;
}

// compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
// phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
// or without prefix).

%type compound_term {Query *}
%destructor compound_term {delete $$;}

compound_term(T) ::= WILD_TERM(U).
	{ T = U->as_wildcarded_query(state); }

compound_term(T) ::= PARTIAL_TERM(U).
	{ T = U->as_partial_query(state); }

compound_term(T) ::= QUOTE phrase(P) QUOTE.
	{ T = P->as_phrase_query(); }

compound_term(T) ::= phrased_term(P).
	{ T = P->as_phrase_query(); }

compound_term(T) ::= group(P).
	{ T = P->as_group(state); }

compound_term(T) ::= near_expr(P).
	{ T = P->as_near_query(); }

compound_term(T) ::= adj_expr(P).
	{ T = P->as_adj_query(); }

compound_term(T) ::= BRA expr(E) KET.
	{ T = E; }

compound_term(T) ::= SYNONYM TERM(U). {
    T = new Query(U->get_query_with_synonyms());
    delete U;
}

compound_term(T) ::= CJKTERM(U). {
    { T = U->as_cjk_query(); }
}

// phrase - The "inside the quotes" part of a double-quoted phrase.

%type phrase {Terms *}

%destructor phrase {delete $$;}

phrase(P) ::= TERM(T). {
    P = Terms::create();
    P->add_positional_term(T);
}

phrase(P) ::= CJKTERM(T). {
    P = Terms::create();
    T->as_positional_cjk_term(P);
}

phrase(P) ::= phrase(Q) TERM(T). {
    P = Q;
    P->add_positional_term(T);
}

phrase(P) ::= phrase(Q) CJKTERM(T). {
    P = Q;
    T->as_positional_cjk_term(P);
}

// phrased_term - A phrased term works like a single term, but is actually
// 2 or more terms linked together into a phrase by punctuation.  There must be
// at least 2 terms in order to be able to have punctuation between the terms!

%type phrased_term {Terms *}
%destructor phrased_term {delete $$;}

phrased_term(P) ::= TERM(T) PHR_TERM(U). {
    P = Terms::create();
    P->add_positional_term(T);
    P->add_positional_term(U);
}

phrased_term(P) ::= phrased_term(Q) PHR_TERM(T). {
    P = Q;
    P->add_positional_term(T);
}

// group - A group of terms separated only by whitespace - candidates for
// multi-term synonyms.

%type group {TermGroup *}
%destructor group {delete $$;}

group(P) ::= TERM(T) GROUP_TERM(U). {
    P = TermGroup::create(T, U);
}

group(P) ::= group(Q) GROUP_TERM(T). {
    P = Q;
    P->add_term(T);
}

group(P) ::= group(Q) EMPTY_GROUP_OK. {
    P = Q;
    P->set_empty_ok();
}

// near_expr - 2 or more terms with NEAR in between.  There must be at least 2
// terms in order for there to be any NEAR operators!

%type near_expr {Terms *}
%destructor near_expr {delete $$;}

near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
    P = Terms::create();
    P->add_positional_term(T);
    P->add_positional_term(U);
    if (N) {
	P->adjust_window(N->get_termpos());
	delete N;
    }
}

near_expr(P) ::= near_expr(Q) NEAR(N) TERM(T). {
    P = Q;
    P->add_positional_term(T);
    if (N) {
	P->adjust_window(N->get_termpos());
	delete N;
    }
}

// adj_expr - 2 or more terms with ADJ in between.  There must be at least 2
// terms in order for there to be any ADJ operators!

%type adj_expr {Terms *}
%destructor adj_expr {delete $$;}

adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
    P = Terms::create();
    P->add_positional_term(T);
    P->add_positional_term(U);
    if (N) {
	P->adjust_window(N->get_termpos());
	delete N;
    }
}

adj_expr(P) ::= adj_expr(Q) ADJ(N) TERM(T). {
    P = Q;
    P->add_positional_term(T);
    if (N) {
	P->adjust_window(N->get_termpos());
	delete N;
    }
}

// Select yacc syntax highlighting in vim editor: vim: syntax=yacc
// (lemon syntax colouring isn't supplied by default; yacc does an OK job).