/** @file * @brief Xapian::Query internals */ /* Copyright (C) 2007-2026 Olly Betts * Copyright (C) 2008,2009 Lemur Consulting Ltd * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see * . */ #include #include "queryinternal.h" #include "xapian/error.h" #include "xapian/postingsource.h" #include "xapian/query.h" #include "xapian/unicode.h" #include "api/editdistance.h" #include "backends/postlist.h" #include "heap.h" #include "matcher/andmaybepostlist.h" #include "matcher/andnotpostlist.h" #include "matcher/andpostlist.h" #include "matcher/boolorpostlist.h" #include "matcher/exactphrasepostlist.h" #include "matcher/externalpostlist.h" #include "matcher/maxpostlist.h" #include "matcher/nearpostlist.h" #include "matcher/orpospostlist.h" #include "matcher/orpostlist.h" #include "matcher/phrasepostlist.h" #include "matcher/queryoptimiser.h" #include "matcher/valuerangepostlist.h" #include "matcher/valuegepostlist.h" #include "matcher/xorpostlist.h" #include "pack.h" #include "serialise-double.h" #include "stringutils.h" #include "termlist.h" #include "debuglog.h" #include "omassert.h" #include "str.h" #include "stringutils.h" #include "unicode/description_append.h" #include #include #include #include #include #include #include #include using namespace std; static constexpr unsigned MAX_UTF_8_CHARACTER_LENGTH = 4; using Xapian::Internal::AndContext; using Xapian::Internal::OrContext; using Xapian::Internal::XorContext; namespace Xapian { namespace Internal { /** Class providing an operator which sorts postlists to select max or terms. * This returns true if a has a (strictly) greater termweight than b. * * Using this comparator will tend to result in multiple calls to * recalc_maxweight() for each of the subqueries (we use it with nth_element * which should be O(n)) - perhaps it'd be better to call recalc_maxweight() * once and then sort on that. */ struct CmpMaxOrTerms { /** Return true if and only if a has a strictly greater termweight than b. */ bool operator()(PostList* a, PostList* b) { #if (defined(__i386__) && !defined(__SSE_MATH__)) || \ defined(__mc68000__) || defined(__mc68010__) || \ defined(__mc68020__) || defined(__mc68030__) // On some architectures, most common of which is x86, floating point // values are calculated and stored in registers with excess precision. // If the two recalc_maxweight() calls below return identical values in a // register, the excess precision may be dropped for one of them but // not the other (e.g. because the compiler saves the first calculated // weight to memory while calculating the second, then reloads it to // compare). This leads to both a > b and b > a being true, which // violates the antisymmetry property of the strict weak ordering // required by nth_element(). This can have serious consequences (e.g. // segfaults). // // Note that m68k only has excess precision in earlier models - 68040 // and later are OK: // https://gcc.gnu.org/ml/gcc-patches/2008-11/msg00105.html // // To avoid this, we store each result in a volatile double prior to // comparing them. This means that the result of this test should // match that on other architectures with the same double format (which // is desirable), and actually has less overhead than rounding both // results to float (which is another approach which works). volatile double a_max_wt = a->recalc_maxweight(); volatile double b_max_wt = b->recalc_maxweight(); return a_max_wt > b_max_wt; #else return a->recalc_maxweight() > b->recalc_maxweight(); #endif } }; /// Comparison functor which orders by descending termfreq. struct ComparePostListTermFreqAscending { /// Order PostList* by descending get_termfreq(). bool operator()(const PostList* a, const PostList* b) const { return a->get_termfreq() > b->get_termfreq(); } }; class Context { protected: QueryOptimiser* qopt; vector pls; VecUniquePtr estimates; vector termfreqs_list; /// Lower bound on matching docid range. Xapian::docid first = Xapian::docid(-1); /// Upper bound on matching docid range. Xapian::docid last = 0; public: Context(QueryOptimiser* qopt_, size_t reserve) : qopt(qopt_), estimates(reserve) { pls.reserve(reserve); } ~Context() { shrink(0); } Xapian::docid get_first() const { return first; } Xapian::docid get_last() const { return last; } void add_termfreqs(TermFreqs* termfreqs) { if (termfreqs) termfreqs_list.emplace_back(*termfreqs); } void add_postlist(PostList* pl, EstimateOp* estimate, TermFreqs* termfreqs) { add_termfreqs(termfreqs); if (pl) { pls.emplace_back(pl); // estimate can be NULL e.g. under the RHS of OP_AND_MAYBE. if (estimate) estimates.push_back(estimate); // Take the union of the docid ranges, which is suitable for // OrContext and XorContext. AndContext() implements its own // version of add_postlist() which takes the intersection. Xapian::docid f = 1; Xapian::docid l = Xapian::docid(-1); pl->get_docid_range(f, l); first = std::min(first, f); last = std::max(last, l); } else { Assert(!estimate); } } void add_postlist(PostListAndEstimate p, TermFreqs* termfreqs) { add_postlist(p.pl, p.est.release(), termfreqs); } bool empty() const { return pls.empty(); } Xapian::termcount size() const { return Xapian::termcount(pls.size()); } void shrink(size_t new_size) { AssertRel(new_size, <=, pls.size()); if (new_size >= pls.size()) return; for (auto&& i = pls.begin() + new_size; i != pls.end(); ++i) { qopt->destroy_postlist(*i); } pls.resize(new_size); estimates.erase(estimates.begin() + new_size, estimates.end()); } /** Expand a wildcard query. */ void expand_wildcard(const QueryWildcard* query, double factor, TermFreqs* termfreqs); /** Expand an edit distance query. */ void expand_edit_distance(const QueryEditDistance* query, double factor, TermFreqs* termfreqs); }; inline void Context::expand_wildcard(const QueryWildcard* query, double factor, TermFreqs* termfreqs) { unique_ptr t(qopt->db.open_allterms(query->get_fixed_prefix())); bool skip_ucase = query->get_fixed_prefix().empty(); auto max_type = query->get_max_type(); Xapian::termcount expansions_left = query->get_max_expansion(); // If there's no expansion limit, set expansions_left to the maximum // value it can hold. if (expansions_left == 0) expansions_left = numeric_limits::max(); while (true) { TermList* ret = t->next(); done_skip_to: if (ret) { // Pruning shouldn't be possible, as this is iterating allterms for // a single shard. Assert(ret == t.get()); // End of entries. break; } const string & term = t->get_termname(); if (skip_ucase && term[0] >= 'A') { // If there's a leading wildcard then skip terms that start // with A-Z, as we don't want the expansion to include prefixed // terms. // // This assumes things about the structure of terms which the // Query class otherwise doesn't need to care about, but it // seems hard to avoid here. skip_ucase = false; if (term[0] <= 'Z') { static_assert('Z' + 1 == '[', "'Z' + 1 == '['"); ret = t->skip_to("["); goto done_skip_to; } } if (!query->test_prefix_known(term)) continue; if (max_type < Xapian::Query::WILDCARD_LIMIT_MOST_FREQUENT) { if (expansions_left == 0) { if (max_type == Xapian::Query::WILDCARD_LIMIT_FIRST) break; string msg("Wildcard "); msg += query->get_pattern(); if (query->get_just_flags() == 0) msg += '*'; msg += " expands to more than "; msg += str(query->get_max_expansion()); msg += " terms"; throw Xapian::WildcardError(msg); } --expansions_left; } add_postlist(qopt->open_lazy_post_list(term, 1, factor), NULL); // Generate a single EstimateOp to avoid overhead for wildcards // which expand to a lot of terms? (FIXME) } if (max_type == Xapian::Query::WILDCARD_LIMIT_MOST_FREQUENT) { auto set_size = query->get_max_expansion(); if (size() > set_size) { auto begin = pls.begin(); nth_element(begin, begin + set_size - 1, pls.end(), ComparePostListTermFreqAscending()); shrink(set_size); } } // Now register the postlists we're actually using for stats. for (auto pl : pls) { // FIXME: LocalSubMatch::open_lazy_post_list() above returns a // PostList* that actually points to a LeafPostList. It would be // better to find a way to be more type-safe here and avoid need to // cast back. qopt->register_lazy_postlist_for_stats(static_cast(pl), termfreqs); add_termfreqs(termfreqs); } } inline void Context::expand_edit_distance(const QueryEditDistance* query, double factor, TermFreqs* termfreqs) { string pfx(query->get_pattern(), 0, query->get_fixed_prefix_len()); unique_ptr t(qopt->db.open_allterms(pfx)); bool skip_ucase = pfx.empty(); auto max_type = query->get_max_type(); Xapian::termcount expansions_left = query->get_max_expansion(); // If there's no expansion limit, set expansions_left to the maximum // value it can hold. if (expansions_left == 0) expansions_left = numeric_limits::max(); while (true) { TermList* res = t->next(); done_skip_to: if (res) { // Pruning shouldn't be possible, as this is iterating allterms for // a single shard. Assert(res == t.get()); // Out of entries. break; } const string& term = t->get_termname(); if (!startswith(term, pfx)) break; if (skip_ucase && term[0] >= 'A') { // Skip terms that start with A-Z, as we don't want the expansion // to include prefixed terms. // // This assumes things about the structure of terms which the // Query class otherwise doesn't need to care about, but it // seems hard to avoid here. skip_ucase = false; if (term[0] <= 'Z') { static_assert('Z' + 1 == '[', "'Z' + 1 == '['"); res = t->skip_to("["); goto done_skip_to; } } if (!query->test(term)) continue; if (max_type < Xapian::Query::WILDCARD_LIMIT_MOST_FREQUENT) { if (expansions_left == 0) { if (max_type == Xapian::Query::WILDCARD_LIMIT_FIRST) break; string msg("Edit distance "); msg += query->get_pattern(); msg += '~'; msg += str(query->get_threshold()); msg += " expands to more than "; msg += str(query->get_max_expansion()); msg += " terms"; throw Xapian::WildcardError(msg); } --expansions_left; } add_postlist(qopt->open_lazy_post_list(term, 1, factor), NULL); } if (max_type == Xapian::Query::WILDCARD_LIMIT_MOST_FREQUENT) { auto set_size = query->get_max_expansion(); if (size() > set_size) { auto begin = pls.begin(); nth_element(begin, begin + set_size - 1, pls.end(), ComparePostListTermFreqAscending()); shrink(set_size); } } // Now register the postlists we're actually using for stats. for (auto pl : pls) { // FIXME: Be more typesafe? qopt->register_lazy_postlist_for_stats(static_cast(pl), termfreqs); add_termfreqs(termfreqs); } } class OrContext : public Context { public: OrContext(QueryOptimiser* qopt_, size_t reserve) : Context(qopt_, reserve) { } void estimate_termfreqs(TermFreqs* termfreqs); /// Select the best set_size postlists from the last out_of added. void select_elite_set(size_t set_size, size_t out_of); PostListAndEstimate postlist(TermFreqs* termfreqs, bool bool_or = false); PostListAndEstimate postlist_max(); }; void OrContext::estimate_termfreqs(TermFreqs* termfreqs) { Assert(termfreqs); Assert(!termfreqs_list.empty()); // We calculate the estimate assuming independence. The simplest // way to calculate this seems to be a series of (n - 1) pairwise // calculations, which gives the same answer regardless of the order. const TermFreqs& freqs = termfreqs_list[0]; auto& stats = *qopt->get_stats(); // Our caller should have ensured this. Assert(stats.collection_size); double scale = 1.0 / stats.collection_size; double P_est = freqs.termfreq * scale; double rtf_scale = 0.0; if (stats.rset_size != 0) { rtf_scale = 1.0 / stats.rset_size; } double Pr_est = freqs.reltermfreq * rtf_scale; // If total_length is 0, cf must always be 0 so cf_scale is irrelevant. double cf_scale = 0.0; if (usual(stats.total_length != 0)) { cf_scale = 1.0 / stats.total_length; } double Pc_est = freqs.collfreq * cf_scale; for (size_t i = 1; i < termfreqs_list.size(); ++i) { const TermFreqs& f = termfreqs_list[i]; double P_i = f.termfreq * scale; P_est += P_i - P_est * P_i; double Pc_i = f.collfreq * cf_scale; Pc_est += Pc_i - Pc_est * Pc_i; // If the rset is empty, Pr_est should be 0 already, so leave // it alone. if (stats.rset_size != 0) { double Pr_i = f.reltermfreq * rtf_scale; Pr_est += Pr_i - Pr_est * Pr_i; } } *termfreqs = TermFreqs(Xapian::doccount(P_est * stats.collection_size + 0.5), Xapian::doccount(Pr_est * stats.rset_size + 0.5), Xapian::termcount(Pc_est * stats.total_length + 0.5)); } void OrContext::select_elite_set(size_t set_size, size_t out_of) { auto begin = pls.begin() + pls.size() - out_of; nth_element(begin, begin + set_size - 1, pls.end(), CmpMaxOrTerms()); shrink(pls.size() - out_of + set_size); } PostListAndEstimate OrContext::postlist(TermFreqs* termfreqs, bool bool_or) { if (!termfreqs_list.empty()) estimate_termfreqs(termfreqs); switch (pls.size()) { case 0: return {}; case 1: { PostList* pl = pls[0]; pls.clear(); // If no_estimates was set then estimates will be empty. return {pl, estimates.empty() ? nullptr : estimates.release_at(0)}; } } unique_ptr est; if (!qopt->get_no_estimates()) { est.reset(new EstimateOp(EstimateOp::OR, first, last, std::move(estimates))); } if (bool_or) { auto pl = new BoolOrPostList(pls.begin(), pls.end(), qopt->db_size); // Empty pls so our destructor doesn't delete them all! pls.clear(); return {pl, std::move(est)}; } // Make postlists into a heap so that the postlist with the greatest term // frequency is at the top of the heap. Heap::make(pls.begin(), pls.end(), ComparePostListTermFreqAscending()); // Now build a tree of binary OrPostList objects. // // The algorithm used to build the tree is like that used to build an // optimal Huffman coding tree. If we called next() repeatedly, this // arrangement would minimise the number of method calls. Generally we // don't actually do that, but this arrangement is still likely to be a // good one, and it does minimise the work in the worst case. while (true) { // We build the tree such that at each branch: // // l.get_termfreq() >= r.get_termfreq() // // We do this so that the OrPostList class can be optimised assuming // that this is the case. PostList* r = pls.front(); Heap::pop(pls.begin(), pls.end(), ComparePostListTermFreqAscending()); pls.pop_back(); auto pl = new OrPostList(pls.front(), r, qopt->matcher); if (pls.size() == 1) { pls.clear(); return {pl, std::move(est)}; } pls[0] = pl; Heap::replace(pls.begin(), pls.end(), ComparePostListTermFreqAscending()); } } PostListAndEstimate OrContext::postlist_max() { switch (pls.size()) { case 0: return {}; case 1: { PostList* pl = pls[0]; pls.clear(); return {pl, estimates.release_at(0)}; } } // Sort the postlists so that the postlist with the greatest term frequency // is first. sort(pls.begin(), pls.end(), ComparePostListTermFreqAscending()); PostList* pl = new MaxPostList(pls.begin(), pls.end(), qopt->matcher, qopt->db_size); unique_ptr est; if (!qopt->get_no_estimates()) { // Same as OR for number of matches. est.reset(new EstimateOp(EstimateOp::OR, first, last, std::move(estimates))); } pls.clear(); return {pl, std::move(est)}; } class XorContext : public Context { public: XorContext(QueryOptimiser* qopt_, size_t reserve) : Context(qopt_, reserve) { } PostListAndEstimate postlist(TermFreqs* termfreqs); }; PostListAndEstimate XorContext::postlist(TermFreqs* termfreqs) { if (pls.empty()) return {}; if (termfreqs) { Assert(!termfreqs_list.empty()); // We calculate the estimate assuming independence. The simplest // way to calculate this seems to be a series of (n - 1) pairwise // calculations, which gives the same answer regardless of the order. auto& stats = *qopt->get_stats(); const TermFreqs& freqs = termfreqs_list[0]; // Our caller should have ensured this. Assert(stats.collection_size); double scale = 1.0 / stats.collection_size; double P_est = freqs.termfreq * scale; double rtf_scale = 0.0; if (stats.rset_size != 0) { rtf_scale = 1.0 / stats.rset_size; } double Pr_est = freqs.reltermfreq * rtf_scale; // If total_length is 0, cf must always be 0 so cf_scale is irrelevant. double cf_scale = 0.0; if (usual(stats.total_length != 0)) { cf_scale = 1.0 / stats.total_length; } double Pc_est = freqs.collfreq * cf_scale; for (size_t i = 1; i < termfreqs_list.size(); ++i) { const TermFreqs& f = termfreqs_list[i]; double P_i = f.termfreq * scale; P_est += P_i - 2.0 * P_est * P_i; double Pc_i = f.collfreq * cf_scale; Pc_est += Pc_i - 2.0 * Pc_est * Pc_i; // If the rset is empty, Pr_est should be 0 already, so leave // it alone. if (stats.rset_size != 0) { double Pr_i = f.reltermfreq * rtf_scale; Pr_est += Pr_i - 2.0 * Pr_est * Pr_i; } } *termfreqs = TermFreqs(Xapian::doccount(P_est * stats.collection_size + 0.5), Xapian::doccount(Pr_est * stats.rset_size + 0.5), Xapian::termcount(Pc_est * stats.total_length + 0.5)); } unique_ptr est; if (!qopt->get_no_estimates()) { est.reset(new EstimateOp(EstimateOp::XOR, first, last, std::move(estimates))); } auto pl = new XorPostList(pls.begin(), pls.end(), qopt->matcher, qopt->db_size); // Empty pls so our destructor doesn't delete them all! pls.clear(); return {pl, std::move(est)}; } class PosFilter { Xapian::Query::op op_; /// Start and end indices for the PostLists this positional filter uses. size_t begin, end; Xapian::termcount window; public: PosFilter(Xapian::Query::op op__, size_t begin_, size_t end_, Xapian::termcount window_) : op_(op__), begin(begin_), end(end_), window(window_) { } PostListAndEstimate postlist(PostList* pl, EstimateOp* est, const vector& pls, PostListTree* pltree, TermFreqs* termfreqs) const try { auto terms_begin = pls.begin() + begin; auto terms_end = pls.begin() + end; if (op_ == Xapian::Query::OP_NEAR) { if (termfreqs) *termfreqs /= 2; if (est) { est = new EstimateOp(EstimateOp::NEAR, est); } pl = new NearPostList(pl, est, window, terms_begin, terms_end, pltree); } else if (window != end - begin) { AssertEq(op_, Xapian::Query::OP_PHRASE); if (termfreqs) *termfreqs /= 3; if (est) { est = new EstimateOp(EstimateOp::PHRASE, est); } pl = new PhrasePostList(pl, est, window, terms_begin, terms_end, pltree); } else { AssertEq(op_, Xapian::Query::OP_PHRASE); if (termfreqs) *termfreqs /= 4; if (est) { est = new EstimateOp(EstimateOp::EXACT_PHRASE, est); } pl = new ExactPhrasePostList(pl, est, terms_begin, terms_end, pltree); } return {pl, est}; } catch (...) { delete pl; delete est; throw; } }; class AndContext : public Context { list pos_filters; unique_ptr not_ctx; unique_ptr maybe_ctx; /** True if this AndContext has seen a no-op MatchAll. * * If it has and it ends up empty then the resulting postlist should be * MatchAll not MatchNothing. */ bool match_all = false; public: AndContext(QueryOptimiser* qopt_, size_t reserve) : Context(qopt_, reserve) { first = 1; last = Xapian::docid(-1); } bool add_postlist(PostList* pl, unique_ptr&& estimate, TermFreqs* termfreqs) { add_termfreqs(termfreqs); if (pl) { if (pls.empty() && termfreqs_list.size() > 1) { qopt->destroy_postlist(pl); return true; } pls.emplace_back(pl); estimates.push_back(estimate.release()); Xapian::docid pl_first = first, pl_last = last; pl->get_docid_range(pl_first, pl_last); first = std::max(first, pl_first); last = std::min(last, pl_last); if (first <= last) { return true; } } shrink(0); match_all = false; return termfreqs != NULL; } bool add_postlist(PostListAndEstimate p, TermFreqs* termfreqs) { return add_postlist(p.pl, std::move(p.est), termfreqs); } void set_match_all() { match_all = true; } void add_pos_filter(Query::op op_, size_t n_subqs, Xapian::termcount window); OrContext& get_not_ctx(size_t reserve) { if (!not_ctx) { not_ctx.reset(new OrContext(qopt, reserve)); } return *not_ctx; } OrContext& get_maybe_ctx(size_t reserve) { if (!maybe_ctx) { maybe_ctx.reset(new OrContext(qopt, reserve)); } return *maybe_ctx; } PostListAndEstimate postlist(TermFreqs* termfreqs); }; void AndContext::add_pos_filter(Query::op op_, size_t n_subqs, Xapian::termcount window) { Assert(n_subqs > 1); size_t end = pls.size(); size_t begin = end - n_subqs; pos_filters.push_back(PosFilter(op_, begin, end, window)); } template inline static T estimate_and_not(T l, T r, U n) { // We calculate the estimates assuming independence. With this assumption, // the estimate is the product of the estimates for the sub-postlists // (with the right side this is inverted by subtracting from the total // size), divided by the total size. return static_cast((l * double(n - r)) / n + 0.5); } PostListAndEstimate AndContext::postlist(TermFreqs* termfreqs) { auto matcher = qopt->matcher; auto db_size = qopt->db_size; if (termfreqs) { Assert(!termfreqs_list.empty()); // We calculate the estimate assuming independence. With this // assumption, the estimate is the product of the estimates for the // sub-postlists divided by db_size (n - 1) times. const TermFreqs& freqs = termfreqs_list[0]; double freqest = double(freqs.termfreq); double relfreqest = double(freqs.reltermfreq); double collfreqest = double(freqs.collfreq); auto& stats = *qopt->get_stats(); // Our caller should have ensured this. Assert(stats.collection_size); for (size_t i = 1; i < termfreqs_list.size(); ++i) { const TermFreqs& f = termfreqs_list[i]; // If the collection is empty, freqest should be 0 already, so // leave it alone. freqest *= f.termfreq / stats.collection_size; if (usual(stats.total_length != 0)) { collfreqest *= f.collfreq / stats.total_length; } // If the rset is empty, relfreqest should be 0 already, so leave // it alone. if (stats.rset_size != 0) relfreqest *= f.reltermfreq / stats.rset_size; } *termfreqs = TermFreqs(static_cast(freqest + 0.5), static_cast(relfreqest + 0.5), static_cast(collfreqest + 0.5)); } unique_ptr pl; unique_ptr est; switch (pls.size()) { case 0: { if (!match_all) { // The "and" part doesn't match anything, so any "not" part or // positional filters are irrelevant. return {}; } auto [new_pl, new_est] = qopt->open_post_list({}, 0, 0.0, nullptr); pl.reset(new_pl); est = std::move(new_est); break; } case 1: pl.reset(pls[0]); est.reset(estimates.release_at(0)); break; default: pl.reset(new AndPostList(pls.begin(), pls.end(), matcher)); if (!qopt->get_no_estimates()) { est.reset(new EstimateOp(EstimateOp::AND, first, last, std::move(estimates))); } break; } if (not_ctx && !not_ctx->empty()) { if (not_ctx->get_last() < first || not_ctx->get_first() > last) { // The ranges don't overlap so the right side has no effect. // The call to not_ctx.reset() below will clean up the estimate // stack. } else { TermFreqs r_freqs; auto [rhs, rhs_est] = not_ctx->postlist(termfreqs ? &r_freqs : NULL, true); if (termfreqs) { TermFreqs& freqs = *termfreqs; auto& stats = *qopt->get_stats(); // Our caller should have ensured this. Assert(stats.collection_size); freqs.termfreq = estimate_and_not(freqs.termfreq, r_freqs.termfreq, stats.collection_size); // If total_length is 0 then collfreq should always be 0 (since // total_length is the sum of all collfreq values) so nothing // to do. if (stats.total_length != 0) { freqs.collfreq = estimate_and_not(freqs.collfreq, r_freqs.collfreq, stats.total_length); } // If the rset is empty then relfreqest should always be 0 so // nothing to do. if (stats.rset_size != 0) { freqs.reltermfreq = estimate_and_not(freqs.reltermfreq, r_freqs.reltermfreq, stats.rset_size); } } pl.reset(new AndNotPostList(pl.release(), rhs, db_size)); if (!qopt->get_no_estimates()) { // The bounds are the same as those for the left side. est.reset(new EstimateOp(EstimateOp::AND_NOT, first, last, std::move(est), std::move(rhs_est))); } } not_ctx.reset(); } // Sort the positional filters to try to apply them in an efficient order. // FIXME: We need to figure out what that is! Try applying lowest cf/tf // first? // Apply any positional filters. for (const PosFilter& filter : pos_filters) { auto [new_pl, new_est] = filter.postlist(pl.release(), est.release(), pls, matcher, termfreqs); pl.reset(new_pl); est = std::move(new_est); } // Empty pls so our destructor doesn't delete them all! pls.clear(); if (maybe_ctx && !maybe_ctx->empty()) { if (maybe_ctx->get_last() < first || maybe_ctx->get_first() > last) { // The ranges don't overlap so the right side has no effect. // The call to maybe_ctx.reset() below will clean up the estimate // stack. } else { // For OP_AND_MAYBE only the LHS determines which documents match // (the RHS only adds weight) so the estimate is just that for the // LHS. It would be useless extra work to generate estimates for // anything on the RHS, and create a problem with the lifetime of // the EstimateOp object which would need to live until after the // match for any PostList that can report stats. bool save_no_estimates = qopt->get_no_estimates(); qopt->set_no_estimates(true); auto [rhs, rhs_est] = maybe_ctx->postlist(termfreqs); qopt->set_no_estimates(save_no_estimates); // If this assertion fails, there's probably a missing check for // qopt->get_no_estimates() somewhere. Assert(!rhs_est); // A NULL PostList from OrContext::postlist() can only mean that // maybe_ctx is empty, but in that case get_last() returns zero // which means we would have taken the branch above. Assert(rhs); pl.reset(new AndMaybePostList(pl.release(), rhs, matcher)); } maybe_ctx.reset(); } return {pl.release(), est.release()}; } } Query::Internal::~Internal() { } size_t Query::Internal::get_num_subqueries() const noexcept { return 0; } const Query Query::Internal::get_subquery(size_t) const { throw Xapian::InvalidArgumentError("get_subquery() not meaningful for this Query object"); } Xapian::termcount Query::Internal::get_wqf() const { throw Xapian::InvalidArgumentError("get_wqf() not meaningful for this Query object"); } Xapian::termpos Query::Internal::get_pos() const { throw Xapian::InvalidArgumentError("get_pos() not meaningful for this Query object"); } void Query::Internal::gather_terms(void *) const { } Xapian::termcount Query::Internal::get_length() const noexcept { return 0; } Query::Internal * Query::Internal::unserialise(const char ** p, const char * end, const Registry & reg) { if (*p == end) return NULL; unsigned char ch = *(*p)++; switch (ch >> 5) { case 4: case 5: case 6: case 7: { // Multi-way branch // // 1ccccnnn where: // nnn -> n_subqs (0 means encoded value follows) // cccc -> code (which OP_XXX) size_t n_subqs = ch & 0x07; if (n_subqs == 0) { if (!unpack_uint(p, end, &n_subqs)) { unpack_throw_serialisation_error(*p); } n_subqs += 8; } unsigned char code = (ch >> 3) & 0x0f; Xapian::termcount parameter = 0; if (code >= 13) { if (!unpack_uint(p, end, ¶meter)) { unpack_throw_serialisation_error(*p); } } Xapian::Internal::QueryBranch * result; switch (code) { case 0: // OP_AND result = new Xapian::Internal::QueryAnd(n_subqs); break; case 1: // OP_OR result = new Xapian::Internal::QueryOr(n_subqs); break; case 2: // OP_AND_NOT result = new Xapian::Internal::QueryAndNot(n_subqs); break; case 3: // OP_XOR result = new Xapian::Internal::QueryXor(n_subqs); break; case 4: // OP_AND_MAYBE result = new Xapian::Internal::QueryAndMaybe(n_subqs); break; case 5: // OP_FILTER result = new Xapian::Internal::QueryFilter(n_subqs); break; case 6: // OP_SYNONYM result = new Xapian::Internal::QuerySynonym(n_subqs); break; case 7: // OP_MAX result = new Xapian::Internal::QueryMax(n_subqs); break; case 13: // OP_ELITE_SET result = new Xapian::Internal::QueryEliteSet(n_subqs, parameter); break; case 14: // OP_NEAR result = new Xapian::Internal::QueryNear(n_subqs, parameter); break; case 15: // OP_PHRASE result = new Xapian::Internal::QueryPhrase(n_subqs, parameter); break; default: // 8 to 12 are currently unused. throw SerialisationError("Unknown multi-way branch Query operator"); } do { result->add_subquery(Xapian::Query(unserialise(p, end, reg))); } while (--n_subqs); result->done(); return result; } case 2: case 3: { // Term // Term // // 01ccLLLL where: // LLLL -> length (0 means encoded value follows) // cc -> code: // 0: wqf = 0; pos = 0 // 1: wqf = 1; pos = 0 // 2: wqf = 1; pos -> encoded value follows // 3: wqf -> encoded value follows; pos -> encoded value follows size_t len = ch & 0x0f; if (len == 0) { if (!unpack_uint(p, end, &len)) { unpack_throw_serialisation_error(*p); } len += 16; } if (size_t(end - *p) < len) throw SerialisationError("Not enough data"); string term(*p, len); *p += len; int code = ((ch >> 4) & 0x03); Xapian::termcount wqf = static_cast(code > 0); if (code == 3) { if (!unpack_uint(p, end, &wqf)) { unpack_throw_serialisation_error(*p); } } Xapian::termpos pos = 0; if (code >= 2) { if (!unpack_uint(p, end, &pos)) { unpack_throw_serialisation_error(*p); } } return new Xapian::Internal::QueryTerm(term, wqf, pos); } case 1: { // OP_VALUE_RANGE or OP_VALUE_GE or OP_VALUE_LE // // 001tssss where: // ssss -> slot number (15 means encoded value follows) // t -> op: // 0: OP_VALUE_RANGE (or OP_VALUE_LE if begin empty) // 1: OP_VALUE_GE Xapian::valueno slot = ch & 15; if (slot == 15) { if (!unpack_uint(p, end, &slot)) { unpack_throw_serialisation_error(*p); } slot += 15; } string begin; if (!unpack_string(p, end, begin)) { unpack_throw_serialisation_error(*p); } if (ch & 0x10) { // OP_VALUE_GE return new Xapian::Internal::QueryValueGE(slot, begin); } // OP_VALUE_RANGE string end_; if (!unpack_string(p, end, end_)) { unpack_throw_serialisation_error(*p); } if (begin.empty()) // FIXME: is this right? return new Xapian::Internal::QueryValueLE(slot, end_); return new Xapian::Internal::QueryValueRange(slot, begin, end_); } case 0: { // Other operators // // 000ttttt where: // ttttt -> encodes which OP_XXX switch (ch & 0x1f) { case 0x00: // OP_INVALID return new Xapian::Internal::QueryInvalid(); case 0x0a: { // Edit distance Xapian::termcount max_expansion; if (!unpack_uint(p, end, &max_expansion) || end - *p < 2) { throw SerialisationError("not enough data"); } int flags = static_cast(*(*p)++); op combiner = static_cast(*(*p)++); unsigned edit_distance; size_t fixed_prefix_len; string pattern; if (!unpack_uint(p, end, &edit_distance) || !unpack_uint(p, end, &fixed_prefix_len) || !unpack_string(p, end, pattern)) { throw SerialisationError("not enough data"); } using Xapian::Internal::QueryEditDistance; return new QueryEditDistance(pattern, max_expansion, flags, combiner, edit_distance, fixed_prefix_len); } case 0x0b: { // Wildcard Xapian::termcount max_expansion; if (!unpack_uint(p, end, &max_expansion) || end - *p < 2) { throw SerialisationError("not enough data"); } int flags = static_cast(*(*p)++); op combiner = static_cast(*(*p)++); string pattern; if (!unpack_string(p, end, pattern)) { throw SerialisationError("not enough data"); } return new Xapian::Internal::QueryWildcard(pattern, max_expansion, flags, combiner); } case 0x0c: { // PostingSource string name; if (!unpack_string(p, end, name)) { throw SerialisationError("not enough data"); } const PostingSource * reg_source = reg.get_posting_source(name); if (!reg_source) { string m = "PostingSource "; m += name; m += " not registered"; throw SerialisationError(m); } string serialised_source; if (!unpack_string(p, end, serialised_source)) { throw SerialisationError("not enough data"); } PostingSource* source = reg_source->unserialise_with_registry(serialised_source, reg); return new Xapian::Internal::QueryPostingSource(source->release()); } case 0x0d: { using Xapian::Internal::QueryScaleWeight; double scale_factor = unserialise_double(p, end); return new QueryScaleWeight(scale_factor, Query(unserialise(p, end, reg))); } case 0x0e: { Xapian::termcount wqf; Xapian::termpos pos; if (!unpack_uint(p, end, &wqf) || !unpack_uint(p, end, &pos)) { throw SerialisationError("not enough data"); } return new Xapian::Internal::QueryTerm({}, wqf, pos); } case 0x0f: return new Xapian::Internal::QueryTerm(); default: // Others currently unused. break; } break; } } string msg = "Unknown Query serialisation: "; msg += str(ch); throw SerialisationError(msg); } bool Query::Internal::postlist_sub_and_like(AndContext& ctx, QueryOptimiser * qopt, double factor, TermFreqs* termfreqs) const { return ctx.add_postlist(postlist(qopt, factor, termfreqs), termfreqs); } void Query::Internal::postlist_sub_or_like(OrContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs, bool keep_zero_weight) const { Xapian::termcount save_total_subqs = qopt->get_total_subqs(); auto [pl_, est] = postlist(qopt, factor, termfreqs); unique_ptr pl{pl_}; if (!keep_zero_weight && pl && pl->recalc_maxweight() == 0.0) { // This subquery can't contribute any weight, so can be discarded. // // Restore the value of total_subqs so that percentages don't get // messed up if we increased total_subqs in the call to postlist() // above. qopt->set_total_subqs(save_total_subqs); qopt->destroy_postlist(pl.release()); return; } ctx.add_postlist(pl.release(), est.release(), termfreqs); } void Query::Internal::postlist_sub_bool_or_like(OrContext& ctx, QueryOptimiser* qopt, TermFreqs* termfreqs) const { ctx.add_postlist(postlist(qopt, 0.0, termfreqs), termfreqs); } void Query::Internal::postlist_sub_xor(XorContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { ctx.add_postlist(postlist(qopt, factor, termfreqs), termfreqs); } namespace Internal { Query::op QueryTerm::get_type() const noexcept { return term.empty() ? Query::LEAF_MATCH_ALL : Query::LEAF_TERM; } string QueryTerm::get_description() const { string desc; if (term.empty()) { desc = ""; } else { description_append(desc, term); } if (wqf != 1) { desc += '#'; desc += str(wqf); } if (pos) { desc += '@'; desc += str(pos); } return desc; } QueryPostingSource::QueryPostingSource(PostingSource * source_) : source(source_) { if (!source_) throw Xapian::InvalidArgumentError("source parameter can't be NULL"); if (source->_refs == 0) { // source_ isn't reference counted, so try to clone it. If clone() // isn't implemented, just use the object provided and it's the // caller's responsibility to ensure it stays valid while in use. PostingSource * cloned_source = source->clone(); if (cloned_source) source = cloned_source->release(); } } Query::op QueryPostingSource::get_type() const noexcept { return Query::LEAF_POSTING_SOURCE; } string QueryPostingSource::get_description() const { string desc = "PostingSource("; desc += source->get_description(); desc += ')'; return desc; } QueryScaleWeight::QueryScaleWeight(double factor, const Query & subquery_) : scale_factor(factor), subquery(subquery_) { if (rare(scale_factor < 0.0)) throw Xapian::InvalidArgumentError("OP_SCALE_WEIGHT requires factor >= 0"); } Query::op QueryScaleWeight::get_type() const noexcept { return Query::OP_SCALE_WEIGHT; } size_t QueryScaleWeight::get_num_subqueries() const noexcept { return 1; } const Query QueryScaleWeight::get_subquery(size_t) const { return subquery; } string QueryScaleWeight::get_description() const { Assert(subquery.internal); string desc = str(scale_factor); desc += " * "; desc += subquery.internal->get_description(); return desc; } PostListAndEstimate QueryTerm::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryTerm::postlist", qopt | factor | termfreqs); if (factor != 0.0) qopt->inc_total_subqs(); RETURN(qopt->open_post_list(term, wqf, factor, termfreqs)); } bool QueryTerm::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { if (term.empty() && !qopt->need_positions && factor == 0.0 && !termfreqs) { // No-op MatchAll. ctx.set_match_all(); return true; } return ctx.add_postlist(postlist(qopt, factor, termfreqs), termfreqs); } PostListAndEstimate QueryPostingSource::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryPostingSource::postlist", qopt | factor | termfreqs); Assert(source); if (factor != 0.0) qopt->inc_total_subqs(); unique_ptr est; if (!qopt->get_no_estimates()) { est.reset(new EstimateOp()); } // Casting away const on the Database::Internal here is OK, as we wrap // them in a const Xapian::Database so non-const methods can't actually // be called on the Database::Internal object. const Xapian::Database wrappeddb( const_cast(&(qopt->db))); auto pl = new ExternalPostList(wrappeddb, source.get(), est.get(), factor, qopt->matcher->get_max_weight_cached_flag_ptr(), qopt->shard_index); if (termfreqs) { auto& stats = *qopt->get_stats(); auto db_size = qopt->db_size; // Scale proportionately from this shard to the whole collection. Not // as good as summing over all shards, but that's harder to do. double termfreq = pl->get_termfreq(); auto tf = termfreq * stats.collection_size / db_size; auto rtf = termfreq * stats.rset_size / db_size; auto cf = termfreq * stats.total_length / db_size; *termfreqs = TermFreqs(static_cast(tf + 0.5), static_cast(rtf + 0.5), static_cast(cf + 0.5)); } RETURN({pl, std::move(est)}); } PostListAndEstimate QueryScaleWeight::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryScaleWeight::postlist", qopt | factor | termfreqs); RETURN(subquery.internal->postlist(qopt, factor * scale_factor, termfreqs)); } bool QueryScaleWeight::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { return subquery.internal->postlist_sub_and_like(ctx, qopt, factor * scale_factor, termfreqs); } void QueryTerm::gather_terms(void * void_terms) const { // Skip Xapian::Query::MatchAll (aka Xapian::Query("")). if (!term.empty()) { vector> &terms = *static_cast>*>(void_terms); terms.push_back(make_pair(pos, term)); } } static double string_frac(const string& s, size_t prefix) { double r = 0; double f = 1.0; for (size_t i = prefix; i != s.size(); ++i) { f /= 256.0; r += static_cast(s[i]) * f; } return r; } static Xapian::doccount estimate_range_freq(const string& lo, const string& hi, const string& begin, const string* end, Xapian::doccount value_freq) { // Assume the values are evenly spread out between lo and hi. // FIXME: Perhaps we should store some sort of binned distribution? AssertRel(lo, <=, hi); size_t common_prefix_len = size_t(-1); do { UNSIGNED_OVERFLOW_OK(++common_prefix_len); // lo <= hi so while we're in the common prefix hi can't run out // before lo. if (common_prefix_len == lo.size()) { if (common_prefix_len != hi.size()) break; // All values in the slot are the same. We should have optimised // to NULL if that singular value is outside the range, and if it's // inside the range then we know that the frequency is exactly the // value frequency. Assert(begin <= lo && (!end || hi <= *end)); return value_freq; } AssertRel(common_prefix_len, !=, hi.size()); } while (lo[common_prefix_len] == hi[common_prefix_len]); double l = string_frac(lo, common_prefix_len); double h = string_frac(hi, common_prefix_len); double denom = h - l; if (rare(denom == 0.0)) { // Weird corner case - hi != lo (because that's handled inside the loop // above) but they give the same string_frac value. Because we only // calculate the fraction starting from the first difference, this // should only happen if hi is lo + one or more trailing zero bytes. // The case where all set values lie within the range should be handled // at a higher level and we shouldn't get called. Assert(!(begin <= lo && (!end || hi <= *end))); // There must be partial overlap as the cases where the range // dominates the bounds and where the range is entirely outside the // bounds are both handled at a higher level. return value_freq / 2; } double b = l; if (begin > lo) { b = string_frac(begin, common_prefix_len); } double e = h; if (end && *end < hi) { // end is NULL for a ValueGePostList e = string_frac(*end, common_prefix_len); } double est = (e - b) / denom * value_freq; return Xapian::doccount(est + 0.5); } PostListAndEstimate QueryValueRange::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryValueRange::postlist", qopt | factor | termfreqs); if (factor != 0.0) qopt->inc_total_subqs(); const Xapian::Database::Internal & db = qopt->db; const auto db_size = qopt->db_size; const string & lb = db.get_value_lower_bound(slot); if (lb.empty()) { // This should only happen if there are no values in this slot (which // could be because the backend just doesn't support values at all). // If there were values in the slot, the backend should have a // non-empty lower bound, even if it isn't a tight one. AssertEq(db.get_value_freq(slot), 0); if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } if (end < lb) { if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } const string & ub = db.get_value_upper_bound(slot); if (begin > ub) { if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } if (termfreqs) { // We scale these below. auto& stats = *qopt->get_stats(); *termfreqs = TermFreqs(stats.collection_size, stats.rset_size, stats.total_length); } auto value_freq = db.get_value_freq(slot); if (end >= ub) { if (begin <= lb) { // The known bounds for the slot both fall within the range so we // know the range matches whenever the value is set, which is // exactly value_freq times. unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(value_freq)); if (value_freq == db_size) { // This value is set for all documents in the current shard, so // we can replace it with a MatchAll postlist, which is // especially efficient if there are no gaps in the docids. RETURN({db.open_post_list({}), std::move(est)}); } // We need to check which documents have a value set in this slot // but don't need to worry about the range bounds so we can use // ValueGePostList with an empty string as the lower bound which // means the range test just becomes a cheap `>= string()` test. if (termfreqs) *termfreqs *= double(value_freq) / db_size; auto pl = new ValueGePostList(&db, est.get(), value_freq, slot, string()); RETURN({pl, std::move(est)}); } auto tf_est = estimate_range_freq(lb, ub, begin, NULL, value_freq); unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(Estimates{0, tf_est, value_freq})); if (termfreqs) *termfreqs *= double(tf_est) / db_size; auto pl = new ValueGePostList(&db, est.get(), tf_est, slot, begin); RETURN({pl, std::move(est)}); } auto tf_est = estimate_range_freq(lb, ub, begin, &end, value_freq); unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(Estimates{0, tf_est, value_freq})); if (termfreqs) *termfreqs *= double(tf_est) / db_size; auto pl = new ValueRangePostList(&db, est.get(), tf_est, slot, begin, end); RETURN({pl, std::move(est)}); } void QueryValueRange::serialise(string & result) const { if (slot < 15) { result += static_cast(0x20 | slot); } else { result += static_cast(0x20 | 15); pack_uint(result, slot - 15); } pack_string(result, begin); pack_string(result, end); } Query::op QueryValueRange::get_type() const noexcept { return Query::OP_VALUE_RANGE; } string QueryValueRange::get_description() const { string desc = "VALUE_RANGE "; desc += str(slot); desc += ' '; description_append(desc, begin); desc += ' '; description_append(desc, end); return desc; } PostListAndEstimate QueryValueLE::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryValueLE::postlist", qopt | factor | termfreqs); if (factor != 0.0) qopt->inc_total_subqs(); const Xapian::Database::Internal & db = qopt->db; const auto db_size = qopt->db_size; const string & lb = db.get_value_lower_bound(slot); if (lb.empty()) { // This should only happen if there are no values in this slot (which // could be because the backend just doesn't support values at all). // If there were values in the slot, the backend should have a // non-empty lower bound, even if it isn't a tight one. AssertEq(db.get_value_freq(slot), 0); if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } if (limit < lb) { if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } if (termfreqs) { // We scale these below. auto& stats = *qopt->get_stats(); *termfreqs = TermFreqs(stats.collection_size, stats.rset_size, stats.total_length); } auto value_freq = db.get_value_freq(slot); const string& ub = db.get_value_upper_bound(slot); if (limit >= ub) { // The known bounds for the slot both fall within the range so we // know the range matches whenever the value is set, which is // exactly value_freq times. unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(value_freq)); if (value_freq == db_size) { // This value is set for all documents in the current shard, so // we can replace it with a MatchAll postlist, which is // especially efficient if there are no gaps in the docids. RETURN({db.open_post_list({}), std::move(est)}); } // We need to check which documents have a value set in this slot // but don't need to worry about the range bounds so we can use // ValueGePostList with an empty string as the lower bound which // means the range test just becomes a cheap `>= string()` test. if (termfreqs) *termfreqs *= double(value_freq) / db_size; auto pl = new ValueGePostList(&db, est.get(), value_freq, slot, string()); RETURN({pl, std::move(est)}); } auto tf_est = estimate_range_freq(lb, ub, string(), &limit, value_freq); unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(Estimates{0, tf_est, value_freq})); if (termfreqs) *termfreqs *= double(tf_est) / db_size; auto pl = new ValueRangePostList(&db, est.get(), tf_est, slot, string(), limit); RETURN({pl, std::move(est)}); } void QueryValueLE::serialise(string & result) const { // Encode as a range with an empty start (which only takes a single byte to // encode). if (slot < 15) { result += static_cast(0x20 | slot); } else { result += static_cast(0x20 | 15); pack_uint(result, slot - 15); } pack_string_empty(result); pack_string(result, limit); } Query::op QueryValueLE::get_type() const noexcept { return Query::OP_VALUE_LE; } string QueryValueLE::get_description() const { string desc = "VALUE_LE "; desc += str(slot); desc += ' '; description_append(desc, limit); return desc; } PostListAndEstimate QueryValueGE::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryValueGE::postlist", qopt | factor | termfreqs); if (factor != 0.0) qopt->inc_total_subqs(); const Xapian::Database::Internal & db = qopt->db; const auto db_size = qopt->db_size; const string & lb = db.get_value_lower_bound(slot); if (lb.empty()) { // This should only happen if there are no values in this slot (which // could be because the backend just doesn't support values at all). // If there were values in the slot, the backend should have a // non-empty lower bound, even if it isn't a tight one. AssertEq(db.get_value_freq(slot), 0); if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } const string& ub = db.get_value_upper_bound(slot); if (limit > ub) { if (termfreqs) *termfreqs = TermFreqs(); RETURN({}); } if (termfreqs) { // We scale these below. auto& stats = *qopt->get_stats(); *termfreqs = TermFreqs(stats.collection_size, stats.rset_size, stats.total_length); } auto value_freq = db.get_value_freq(slot); if (limit <= lb) { // The known bounds for the slot both fall within the range so we // know the range matches whenever the value is set, which is // exactly value_freq times. unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(value_freq)); if (value_freq == db_size) { // This value is set for all documents in the current shard, so // we can replace it with a MatchAll postlist, which is // especially efficient if there are no gaps in the docids. RETURN({db.open_post_list({}), std::move(est)}); } // We need to check which documents have a value set in this slot // but don't need to worry about the range bounds so we can use // ValueGePostList with an empty string as the lower bound which // means the range test just becomes a cheap `>= string()` test. auto pl = new ValueGePostList(&db, est.get(), value_freq, slot, string()); RETURN({pl, std::move(est)}); } auto tf_est = estimate_range_freq(lb, ub, limit, NULL, value_freq); unique_ptr est; if (!qopt->get_no_estimates()) est.reset(new EstimateOp(Estimates{0, tf_est, value_freq})); if (termfreqs) *termfreqs *= double(tf_est) / db_size; auto pl = new ValueGePostList(&db, est.get(), tf_est, slot, limit); RETURN({pl, std::move(est)}); } void QueryValueGE::serialise(string & result) const { if (slot < 15) { result += static_cast(0x20 | 0x10 | slot); } else { result += static_cast(0x20 | 0x10 | 15); pack_uint(result, slot - 15); } pack_string(result, limit); } Query::op QueryValueGE::get_type() const noexcept { return Query::OP_VALUE_GE; } string QueryValueGE::get_description() const { string desc = "VALUE_GE "; desc += str(slot); desc += ' '; description_append(desc, limit); return desc; } QueryWildcard::QueryWildcard(std::string_view pattern_, Xapian::termcount max_expansion_, int flags_, Query::op combiner_) : pattern(pattern_), max_expansion(max_expansion_), flags(flags_), combiner(combiner_) { if ((flags & ~Query::WILDCARD_LIMIT_MASK_) == 0) { head = min_len = pattern.size(); max_len = numeric_limits::max(); prefix = pattern; return; } size_t i = 0; while (i != pattern.size()) { // Check for characters with special meaning. switch (pattern[i]) { case '*': if (flags & Query::WILDCARD_PATTERN_MULTI) goto found_special; break; case '?': if (flags & Query::WILDCARD_PATTERN_SINGLE) goto found_special; break; } prefix += pattern[i]; ++i; head = i; } found_special: min_len = max_len = prefix.size(); tail = i; size_t qm_count = 0; bool had_star = false; while (i != pattern.size()) { switch (pattern[i]) { default: default_case: suffix += pattern[i]; ++min_len; ++max_len; break; case '*': if (!(flags & Query::WILDCARD_PATTERN_MULTI)) goto default_case; // Matches zero or more characters. had_star = true; tail = i + 1; if (!suffix.empty()) { min_check_len = 0; suffix.clear(); } break; case '?': if (!(flags & Query::WILDCARD_PATTERN_SINGLE)) goto default_case; // Matches exactly one character. tail = i + 1; if (!suffix.empty()) { min_check_len = 0; suffix.clear(); } ++qm_count; ++min_len; max_len += MAX_UTF_8_CHARACTER_LENGTH; break; } ++i; } if (had_star) { max_len = numeric_limits::max(); } else if (qm_count > 1) { // `?` matches one Unicode character, which is 1-4 bytes in UTF-8, so // we have to actually check the pattern if there's more than one `?` // in it. min_check_len = 0; } else if (qm_count == 1) { // If the pattern contains exactly one `?` wildcard we need to check it // unless the candidate is exactly min_len bytes long. Note that we // know it can't match if it's < min_len long. min_check_len = min_len + 1; } } bool QueryWildcard::test_wildcard_(const string& candidate, size_t o, size_t p, size_t i) const { // FIXME: Optimisation potential here. We could compile the pattern to a // regex, or other tricks like calculating the min length needed after each // position that we test with this method - e.g. for foo*bar*x*baz there // must be at least 7 bytes after a position or there's no point testing if // "bar" matches there. for ( ; i != tail; ++i) { if ((flags & Query::WILDCARD_PATTERN_MULTI) && pattern[i] == '*') { if (++i == tail) { // '*' at end of variable part is easy! return true; } for (size_t test_o = o; test_o <= p; ++test_o) { if (test_wildcard_(candidate, test_o, p, i)) return true; } return false; } if (o == p) return false; if ((flags & Query::WILDCARD_PATTERN_SINGLE) && pattern[i] == '?') { unsigned char b = candidate[o]; if (b < 0xc0) { ++o; continue; } unsigned seqlen; if (b < 0xe0) { seqlen = 2; } else if (b < 0xf0) { seqlen = 3; } else { seqlen = 4; } if (rare(p - o < seqlen)) return false; o += seqlen; continue; } if (pattern[i] != candidate[o]) return false; ++o; } return (o == p); } bool QueryWildcard::test_prefix_known(const string& candidate) const { if (candidate.size() < min_len) return false; if (candidate.size() > max_len) return false; if (!endswith(candidate, suffix)) return false; if (candidate.size() < min_check_len) return true; return test_wildcard_(candidate, prefix.size(), candidate.size() - suffix.size(), head); } PostListAndEstimate QueryWildcard::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryWildcard::postlist", qopt | factor | termfreqs); OrContext ctx(qopt, 0); Query::op op = combiner; if (factor == 0.0 || op == Query::OP_SYNONYM) { if (factor == 0.0) { // If we have a factor of 0, we don't care about the weights, so // we're just like a normal OR query. op = Query::OP_OR; } bool old_compound_weight = qopt->compound_weight; if (!old_compound_weight) { qopt->compound_weight = (op == Query::OP_SYNONYM); } TermFreqs synonym_freqs; if (op == Query::OP_SYNONYM) { qopt->inc_total_subqs(); ctx.expand_wildcard(this, 0.0, &synonym_freqs); } else { ctx.expand_wildcard(this, 0.0, termfreqs); } qopt->compound_weight = old_compound_weight; if (ctx.empty()) RETURN({}); if (op != Query::OP_SYNONYM) RETURN(ctx.postlist(termfreqs, true)); // We build an OP_OR tree for OP_SYNONYM and then wrap it in a // SynonymPostList, which supplies the weights. RETURN(qopt->make_synonym_postlist(ctx.postlist(&synonym_freqs, true), factor, synonym_freqs)); } ctx.expand_wildcard(this, factor, termfreqs); qopt->set_total_subqs(qopt->get_total_subqs() + ctx.size()); if (ctx.empty()) RETURN({}); if (op == Query::OP_MAX) RETURN(ctx.postlist_max()); RETURN(ctx.postlist(termfreqs)); } termcount QueryWildcard::get_length() const noexcept { // We currently assume wqf is 1 for calculating the synonym's weight // since conceptually the synonym is one "virtual" term. If we were // to combine multiple occurrences of the same synonym expansion into // a single instance with wqf set, we would want to track the wqf. return 1; } void QueryWildcard::serialise(string & result) const { result += static_cast(0x0b); pack_uint(result, max_expansion); result += static_cast(flags); result += static_cast(combiner); pack_string(result, pattern); } Query::op QueryWildcard::get_type() const noexcept { return Query::OP_WILDCARD; } string QueryWildcard::get_description() const { string desc = "WILDCARD "; switch (combiner) { case Query::OP_SYNONYM: desc += "SYNONYM "; break; case Query::OP_MAX: desc += "MAX "; break; case Query::OP_OR: desc += "OR "; break; default: desc += "BAD "; break; } description_append(desc, pattern); return desc; } int QueryEditDistance::test(const string& candidate) const { int threshold = get_threshold(); int edist = edcalc(candidate, threshold); return edist <= threshold ? edist + 1 : 0; } PostListAndEstimate QueryEditDistance::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryEditDistance::postlist", qopt | factor | termfreqs); OrContext ctx(qopt, 0); Query::op op = combiner; if (factor == 0.0 || op == Query::OP_SYNONYM) { if (factor == 0.0) { // If we have a factor of 0, we don't care about the weights, so // we're just like a normal OR query. op = Query::OP_OR; } bool old_compound_weight = qopt->compound_weight; if (!old_compound_weight) { qopt->compound_weight = (op == Query::OP_SYNONYM); } TermFreqs synonym_freqs; if (op == Query::OP_SYNONYM) { qopt->inc_total_subqs(); ctx.expand_edit_distance(this, 0.0, &synonym_freqs); } else { ctx.expand_edit_distance(this, 0.0, termfreqs); } qopt->compound_weight = old_compound_weight; if (ctx.empty()) RETURN({}); if (op != Query::OP_SYNONYM) RETURN(ctx.postlist(termfreqs, true)); // We build an OP_OR tree for OP_SYNONYM and then wrap it in a // SynonymPostList, which supplies the weights. RETURN(qopt->make_synonym_postlist(ctx.postlist(&synonym_freqs, true), factor, synonym_freqs)); } ctx.expand_edit_distance(this, factor, termfreqs); qopt->set_total_subqs(qopt->get_total_subqs() + ctx.size()); if (ctx.empty()) RETURN({}); if (op == Query::OP_MAX) RETURN(ctx.postlist_max()); RETURN(ctx.postlist(termfreqs)); } termcount QueryEditDistance::get_length() const noexcept { // We currently assume wqf is 1 for calculating the synonym's weight // since conceptually the synonym is one "virtual" term. If we were // to combine multiple occurrences of the same synonym expansion into // a single instance with wqf set, we would want to track the wqf. return 1; } void QueryEditDistance::serialise(string & result) const { result += static_cast(0x0a); pack_uint(result, max_expansion); result += static_cast(flags); result += static_cast(combiner); pack_uint(result, edit_distance); pack_uint(result, fixed_prefix_len); pack_string(result, pattern); } Query::op QueryEditDistance::get_type() const noexcept { return Query::OP_EDIT_DISTANCE; } string QueryEditDistance::get_description() const { string desc = "EDIT_DISTANCE "; switch (combiner) { case Query::OP_SYNONYM: desc += "SYNONYM "; break; case Query::OP_MAX: desc += "MAX "; break; case Query::OP_OR: desc += "OR "; break; default: desc += "BAD "; break; } description_append(desc, pattern); desc += '~'; desc += str(edit_distance); if (fixed_prefix_len) { desc += " fixed_prefix_len="; desc += str(fixed_prefix_len); } return desc; } Xapian::termcount QueryBranch::get_length() const noexcept { // Sum results from all subqueries. Xapian::termcount result = 0; QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(), but we // can't use Assert in a noexcept function. But we'll get a // segfault anyway. result += (*i).internal->get_length(); } return result; } #define MULTIWAY(X) static_cast(0x80 | (X) << 3) #define MISC(X) static_cast(X) void QueryBranch::serialise_(string & result, Xapian::termcount parameter) const { static const unsigned char first_byte[] = { MULTIWAY(0), // OP_AND MULTIWAY(1), // OP_OR MULTIWAY(2), // OP_AND_NOT MULTIWAY(3), // OP_XOR MULTIWAY(4), // OP_AND_MAYBE MULTIWAY(5), // OP_FILTER MULTIWAY(14), // OP_NEAR MULTIWAY(15), // OP_PHRASE 0, // OP_VALUE_RANGE MISC(3), // OP_SCALE_WEIGHT MULTIWAY(13), // OP_ELITE_SET 0, // OP_VALUE_GE 0, // OP_VALUE_LE MULTIWAY(6), // OP_SYNONYM MULTIWAY(7) // OP_MAX }; Xapian::Query::op op_ = get_op(); AssertRel(size_t(op_),<,sizeof(first_byte)); unsigned char ch = first_byte[op_]; if (ch & 0x80) { // Multi-way operator. if (subqueries.size() < 8) ch |= subqueries.size(); result += ch; if (subqueries.size() >= 8) pack_uint(result, subqueries.size() - 8); if (ch >= MULTIWAY(13)) pack_uint(result, parameter); } else { result += ch; } QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(). Assert((*i).internal); (*i).internal->serialise(result); } // For OP_NEAR, OP_PHRASE, and OP_ELITE_SET, the window/set size gets // appended next by an overloaded serialise() method in the subclass. } void QueryBranch::serialise(string & result) const { QueryBranch::serialise_(result); } void QueryNear::serialise(string & result) const { // FIXME: window - subqueries.size() ? QueryBranch::serialise_(result, window); } void QueryPhrase::serialise(string & result) const { // FIXME: window - subqueries.size() ? QueryBranch::serialise_(result, window); } void QueryEliteSet::serialise(string & result) const { // FIXME: set_size - subqueries.size() ? QueryBranch::serialise_(result, set_size); } void QueryBranch::gather_terms(void * void_terms) const { // Gather results from all subqueries. QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(). Assert((*i).internal); (*i).internal->gather_terms(void_terms); } } void QueryBranch::do_bool_or_like(OrContext& ctx, QueryOptimiser* qopt, TermFreqs* termfreqs, size_t first) const { LOGCALL_VOID(MATCH, "QueryBranch::do_bool_or_like", ctx | qopt | termfreqs | first); // FIXME: we could optimise by merging OP_ELITE_SET and OP_OR like we do // for AND-like operations. // OP_SYNONYM with a single subquery is only simplified by // QuerySynonym::done() if the single subquery is a term or MatchAll. Assert(subqueries.size() >= 2 || get_op() == Query::OP_SYNONYM); QueryVector::const_iterator q; for (q = subqueries.begin() + first; q != subqueries.end(); ++q) { // MatchNothing subqueries should have been removed by done(). Assert((*q).internal); (*q).internal->postlist_sub_bool_or_like(ctx, qopt, termfreqs); } } void QueryBranch::do_or_like(OrContext& ctx, QueryOptimiser * qopt, double factor, TermFreqs* termfreqs, Xapian::termcount elite_set_size, size_t first, bool keep_zero_weight) const { LOGCALL_VOID(MATCH, "QueryBranch::do_or_like", ctx | qopt | factor | termfreqs | elite_set_size | first | keep_zero_weight); // FIXME: we could optimise by merging OP_ELITE_SET and OP_OR like we do // for AND-like operations. // OP_SYNONYM with a single subquery is only simplified by // QuerySynonym::done() if the single subquery is a term or MatchAll. Assert(subqueries.size() >= 2 || get_op() == Query::OP_SYNONYM); size_t size_before = ctx.size(); QueryVector::const_iterator q; for (q = subqueries.begin() + first; q != subqueries.end(); ++q) { // MatchNothing subqueries should have been removed by done(). Assert((*q).internal); (*q).internal->postlist_sub_or_like(ctx, qopt, factor, termfreqs, keep_zero_weight); } size_t out_of = ctx.size() - size_before; if (elite_set_size && elite_set_size < out_of) { ctx.select_elite_set(elite_set_size, out_of); // FIXME: This isn't quite right as we flatten ORs under the ELITE_SET // and then pick from amongst all the subqueries. Consider: // // Query subqs[] = {q1 | q2, q3 | q4}; // Query q(OP_ELITE_SET, begin(subqs), end(subqs), 1); // // Here q should be either q1 | q2 or q3 | q4, but actually it'll be // just one of q1 or q2 or q3 or q4 (assuming those aren't themselves // OP_OR or OP_OR-like queries). } } PostListAndEstimate QueryBranch::do_synonym(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(MATCH, PostListAndEstimate, "QueryBranch::do_synonym", qopt | factor | termfreqs); OrContext ctx(qopt, subqueries.size()); if (factor == 0.0) { // If we have a factor of 0, we don't care about the weights, so // we're just like a normal OR query. An OP_SYNONYM sets factor=0 // for its subqueries so this handles an OP_SYNONYM below an // OP_SYNONYM. do_bool_or_like(ctx, qopt, termfreqs); return ctx.postlist(termfreqs, true); } bool old_compound_weight = qopt->compound_weight; Assert(!old_compound_weight); qopt->compound_weight = true; TermFreqs synonym_freqs; do_bool_or_like(ctx, qopt, &synonym_freqs); PostListAndEstimate plest = ctx.postlist(&synonym_freqs, true); qopt->compound_weight = old_compound_weight; if (!plest.pl) return {}; // We currently assume wqf is 1 for calculating the synonym's weight // since conceptually the synonym is one "virtual" term. If we were // to combine multiple occurrences of the same synonym expansion into // a single instance with wqf set, we would want to track the wqf. // We build an OP_OR tree for OP_SYNONYM and then wrap it in a // SynonymPostList, which supplies the weights. RETURN(qopt->make_synonym_postlist(std::move(plest), factor, synonym_freqs)); } PostListAndEstimate QueryBranch::do_max(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(MATCH, PostListAndEstimate, "QueryBranch::do_max", qopt | factor | termfreqs); OrContext ctx(qopt, subqueries.size()); if (factor == 0.0) { // Without the weights we're just like a normal OR query. do_bool_or_like(ctx, qopt, termfreqs); RETURN(ctx.postlist(termfreqs, true)); } // If termfreqs is set that means we're below a synonym, but in that case // factor should be 0.0 which is handled above. Assert(!termfreqs); do_or_like(ctx, qopt, factor, termfreqs); // We currently assume wqf is 1 for calculating the OP_MAX's weight // since conceptually the OP_MAX is one "virtual" term. If we were // to combine multiple occurrences of the same OP_MAX expansion into // a single instance with wqf set, we would want to track the wqf. RETURN(ctx.postlist_max()); } Xapian::Query::op QueryBranch::get_type() const noexcept { return get_op(); } size_t QueryBranch::get_num_subqueries() const noexcept { return subqueries.size(); } const Query QueryBranch::get_subquery(size_t n) const { return subqueries[n]; } const string QueryBranch::get_description_helper(const char * op, Xapian::termcount parameter) const { string desc = "("; QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { if (desc.size() > 1) { desc += op; if (parameter) { desc += str(parameter); desc += ' '; } } Assert((*i).internal); // MatchNothing subqueries should have been removed by done(), and we // shouldn't get called before done() is, since that happens at the // end of the Xapian::Query constructor. desc += (*i).internal->get_description(); } desc += ')'; return desc; } Query::Internal * QueryWindowed::done() { // If window size not specified, default it. if (window == 0) window = subqueries.size(); return QueryAndLike::done(); } void QueryScaleWeight::gather_terms(void * void_terms) const { subquery.internal->gather_terms(void_terms); } void QueryTerm::serialise(string & result) const { size_t len = term.size(); if (len == 0) { if (wqf == 1 && pos == 0) { // Query::MatchAll result += '\x0f'; } else { // Weird mutant versions of MatchAll result += '\x0e'; pack_uint(result, wqf); pack_uint(result, pos); } } else if (wqf == 1) { if (pos == 0) { // Single occurrence free-text term without position set. if (len >= 16) { result += static_cast(0x40 | 0x10); pack_uint(result, term.size() - 16); } else { result += static_cast(0x40 | 0x10 | len); } result += term; } else { // Single occurrence free-text term with position set. if (len >= 16) { result += static_cast(0x40 | 0x20); pack_uint(result, term.size() - 16); } else { result += static_cast(0x40 | 0x20 | len); } result += term; pack_uint(result, pos); } } else if (wqf > 1 || pos > 0) { // General case. if (len >= 16) { result += static_cast(0x40 | 0x30); pack_uint(result, term.size() - 16); } else if (len) { result += static_cast(0x40 | 0x30 | len); } result += term; pack_uint(result, wqf); pack_uint(result, pos); } else { // Typical boolean term. AssertEq(wqf, 0); AssertEq(pos, 0); if (len >= 16) { result += static_cast(0x40); pack_uint(result, term.size() - 16); } else { result += static_cast(0x40 | len); } result += term; } } void QueryPostingSource::serialise(string & result) const { result += static_cast(0x0c); pack_string(result, source->name()); pack_string(result, source->serialise()); } void QueryScaleWeight::serialise(string & result) const { Assert(subquery.internal); result += '\x0d'; result += serialise_double(scale_factor); subquery.internal->serialise(result); } void QueryAndLike::add_subquery(const Xapian::Query & subquery) { // If the AndLike is already MatchNothing, do nothing. if (subqueries.size() == 1 && !subqueries[0].internal) return; // If we're adding MatchNothing, discard any previous subqueries. if (!subquery.internal) subqueries.clear(); subqueries.push_back(subquery); } Query::Internal * QueryAndLike::done() { // Empty AndLike gives MatchNothing. if (subqueries.empty()) return NULL; // We handle any subquery being MatchNothing in add_subquery() by leaving // a single MatchNothing subquery, and so this check results in AndLike // giving MatchNothing. if (subqueries.size() == 1) return subqueries[0].internal.get(); return this; } PostListAndEstimate QueryAndLike::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryAndLike::postlist", qopt | factor | termfreqs); AndContext ctx(qopt, subqueries.size()); if (!postlist_sub_and_like(ctx, qopt, factor, termfreqs)) { RETURN({}); } RETURN(ctx.postlist(termfreqs)); } bool QueryAndLike::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(). Assert((*i).internal); if (!(*i).internal->postlist_sub_and_like(ctx, qopt, factor, termfreqs)) return false; } return true; } void QueryOrLike::add_subquery(const Xapian::Query & subquery) { // Drop any subqueries which are MatchNothing. if (subquery.internal) subqueries.push_back(subquery); } Query::Internal * QueryOrLike::done() { // An empty OrLike gives MatchNothing. Note that add_subquery() drops any // subqueries which are MatchNothing. if (subqueries.empty()) return NULL; if (subqueries.size() == 1) return subqueries[0].internal.get(); return this; } void QueryAndNot::add_subquery(const Xapian::Query & subquery) { if (!subqueries.empty()) { // We're adding the 2nd or subsequent subquery, so this subquery is // negated. if (!subqueries[0].internal) { // The left side is already MatchNothing so drop any right side. // // MatchNothing AND_NOT X == MatchNothing return; } if (!subquery.internal) { // Drop MatchNothing on the right of AndNot. // // X AND_NOT MatchNothing == X return; } if (subquery.get_type() == subquery.OP_SCALE_WEIGHT) { // Strip OP_SCALE_WEIGHT wrapping from queries on the right of // AndNot as no weight is taken from them. subqueries.push_back(subquery.get_subquery(0)); // The Query constructor for OP_SCALE_WEIGHT constructor should // eliminate OP_SCALE_WEIGHT applied to MatchNothing. Assert(subquery.get_subquery(0).internal); return; } } subqueries.push_back(subquery); } Query::Internal * QueryAndNot::done() { // Any MatchNothing right subqueries get discarded by add_subquery() - if // that leaves just the left subquery, return that. // // If left subquery is MatchNothing, then add_subquery() discards all right // subqueries, so this check also gives MatchNothing for this case. if (subqueries.size() == 1) return subqueries[0].internal.get(); return this; } void QueryAndMaybe::add_subquery(const Xapian::Query & subquery) { // If the left side of AndMaybe is already MatchNothing, do nothing. if (subqueries.size() == 1 && !subqueries[0].internal) return; // Drop any 2nd or subsequent subqueries which are MatchNothing. if (subquery.internal || subqueries.empty()) subqueries.push_back(subquery); } Query::Internal * QueryAndMaybe::done() { // Any MatchNothing right subqueries get discarded by add_subquery() - if // that leaves just the left subquery, return that. // // If left subquery is MatchNothing, then add_subquery() discards all right // subqueries, so this check also gives MatchNothing for this case. if (subqueries.size() == 1) return subqueries[0].internal.get(); return this; } PostListAndEstimate QueryOr::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryOr::postlist", qopt | factor | termfreqs); OrContext ctx(qopt, subqueries.size()); if (factor == 0.0) { do_bool_or_like(ctx, qopt, termfreqs); RETURN(ctx.postlist(termfreqs, true)); } do_or_like(ctx, qopt, factor, termfreqs); RETURN(ctx.postlist(termfreqs)); } void QueryOr::postlist_sub_or_like(OrContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs, bool keep_zero_weight) const { do_or_like(ctx, qopt, factor, termfreqs, 0, 0, keep_zero_weight); } void QueryOr::postlist_sub_bool_or_like(OrContext& ctx, QueryOptimiser* qopt, TermFreqs* termfreqs) const { do_bool_or_like(ctx, qopt, termfreqs); } PostListAndEstimate QueryAndNot::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryAndNot::postlist", qopt | factor | termfreqs); AndContext ctx(qopt, 1); if (!QueryAndNot::postlist_sub_and_like(ctx, qopt, factor, termfreqs)) { RETURN({}); } RETURN(ctx.postlist(termfreqs)); } bool QueryAndNot::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { // This invariant should be established by QueryAndNot::done() with // assistance from QueryAndNot::add_subquery(). Assert(subqueries[0].internal); if (!subqueries[0].internal->postlist_sub_and_like(ctx, qopt, factor, termfreqs)) { return false; } do_bool_or_like(ctx.get_not_ctx(subqueries.size() - 1), qopt, termfreqs, 1); return true; } PostListAndEstimate QueryXor::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryXor::postlist", qopt | factor | termfreqs); XorContext ctx(qopt, subqueries.size()); postlist_sub_xor(ctx, qopt, factor, termfreqs); RETURN(ctx.postlist(termfreqs)); } void QueryXor::postlist_sub_xor(XorContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(). Assert((*i).internal); (*i).internal->postlist_sub_xor(ctx, qopt, factor, termfreqs); } } PostListAndEstimate QueryAndMaybe::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryAndMaybe::postlist", qopt | factor | termfreqs); AndContext ctx(qopt, 1); if (!QueryAndMaybe::postlist_sub_and_like(ctx, qopt, factor, termfreqs)) { RETURN({}); } RETURN(ctx.postlist(termfreqs)); } bool QueryAndMaybe::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { // This invariant should be established by QueryAndMaybe::done() with // assistance from QueryAndMaybe::add_subquery(). Assert(subqueries[0].internal); if (!subqueries[0].internal->postlist_sub_and_like(ctx, qopt, factor, termfreqs)) { return false; } // We only need to consider the right branch or branches if we're weighted // - an unweighted OP_AND_MAYBE can be replaced with its left branch. if (factor != 0.0) { // Only keep zero-weight subqueries if we need their wdf because // they're underneath a compound weight. OrContext& maybe_ctx = ctx.get_maybe_ctx(subqueries.size() - 1); bool need_wdf = qopt->need_wdf_for_compound_weight(); bool save_no_estimates = qopt->get_no_estimates(); qopt->set_no_estimates(true); do_or_like(maybe_ctx, qopt, factor, termfreqs, 0, 1, need_wdf); qopt->set_no_estimates(save_no_estimates); } return true; } PostListAndEstimate QueryFilter::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryFilter::postlist", qopt | factor | termfreqs); AndContext ctx(qopt, subqueries.size()); if (!QueryFilter::postlist_sub_and_like(ctx, qopt, factor, termfreqs)) { RETURN({}); } RETURN(ctx.postlist(termfreqs)); } bool QueryFilter::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(). Assert((*i).internal); if (!(*i).internal->postlist_sub_and_like(ctx, qopt, factor, termfreqs)) return false; // Second and subsequent subqueries are unweighted. factor = 0.0; } return true; } bool QueryWindowed::postlist_windowed(Query::op op, AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { if (!qopt->db.has_positions()) { // No positions in this subdatabase so this matches nothing, which // means the whole andcontext matches nothing. // // Bailing out here means we don't recurse deeper and that means we // don't call QueryOptimiser::inc_total_subqs() for leaf postlists in // the phrase, but at least one shard will count them, and the matcher // takes the highest answer (since 1.4.6). ctx.shrink(0); return false; } bool old_need_positions = qopt->need_positions; qopt->need_positions = true; bool result = true; QueryVector::const_iterator i; for (i = subqueries.begin(); i != subqueries.end(); ++i) { // MatchNothing subqueries should have been removed by done(). Assert((*i).internal); PostListAndEstimate plest = (*i).internal->postlist(qopt, factor, NULL); if (plest.pl && (*i).internal->get_type() != Query::LEAF_TERM) { plest.pl = new OrPosPostList(plest.pl); } result = ctx.add_postlist(std::move(plest), termfreqs); if (!result) { if (factor == 0.0) break; // If we don't complete the iteration, the subquery count may be // wrong, and weighting information may not be filled in. while (i != subqueries.end()) { // MatchNothing subqueries should have been removed by done(). // FIXME: Can we handle this more gracefully? Assert((*i).internal); qopt->destroy_postlist((*i).internal->postlist(qopt, factor, NULL).pl); ++i; } break; } } if (result) { // Record the positional filter to apply higher up the tree. ctx.add_pos_filter(op, subqueries.size(), window); } qopt->need_positions = old_need_positions; return result; } bool QueryPhrase::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { constexpr auto OP_PHRASE = Query::OP_PHRASE; return QueryWindowed::postlist_windowed(OP_PHRASE, ctx, qopt, factor, termfreqs); } bool QueryNear::postlist_sub_and_like(AndContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { constexpr auto OP_NEAR = Query::OP_NEAR; return QueryWindowed::postlist_windowed(OP_NEAR, ctx, qopt, factor, termfreqs); } PostListAndEstimate QueryEliteSet::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryEliteSet::postlist", qopt | factor | termfreqs); OrContext ctx(qopt, subqueries.size()); do_or_like(ctx, qopt, factor, termfreqs, set_size); RETURN(ctx.postlist(termfreqs)); } void QueryEliteSet::postlist_sub_or_like(OrContext& ctx, QueryOptimiser* qopt, double factor, TermFreqs* termfreqs, bool keep_zero_weight) const { do_or_like(ctx, qopt, factor, termfreqs, set_size, 0, keep_zero_weight); } PostListAndEstimate QuerySynonym::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QuerySynonym::postlist", qopt | factor | termfreqs); // Save and restore total_subqs so we only add one for the whole // OP_SYNONYM subquery (or none if we're not weighted). Xapian::termcount save_total_subqs = qopt->get_total_subqs(); if (factor != 0.0) ++save_total_subqs; PostListAndEstimate plest = do_synonym(qopt, factor, termfreqs); qopt->set_total_subqs(save_total_subqs); return plest; } Query::Internal * QuerySynonym::done() { // An empty Synonym gives MatchNothing. Note that add_subquery() drops any // subqueries which are MatchNothing. if (subqueries.empty()) return NULL; if (subqueries.size() == 1) { Query::op sub_type = subqueries[0].get_type(); // Synonym of a single subquery should only be simplified if that // subquery is a term (or MatchAll), or if it's also OP_SYNONYM. Note // that MatchNothing subqueries are dropped, so we'd never get here // with a single MatchNothing subquery. if (sub_type == Query::LEAF_TERM || sub_type == Query::LEAF_MATCH_ALL || sub_type == Query::OP_SYNONYM) { return subqueries[0].internal.get(); } if (sub_type == Query::OP_WILDCARD) { auto q = static_cast(subqueries[0].internal.get()); // SYNONYM over WILDCARD X -> WILDCARD SYNONYM for any combiner X. return q->change_combiner(Query::OP_SYNONYM); } if (sub_type == Query::OP_EDIT_DISTANCE) { auto q = static_cast(subqueries[0].internal.get()); // SYNONYM over EDIT_DISTANCE X -> EDIT_DISTANCE SYNONYM for any // combiner X. return q->change_combiner(Query::OP_SYNONYM); } } return this; } PostListAndEstimate QueryMax::postlist(QueryOptimiser* qopt, double factor, TermFreqs* termfreqs) const { LOGCALL(QUERY, PostListAndEstimate, "QueryMax::postlist", qopt | factor | termfreqs); // Save and restore total_subqs so we only add one for the whole // OP_MAX subquery (or none if we're not weighted). Xapian::termcount save_total_subqs = qopt->get_total_subqs(); if (factor != 0.0) ++save_total_subqs; PostListAndEstimate plest = do_max(qopt, factor, termfreqs); qopt->set_total_subqs(save_total_subqs); return plest; } Xapian::Query::op QueryAnd::get_op() const { return Xapian::Query::OP_AND; } Xapian::Query::op QueryOr::get_op() const { return Xapian::Query::OP_OR; } Xapian::Query::op QueryAndNot::get_op() const { return Xapian::Query::OP_AND_NOT; } Xapian::Query::op QueryXor::get_op() const { return Xapian::Query::OP_XOR; } Xapian::Query::op QueryAndMaybe::get_op() const { return Xapian::Query::OP_AND_MAYBE; } Xapian::Query::op QueryFilter::get_op() const { return Xapian::Query::OP_FILTER; } Xapian::Query::op QueryNear::get_op() const { return Xapian::Query::OP_NEAR; } Xapian::Query::op QueryPhrase::get_op() const { return Xapian::Query::OP_PHRASE; } Xapian::Query::op QueryEliteSet::get_op() const { return Xapian::Query::OP_ELITE_SET; } Xapian::Query::op QuerySynonym::get_op() const { return Xapian::Query::OP_SYNONYM; } Xapian::Query::op QueryMax::get_op() const { return Xapian::Query::OP_MAX; } string QueryAnd::get_description() const { return get_description_helper(" AND "); } string QueryOr::get_description() const { return get_description_helper(" OR "); } string QueryAndNot::get_description() const { return get_description_helper(" AND_NOT "); } string QueryXor::get_description() const { return get_description_helper(" XOR "); } string QueryAndMaybe::get_description() const { return get_description_helper(" AND_MAYBE "); } string QueryFilter::get_description() const { return get_description_helper(" FILTER "); } string QueryNear::get_description() const { return get_description_helper(" NEAR ", window); } string QueryPhrase::get_description() const { return get_description_helper(" PHRASE ", window); } string QueryEliteSet::get_description() const { return get_description_helper(" ELITE_SET ", set_size); } string QuerySynonym::get_description() const { if (subqueries.size() == 1) { string d = "(SYNONYM "; d += subqueries[0].internal->get_description(); d += ")"; return d; } return get_description_helper(" SYNONYM "); } string QueryMax::get_description() const { return get_description_helper(" MAX "); } Xapian::Query::op QueryInvalid::get_type() const noexcept { return Xapian::Query::OP_INVALID; } PostListAndEstimate QueryInvalid::postlist(QueryOptimiser*, double, TermFreqs*) const { throw Xapian::InvalidOperationError("Query is invalid"); } void QueryInvalid::serialise(std::string & result) const { result += static_cast(0x00); } string QueryInvalid::get_description() const { return ""; } } }