1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_
#define COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_
#include <stddef.h>
#include <memory>
#include <string>
#include <vector>
#include "components/query_parser/snippet.h"
namespace query_parser {
class QueryNodeList;
// Used by HasMatchIn.
struct QueryWord {
// The word to match against.
std::u16string word;
// The starting position of the word in the original text.
size_t position;
};
enum class MatchingAlgorithm {
// Only words long enough are considered for prefix search. Shorter words are
// considered for exact matches.
DEFAULT,
// All words are considered for a prefix search.
ALWAYS_PREFIX_SEARCH,
kMaxValue = ALWAYS_PREFIX_SEARCH,
};
using QueryWordVector = std::vector<query_parser::QueryWord>;
// `QueryNode` is used by `QueryParser` to represent the elements that
// constitute a query. While `QueryNode` is exposed by way of `ParseQuery`, it
// really isn't meant for external usage.
class QueryNode {
public:
virtual ~QueryNode() = default;
// Serialize ourselves out to a string that can be passed to SQLite. Returns
// the number of words in this node.
virtual int AppendToSQLiteQuery(std::u16string* query) const = 0;
// Return true if this is a `QueryNodeWord`, false if it's a `QueryNodeList`.
virtual bool IsWord() const = 0;
// Returns true if this node matches `word`. If `exact` is true, the string
// must exactly match. Otherwise, this uses a starts-with comparison.
virtual bool Matches(const std::u16string& word, bool exact) const = 0;
// Returns true if this node matches at least one of the words in `words`. An
// entry is added to `match_positions` for all matching words giving the
// matching regions. Uses a starts-with comparison.
virtual bool HasMatchIn(const QueryWordVector& words,
Snippet::MatchPositions* match_positions) const = 0;
// Returns true if this node matches at least one of the words in `words`.
// If `exact` is true, at least one of the words must be an exact match.
virtual bool HasMatchIn(const QueryWordVector& words, bool exact) const = 0;
// Appends the words that make up this node in `words`.
virtual void AppendWords(std::vector<std::u16string>* words) const = 0;
};
using QueryNodeVector = std::vector<std::unique_ptr<query_parser::QueryNode>>;
// This class is used to parse queries entered into the history search into more
// normalized queries that can be passed to the SQLite backend.
class QueryParser {
public:
QueryParser() = delete;
QueryParser(const QueryParser&) = delete;
QueryParser& operator=(const QueryParser&) = delete;
~QueryParser() = delete;
// For CJK ideographs and Korean Hangul, even a single character
// can be useful in prefix matching, but that may give us too many
// false positives. Moreover, the current ICU word breaker gives us
// back every single Chinese character as a word so that there's no
// point doing anything for them and we only adjust the minimum length
// to 2 for Korean Hangul while using 3 for others. This is a temporary
// hack until we have a segmentation support.
static bool IsWordLongEnoughForPrefixSearch(
const std::u16string& word,
MatchingAlgorithm matching_algorithm);
// Parse a query into a SQLite query. The resulting query is placed in
// |sqlite_query| and the number of words is returned.
static int ParseQuery(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
std::u16string* sqlite_query);
// Parses |query|, returning the words that make up it. Any words in quotes
// are put in |words| without the quotes. For example, the query text
// "foo bar" results in two entries being added to words, one for foo and one
// for bar.
static void ParseQueryWords(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
std::vector<std::u16string>* words);
// Parses |query|, returning the nodes that constitute the valid words in the
// query. This is intended for later usage with DoesQueryMatch. Ownership of
// the nodes passes to the caller.
static void ParseQueryNodes(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
QueryNodeVector* nodes);
// Returns true if all of the |find_nodes| are found in |find_in_text|.
// |find_nodes| should have been created by calling |ParseQuery()|. If all
// nodes were successfully found, each of the matching positions in the text
// is added to |match_positions|.
static bool DoesQueryMatch(const std::u16string& find_in_text,
const QueryNodeVector& find_nodes,
Snippet::MatchPositions* match_positions);
// Returns true if all of the |find_nodes| are found in |find_in_words|.
// |find_nodes| should have been created by calling |ParseQuery()|.
// If |exact| is set to true, only exact matches are considered matches.
static bool DoesQueryMatch(const QueryWordVector& find_in_words,
const QueryNodeVector& find_nodes,
bool exact = false);
// Extracts the words from |text|, placing each word into |words|.
// |text| must already be lowercased by the caller, as otherwise the output
// will NEVER match anything.
static void ExtractQueryWords(const std::u16string& text,
QueryWordVector* words);
// Sorts the match positions in |matches| by their first index, then
// coalesces any match positions that intersect each other.
static void SortAndCoalesceMatchPositions(Snippet::MatchPositions* matches);
private:
// Does the work of parsing |query|; creates nodes in |root| as appropriate.
// This is invoked from both of the ParseQuery methods.
static bool ParseQueryImpl(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
QueryNodeList* root);
};
} // namespace query_parser
#endif // COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_
|