1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
|
/* Copyright (C) 2017-2019 J.F.Dockes
*
* License: GPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "autoconfig.h"
#include "hldata.h"
#include <algorithm>
#include <limits.h>
#include "log.h"
#include "smallut.h"
using std::string;
using std::unordered_map;
using std::vector;
using std::pair;
#undef DEBUGGROUPS
#ifdef DEBUGGROUPS
#define LOGRP LOGINF
#else
#define LOGRP LOGDEB1
#endif
// Combined position list for or'd terms
struct OrPList {
void addplist(const string& term, const vector<size_t>* pl) {
terms.push_back(term);
plists.push_back(pl);
indexes.push_back(0);
totalsize += pl->size();
}
// Returns -1 for eof, else the next smallest value in the
// combined lists, according to the current indexes.
int value() {
size_t minval = INT_MAX;
int minidx = -1;
for (unsigned ii = 0; ii < indexes.size(); ii++) {
const vector<size_t>& pl(*plists[ii]);
if (indexes[ii] >= pl.size())
continue; // this list done
if (pl[indexes[ii]] < minval) {
minval = pl[indexes[ii]];
minidx = ii;
}
}
if (minidx != -1) {
LOGRP("OrPList::value() -> " << minval << " for " << terms[minidx] << "\n");
currentidx = minidx;
return static_cast<int>(minval);
} else {
LOGRP("OrPList::value(): EOL for " << stringsToString(terms)<<"\n");
return -1;
}
}
int next() {
if (currentidx != -1) {
indexes[currentidx]++;
}
return value();
}
size_t size() const {
return totalsize;
}
void rewind() {
for (auto& idx : indexes) {
idx = 0;
}
currentidx = -1;
}
vector<const vector<size_t>*> plists;
vector<unsigned int> indexes;
vector<string> terms;
int currentidx{-1};
size_t totalsize{0};
};
static inline void setWinMinMax(int pos, int& sta, int& sto)
{
if (pos < sta) {
sta = pos;
}
if (pos > sto) {
sto = pos;
}
}
/*
* @param window the total width for the "near" area, in positions.
* @param plists the position vectors for the terms. The array is
* sorted shorted first for optimization. The function does a
* recursive call on the next array if the match is still possible
* after dealing with the current one
* @param plist_idx the index for the position list we will work with.
* @param min, max the current minimum and maximum term positions.
* @param[output] sp, ep, the start and end positions of the found match.
* @param minpos Highest end of a found match. While looking for
* further matches, we don't want the search to extend before
* this, because it does not make sense for highlight regions to
* overlap.
* @param isphrase if true, the position lists are in term order, and
* we only look for the next match beyond the current window top.
*/
static bool do_proximity_test(
const int window, vector<OrPList>& plists,
unsigned int plist_idx, int min, int max, int *sp, int *ep, int minpos,
bool isphrase)
{
// Overlap interdiction: possibly adjust window start by input minpos
int actualminpos = isphrase ? max + 1 : max + 1 - window;
if (actualminpos < minpos)
actualminpos = minpos;
LOGRP("do_prox_test: win " << window << " plist_idx " << plist_idx <<
" min " << min << " max " << max << " minpos " << minpos <<
" isphrase " << isphrase << " actualminpos " << actualminpos << "\n");
// Find 1st position bigger than window start. A previous call may
// have advanced the index, so we begin by retrieving the current
// value.
int nextpos = plists[plist_idx].value();
while (nextpos != -1 && nextpos < actualminpos)
nextpos = plists[plist_idx].next();
// Look for position inside window. If not found, no match. If
// found: if this is the last list we're done, else recurse on
// next list after adjusting the window
while (nextpos != -1) {
if (nextpos > min + window - 1) {
return false;
}
if (plist_idx + 1 == plists.size()) {
// We already checked pos > min, now we also have pos <
// max, and we are the last list: done: set return values.
setWinMinMax(nextpos, *sp, *ep);
return true;
}
setWinMinMax(nextpos, min, max);
if (do_proximity_test(window, plists, plist_idx + 1,
min, max, sp, ep, minpos, isphrase)) {
return true;
}
nextpos = plists[plist_idx].next();
}
return false;
}
// Find matches for one group of terms
bool matchGroup(const HighlightData& hldata,
unsigned int grpidx,
const unordered_map<string, vector<size_t>>& inplists,
const unordered_map<size_t, pair<size_t,size_t>>& gpostobytes,
vector<GroupMatchEntry>& tboffs)
{
const auto& tg(hldata.index_term_groups[grpidx]);
bool isphrase = tg.kind == HighlightData::TermGroup::TGK_PHRASE;
#ifdef DEBUGGROUPS
string allplterms;
for (const auto& entry:inplists) {
allplterms += entry.first + " ";
}
LOGRP("matchGroup: isphrase " << isphrase << ". Have plists for [" << allplterms << "]\n");
//LOGRP("matchGroup: hldata: " << hldata.toString() << std::endl);
#endif
int window = int(tg.orgroups.size() + tg.slack);
// The position lists we are going to work with. We extract them from the
// (string->plist) map
vector<OrPList> orplists;
// Find the position list for each term in the group and build the combined lists for the term
// or groups (each group is the result of the exansion of one user term). It is possible that
// this particular group was not actually matched by the search, so that some terms are not
// found, in which case we bail out.
for (const auto& group : tg.orgroups) {
orplists.push_back(OrPList());
for (const auto& term : group) {
const auto pl = inplists.find(term);
if (pl == inplists.end()) {
LOGRP("Matchgroup::matchGroup: term [" << term << "] not found in plists\n");
continue;
}
orplists.back().addplist(pl->first, &(pl->second));
}
if (orplists.back().plists.empty()) {
LOGRP("No positions list found for OR group [" << stringsToString(group) <<
"] : input has no group match, returning false\n");
return false;
} else {
LOGRP("Created OrPList has " << orplists.back().plists.size() << " members\n");
}
}
// I think this can't actually happen, was useful when we used to
// prune the groups, but doesn't hurt.
if (orplists.size() < 2) {
LOGRP("Matchgroup::matchGroup: no actual groups found\n");
return false;
}
if (!isphrase) {
// Sort the positions lists so that the shorter is first
std::sort(orplists.begin(), orplists.end(),
[](const OrPList& a, const OrPList& b) -> bool {
return a.size() < b.size();
}
);
}
// Minpos is the highest end of a found match. While looking for
// further matches, we don't want the search to extend before
// this, because it does not make sense for highlight regions to
// overlap
int minpos = 0;
// Walk the shortest plist and look for matches
int pos;
while ((pos = orplists[0].next()) != -1) {
int sta = INT_MAX, sto = 0;
LOGDEB2("MatchGroup: Testing at pos " << pos << "\n");
if (do_proximity_test(
window, orplists, 1, pos, pos, &sta, &sto, minpos, isphrase)) {
setWinMinMax(pos, sta, sto);
LOGRP("Matchgroup::matchGroup: MATCH termpos [" << sta <<
"," << sto << "]\n");
minpos = sto + 1;
// Translate the position window into a byte offset window
auto i1 = gpostobytes.find(sta);
auto i2 = gpostobytes.find(sto);
if (i1 != gpostobytes.end() && i2 != gpostobytes.end()) {
LOGDEB2("Matchgroup::matchGroup: pushing bpos " <<
i1->second.first << " " << i2->second.second << "\n");
tboffs.push_back(GroupMatchEntry(i1->second.first,
i2->second.second, grpidx));
} else {
LOGDEB0("matchGroup: no bpos found for " << sta << " or " << sto << "\n");
}
} else {
LOGRP("matchGroup: no group match found at this position\n");
}
}
return !tboffs.empty();
}
vector<CharFlags> kindflags {
CHARFLAGENTRY(HighlightData::TermGroup::TGK_TERM),
CHARFLAGENTRY(HighlightData::TermGroup::TGK_NEAR),
CHARFLAGENTRY(HighlightData::TermGroup::TGK_PHRASE),
};
string HighlightData::toString() const
{
string out;
out.append("\nUser terms (orthograph): ");
for (const auto& term : uterms) {
out.append(" [").append(term).append("]");
}
out.append("\nUser terms to Query terms:");
for (const auto& entry: terms) {
out.append("[").append(entry.first).append("]->[");
out.append(entry.second).append("] ");
}
out.append("\nGroups: ");
char cbuf[200];
snprintf(cbuf, sizeof(cbuf), "index_term_groups size %d ugroups size %d",
int(index_term_groups.size()), int(ugroups.size()));
out.append(cbuf);
size_t ugidx = (size_t) - 1;
for (const HighlightData::TermGroup &tg : index_term_groups) {
if (ugidx != tg.grpsugidx) {
ugidx = tg.grpsugidx;
out.append("\n(");
for (unsigned int j = 0; j < ugroups[ugidx].size(); j++) {
out.append("[").append(ugroups[ugidx][j]).append("] ");
}
out.append(") ->");
}
if (tg.kind == HighlightData::TermGroup::TGK_TERM) {
out.append(" <").append(tg.term).append(">");
} else {
out.append(" {");
for (unsigned int j = 0; j < tg.orgroups.size(); j++) {
out.append(" {");
for (unsigned int k = 0; k < tg.orgroups[j].size(); k++) {
out.append("[").append(tg.orgroups[j][k]).append("]");
}
out.append("}");
}
out.append(" ");
out.append(valToString(kindflags,tg.kind)).append("-").append(std::to_string(tg.slack));
out.append(" }");
}
}
out.append("\n");
out.append("\nTerms generated by spelling approximation:");
out.append(stringsToString(spellexpands));
out.append("\n");
return out;
}
void HighlightData::append(const HighlightData& hl)
{
uterms.insert(hl.uterms.begin(), hl.uterms.end());
terms.insert(hl.terms.begin(), hl.terms.end());
size_t ugsz0 = ugroups.size();
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
size_t itgsize = index_term_groups.size();
index_term_groups.insert(index_term_groups.end(),
hl.index_term_groups.begin(),
hl.index_term_groups.end());
// Adjust the grpsugidx values for the newly inserted entries
for (auto idx = itgsize; idx < index_term_groups.size(); idx++) {
index_term_groups[idx].grpsugidx += ugsz0;
}
spellexpands.insert(spellexpands.end(), hl.spellexpands.begin(), hl.spellexpands.end());
}
|