1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
/** @file
* @brief internal class representing a term in a modified document
*/
/* Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2002 Ananova Ltd
* Copyright 2003,2007,2018 Olly Betts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
#ifndef OM_HGUARD_DOCUMENTTERM_H
#define OM_HGUARD_DOCUMENTTERM_H
#include "debuglog.h"
#include <string>
#include <vector>
#include <xapian/types.h>
using std::string;
using std::vector;
/// A term in a document.
class OmDocumentTerm {
public:
/** Make a new term.
*
* @param wdf_ Initial wdf.
*/
explicit OmDocumentTerm(Xapian::termcount wdf_)
: wdf(wdf_)
{
LOGCALL_CTOR(DB, "OmDocumentTerm", wdf_);
}
/** Within document frequency of the term.
* This is the number of occurrences of the term in the document.
*/
Xapian::termcount wdf;
/** Split point in the position range.
*
* To allow more efficient insertion of positions, we support the
* positions being split into two sorted ranges, and if this is the
* case, split will be > 0 and there will be two sorted ranges [0, split)
* and [split, positions.size()).
*
* If split is 0, then [0, positions.size()) form a single sorted range.
*
* If positions.empty(), then split > 0 indicates that the term has been
* deleted (this allows us to delete terms without invalidating existing
* TermIterator objects).
*
* Use type unsigned here to avoid bloating this structure. More than
* 4 billion positions in one document is not sensible (and not possible
* unless termpos is configured to be 64 bit).
*/
mutable unsigned split = 0;
/** Merge sorted ranges before and after @a split. */
void merge() const;
typedef vector<Xapian::termpos> term_positions;
private:
/** Positional information.
*
* This is a list of positions at which the term occurs in the
* document. The list is in strictly increasing order of term
* position.
*
* The positions start at 1.
*
* Note that, even if positional information is present, the WDF might
* not be equal to the length of the position list, since a term might
* occur multiple times at a single position, but will only have one
* entry in the position list for each position.
*/
mutable term_positions positions;
public:
const term_positions* get_vector_termpos() const {
merge();
return &positions;
}
Xapian::termcount positionlist_count() const {
return positions.size();
}
void remove() {
positions.clear();
split = 1;
}
/** Add a position.
*
* If @a termpos is already present, this is a no-op.
*
* @param wdf_inc wdf increment
* @param termpos Position to add
*
* @return true if the term was flagged as deleted before the operation.
*/
bool add_position(Xapian::termcount wdf_inc, Xapian::termpos termpos);
/** Append a position.
*
* The position must be >= the largest currently in the list.
*/
void append_position(Xapian::termpos termpos) {
positions.push_back(termpos);
}
/** Remove an entry from the position list.
*
* This removes an entry from the list of positions.
*
* This does not change the value of the wdf.
*
* @exception Xapian::InvalidArgumentError is thrown if the position does
* not occur in the position list.
*/
void remove_position(Xapian::termpos tpos);
/** Remove a range of positions.
*
* @param termpos_first First position to remove
* @param termpos_last Last position to remove
*
* It's OK if there are no positions in the specified range.
*
* @return the number of positions removed.
*/
Xapian::termpos remove_positions(Xapian::termpos termpos_first,
Xapian::termpos termpos_last);
/** Increase within-document frequency.
*
* @return true if the term was flagged as deleted before the operation.
*/
bool increase_wdf(Xapian::termcount delta) {
if (rare(is_deleted())) {
split = 0;
wdf = delta;
return true;
}
wdf += delta;
return false;
}
/// Decrease within-document frequency.
void decrease_wdf(Xapian::termcount delta) {
// Saturating arithmetic - don't let the wdf go below zero.
if (wdf >= delta) {
wdf -= delta;
} else {
wdf = 0;
}
}
/// Get the wdf
Xapian::termcount get_wdf() const { return wdf; }
/** Has this term been deleted from this document?
*
* We flag entries as deleted instead of actually deleting them to avoid
* invalidating existing TermIterator objects.
*/
bool is_deleted() const { return positions.empty() && split > 0; }
/// Return a string describing this object.
string get_description() const;
};
#endif // OM_HGUARD_DOCUMENTTERM_H
|