File: documentterm.h

package info (click to toggle)
xapian-core 1.4.29-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 22,840 kB
  • sloc: cpp: 92,356; ansic: 9,948; sh: 5,026; perl: 850; makefile: 509; javascript: 360; tcl: 319; python: 40
file content (190 lines) | stat: -rw-r--r-- 5,624 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/** @file
 * @brief internal class representing a term in a modified document
 */
/* Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2002 Ananova Ltd
 * Copyright 2003,2007,2018 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

#ifndef OM_HGUARD_DOCUMENTTERM_H
#define OM_HGUARD_DOCUMENTTERM_H

#include "debuglog.h"

#include <string>
#include <vector>

#include <xapian/types.h>

using std::string;
using std::vector;

/// A term in a document.
class OmDocumentTerm {
    public:
    /** Make a new term.
     *
     *  @param wdf_   Initial wdf.
     */
    explicit OmDocumentTerm(Xapian::termcount wdf_)
	: wdf(wdf_)
    {
	LOGCALL_CTOR(DB, "OmDocumentTerm", wdf_);
    }

    /** Within document frequency of the term.
     *  This is the number of occurrences of the term in the document.
     */
    Xapian::termcount wdf;

    /** Split point in the position range.
     *
     *  To allow more efficient insertion of positions, we support the
     *  positions being split into two sorted ranges, and if this is the
     *  case, split will be > 0 and there will be two sorted ranges [0, split)
     *  and [split, positions.size()).
     *
     *  If split is 0, then [0, positions.size()) form a single sorted range.
     *
     *  If positions.empty(), then split > 0 indicates that the term has been
     *  deleted (this allows us to delete terms without invalidating existing
     *  TermIterator objects).
     *
     *  Use type unsigned here to avoid bloating this structure.  More than
     *  4 billion positions in one document is not sensible (and not possible
     *  unless termpos is configured to be 64 bit).
     */
    mutable unsigned split = 0;

    /** Merge sorted ranges before and after @a split. */
    void merge() const;

    typedef vector<Xapian::termpos> term_positions;

  private:
    /** Positional information.
     *
     *  This is a list of positions at which the term occurs in the
     *  document. The list is in strictly increasing order of term
     *  position.
     *
     *  The positions start at 1.
     *
     *  Note that, even if positional information is present, the WDF might
     *  not be equal to the length of the position list, since a term might
     *  occur multiple times at a single position, but will only have one
     *  entry in the position list for each position.
     */
    mutable term_positions positions;

  public:
    const term_positions* get_vector_termpos() const {
	merge();
	return &positions;
    }

    Xapian::termcount positionlist_count() const {
	return positions.size();
    }

    void remove() {
	positions.clear();
	split = 1;
    }

    /** Add a position.
     *
     *  If @a termpos is already present, this is a no-op.
     *
     *  @param wdf_inc  wdf increment
     *  @param termpos	Position to add
     *
     *  @return true if the term was flagged as deleted before the operation.
     */
    bool add_position(Xapian::termcount wdf_inc, Xapian::termpos termpos);

    /** Append a position.
     *
     *  The position must be >= the largest currently in the list.
     */
    void append_position(Xapian::termpos termpos) {
	positions.push_back(termpos);
    }

    /** Remove an entry from the position list.
     *
     *  This removes an entry from the list of positions.
     *
     *  This does not change the value of the wdf.
     *
     *  @exception Xapian::InvalidArgumentError is thrown if the position does
     *  not occur in the position list.
     */
    void remove_position(Xapian::termpos tpos);

    /** Remove a range of positions.
     *
     *  @param termpos_first	First position to remove
     *  @param termpos_last	Last position to remove
     *
     *  It's OK if there are no positions in the specified range.
     *
     *  @return the number of positions removed.
     */
    Xapian::termpos remove_positions(Xapian::termpos termpos_first,
				     Xapian::termpos termpos_last);

    /** Increase within-document frequency.
     *
     *  @return true if the term was flagged as deleted before the operation.
     */
    bool increase_wdf(Xapian::termcount delta) {
	if (rare(is_deleted())) {
	    split = 0;
	    wdf = delta;
	    return true;
	}
	wdf += delta;
	return false;
    }

    /// Decrease within-document frequency.
    void decrease_wdf(Xapian::termcount delta) {
	// Saturating arithmetic - don't let the wdf go below zero.
	if (wdf >= delta) {
	    wdf -= delta;
	} else {
	    wdf = 0;
	}
    }

    /// Get the wdf
    Xapian::termcount get_wdf() const { return wdf; }

    /** Has this term been deleted from this document?
     *
     *  We flag entries as deleted instead of actually deleting them to avoid
     *  invalidating existing TermIterator objects.
     */
    bool is_deleted() const { return positions.empty() && split > 0; }

    /// Return a string describing this object.
    string get_description() const;
};

#endif // OM_HGUARD_DOCUMENTTERM_H