1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
|
/** @file inmemory_database.h
* @brief C++ class definition for inmemory database access
*/
/* Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2002 Ananova Ltd
* Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015 Olly Betts
* Copyright 2006,2009 Lemur Consulting Ltd
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
#ifndef OM_HGUARD_INMEMORY_DATABASE_H
#define OM_HGUARD_INMEMORY_DATABASE_H
#include "api/leafpostlist.h"
#include "api/termlist.h"
#include "backends/backends.h"
#include "backends/database.h"
#include "backends/valuestats.h"
#include <map>
#include <vector>
#include <algorithm>
#include <xapian/document.h>
#include "inmemory_positionlist.h"
#include "internaltypes.h"
#include "omassert.h"
#include "noreturn.h"
using namespace std;
// Class representing a posting (a term/doc pair, and
// all the relevant positional information, is a single posting)
class InMemoryPosting {
public:
Xapian::docid did;
bool valid;
vector<Xapian::termpos> positions; // Sorted vector of positions
Xapian::termcount wdf;
// Merge two postings (same term/doc pair, new positional info)
void merge(const InMemoryPosting & post) {
Assert(did == post.did);
positions.insert(positions.end(),
post.positions.begin(),
post.positions.end());
// FIXME - inefficient - use merge (and list<>)?
sort(positions.begin(), positions.end());
}
};
class InMemoryTermEntry {
public:
string tname;
vector<Xapian::termpos> positions; // Sorted vector of positions
Xapian::termcount wdf;
// Merge two postings (same term/doc pair, new positional info)
void merge(const InMemoryTermEntry & post) {
Assert(tname == post.tname);
positions.insert(positions.end(),
post.positions.begin(),
post.positions.end());
// FIXME - inefficient - use merge (and list<>)?
sort(positions.begin(), positions.end());
}
};
// Compare by document ID
class InMemoryPostingLessThan {
public:
int operator() (const InMemoryPosting &p1,
const InMemoryPosting &p2) const
{
return p1.did < p2.did;
}
};
// Compare by termname
class InMemoryTermEntryLessThan {
public:
int operator() (const InMemoryTermEntry&p1,
const InMemoryTermEntry&p2) const
{
return p1.tname < p2.tname;
}
};
// Class representing a term and the documents indexing it
class InMemoryTerm {
public:
// Sorted list of documents indexing this term.
vector<InMemoryPosting> docs;
Xapian::termcount term_freq;
Xapian::termcount collection_freq;
InMemoryTerm() : term_freq(0), collection_freq(0) {}
void add_posting(const InMemoryPosting & post);
};
/// Class representing a document and the terms indexing it.
class InMemoryDoc {
public:
bool is_valid;
// Sorted list of terms indexing this document.
vector<InMemoryTermEntry> terms;
/* Initialise invalid by default, so that resizing the termlist array
* doesn't create valid documents. */
InMemoryDoc() : is_valid(false) {}
// Initialise specifying validity.
explicit InMemoryDoc(bool is_valid_) : is_valid(is_valid_) {}
void add_posting(const InMemoryTermEntry & post);
};
class InMemoryDatabase;
/** A PostList in an inmemory database.
*/
class InMemoryPostList : public LeafPostList {
friend class InMemoryDatabase;
private:
vector<InMemoryPosting>::const_iterator pos;
vector<InMemoryPosting>::const_iterator end;
Xapian::doccount termfreq;
bool started;
/** List of positions of the current term.
* This list is populated when read_position_list() is called.
*/
InMemoryPositionList mypositions;
Xapian::Internal::intrusive_ptr<const InMemoryDatabase> db;
InMemoryPostList(Xapian::Internal::intrusive_ptr<const InMemoryDatabase> db,
const InMemoryTerm & imterm, const std::string & term_);
public:
Xapian::doccount get_termfreq() const;
Xapian::docid get_docid() const; // Gets current docid
Xapian::termcount get_doclength() const; // Length of current document
Xapian::termcount get_unique_terms() const; // number of terms in current document
Xapian::termcount get_wdf() const; // Within Document Frequency
PositionList * read_position_list();
PositionList * open_position_list() const;
PostList *next(double w_min); // Moves to next docid
PostList *skip_to(Xapian::docid did, double w_min); // Moves to next docid >= specified docid
// True if we're off the end of the list.
bool at_end() const;
string get_description() const;
};
/** A PostList over all docs in an inmemory database.
*/
class InMemoryAllDocsPostList : public LeafPostList {
friend class InMemoryDatabase;
private:
Xapian::docid did;
Xapian::Internal::intrusive_ptr<const InMemoryDatabase> db;
InMemoryAllDocsPostList(Xapian::Internal::intrusive_ptr<const InMemoryDatabase> db);
public:
Xapian::doccount get_termfreq() const;
Xapian::docid get_docid() const; // Gets current docid
Xapian::termcount get_doclength() const; // Length of current document
Xapian::termcount get_unique_terms() const; // number of terms in current document
Xapian::termcount get_wdf() const; // Within Document Frequency
PositionList * read_position_list();
PositionList * open_position_list() const;
PostList *next(double w_min); // Moves to next docid
PostList *skip_to(Xapian::docid did, double w_min); // Moves to next docid >= specified docid
// True if we're off the end of the list
bool at_end() const;
string get_description() const;
};
// Term List
class InMemoryTermList : public TermList {
friend class InMemoryDatabase;
private:
vector<InMemoryTermEntry>::const_iterator pos;
vector<InMemoryTermEntry>::const_iterator end;
Xapian::termcount terms;
bool started;
Xapian::Internal::intrusive_ptr<const InMemoryDatabase> db;
Xapian::docid did;
Xapian::termcount document_length;
InMemoryTermList(Xapian::Internal::intrusive_ptr<const InMemoryDatabase> db,
Xapian::docid did,
const InMemoryDoc & doc,
Xapian::termcount len);
public:
Xapian::termcount get_approx_size() const;
/// Collate weighting information for the current term.
void accumulate_stats(Xapian::Internal::ExpandStats & stats) const;
string get_termname() const;
Xapian::termcount get_wdf() const; // Number of occurrences of term in current doc
Xapian::doccount get_termfreq() const; // Number of docs indexed by term
TermList * next();
TermList * skip_to(const std::string & term);
bool at_end() const;
Xapian::termcount positionlist_count() const;
Xapian::PositionIterator positionlist_begin() const;
};
class InMemoryDocument;
/** A database held entirely in memory.
*
* This is a prototype database, mainly used for debugging and testing.
*/
class InMemoryDatabase : public Xapian::Database::Internal {
friend class InMemoryAllDocsPostList;
friend class InMemoryDocument;
map<string, InMemoryTerm> postlists;
vector<InMemoryDoc> termlists;
vector<std::string> doclists;
vector<std::map<Xapian::valueno, string> > valuelists;
std::map<Xapian::valueno, ValueStats> valuestats;
vector<Xapian::termcount> doclengths;
std::map<string, string> metadata;
Xapian::doccount totdocs;
totlen_t totlen;
bool positions_present;
// Flag, true if the db has been closed.
bool closed;
// Stop copy / assignment being allowed
InMemoryDatabase& operator=(const InMemoryDatabase &);
InMemoryDatabase(const InMemoryDatabase &);
void make_term(const string & tname);
bool doc_exists(Xapian::docid did) const;
Xapian::docid make_doc(const string & docdata);
/* The common parts of add_doc and replace_doc */
void finish_add_doc(Xapian::docid did, const Xapian::Document &document);
void add_values(Xapian::docid did, const map<Xapian::valueno, string> &values_);
void make_posting(InMemoryDoc * doc,
const string & tname,
Xapian::docid did,
Xapian::termpos position,
Xapian::termcount wdf,
bool use_position = true);
//@{
/** Implementation of virtual methods: see Database for details.
*/
void commit();
void cancel();
Xapian::docid add_document(const Xapian::Document & document);
// Stop the default implementation of delete_document(term) and
// replace_document(term) from being hidden. This isn't really
// a problem as we only try to call them through the base class
// (where they aren't hidden) but some compilers generate a warning
// about the hiding.
#ifndef _MSC_VER
using Xapian::Database::Internal::delete_document;
using Xapian::Database::Internal::replace_document;
#endif
void delete_document(Xapian::docid did);
void replace_document(Xapian::docid did, const Xapian::Document & document);
//@}
public:
/** Create and open an in-memory database.
*
* @exception Xapian::DatabaseOpeningError thrown if database can't be opened.
*/
InMemoryDatabase();
~InMemoryDatabase();
bool reopen();
void close();
bool is_closed() const { return closed; }
Xapian::doccount get_doccount() const;
Xapian::docid get_lastdocid() const;
totlen_t get_total_length() const;
Xapian::termcount get_doclength(Xapian::docid did) const;
Xapian::termcount get_unique_terms(Xapian::docid did) const;
void get_freqs(const string & term,
Xapian::doccount * termfreq_ptr,
Xapian::termcount * collfreq_ptr) const;
Xapian::doccount get_value_freq(Xapian::valueno slot) const;
std::string get_value_lower_bound(Xapian::valueno slot) const;
std::string get_value_upper_bound(Xapian::valueno slot) const;
bool term_exists(const string & tname) const;
bool has_positions() const;
LeafPostList * open_post_list(const string & tname) const;
TermList * open_term_list(Xapian::docid did) const;
Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy) const;
std::string get_metadata(const std::string & key) const;
TermList * open_metadata_keylist(const std::string &prefix) const;
void set_metadata(const std::string & key, const std::string & value);
Xapian::termcount positionlist_count(Xapian::docid did,
const string & tname) const;
PositionList * open_position_list(Xapian::docid did,
const string & tname) const;
TermList * open_allterms(const string & prefix) const;
XAPIAN_NORETURN(static void throw_database_closed());
int get_backend_info(string * path) const {
if (path) *path = string();
return BACKEND_INMEMORY;
}
};
#endif /* OM_HGUARD_INMEMORY_DATABASE_H */
|