1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
/** @file flint_termlisttable.cc
* @brief Subclass of FlintTable which holds termlists.
*/
/* Copyright (C) 2007 Olly Betts
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <config.h>
#include <xapian/document.h>
#include <xapian/error.h>
#include <xapian/termiterator.h>
#include "flint_termlisttable.h"
#include "flint_utils.h"
#include "omassert.h"
#include "omdebug.h"
#include "stringutils.h"
#include "utils.h"
#include <string>
using namespace std;
void
FlintTermListTable::set_termlist(Xapian::docid did,
const Xapian::Document & doc,
flint_doclen_t doclen)
{
DEBUGCALL(DB, void, "FlintTermListTable::set_termlist",
did << ", " << doc << ", " << doclen);
string tag = pack_uint(doclen);
Xapian::doccount termlist_size = doc.termlist_count();
if (termlist_size == 0) {
// doclen is sum(wdf) so should be zero if there are no terms.
Assert(doclen == 0);
Assert(doc.termlist_begin() == doc.termlist_end());
add(flint_docid_to_key(did), string());
return;
}
Xapian::TermIterator t = doc.termlist_begin();
if (t != doc.termlist_end()) {
tag += pack_uint(termlist_size);
string prev_term = *t;
// Previous database versions encoded a boolean here, which was
// always false (and pack_bool() encodes false as a '0'). We can
// just omit this and successfully read old and new termlists
// except in the case where the next byte is a '0' - in this case
// we need keep the '0' so that the decoder can just skip any '0'
// it sees in this position (this shouldn't be a common case - 48
// character terms aren't very common, and the first term
// alphabetically is likely to be shorter than average).
// FIXME: If we have an incompatible database version bump we should
// drop this completely.
if (prev_term.size() == '0') tag += '0';
tag += prev_term.size();
tag += prev_term;
tag += pack_uint(t.get_wdf());
--termlist_size;
while (++t != doc.termlist_end()) {
const string & term = *t;
// If there's a shared prefix with the previous term, we don't
// store it explicitly, but just store the length of the shared
// prefix. In general, this is a big win.
size_t reuse = common_prefix_length(prev_term, term);
// reuse must be <= prev_term.size(), and we know that value while
// decoding. So if the wdf is small enough that we can multiply it
// by (prev_term.size() + 1), add reuse and fit the result in a
// byte, then we can pack reuse and the wdf into a single byte and
// save ourselves a byte. We actually need to add one to the wdf
// before multiplying so that a wdf of 0 can be detected by the
// decoder.
size_t packed = 0;
Xapian::termcount wdf = t.get_wdf();
// If wdf >= 128, then we aren't going to be able to pack it in so
// don't even try to avoid the calculation overflowing and making
// us think we can.
if (wdf < 127)
packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
if (packed && packed < 256) {
// We can pack the wdf into the same byte.
tag += char(packed);
tag += char(term.size() - reuse);
tag.append(term.data() + reuse, term.size() - reuse);
} else {
tag += char(reuse);
tag += char(term.size() - reuse);
tag.append(term.data() + reuse, term.size() - reuse);
// FIXME: pack wdf after reuse next time we rejig the format
// incompatibly.
tag += pack_uint(wdf);
}
prev_term = *t;
--termlist_size;
}
}
Assert(termlist_size == 0);
add(flint_docid_to_key(did), tag);
}
flint_doclen_t
FlintTermListTable::get_doclength(Xapian::docid did) const
{
DEBUGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did);
string tag;
if (!get_exact_entry(flint_docid_to_key(did), tag))
throw Xapian::DocNotFoundError("No termlist found for document " +
om_tostring(did));
if (tag.empty()) RETURN(0);
const char * pos = tag.data();
const char * end = pos + tag.size();
flint_doclen_t doclen;
if (!unpack_uint(&pos, end, &doclen)) {
const char *msg;
if (pos == 0) {
msg = "Too little data for doclen in termlist";
} else {
msg = "Overflowed value for doclen in termlist";
}
throw Xapian::DatabaseCorruptError(msg);
}
RETURN(doclen);
}
|