1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
|
/** @file prefix_compressed_strings.h
* @brief Handle encoding and decoding prefix-compressed lists of strings
*/
/* Copyright (C) 2004,2005,2006,2007,2008,2009,2010 Olly Betts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
#ifndef XAPIAN_INCLUDED_PREFIX_COMPRESSED_STRINGS_H
#define XAPIAN_INCLUDED_PREFIX_COMPRESSED_STRINGS_H
#include <xapian/error.h>
#include <algorithm>
#include <string>
// We XOR the length values with this so that they are more likely to coincide
// with lower case ASCII letters, which are likely to be common. This means
// that zlib should do a better job of compressing tag values - in tests, this
// gave 5% better compression.
#define MAGIC_XOR_VALUE 96
class PrefixCompressedStringItor {
const unsigned char * p;
size_t left;
std::string current;
PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
const std::string ¤t_)
: p(p_), left(left_), current(current_) { }
public:
explicit PrefixCompressedStringItor(const std::string & s)
: p(reinterpret_cast<const unsigned char *>(s.data())),
left(s.size()) {
if (left) {
operator++();
} else {
p = NULL;
}
}
const std::string & operator*() const {
return current;
}
PrefixCompressedStringItor operator++(int) {
const unsigned char * old_p = p;
size_t old_left = left;
std::string old_current = current;
operator++();
return PrefixCompressedStringItor(old_p, old_left, old_current);
}
PrefixCompressedStringItor & operator++() {
if (left == 0) {
p = NULL;
} else {
if (!current.empty()) {
current.resize(*p++ ^ MAGIC_XOR_VALUE);
--left;
}
size_t add;
if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
current.append(reinterpret_cast<const char *>(p + 1), add);
p += add + 1;
left -= add + 1;
}
return *this;
}
bool at_end() const {
return p == NULL;
}
};
class PrefixCompressedStringWriter {
std::string current;
std::string & out;
public:
explicit PrefixCompressedStringWriter(std::string & out_) : out(out_) { }
void append(const std::string & word) {
// If this isn't the first entry, see how much of the previous one
// we can reuse.
if (!current.empty()) {
size_t len = std::min(current.size(), word.size());
size_t i;
for (i = 0; i < len; ++i) {
if (current[i] != word[i]) break;
}
out += char(i ^ MAGIC_XOR_VALUE);
out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
out.append(word.data() + i, word.size() - i);
} else {
out += char(word.size() ^ MAGIC_XOR_VALUE);
out += word;
}
current = word;
}
};
struct PrefixCompressedStringItorGt {
/// Return true if and only if a's string is strictly greater than b's.
bool operator()(const PrefixCompressedStringItor *a,
const PrefixCompressedStringItor *b) const {
return (**a > **b);
}
};
#endif // XAPIAN_INCLUDED_PREFIX_COMPRESSED_STRINGS_H
|