File: sortable-serialise.cc

package info (click to toggle)
xapian-core 1.4.3-2%2Bdeb9u3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 21,412 kB
  • sloc: cpp: 113,868; ansic: 8,723; sh: 4,433; perl: 836; makefile: 566; tcl: 317; python: 40
file content (265 lines) | stat: -rw-r--r-- 8,486 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/** @file sortable-serialise.cc
 * @brief Serialise floating point values to string which sort the same way.
 */
/* Copyright (C) 2007,2009,2015,2016 Olly Betts
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <config.h>

#include <xapian/queryparser.h>

// Disable these assertions when building the library as these functions are
// marked not to throw exceptions in the API headers.  In unittest we define
// UNITTEST_ASSERT_NOTHROW to set a variable to the exception message and
// return, then the harness checks if that variable has been set.
#ifndef XAPIAN_UNITTEST
# define UNITTEST_ASSERT_NOTHROW(COND,RET)
#endif

#include <cfloat>
#include <cmath>
#include <cstring>

#include <string>

using namespace std;

#if FLT_RADIX != 2
# error Code currently assumes FLT_RADIX == 2
#endif

#ifdef _MSC_VER
// Disable warning about negating an unsigned type, which we do deliberately.
# pragma warning(disable:4146)
#endif

size_t
Xapian::sortable_serialise_(double value, char * buf) XAPIAN_NOEXCEPT
{
    double mantissa;
    int exponent;

    // Negative infinity.
    if (value < -DBL_MAX) return 0;

    mantissa = frexp(value, &exponent);

    /* Deal with zero specially.
     *
     * IEEE representation of doubles uses 11 bits for the exponent, with a
     * bias of 1023.  We bias this by subtracting 8, and non-IEEE
     * representations may allow higher exponents, so allow exponents down to
     * -2039 - if smaller exponents are possible anywhere, we underflow such
     *  numbers to 0.
     */
    if (mantissa == 0.0 || exponent < -2039) {
	*buf = '\x80';
	return 1;
    }

    bool negative = (mantissa < 0);
    if (negative) mantissa = -mantissa;

    // Infinity, or extremely large non-IEEE representation.
    if (value > DBL_MAX || exponent > 2055) {
	if (negative) {
	    // This can only happen with a non-IEEE representation, because
	    // we've already tested for value < -DBL_MAX
	    return 0;
	} else {
	    memset(buf, '\xff', 9);
	    return 9;
	}
    }

    // Encoding:
    //
    // [ 7 | 6 | 5 | 4 3 2 1 0]
    //   Sm  Se  Le
    //
    // Sm stores the sign of the mantissa: 1 = positive or zero, 0 = negative.
    // Se stores the sign of the exponent: Sm for positive/zero, !Sm for neg.
    // Le stores the length of the exponent: !Se for 3 bits, Se for 11 bits.
    unsigned char next = (negative ? 0 : 0xe0);

    // Bias the exponent by 8 so that more small integers get short encodings.
    exponent -= 8;
    bool exponent_negative = (exponent < 0);
    if (exponent_negative) {
	exponent = -exponent;
	next ^= 0x60;
    }

    size_t len = 0;

    /* We store the exponent in 3 or 11 bits.  If the number is negative, we
     * flip all the bits of the exponent, since larger negative numbers should
     * sort first.
     *
     * If the exponent is negative, we flip the bits of the exponent, since
     * larger negative exponents should sort first (unless the number is
     * negative, in which case they should sort later).
     */
    UNITTEST_ASSERT_NOTHROW(exponent >= 0, 0);
    if (exponent < 8) {
	next ^= 0x20;
	next |= static_cast<unsigned char>(exponent << 2);
	if (negative ^ exponent_negative) next ^= 0x1c;
    } else {
	UNITTEST_ASSERT_NOTHROW((exponent >> 11) == 0, 0);
	// Put the top 5 bits of the exponent into the lower 5 bits of the
	// first byte:
	next |= static_cast<unsigned char>(exponent >> 6);
	if (negative ^ exponent_negative) next ^= 0x1f;
	buf[len++] = next;
	// And the lower 6 bits of the exponent go into the upper 6 bits
	// of the second byte:
	next = static_cast<unsigned char>(exponent) << 2;
	if (negative ^ exponent_negative) next ^= 0xfc;
    }

    // Convert the 52 (or 53) bits of the mantissa into two 32-bit words.
    mantissa *= 1 << (negative ? 26 : 27);
    unsigned word1 = static_cast<unsigned>(mantissa);
    mantissa -= word1;
    unsigned word2 = static_cast<unsigned>(mantissa * 4294967296.0); // 1<<32
    // If the number is positive, the first bit will always be set because 0.5
    // <= mantissa < 1, unless mantissa is zero, which we handle specially
    // above).  If the number is negative, we negate the mantissa instead of
    // flipping all the bits, so in the case of 0.5, the first bit isn't set
    // so we need to store it explicitly.  But for the cost of one extra
    // leading bit, we can save several trailing 0xff bytes in lots of common
    // cases.
    UNITTEST_ASSERT_NOTHROW(negative || (word1 & (1<<26)), 0);
    if (negative) {
	// We negate the mantissa for negative numbers, so that the sort order
	// is reversed (since larger negative numbers should come first).
	word1 = -word1;
	if (word2 != 0) ++word1;
	word2 = -word2;
    }

    word1 &= 0x03ffffff;
    next |= static_cast<unsigned char>(word1 >> 24);
    buf[len++] = next;
    buf[len++] = char(word1 >> 16);
    buf[len++] = char(word1 >> 8);
    buf[len++] = char(word1);

    buf[len++] = char(word2 >> 24);
    buf[len++] = char(word2 >> 16);
    buf[len++] = char(word2 >> 8);
    buf[len++] = char(word2);

    // Finally, we can chop off any trailing zero bytes.
    while (len > 0 && buf[len - 1] == '\0') {
	--len;
    }

    return len;
}

/// Get a number from the character at a given position in a string, returning
/// 0 if the string isn't long enough.
static inline unsigned char
numfromstr(const std::string & str, std::string::size_type pos)
{
    return (pos < str.size()) ? static_cast<unsigned char>(str[pos]) : '\0';
}

double
Xapian::sortable_unserialise(const std::string & value) XAPIAN_NOEXCEPT
{
    // Zero.
    if (value.size() == 1 && value[0] == '\x80') return 0.0;

    // Positive infinity.
    if (value.size() == 9 &&
	memcmp(value.data(), "\xff\xff\xff\xff\xff\xff\xff\xff\xff", 9) == 0) {
#ifdef INFINITY
	// INFINITY is C99.  Oddly, it's of type "float" so sanity check in
	// case it doesn't cast to double as infinity (apparently some
	// implementations have this problem).
	if (double(INFINITY) > HUGE_VAL) return INFINITY;
#endif
	return HUGE_VAL;
    }

    // Negative infinity.
    if (value.empty()) {
#ifdef INFINITY
	if (double(INFINITY) > HUGE_VAL) return -INFINITY;
#endif
	return -HUGE_VAL;
    }

    unsigned char first = numfromstr(value, 0);
    size_t i = 0;

    first ^= static_cast<unsigned char>(first & 0xc0) >> 1;
    bool negative = !(first & 0x80);
    bool exponent_negative = (first & 0x40);
    bool explen = !(first & 0x20);
    int exponent = first & 0x1f;
    if (!explen) {
	exponent >>= 2;
	if (negative ^ exponent_negative) exponent ^= 0x07;
    } else {
	first = numfromstr(value, ++i);
	exponent <<= 6;
	exponent |= (first >> 2);
	if (negative ^ exponent_negative) exponent ^= 0x07ff;
    }

    unsigned word1;
    word1 = (unsigned(first & 0x03) << 24);
    word1 |= numfromstr(value, ++i) << 16;
    word1 |= numfromstr(value, ++i) << 8;
    word1 |= numfromstr(value, ++i);

    unsigned word2 = 0;
    if (i < value.size()) {
	word2 = numfromstr(value, ++i) << 24;
	word2 |= numfromstr(value, ++i) << 16;
	word2 |= numfromstr(value, ++i) << 8;
	word2 |= numfromstr(value, ++i);
    }

    if (negative) {
	word1 = -word1;
	if (word2 != 0) ++word1;
	word2 = -word2;
	UNITTEST_ASSERT_NOTHROW((word1 & 0xf0000000) != 0, 0);
	word1 &= 0x03ffffff;
    }
    if (!negative) word1 |= 1<<26;

    double mantissa = 0;
    if (word2) mantissa = word2 / 4294967296.0; // 1<<32
    mantissa += word1;
    mantissa /= 1 << (negative ? 26 : 27);

    if (exponent_negative) exponent = -exponent;
    exponent += 8;

    if (negative) mantissa = -mantissa;

    // We use scalbn() since it's equivalent to ldexp() when FLT_RADIX == 2
    // (which we currently assume), except that ldexp() will set errno if the
    // result overflows or underflows, which isn't really desirable here.
    return scalbn(mantissa, exponent);
}