File: index.go

package info (click to toggle)
golang-github-twotwotwo-sorts 0.0~git20160814.bf5c1f2-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 172 kB
  • sloc: makefile: 2
file content (341 lines) | stat: -rw-r--r-- 10,052 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
// Copyright 2015 Randall Farmer. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Package index uses sorted arrays of integers to assist sorting and
// searching, particularly of collections of strings.
package index

import (
	"bytes"
	"sort"
	"strings"

	"github.com/twotwotwo/sorts"
)

type Index struct {
	Keys    []uint64
	Summary []uint64 // implicit B-tree, if Summarize() was called
	Data    sort.Interface
}

// Len returns the length of the data underlying an Index
func (idx *Index) Len() int { return idx.Data.Len() }

// Less compares Index elements by their Keys, falling back to Data.Less for
// equal-keyed items.
func (idx *Index) Less(i, j int) bool {
	return idx.Keys[i] < idx.Keys[j] || (idx.Keys[i] == idx.Keys[j] && idx.Data.Less(i, j))
}

// Swap swaps both the Keys and the inderlying data items at indices i and
// j.
func (idx *Index) Swap(i, j int) {
	idx.Keys[i], idx.Keys[j] = idx.Keys[j], idx.Keys[i]
	idx.Data.Swap(i, j)
}

// Key returns the uint64 key at index i.
func (idx *Index) Key(i int) uint64 { return idx.Keys[i] }

// levelBits and pageSize control the fan-out of Summary, the implicit
// B-tree.  6 won a very informal bake-off.  (Would have guessed 3, matching
// 8-word amd64 cache lines.) More would work better if this were ever on
// block storage, e.g.  levelBits of 9 corresponds to a page size of 4KiB.
const levelBits = 6
const pageSize = 1 << levelBits

// Summarize makes an implicit B-tree to speed lookups, using a few percent
// overhead on top of what's already used for Indices.
func (idx *Index) Summarize() {
	l := idx.Len()
	sl := l>>levelBits + l>>levelBits*2 + l>>levelBits*3 + l>>((levelBits*4)-1)
	summary := make([]uint64, 0, sl)
	summarizing := idx.Keys
	levelNum := 1
	for len(summarizing) > pageSize {
		start := len(summary)
		for i := 0; i < len(summarizing); i += pageSize {
			summary = append(summary, summarizing[i])
		}
		summarizing = summary[start:]
		levelNum++
	}
	idx.Summary = summary
}

// FindUint64 finds the position of the first item >= key in Keys, returning
// one after the end if there is none.  When different values map to the same key,
// you might want to sort.Search within the returned range to narrow your result
// down to the desired values.
func (idx *Index) FindUint64(key uint64) int {
	if idx.Summary != nil {
		return idx.findUint64Summary(key)
	}
	return sort.Search(idx.Len(), func(i int) bool { return idx.Keys[i] >= key })
}

// Compares string a to []byte b, returning -1 if a<b, 0 if a==b, and 1 if a>b.
func CompareStringToBytes(a string, b []byte) int {
	for i := range b {
		if i > len(a) {
			return -1
		}
		if b[i] > a[i] {
			return -1
		}
		if a[i] > b[i] {
			return 1
		}
	}
	if len(a) > len(b) {
		return 1
	}
	return 0
}

// CompareBytesToString is a convenience wrapper for CompareStringToBytes.
func CompareBytesToString(a []byte, b string) int {
	return -CompareStringToBytes(b, a)
}

// FindString finds the first item >= key, returning one after the end if there
// is none. The collection type must implement Key(i) returning string or []byte.
func (idx *Index) FindString(key string) int {
	k := StringKey(key)
	a, b := idx.FindUint64Range(k)
	switch data := idx.Data.(type) {
	case sorts.StringInterface:
		return a + sort.Search(b-a, func(i int) bool {
			return strings.Compare(key, data.Key(a+i)) >= 0
		})
	case sorts.BytesInterface:
		return a + sort.Search(b-a, func(i int) bool {
			return CompareStringToBytes(key, data.Key(a+i)) >= 0
		})
	default:
		panic("to use FindStringKey, Data.Key(i) must return string or []byte")
	}
}

// FindBytes finds the first item >= key, returning one after the end if there
// is none. The collection type must implement Key(i) returning string or []byte.
func (idx *Index) FindBytes(key []byte) int {
	k := BytesKey(key)
	a, b := idx.FindUint64Range(k)
	switch data := idx.Data.(type) {
	case sorts.StringInterface:
		return a + sort.Search(b-a, func(i int) bool {
			return CompareBytesToString(key, data.Key(a+i)) >= 0
		})
	case sorts.BytesInterface:
		offset := sort.Search(b-a, func(i int) bool {
			return bytes.Compare(key, data.Key(a+i)) >= 0
		})
		return a + offset
	default:
		panic("to use FindBytesKey, Data.Key(i) must return string or []byte")
	}
}

// FindUint64Range looks for a range of keys such that all items in idx.Keys[a:b]
// equal key.
// It can return an empty range if the item isn't found; in that case, a and b are both where the item would be inserted (and can be one past the end).
// To find a single key, use FindUint64.
func (idx *Index) FindUint64Range(key uint64) (a, b int) {
	a = idx.FindUint64(key)
	if a == len(idx.Keys) || idx.Keys[a] != key {
		// key not found, needn't search again
		return a, a
	}
	if key == ^uint64(0) { // would overflow
		b = len(idx.Keys)
	} else {
		b = idx.FindUint64(key + 1)
	}
	return
}

// FindStringRange(key) finds the range (a,b] such that Key() returns key for all items in idx.Data[a:b].
// It can return an empty range if the item isn't found; in that case, a and b are both where the item would be inserted (and can be one past the end).
// Data must implement Key(i) returning string or []byte.
// To find a single item, use FindString.
func (idx *Index) FindStringRange(key string) (int, int) {
	k := StringKey(key)
	a, b := idx.FindUint64Range(k)
	switch data := idx.Data.(type) {
	case sorts.StringInterface:
		aa := a + sort.Search(b-a, func(i int) bool {
			return strings.Compare(key, data.Key(a+i)) >= 0
		})
		bb := aa + sort.Search(b-aa, func(i int) bool {
			return strings.Compare(key, data.Key(aa+i)) > 0
		})
		return aa, bb
	case sorts.BytesInterface:
		aa := a + sort.Search(b-a, func(i int) bool {
			return CompareStringToBytes(key, data.Key(a+i)) >= 0
		})
		bb := aa + sort.Search(b-aa, func(i int) bool {
			return CompareStringToBytes(key, data.Key(aa+i)) > 0
		})
		return aa, bb
	default:
		panic("to use FindStringRange, Data.Key(i) must return string or []byte")
	}
}

// FindBytesRange(key) finds the range (a,b] such that Key() returns key for all items in idx.Data[a:b].
// It can return an empty range if the item isn't found; in that case, a is where the item would be inserted (and can be one past the end).
// Data must implement Key(i) returning string or []byte.
// To find a single item, use FindBytes.
func (idx *Index) FindBytesRange(key []byte) (int, int) {
	k := BytesKey(key)
	a, b := idx.FindUint64Range(k)
	switch data := idx.Data.(type) {
	case sorts.StringInterface:
		aa := a + sort.Search(b-a, func(i int) bool {
			return CompareBytesToString(key, data.Key(a+i)) >= 0
		})
		bb := aa + sort.Search(b-aa, func(i int) bool {
			return CompareBytesToString(key, data.Key(aa+i)) > 0
		})
		return aa, bb
	case sorts.BytesInterface:
		aa := a + sort.Search(b-a, func(i int) bool {
			return bytes.Compare(key, data.Key(a+i)) >= 0
		})
		bb := aa + sort.Search(b-aa, func(i int) bool {
			return bytes.Compare(key, data.Key(aa+i)) > 0
		})
		return aa, bb
	default:
		panic("to use FindStringRangeExact, Data.Key(i) must return string or []byte")
	}
}

func (idx *Index) findUint64Summary(key uint64) int {
	summary := idx.Summary
	keys := idx.Keys

	// count how many layers to expect in the "btree"
	levels, l := 0, len(keys)
	for l > 0 {
		levels++
		l >>= levelBits
	}
	levels-- // went one too far

	// keep following largest-strictly-less down the chain
	levelNum := levels
	levelEnd := len(summary)
	offset := 0
	for levelNum > 0 {
		// extract the "level"
		thisLevelBits := uint(levelBits * levelNum)
		levelLen := len(keys) >> thisLevelBits
		if len(keys) > levelLen<<thisLevelBits {
			// an entry for the remainder
			levelLen++
		}
		level := summary[levelEnd-levelLen : levelEnd]

		// extract the page at the given offset
		pageEnd := offset + pageSize
		if pageEnd > len(level) {
			pageEnd = len(level)
		}
		page := level[offset:pageEnd]

		// scan page for an entry >= key
		// binsearch would be fewer operations but less predictable ones
		i := 0
		for i < len(page) && page[i] < key {
			i++
		}
		if i > 0 {
			// i is first one that goes too far
			i--
		}

		// use that to walk down the tree
		// in particular, get next offset and level location
		offset += i
		offset <<= levelBits
		levelEnd -= levelLen
		levelNum--
	}

	// level==0 is the original array
	pageEnd := offset + pageSize
	if pageEnd > len(keys) {
		pageEnd = len(keys)
	}
	page := keys[offset:pageEnd]
	i := 0
	for i < len(page) && page[i] < key {
		i++
	}
	return offset + i
}

// StringKey generates a uint64 key from the first bytes of key.
func StringKey(key string) uint64 {
	k := uint64(0)
	for j := 0; j < 8 && j < len(key); j++ {
		k ^= uint64(key[j]) << uint(56-8*j)
	}
	return k
}

// BytesKey generates a uint64 key from the first bytes of key.
func BytesKey(key []byte) uint64 {
	k := uint64(0)
	for j := 0; j < 8 && j < len(key); j++ {
		k ^= uint64(key[j]) << uint(56-8*j)
	}
	return k
}

// SortWithIndex allocates an Index with space for a uint64 key for each
// item in data, then sorts items by their uint64 keys, using data.Less as a
// tie-breaker for equal-keyed items.  data may implement index.KeySetter or
// any of sorts.StringInterface, BytesInterface, or Uint64Interface.
func SortWithIndex(data sort.Interface) *Index {
	l := data.Len()
	indices := make([]uint64, l)
	idx := &Index{
		Keys: indices,
		Data: data,
	}
	switch data := data.(type) {
	case sorts.StringInterface:
		for i := 0; i < l; i++ {
			key := data.Key(i)
			k := uint64(0)
			for j := 0; j < 8 && j < len(key); j++ {
				k ^= uint64(key[j]) << uint(56-8*j)
			}
			indices[i] = k
		}
	case sorts.BytesInterface:
		for i := 0; i < l; i++ {
			key := data.Key(i)
			k := uint64(0)
			for j := 0; j < 8 && j < len(key); j++ {
				k ^= uint64(key[j]) << uint(56-8*j)
			}
			indices[i] = k
		}
	case sorts.Uint64Interface:
		for i := 0; i < l; i++ {
			indices[i] = data.Key(i)
		}
	default:
		panic("don't know how to extract int keys for data")
	}
	sorts.ByUint64(idx)
	return idx
}