1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This algorithm is based on "Faster Suffix Sorting"
// by N. Jesper Larsson and Kunihiko Sadakane
// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
// code: http://www.larsson.dogma.net/qsufsort.c
// This algorithm computes the suffix array sa by computing its inverse.
// Consecutive groups of suffixes in sa are labeled as sorted groups or
// unsorted groups. For a given pass of the sorter, all suffixes are ordered
// up to their first h characters, and sa is h-ordered. Suffixes in their
// final positions and unambiguously sorted in h-order are in a sorted group.
// Consecutive groups of suffixes with identical first h characters are an
// unsorted group. In each pass of the algorithm, unsorted groups are sorted
// according to the group number of their following suffix.
// In the implementation, if sa[i] is negative, it indicates that i is
// the first element of a sorted group of length -sa[i], and can be skipped.
// An unsorted group sa[i:k] is given the group number of the index of its
// last element, k-1. The group numbers are stored in the inverse slice (inv),
// and when all groups are sorted, this slice is the inverse suffix array.
package suffixarray
import "sort"
func qsufsort(data []byte) []int {
// initial sorting by first byte of suffix
sa := sortedByFirstByte(data)
if len(sa) < 2 {
return sa
}
// initialize the group lookup table
// this becomes the inverse of the suffix array when all groups are sorted
inv := initGroups(sa, data)
// the index starts 1-ordered
sufSortable := &suffixSortable{sa: sa, inv: inv, h: 1}
for sa[0] > -len(sa) { // until all suffixes are one big sorted group
// The suffixes are h-ordered, make them 2*h-ordered
pi := 0 // pi is first position of first group
sl := 0 // sl is negated length of sorted groups
for pi < len(sa) {
if s := sa[pi]; s < 0 { // if pi starts sorted group
pi -= s // skip over sorted group
sl += s // add negated length to sl
} else { // if pi starts unsorted group
if sl != 0 {
sa[pi+sl] = sl // combine sorted groups before pi
sl = 0
}
pk := inv[s] + 1 // pk-1 is last position of unsorted group
sufSortable.sa = sa[pi:pk]
sort.Sort(sufSortable)
sufSortable.updateGroups(pi)
pi = pk // next group
}
}
if sl != 0 { // if the array ends with a sorted group
sa[pi+sl] = sl // combine sorted groups at end of sa
}
sufSortable.h *= 2 // double sorted depth
}
for i := range sa { // reconstruct suffix array from inverse
sa[inv[i]] = i
}
return sa
}
func sortedByFirstByte(data []byte) []int {
// total byte counts
var count [256]int
for _, b := range data {
count[b]++
}
// make count[b] equal index of first occurrence of b in sorted array
sum := 0
for b := range count {
count[b], sum = sum, count[b]+sum
}
// iterate through bytes, placing index into the correct spot in sa
sa := make([]int, len(data))
for i, b := range data {
sa[count[b]] = i
count[b]++
}
return sa
}
func initGroups(sa []int, data []byte) []int {
// label contiguous same-letter groups with the same group number
inv := make([]int, len(data))
prevGroup := len(sa) - 1
groupByte := data[sa[prevGroup]]
for i := len(sa) - 1; i >= 0; i-- {
if b := data[sa[i]]; b < groupByte {
if prevGroup == i+1 {
sa[i+1] = -1
}
groupByte = b
prevGroup = i
}
inv[sa[i]] = prevGroup
if prevGroup == 0 {
sa[0] = -1
}
}
// Separate out the final suffix to the start of its group.
// This is necessary to ensure the suffix "a" is before "aba"
// when using a potentially unstable sort.
lastByte := data[len(data)-1]
s := -1
for i := range sa {
if sa[i] >= 0 {
if data[sa[i]] == lastByte && s == -1 {
s = i
}
if sa[i] == len(sa)-1 {
sa[i], sa[s] = sa[s], sa[i]
inv[sa[s]] = s
sa[s] = -1 // mark it as an isolated sorted group
break
}
}
}
return inv
}
type suffixSortable struct {
sa []int
inv []int
h int
buf []int // common scratch space
}
func (x *suffixSortable) Len() int { return len(x.sa) }
func (x *suffixSortable) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
func (x *suffixSortable) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
func (x *suffixSortable) updateGroups(offset int) {
bounds := x.buf[0:0]
group := x.inv[x.sa[0]+x.h]
for i := 1; i < len(x.sa); i++ {
if g := x.inv[x.sa[i]+x.h]; g > group {
bounds = append(bounds, i)
group = g
}
}
bounds = append(bounds, len(x.sa))
x.buf = bounds
// update the group numberings after all new groups are determined
prev := 0
for _, b := range bounds {
for i := prev; i < b; i++ {
x.inv[x.sa[i]] = offset + b - 1
}
if b-prev == 1 {
x.sa[prev] = -1
}
prev = b
}
}
|