File: bwt.go

package info (click to toggle)
golang-github-shenwei356-bwt 0.0~git20200418.ae79c98-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 80 kB
  • sloc: makefile: 2
file content (112 lines) | stat: -rw-r--r-- 2,995 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package bwt

import (
	"errors"
	"index/suffixarray"
	"reflect"
	"sort"

	"github.com/shenwei356/util/byteutil"
)

// CheckEndSymbol is a global variable for checking end symbol before Burrows–Wheeler transform
var CheckEndSymbol = true

// ErrEndSymbolExisted means you should choose another EndSymbol
var ErrEndSymbolExisted = errors.New("bwt: end-symbol existed in string")

// Transform returns Burrows–Wheeler transform of a byte slice.
// See https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
func Transform(s []byte, es byte) ([]byte, error) {
	if CheckEndSymbol {
		for _, c := range s {
			if c == es {
				return nil, ErrEndSymbolExisted
			}
		}
	}
	sa := SuffixArray(s)
	bwt, err := FromSuffixArray(s, sa, es)
	return bwt, err
}

// InverseTransform reverses the bwt to original byte slice. Not optimized yet.
func InverseTransform(t []byte, es byte) []byte {
	n := len(t)
	lines := make([][]byte, n)
	for i := 0; i < n; i++ {
		lines[i] = make([]byte, n)
	}

	for i := 0; i < n; i++ {
		for j := 0; j < n; j++ {
			lines[j][n-1-i] = t[j]
		}
		sort.Sort(byteutil.SliceOfByteSlice(lines))
	}

	s := make([]byte, n-1)
	for _, line := range lines {
		if line[n-1] == es {
			s = line[0 : n-1]
			break
		}
	}
	return s
}

// SuffixArray returns the suffix array of s.
// This function is the performance bottleneck of bwt and bwt/fmi package, with O(nlogn).
func SuffixArray(s []byte) []int {
	// sa := make([]int, len(s)+1)
	// sa[0] = len(s)

	// for i := 0; i < len(s); i++ {
	// 	sa[i+1] = i
	// }
	// sort.Slice(sa[1:], func(i, j int) bool {
	// 	return bytes.Compare(s[sa[i+1]:], s[sa[j+1]:]) < 0
	// })
	// return sa

	// https://github.com/shenwei356/bwt/issues/3 .
	// nearly copy from https://github.com/crazyleg/burrow-wheelers-golang/blob/master/pkg/bwtgolang/suffixarrayBWT.go#L8
	// It's 4X faster!
	//
	// benchmark                 old ns/op     new ns/op     delta
	// BenchmarkTransform-16     1339346       310706        -76.80%
	//
	// benchmark                 old allocs     new allocs     delta
	// BenchmarkTransform-16     4              5              +25.00%
	//
	// benchmark                 old bytes     new bytes     delta
	// BenchmarkTransform-16     55362         79962         +44.43%
	_sa := suffixarray.New(s)
	tmp := reflect.ValueOf(_sa).Elem().FieldByName("sa").FieldByIndex([]int{0})
	var sa []int = make([]int, len(s)+1)
	sa[0] = len(s)
	for i := 0; i < len(s); i++ {
		sa[i+1] = int(tmp.Index(i).Int())
	}
	return sa
}

// ErrInvalidSuffixArray means length of sa is not equal to 1+len(s)
var ErrInvalidSuffixArray = errors.New("bwt: invalid suffix array")

// FromSuffixArray compute BWT from sa
func FromSuffixArray(s []byte, sa []int, es byte) ([]byte, error) {
	if len(s)+1 != len(sa) || sa[0] != len(s) {
		return nil, ErrInvalidSuffixArray
	}
	bwt := make([]byte, len(sa))
	bwt[0] = s[len(s)-1]
	for i := 1; i < len(sa); i++ {
		if sa[i] == 0 {
			bwt[i] = es
		} else {
			bwt[i] = s[sa[i]-1]
		}
	}
	return bwt, nil
}