1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
|
package uniseg
import "unicode/utf8"
// Graphemes implements an iterator over Unicode grapheme clusters, or
// user-perceived characters. While iterating, it also provides information
// about word boundaries, sentence boundaries, line breaks, and monospace
// character widths.
//
// After constructing the class via [NewGraphemes] for a given string "str",
// [Graphemes.Next] is called for every grapheme cluster in a loop until it
// returns false. Inside the loop, information about the grapheme cluster as
// well as boundary information and character width is available via the various
// methods (see examples below).
//
// This class basically wraps the [StepString] parser and provides a convenient
// interface to it. If you are only interested in some parts of this package's
// functionality, using the specialized functions starting with "First" is
// almost always faster.
type Graphemes struct {
// The original string.
original string
// The remaining string to be parsed.
remaining string
// The current grapheme cluster.
cluster string
// The byte offset of the current grapheme cluster relative to the original
// string.
offset int
// The current boundary information of the [Step] parser.
boundaries int
// The current state of the [Step] parser.
state int
}
// NewGraphemes returns a new grapheme cluster iterator.
func NewGraphemes(str string) *Graphemes {
return &Graphemes{
original: str,
remaining: str,
state: -1,
}
}
// Next advances the iterator by one grapheme cluster and returns false if no
// clusters are left. This function must be called before the first cluster is
// accessed.
func (g *Graphemes) Next() bool {
if len(g.remaining) == 0 {
// We're already past the end.
g.state = -2
g.cluster = ""
return false
}
g.offset += len(g.cluster)
g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
return true
}
// Runes returns a slice of runes (code points) which corresponds to the current
// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
// has not yet been called, nil is returned.
func (g *Graphemes) Runes() []rune {
if g.state < 0 {
return nil
}
return []rune(g.cluster)
}
// Str returns a substring of the original string which corresponds to the
// current grapheme cluster. If the iterator is already past the end or
// [Graphemes.Next] has not yet been called, an empty string is returned.
func (g *Graphemes) Str() string {
return g.cluster
}
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
// If the iterator is already past the end or [Graphemes.Next] has not yet been
// called, nil is returned.
func (g *Graphemes) Bytes() []byte {
if g.state < 0 {
return nil
}
return []byte(g.cluster)
}
// Positions returns the interval of the current grapheme cluster as byte
// positions into the original string. The first returned value "from" indexes
// the first byte and the second returned value "to" indexes the first byte that
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
// the original string "str". If [Graphemes.Next] has not yet been called, both
// values are 0. If the iterator is already past the end, both values are 1.
func (g *Graphemes) Positions() (int, int) {
if g.state == -1 {
return 0, 0
} else if g.state == -2 {
return 1, 1
}
return g.offset, g.offset + len(g.cluster)
}
// IsWordBoundary returns true if a word ends after the current grapheme
// cluster.
func (g *Graphemes) IsWordBoundary() bool {
if g.state < 0 {
return true
}
return g.boundaries&MaskWord != 0
}
// IsSentenceBoundary returns true if a sentence ends after the current
// grapheme cluster.
func (g *Graphemes) IsSentenceBoundary() bool {
if g.state < 0 {
return true
}
return g.boundaries&MaskSentence != 0
}
// LineBreak returns whether the line can be broken after the current grapheme
// cluster. A value of [LineDontBreak] means the line may not be broken, a value
// of [LineMustBreak] means the line must be broken, and a value of
// [LineCanBreak] means the line may or may not be broken.
func (g *Graphemes) LineBreak() int {
if g.state == -1 {
return LineDontBreak
}
if g.state == -2 {
return LineMustBreak
}
return g.boundaries & MaskLine
}
// Width returns the monospace width of the current grapheme cluster.
func (g *Graphemes) Width() int {
if g.state < 0 {
return 0
}
return g.boundaries >> ShiftWidth
}
// Reset puts the iterator into its initial state such that the next call to
// [Graphemes.Next] sets it to the first grapheme cluster again.
func (g *Graphemes) Reset() {
g.state = -1
g.offset = 0
g.cluster = ""
g.remaining = g.original
}
// GraphemeClusterCount returns the number of user-perceived characters
// (grapheme clusters) for the given string.
func GraphemeClusterCount(s string) (n int) {
state := -1
for len(s) > 0 {
_, s, _, state = FirstGraphemeClusterInString(s, state)
n++
}
return
}
// ReverseString reverses the given string while observing grapheme cluster
// boundaries.
func ReverseString(s string) string {
str := []byte(s)
reversed := make([]byte, len(str))
state := -1
index := len(str)
for len(str) > 0 {
var cluster []byte
cluster, str, _, state = FirstGraphemeCluster(str, state)
index -= len(cluster)
copy(reversed[index:], cluster)
if index <= len(str)/2 {
break
}
}
return string(reversed)
}
// The number of bits the grapheme property must be shifted to make place for
// grapheme states.
const shiftGraphemePropState = 4
// FirstGraphemeCluster returns the first grapheme cluster found in the given
// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
// Cluster Boundaries]. This function can be called continuously to extract all
// grapheme clusters from a byte slice, as illustrated in the example below.
//
// If you don't know the current state, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified grapheme cluster. If the length of the
// "rest" slice is 0, the entire byte slice "b" has been processed. The
// "cluster" byte slice is the sub-slice of the input slice containing the
// identified grapheme cluster.
//
// The returned width is the width of the grapheme cluster for most monospace
// fonts where a value of 1 represents one character cell.
//
// Given an empty byte slice "b", the function returns nil values.
//
// While slightly less convenient than using the Graphemes class, this function
// has much better performance and makes no allocations. It lends itself well to
// large byte slices.
//
// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = propertyGraphemes(r)
} else {
prop = state >> shiftGraphemePropState
}
return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var firstProp int
if state < 0 {
state, firstProp, _ = transitionGraphemeState(state, r)
} else {
firstProp = state >> shiftGraphemePropState
}
width += runeWidth(r, firstProp)
// Transition until we find a boundary.
for {
var (
prop int
boundary bool
)
r, l := utf8.DecodeRune(b[length:])
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
if boundary {
return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
}
if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else if r == vs16 {
width = 2
}
} else if firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
}
length += l
if len(b) <= length {
return b, nil, width, grAny | (prop << shiftGraphemePropState)
}
}
}
// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
// outputs are strings.
func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
// An empty string returns nothing.
if len(str) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = propertyGraphemes(r)
} else {
prop = state >> shiftGraphemePropState
}
return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var firstProp int
if state < 0 {
state, firstProp, _ = transitionGraphemeState(state, r)
} else {
firstProp = state >> shiftGraphemePropState
}
width += runeWidth(r, firstProp)
// Transition until we find a boundary.
for {
var (
prop int
boundary bool
)
r, l := utf8.DecodeRuneInString(str[length:])
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
if boundary {
return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
}
if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else if r == vs16 {
width = 2
}
} else if firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
}
length += l
if len(str) <= length {
return str, "", width, grAny | (prop << shiftGraphemePropState)
}
}
}
|