1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
|
package uniseg
// The Unicode properties as used in the various parsers. Only the ones needed
// in the context of this package are included.
const (
prXX = 0 // Same as prAny.
prAny = iota // prAny must be 0.
prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
prCR
prLF
prControl
prExtend
prRegionalIndicator
prSpacingMark
prL
prV
prT
prLV
prLVT
prZWJ
prExtendedPictographic
prNewline
prWSegSpace
prDoubleQuote
prSingleQuote
prMidNumLet
prNumeric
prMidLetter
prMidNum
prExtendNumLet
prALetter
prFormat
prHebrewLetter
prKatakana
prSp
prSTerm
prClose
prSContinue
prATerm
prUpper
prLower
prSep
prOLetter
prCM
prBA
prBK
prSP
prEX
prQU
prAL
prPR
prPO
prOP
prCP
prIS
prHY
prSY
prNU
prCL
prNL
prGL
prAI
prBB
prHL
prSA
prJL
prJV
prJT
prNS
prZW
prB2
prIN
prWJ
prID
prEB
prCJ
prH2
prH3
prSG
prCB
prRI
prEM
prN
prNa
prA
prW
prH
prF
prEmojiPresentation
)
// Unicode General Categories. Only the ones needed in the context of this
// package are included.
const (
gcNone = iota // gcNone must be 0.
gcCc
gcZs
gcPo
gcSc
gcPs
gcPe
gcSm
gcPd
gcNd
gcLu
gcSk
gcPc
gcLl
gcSo
gcLo
gcPi
gcCf
gcNo
gcPf
gcLC
gcLm
gcMn
gcMe
gcMc
gcNl
gcZl
gcZp
gcCn
gcCs
gcCo
)
// Special code points.
const (
vs15 = 0xfe0e // Variation Selector-15 (text presentation)
vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
)
// propertySearch performs a binary search on a property slice and returns the
// entry whose range (start = first array element, end = second array element)
// includes r, or an array of 0's if no such entry was found.
func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
// Run a binary search.
from := 0
to := len(dictionary)
for to > from {
middle := (from + to) / 2
cpRange := dictionary[middle]
if int(r) < cpRange[0] {
to = middle
continue
}
if int(r) > cpRange[1] {
from = middle + 1
continue
}
return cpRange
}
return
}
// property returns the Unicode property value (see constants above) of the
// given code point.
func property(dictionary [][3]int, r rune) int {
return propertySearch(dictionary, r)[2]
}
// propertyLineBreak returns the Unicode property value and General Category
// (see constants above) of the given code point, as listed in the line break
// code points table, while fast tracking ASCII digits and letters.
func propertyLineBreak(r rune) (property, generalCategory int) {
if r >= 'a' && r <= 'z' {
return prAL, gcLl
}
if r >= 'A' && r <= 'Z' {
return prAL, gcLu
}
if r >= '0' && r <= '9' {
return prNU, gcNd
}
entry := propertySearch(lineBreakCodePoints, r)
return entry[2], entry[3]
}
// propertyGraphemes returns the Unicode grapheme cluster property value of the
// given code point while fast tracking ASCII characters.
func propertyGraphemes(r rune) int {
if r >= 0x20 && r <= 0x7e {
return prAny
}
if r == 0x0a {
return prLF
}
if r == 0x0d {
return prCR
}
if r >= 0 && r <= 0x1f || r == 0x7f {
return prControl
}
return property(graphemeCodePoints, r)
}
// propertyEastAsianWidth returns the Unicode East Asian Width property value of
// the given code point while fast tracking ASCII characters.
func propertyEastAsianWidth(r rune) int {
if r >= 0x20 && r <= 0x7e {
return prNa
}
if r >= 0 && r <= 0x1f || r == 0x7f {
return prN
}
return property(eastAsianWidth, r)
}
|