1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
|
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE Trustworthy #-}
{-# OPTIONS_HADDOCK not-home #-}
-----------------------------------------------------------------------------
-- |
-- Module : GHC.Unicode
-- Copyright : (c) The University of Glasgow, 2003
-- License : see libraries/base/LICENSE
--
-- Maintainer : cvs-ghc@haskell.org
-- Stability : internal
-- Portability : non-portable (GHC extensions)
--
-- Implementations for the character predicates (isLower, isUpper, etc.)
-- and the conversions (toUpper, toLower). The implementation uses
-- libunicode on Unix systems if that is available.
--
-----------------------------------------------------------------------------
module GHC.Unicode (
unicodeVersion,
GeneralCategory (..), generalCategory,
isAscii, isLatin1, isControl,
isAsciiUpper, isAsciiLower,
isPrint, isSpace, isUpper, isUpperCase,
isLower, isLowerCase, isAlpha, isDigit,
isOctDigit, isHexDigit, isAlphaNum,
isPunctuation, isSymbol,
toUpper, toLower, toTitle
) where
import GHC.Base
import GHC.Real
import GHC.Enum ( Enum (..), Bounded (..) )
import GHC.Ix ( Ix (..) )
import GHC.Num
import GHC.Unicode.Internal.Version
import qualified GHC.Unicode.Internal.Char.DerivedCoreProperties as DCP
import qualified GHC.Unicode.Internal.Char.UnicodeData.GeneralCategory as GC
import qualified GHC.Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping as C
import qualified GHC.Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping as C
import qualified GHC.Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping as C
-- Data.Char.chr already imports this and we need to define a Show instance
-- for GeneralCategory
import GHC.Show ( Show )
-- $setup
-- >>> import Prelude
-- [NOTE] The constructors of 'GeneralCategory' must be in the same order they
-- are listed in the Unicode Standard, because some functions
-- (e.g. 'generalCategory') rely on the 'Enum' instance.
-- | Unicode General Categories (column 2 of the UnicodeData table) in
-- the order they are listed in the Unicode standard (the Unicode
-- Character Database, in particular).
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> :t OtherLetter
-- OtherLetter :: GeneralCategory
--
-- 'Eq' instance:
--
-- >>> UppercaseLetter == UppercaseLetter
-- True
-- >>> UppercaseLetter == LowercaseLetter
-- False
--
-- 'Ord' instance:
--
-- >>> NonSpacingMark <= MathSymbol
-- True
--
-- 'Enum' instance:
--
-- >>> enumFromTo ModifierLetter SpacingCombiningMark
-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
--
-- 'Text.Read.Read' instance:
--
-- >>> read "DashPunctuation" :: GeneralCategory
-- DashPunctuation
-- >>> read "17" :: GeneralCategory
-- *** Exception: Prelude.read: no parse
--
-- 'Show' instance:
--
-- >>> show EnclosingMark
-- "EnclosingMark"
--
-- 'Bounded' instance:
--
-- >>> minBound :: GeneralCategory
-- UppercaseLetter
-- >>> maxBound :: GeneralCategory
-- NotAssigned
--
-- 'Ix' instance:
--
-- >>> import Data.Ix ( index )
-- >>> index (OtherLetter,Control) FinalQuote
-- 12
-- >>> index (OtherLetter,Control) Format
-- *** Exception: Error in array index
--
data GeneralCategory
= UppercaseLetter -- ^ Lu: Letter, Uppercase
| LowercaseLetter -- ^ Ll: Letter, Lowercase
| TitlecaseLetter -- ^ Lt: Letter, Titlecase
| ModifierLetter -- ^ Lm: Letter, Modifier
| OtherLetter -- ^ Lo: Letter, Other
| NonSpacingMark -- ^ Mn: Mark, Non-Spacing
| SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
| EnclosingMark -- ^ Me: Mark, Enclosing
| DecimalNumber -- ^ Nd: Number, Decimal
| LetterNumber -- ^ Nl: Number, Letter
| OtherNumber -- ^ No: Number, Other
| ConnectorPunctuation -- ^ Pc: Punctuation, Connector
| DashPunctuation -- ^ Pd: Punctuation, Dash
| OpenPunctuation -- ^ Ps: Punctuation, Open
| ClosePunctuation -- ^ Pe: Punctuation, Close
| InitialQuote -- ^ Pi: Punctuation, Initial quote
| FinalQuote -- ^ Pf: Punctuation, Final quote
| OtherPunctuation -- ^ Po: Punctuation, Other
| MathSymbol -- ^ Sm: Symbol, Math
| CurrencySymbol -- ^ Sc: Symbol, Currency
| ModifierSymbol -- ^ Sk: Symbol, Modifier
| OtherSymbol -- ^ So: Symbol, Other
| Space -- ^ Zs: Separator, Space
| LineSeparator -- ^ Zl: Separator, Line
| ParagraphSeparator -- ^ Zp: Separator, Paragraph
| Control -- ^ Cc: Other, Control
| Format -- ^ Cf: Other, Format
| Surrogate -- ^ Cs: Other, Surrogate
| PrivateUse -- ^ Co: Other, Private Use
| NotAssigned -- ^ Cn: Other, Not Assigned
deriving ( Show -- ^ @since 2.01
, Eq -- ^ @since 2.01
, Ord -- ^ @since 2.01
, Enum -- ^ @since 2.01
, Bounded -- ^ @since 2.01
, Ix -- ^ @since 2.01
)
-- | The Unicode general category of the character. This relies on the
-- 'Enum' instance of 'GeneralCategory', which must remain in the
-- same order as the categories are presented in the Unicode
-- standard.
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> generalCategory 'a'
-- LowercaseLetter
-- >>> generalCategory 'A'
-- UppercaseLetter
-- >>> generalCategory '0'
-- DecimalNumber
-- >>> generalCategory '%'
-- OtherPunctuation
-- >>> generalCategory '♥'
-- OtherSymbol
-- >>> generalCategory '\31'
-- Control
-- >>> generalCategory ' '
-- Space
--
{-# INLINE generalCategory #-}
generalCategory :: Char -> GeneralCategory
generalCategory = toEnum . GC.generalCategory
-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii :: Char -> Bool
isAscii c = c < '\x80'
-- | Selects the first 256 characters of the Unicode character set,
-- corresponding to the ISO 8859-1 (Latin-1) character set.
isLatin1 :: Char -> Bool
isLatin1 c = c <= '\xff'
-- | Selects ASCII lower-case letters,
-- i.e. characters satisfying both 'isAscii' and 'isLower'.
isAsciiLower :: Char -> Bool
isAsciiLower c = c >= 'a' && c <= 'z'
-- | Selects ASCII upper-case letters,
-- i.e. characters satisfying both 'isAscii' and 'isUpper'.
isAsciiUpper :: Char -> Bool
isAsciiUpper c = c >= 'A' && c <= 'Z'
-- | Selects control characters, which are the non-printing characters of
-- the Latin-1 subset of Unicode.
isControl :: Char -> Bool
-- Select characters with category 'Control'.
-- By definition (https://www.unicode.org/reports/tr44/#General_Category_Values)
-- “a C0 or C1 control code”, i.e. the 0x00-0x1f, 0x7f, and 0x80-0x9f.
isControl c = case generalCategory c of
Control -> True
_ -> False
-- | Selects printable Unicode characters
-- (letters, numbers, marks, punctuation, symbols and spaces).
isPrint :: Char -> Bool
isPrint c = case generalCategory c of
LineSeparator -> False
ParagraphSeparator -> False
Control -> False
Format -> False
Surrogate -> False
PrivateUse -> False
NotAssigned -> False
_ -> True
-- | Returns 'True' for any Unicode space character, and the control
-- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.
isSpace :: Char -> Bool
-- isSpace includes non-breaking space
-- The magic 0x377 isn't really that magical. As of 2014, all the codepoints
-- at or below 0x377 have been assigned, so we shouldn't have to worry about
-- any new spaces appearing below there. It would probably be best to
-- use branchless ||, but currently the eqLit transformation will undo that,
-- so we'll do it like this until there's a way around that.
isSpace c
| uc <= 0x377 = uc == 32 || uc - 0x9 <= 4 || uc == 0xa0
| otherwise = generalCategory c == Space
where
uc = fromIntegral (ord c) :: Word
-- | Selects upper-case or title-case alphabetic Unicode characters (letters).
-- Title case is used by a small number of letter ligatures like the
-- single-character form of /Lj/.
--
-- __Note:__ this predicate does /not/ work for letter-like characters such as:
-- @\'Ⓐ\'@ (@U+24B6@ circled Latin capital letter A) and
-- @\'Ⅳ\'@ (@U+2163@ Roman numeral four). This is due to selecting only
-- characters with the 'GeneralCategory' 'UppercaseLetter' or 'TitlecaseLetter'.
--
-- See 'isUpperCase' for a more intuitive predicate. Note that
-- unlike 'isUpperCase', 'isUpper' does select /title-case/ characters such as
-- @\'Dž\'@ (@U+01C5@ Latin capital letter d with small letter z with caron) or
-- @\'ᾯ\'@ (@U+1FAF@ Greek capital letter omega with dasia and perispomeni and
-- prosgegrammeni).
isUpper :: Char -> Bool
isUpper c = case generalCategory c of
UppercaseLetter -> True
TitlecaseLetter -> True
_ -> False
-- | Selects upper-case Unicode letter-like characters.
--
-- __Note:__ this predicate selects characters with the Unicode property
-- @Uppercase@, which include letter-like characters such as:
-- @\'Ⓐ\'@ (@U+24B6@ circled Latin capital letter A) and
-- @\'Ⅳ\'@ (@U+2163@ Roman numeral four).
--
-- See 'isUpper' for the legacy predicate. Note that
-- unlike 'isUpperCase', 'isUpper' does select /title-case/ characters such as
-- @\'Dž\'@ (@U+01C5@ Latin capital letter d with small letter z with caron) or
-- @\'ᾯ\'@ (@U+1FAF@ Greek capital letter omega with dasia and perispomeni and
-- prosgegrammeni).
--
-- @since 4.18.0.0
{-# INLINE isUpperCase #-}
isUpperCase :: Char -> Bool
isUpperCase = DCP.isUppercase
-- | Selects lower-case alphabetic Unicode characters (letters).
--
-- __Note:__ this predicate does /not/ work for letter-like characters such as:
-- @\'ⓐ\'@ (@U+24D0@ circled Latin small letter a) and
-- @\'ⅳ\'@ (@U+2173@ small Roman numeral four). This is due to selecting only
-- characters with the 'GeneralCategory' 'LowercaseLetter'.
--
-- See 'isLowerCase' for a more intuitive predicate.
isLower :: Char -> Bool
isLower c = case generalCategory c of
LowercaseLetter -> True
_ -> False
-- | Selects lower-case Unicode letter-like characters.
--
-- __Note:__ this predicate selects characters with the Unicode property
-- @Lowercase@, which includes letter-like characters such as:
-- @\'ⓐ\'@ (@U+24D0@ circled Latin small letter a) and
-- @\'ⅳ\'@ (@U+2173@ small Roman numeral four).
--
-- See 'isLower' for the legacy predicate.
--
-- @since 4.18.0.0
{-# INLINE isLowerCase #-}
isLowerCase :: Char -> Bool
isLowerCase = DCP.isLowercase
-- | Selects alphabetic Unicode characters (lower-case, upper-case and
-- title-case letters, plus letters of caseless scripts and modifiers letters).
-- This function is equivalent to 'Data.Char.isLetter'.
isAlpha :: Char -> Bool
isAlpha c = case generalCategory c of
UppercaseLetter -> True
LowercaseLetter -> True
TitlecaseLetter -> True
ModifierLetter -> True
OtherLetter -> True
_ -> False
-- | Selects alphabetic or numeric Unicode characters.
--
-- Note that numeric digits outside the ASCII range, as well as numeric
-- characters which aren't digits, are selected by this function but not by
-- 'isDigit'. Such characters may be part of identifiers but are not used by
-- the printer and reader to represent numbers.
isAlphaNum :: Char -> Bool
isAlphaNum c = case generalCategory c of
UppercaseLetter -> True
LowercaseLetter -> True
TitlecaseLetter -> True
ModifierLetter -> True
OtherLetter -> True
DecimalNumber -> True
LetterNumber -> True
OtherNumber -> True
_ -> False
-- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.
isDigit :: Char -> Bool
isDigit c = (fromIntegral (ord c - ord '0') :: Word) <= 9
-- We use an addition and an unsigned comparison instead of two signed
-- comparisons because it's usually faster and puts less strain on branch
-- prediction. It likely also enables some CSE when combined with functions
-- that follow up with an actual conversion.
-- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.
isOctDigit :: Char -> Bool
isOctDigit c = (fromIntegral (ord c - ord '0') :: Word) <= 7
-- | Selects ASCII hexadecimal digits,
-- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@.
isHexDigit :: Char -> Bool
isHexDigit c = isDigit c ||
(fromIntegral (ord c - ord 'A')::Word) <= 5 ||
(fromIntegral (ord c - ord 'a')::Word) <= 5
-- | Selects Unicode punctuation characters, including various kinds
-- of connectors, brackets and quotes.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'ConnectorPunctuation'
-- * 'DashPunctuation'
-- * 'OpenPunctuation'
-- * 'ClosePunctuation'
-- * 'InitialQuote'
-- * 'FinalQuote'
-- * 'OtherPunctuation'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Punctuation\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isPunctuation 'a'
-- False
-- >>> isPunctuation '7'
-- False
-- >>> isPunctuation '♥'
-- False
-- >>> isPunctuation '"'
-- True
-- >>> isPunctuation '?'
-- True
-- >>> isPunctuation '—'
-- True
--
isPunctuation :: Char -> Bool
isPunctuation c = case generalCategory c of
ConnectorPunctuation -> True
DashPunctuation -> True
OpenPunctuation -> True
ClosePunctuation -> True
InitialQuote -> True
FinalQuote -> True
OtherPunctuation -> True
_ -> False
-- | Selects Unicode symbol characters, including mathematical and
-- currency symbols.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'MathSymbol'
-- * 'CurrencySymbol'
-- * 'ModifierSymbol'
-- * 'OtherSymbol'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Symbol\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isSymbol 'a'
-- False
-- >>> isSymbol '6'
-- False
-- >>> isSymbol '='
-- True
--
-- The definition of \"math symbol\" may be a little
-- counter-intuitive depending on one's background:
--
-- >>> isSymbol '+'
-- True
-- >>> isSymbol '-'
-- False
--
isSymbol :: Char -> Bool
isSymbol c = case generalCategory c of
MathSymbol -> True
CurrencySymbol -> True
ModifierSymbol -> True
OtherSymbol -> True
_ -> False
-- | Convert a letter to the corresponding upper-case letter, if any.
-- Any other character is returned unchanged.
{-# INLINE toUpper #-}
toUpper :: Char -> Char
toUpper = C.toSimpleUpperCase
-- | Convert a letter to the corresponding lower-case letter, if any.
-- Any other character is returned unchanged.
{-# INLINE toLower #-}
toLower :: Char -> Char
toLower = C.toSimpleLowerCase
-- | Convert a letter to the corresponding title-case or upper-case
-- letter, if any. (Title case differs from upper case only for a small
-- number of ligature letters.)
-- Any other character is returned unchanged.
{-# INLINE toTitle #-}
toTitle :: Char -> Char
toTitle = C.toSimpleTitleCase
|