1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
|
module Data.SCargot.Common ( -- $intro
-- * Identifier Syntaxes
parseR5RSIdent
, parseR6RSIdent
, parseR7RSIdent
, parseXIDIdentStrict
, parseXIDIdentGeneral
, parseHaskellIdent
, parseHaskellVariable
, parseHaskellConstructor
-- * Numeric Literal Parsers
, signed
, prefixedNumber
, signedPrefixedNumber
, binNumber
, signedBinNumber
, octNumber
, signedOctNumber
, decNumber
, signedDecNumber
, dozNumber
, signedDozNumber
, hexNumber
, signedHexNumber
-- ** Numeric Literals for Arbitrary Bases
, commonLispNumberAnyBase
, gnuM4NumberAnyBase
-- ** Source locations
, Location(..), Located(..), located, dLocation
) where
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative hiding ((<|>), many)
#endif
import Control.Monad (guard)
import Data.Char
import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec
import Text.Parsec.Pos (newPos)
import Text.Parsec.Text (Parser)
-- | Parse an identifier according to the R5RS Scheme standard. This
-- will not normalize case, even though the R5RS standard specifies
-- that all identifiers be normalized to lower case first.
--
-- An R5RS identifier is, broadly speaking, alphabetic or numeric
-- and may include various symbols, but no escapes.
parseR5RSIdent :: Parser Text
parseR5RSIdent =
T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
where initial = letter <|> oneOf "!$%&*/:<=>?^_~"
subsequent = initial <|> digit <|> oneOf "+-.@"
peculiar = string "+" <|> string "-" <|> string "..."
hasCategory :: Char -> [GeneralCategory] -> Bool
hasCategory c cs = generalCategory c `elem` cs
-- | Parse an identifier according to the R6RS Scheme standard. An
-- R6RS identifier may include inline hexadecimal escape sequences
-- so that, for example, @foo@ is equivalent to @f\\x6f;o@, and is
-- more liberal than R5RS as to which Unicode characters it may
-- accept.
parseR6RSIdent :: Parser Text
parseR6RSIdent =
T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
constituent = letter
<|> uniClass (\ c -> isLetter c ||
isSymbol c ||
hasCategory c
[ NonSpacingMark
, LetterNumber
, OtherNumber
, DashPunctuation
, ConnectorPunctuation
, OtherPunctuation
, PrivateUse
])
inlineHex = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
subsequent = initial <|> digit <|> oneOf "+-.@"
<|> uniClass (\ c -> hasCategory c
[ DecimalNumber
, SpacingCombiningMark
, EnclosingMark
])
peculiar = string "+" <|> string "-" <|> string "..." <|>
((++) <$> string "->" <*> many subsequent)
uniClass :: (Char -> Bool) -> Parser Char
uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)
-- | Parse an identifier according to the R7RS Scheme standard. An
-- R7RS identifier, in addition to a typical identifier format,
-- can also be a chunk of text surrounded by vertical bars that
-- can contain spaces and other characters. Unlike R6RS, it does
-- not allow escapes to be included in identifiers unless those
-- identifiers are surrounded by vertical bars.
parseR7RSIdent :: Parser Text
parseR7RSIdent = T.pack <$>
( (:) <$> initial <*> many subsequent
<|> char '|' *> many1 symbolElement <* char '|'
<|> peculiar
)
where initial = letter <|> specInit
specInit = oneOf "!$%&*/:<=>?^_~"
subsequent = initial <|> digit <|> specSubsequent
specSubsequent = expSign <|> oneOf ".@"
expSign = oneOf "+-"
symbolElement = noneOf "\\|"
<|> hexEscape
<|> mnemEscape
<|> ('|' <$ string "\\|")
hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
mnemEscape = '\a' <$ string "\\a"
<|> '\b' <$ string "\\b"
<|> '\t' <$ string "\\t"
<|> '\n' <$ string "\\n"
<|> '\r' <$ string "\\r"
peculiar = (:[]) <$> expSign
<|> cons2 <$> expSign <*> signSub <*> many subsequent
<|> cons3 <$> expSign
<*> char '.'
<*> dotSub
<*> many subsequent
<|> cons2 <$> char '.' <*> dotSub <*> many subsequent
dotSub = signSub <|> char '.'
signSub = initial <|> expSign <|> char '@'
cons2 a b cs = a : b : cs
cons3 a b c ds = a : b : c : ds
-- | Parse a Haskell variable identifier: a sequence of alphanumeric
-- characters, underscores, or single quote that begins with a
-- lower-case letter.
parseHaskellVariable :: Parser Text
parseHaskellVariable =
T.pack <$> ((:) <$> small <*> many (small <|>
large <|>
digit' <|>
char '\'' <|>
char '_'))
where small = satisfy isLower
large = satisfy isUpper
digit' = satisfy isDigit
-- | Parse a Haskell constructor: a sequence of alphanumeric
-- characters, underscores, or single quote that begins with an
-- upper-case letter.
parseHaskellConstructor :: Parser Text
parseHaskellConstructor =
T.pack <$> ((:) <$> large <*> many (small <|>
large <|>
digit' <|>
char '\'' <|>
char '_'))
where small = satisfy isLower
large = satisfy isUpper
digit' = satisfy isDigit
-- | Parse a Haskell identifer: a sequence of alphanumeric
-- characters, underscores, or a single quote. This matches both
-- variable and constructor names.
parseHaskellIdent :: Parser Text
parseHaskellIdent =
T.pack <$> ((:) <$> (large <|> small)
<*> many (small <|>
large <|>
digit' <|>
char '\'' <|>
char '_'))
where small = satisfy isLower
large = satisfy isUpper
digit' = satisfy isDigit
-- Ensure that a given character has the given Unicode category
hasCat :: [GeneralCategory] -> Parser Char
hasCat cats = satisfy (flip hasCategory cats)
xidStart :: [GeneralCategory]
xidStart = [ UppercaseLetter
, LowercaseLetter
, TitlecaseLetter
, ModifierLetter
, OtherLetter
, LetterNumber
]
xidContinue :: [GeneralCategory]
xidContinue = xidStart ++ [ NonSpacingMark
, SpacingCombiningMark
, DecimalNumber
, ConnectorPunctuation
]
-- | Parse an identifier of unicode characters of the form
-- @<XID_Start> <XID_Continue>*@, which corresponds strongly
-- to the identifiers found in most C-like languages. Note that
-- the @XID_Start@ category does not include the underscore,
-- so @__foo@ is not a valid XID identifier. To parse
-- identifiers that may include leading underscores, use
-- 'parseXIDIdentGeneral'.
parseXIDIdentStrict :: Parser Text
parseXIDIdentStrict = T.pack <$> ((:) <$> hasCat xidStart
<*> many (hasCat xidContinue))
-- | Parse an identifier of unicode characters of the form
-- @(<XID_Start> | '_') <XID_Continue>*@, which corresponds
-- strongly to the identifiers found in most C-like languages.
-- Unlike 'parseXIDIdentStrict', this will also accept an
-- underscore as leading character, which corresponds more
-- closely to programming languages like C and Java, but
-- deviates somewhat from the
-- <http://unicode.org/reports/tr31/ Unicode Identifier and
-- Pattern Syntax standard>.
parseXIDIdentGeneral :: Parser Text
parseXIDIdentGeneral = T.pack <$> ((:) <$> (hasCat xidStart <|> char '_')
<*> many (hasCat xidContinue))
-- | A helper function for defining parsers for arbitrary-base integers.
-- The first argument will be the base, and the second will be the
-- parser for the individual digits.
number :: Integer -> Parser Char -> Parser Integer
number base digits = foldl go 0 <$> many1 digits
where go x d = base * x + toInteger (value d)
value c
| c >= 'a' && c <= 'z' = 0xa + (fromEnum c - fromEnum 'a')
| c >= 'A' && c <= 'Z' = 0xa + (fromEnum c - fromEnum 'A')
| c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
| c == '\x218a' = 0xa
| c == '\x218b' = 0xb
| otherwise = error ("Unknown letter in number: " ++ show c)
digitsFor :: Int -> [Char]
digitsFor n
| n <= 10 = take n ['0'..'9']
| n <= 36 = take (n-10) ['A'..'Z'] ++ take (n-10) ['a'..'z'] ++ ['0'..'9']
| otherwise = error ("Invalid base for parser: " ++ show n)
anyBase :: Integer -> Parser Integer
anyBase n = number n (oneOf (digitsFor (fromIntegral n)))
-- | A parser for Common Lisp's arbitrary-base number syntax, of
-- the form @#[base]r[number]@, where the base is given in
-- decimal. Note that this syntax begins with a @#@, which
-- means it might conflict with defined reader macros.
commonLispNumberAnyBase :: Parser Integer
commonLispNumberAnyBase = do
_ <- char '#'
n <- decNumber
guard (n >= 2 && n <= 36)
_ <- char 'r'
signed (anyBase n)
-- | A parser for GNU m4's arbitrary-base number syntax, of
-- the form @0r[base]:[number]@, where the base is given in
-- decimal.
gnuM4NumberAnyBase :: Parser Integer
gnuM4NumberAnyBase = do
_ <- string "0r"
n <- decNumber
guard (n >= 2 && n <= 36)
_ <- char ':'
signed (anyBase n)
sign :: Num a => Parser (a -> a)
sign = (pure id <* char '+')
<|> (pure negate <* char '-')
<|> pure id
-- | Given a parser for some kind of numeric literal, this will attempt to
-- parse a leading @+@ or a leading @-@ followed by the numeric literal,
-- and if a @-@ is found, negate that literal.
signed :: Num a => Parser a -> Parser a
signed p = ($) <$> sign <*> p
-- | Parses a number in the same way as 'prefixedNumber', with an optional
-- leading @+@ or @-@.
signedPrefixedNumber :: Parser Integer
signedPrefixedNumber = signed prefixedNumber
-- | Parses a number, determining which numeric base to use by examining
-- the literal's prefix: @0x@ for a hexadecimal number, @0z@ for a
-- dozenal number, @0o@ for an octal number, and @0b@ for a binary
-- number (as well as the upper-case versions of the same.) If the
-- base is omitted entirely, then it is treated as a decimal number.
prefixedNumber :: Parser Integer
prefixedNumber = (string "0x" <|> string "0X") *> hexNumber
<|> (string "0o" <|> string "0O") *> octNumber
<|> (string "0z" <|> string "0Z") *> dozNumber
<|> (string "0b" <|> string "0B") *> binNumber
<|> decNumber
-- | A parser for non-signed binary numbers
binNumber :: Parser Integer
binNumber = number 2 (char '0' <|> char '1')
-- | A parser for signed binary numbers, with an optional leading @+@ or @-@.
signedBinNumber :: Parser Integer
signedBinNumber = signed binNumber
-- | A parser for non-signed octal numbers
octNumber :: Parser Integer
octNumber = number 8 (oneOf "01234567")
-- | A parser for signed octal numbers, with an optional leading @+@ or @-@.
signedOctNumber :: Parser Integer
signedOctNumber = ($) <$> sign <*> octNumber
-- | A parser for non-signed decimal numbers
decNumber :: Parser Integer
decNumber = number 10 digit
-- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.
signedDecNumber :: Parser Integer
signedDecNumber = ($) <$> sign <*> decNumber
dozDigit :: Parser Char
dozDigit = digit <|> oneOf "AaBb\x218a\x218b"
-- | A parser for non-signed duodecimal (dozenal) numbers. This understands both
-- the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)
-- and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@
-- respectively.
dozNumber :: Parser Integer
dozNumber = number 12 dozDigit
-- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.
signedDozNumber :: Parser Integer
signedDozNumber = ($) <$> sign <*> dozNumber
-- | A parser for non-signed hexadecimal numbers
hexNumber :: Parser Integer
hexNumber = number 16 hexDigit
-- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.
signedHexNumber :: Parser Integer
signedHexNumber = ($) <$> sign <*> hexNumber
-- |
data Location = Span !SourcePos !SourcePos
deriving (Eq, Ord, Show)
-- | Add support for source locations while parsing S-expressions, as described in this
-- <https://www.reddit.com/r/haskell/comments/4x22f9/labelling_ast_nodes_with_locations/d6cmdy9/ Reddit>
-- thread.
data Located a = At !Location a
deriving (Eq, Ord, Show)
-- | Adds a source span to a parser.
located :: Parser a -> Parser (Located a)
located parser = do
begin <- getPosition
result <- parser
end <- getPosition
return $ At (Span begin end) result
-- | A default location value
dLocation :: Location
dLocation = Span dPos dPos
where dPos = newPos "" 0 0
{- $intro
This module contains a selection of parsers for different kinds of
identifiers and literals, from which more elaborate parsers can be
assembled. These can afford the user a quick way of building parsers
for different atom types.
-}
|