File: Common.hs

package info (click to toggle)
haskell-s-cargot 0.1.6.0-2
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 268 kB
  • sloc: haskell: 1,970; makefile: 5
file content (369 lines) | stat: -rw-r--r-- 14,961 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
module Data.SCargot.Common ( -- $intro
                           -- * Identifier Syntaxes
                             parseR5RSIdent
                           , parseR6RSIdent
                           , parseR7RSIdent
                           , parseXIDIdentStrict
                           , parseXIDIdentGeneral
                           , parseHaskellIdent
                           , parseHaskellVariable
                           , parseHaskellConstructor
                             -- * Numeric Literal Parsers
                           , signed
                           , prefixedNumber
                           , signedPrefixedNumber
                           , binNumber
                           , signedBinNumber
                           , octNumber
                           , signedOctNumber
                           , decNumber
                           , signedDecNumber
                           , dozNumber
                           , signedDozNumber
                           , hexNumber
                           , signedHexNumber
                             -- ** Numeric Literals for Arbitrary Bases
                           , commonLispNumberAnyBase
                           , gnuM4NumberAnyBase
                             -- ** Source locations
                           , Location(..), Located(..), located, dLocation
                           ) where

#if !MIN_VERSION_base(4,8,0)
import Control.Applicative hiding ((<|>), many)
#endif
import           Control.Monad (guard)
import           Data.Char
import           Data.Text (Text)
import qualified Data.Text as T
import           Text.Parsec
import           Text.Parsec.Pos  (newPos)
import           Text.Parsec.Text (Parser)

-- | Parse an identifier according to the R5RS Scheme standard. This
--   will not normalize case, even though the R5RS standard specifies
--   that all identifiers be normalized to lower case first.
--
--   An R5RS identifier is, broadly speaking, alphabetic or numeric
--   and may include various symbols, but no escapes.
parseR5RSIdent :: Parser Text
parseR5RSIdent =
  T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  where initial    = letter <|> oneOf "!$%&*/:<=>?^_~"
        subsequent = initial <|> digit <|> oneOf "+-.@"
        peculiar   = string "+" <|> string "-" <|> string "..."

hasCategory :: Char -> [GeneralCategory] -> Bool
hasCategory c cs = generalCategory c `elem` cs

-- | Parse an identifier according to the R6RS Scheme standard. An
--   R6RS identifier may include inline hexadecimal escape sequences
--   so that, for example, @foo@ is equivalent to @f\\x6f;o@, and is
--   more liberal than R5RS as to which Unicode characters it may
--   accept.
parseR6RSIdent :: Parser Text
parseR6RSIdent =
  T.pack <$> ((:) <$> initial <*> many subsequent <|> peculiar)
  where initial = constituent <|> oneOf "!$%&*/:<=>?^_~" <|> inlineHex
        constituent = letter
                   <|> uniClass (\ c -> isLetter c ||
                                        isSymbol c ||
                                        hasCategory c
                                          [ NonSpacingMark
                                          , LetterNumber
                                          , OtherNumber
                                          , DashPunctuation
                                          , ConnectorPunctuation
                                          , OtherPunctuation
                                          , PrivateUse
                                          ])
        inlineHex   = (chr . fromIntegral) <$> (string "\\x" *> hexNumber <* char ';')
        subsequent  = initial <|> digit <|> oneOf "+-.@"
                   <|> uniClass (\ c -> hasCategory c
                                          [ DecimalNumber
                                          , SpacingCombiningMark
                                          , EnclosingMark
                                          ])
        peculiar    = string "+" <|> string "-" <|> string "..." <|>
                      ((++) <$> string "->" <*> many subsequent)
        uniClass :: (Char -> Bool) -> Parser Char
        uniClass sp = satisfy (\ c -> c > '\x7f' && sp c)

-- | Parse an identifier according to the R7RS Scheme standard. An
--   R7RS identifier, in addition to a typical identifier format,
--   can also be a chunk of text surrounded by vertical bars that
--   can contain spaces and other characters. Unlike R6RS, it does
--   not allow escapes to be included in identifiers unless those
--   identifiers are surrounded by vertical bars.
parseR7RSIdent :: Parser Text
parseR7RSIdent =  T.pack <$>
          (  (:) <$> initial <*> many subsequent
         <|> char '|' *> many1 symbolElement <* char '|'
         <|> peculiar
          )
  where initial = letter <|> specInit
        specInit = oneOf "!$%&*/:<=>?^_~"
        subsequent = initial <|> digit <|> specSubsequent
        specSubsequent = expSign <|> oneOf ".@"
        expSign = oneOf "+-"
        symbolElement =  noneOf "\\|"
                     <|> hexEscape
                     <|> mnemEscape
                     <|> ('|' <$ string "\\|")
        hexEscape = chr . fromIntegral <$> (string "\\x" *> hexNumber <* char ';')
        mnemEscape =  '\a' <$ string "\\a"
                  <|> '\b' <$ string "\\b"
                  <|> '\t' <$ string "\\t"
                  <|> '\n' <$ string "\\n"
                  <|> '\r' <$ string "\\r"
        peculiar =  (:[]) <$> expSign
                <|> cons2 <$> expSign <*> signSub <*> many subsequent
                <|> cons3 <$> expSign
                          <*> char '.'
                          <*> dotSub
                          <*> many subsequent
                <|> cons2 <$> char '.' <*> dotSub <*> many subsequent
        dotSub = signSub <|> char '.'
        signSub = initial <|> expSign <|> char '@'
        cons2 a b cs   = a : b : cs
        cons3 a b c ds = a : b : c : ds

-- | Parse a Haskell variable identifier: a sequence of alphanumeric
--   characters, underscores, or single quote that begins with a
--   lower-case letter.
parseHaskellVariable :: Parser Text
parseHaskellVariable =
  T.pack <$> ((:) <$> small <*> many (small <|>
                                      large <|>
                                      digit' <|>
                                      char '\'' <|>
                                      char '_'))
  where small = satisfy isLower
        large = satisfy isUpper
        digit' = satisfy isDigit

-- | Parse a Haskell constructor: a sequence of alphanumeric
--   characters, underscores, or single quote that begins with an
--   upper-case letter.
parseHaskellConstructor :: Parser Text
parseHaskellConstructor =
  T.pack <$> ((:) <$> large <*> many (small <|>
                                      large <|>
                                      digit' <|>
                                      char '\'' <|>
                                      char '_'))
  where small = satisfy isLower
        large = satisfy isUpper
        digit' = satisfy isDigit

-- | Parse a Haskell identifer: a sequence of alphanumeric
--   characters, underscores, or a single quote. This matches both
--   variable and constructor names.
parseHaskellIdent :: Parser Text
parseHaskellIdent =
  T.pack <$> ((:) <$> (large <|> small)
                  <*> many (small <|>
                            large <|>
                            digit' <|>
                            char '\'' <|>
                            char '_'))
  where small = satisfy isLower
        large = satisfy isUpper
        digit' = satisfy isDigit

-- Ensure that a given character has the given Unicode category
hasCat :: [GeneralCategory] -> Parser Char
hasCat cats = satisfy (flip hasCategory cats)

xidStart :: [GeneralCategory]
xidStart = [ UppercaseLetter
           , LowercaseLetter
           , TitlecaseLetter
           , ModifierLetter
           , OtherLetter
           , LetterNumber
           ]

xidContinue :: [GeneralCategory]
xidContinue = xidStart ++ [ NonSpacingMark
                          , SpacingCombiningMark
                          , DecimalNumber
                          , ConnectorPunctuation
                          ]

-- | Parse an identifier of unicode characters of the form
--   @<XID_Start> <XID_Continue>*@, which corresponds strongly
--   to the identifiers found in most C-like languages. Note that
--   the @XID_Start@ category does not include the underscore,
--   so @__foo@ is not a valid XID identifier. To parse
--   identifiers that may include leading underscores, use
--   'parseXIDIdentGeneral'.
parseXIDIdentStrict :: Parser Text
parseXIDIdentStrict = T.pack <$> ((:) <$> hasCat xidStart
                                  <*> many (hasCat xidContinue))

-- | Parse an identifier of unicode characters of the form
--   @(<XID_Start> | '_') <XID_Continue>*@, which corresponds
--   strongly to the identifiers found in most C-like languages.
--   Unlike 'parseXIDIdentStrict', this will also accept an
--   underscore as leading character, which corresponds more
--   closely to programming languages like C and Java, but
--   deviates somewhat from the
--   <http://unicode.org/reports/tr31/ Unicode Identifier and
--   Pattern Syntax standard>.
parseXIDIdentGeneral :: Parser Text
parseXIDIdentGeneral = T.pack <$> ((:) <$> (hasCat xidStart <|> char '_')
                                       <*> many (hasCat xidContinue))

-- | A helper function for defining parsers for arbitrary-base integers.
--   The first argument will be the base, and the second will be the
--   parser for the individual digits.
number :: Integer -> Parser Char -> Parser Integer
number base digits = foldl go 0 <$> many1 digits
  where go x d = base * x + toInteger (value d)
        value c
          | c >= 'a' && c <= 'z' = 0xa + (fromEnum c - fromEnum 'a')
          | c >= 'A' && c <= 'Z' = 0xa + (fromEnum c - fromEnum 'A')
          | c >= '0' && c <= '9' = fromEnum c - fromEnum '0'
          | c == '\x218a' = 0xa
          | c == '\x218b' = 0xb
          | otherwise = error ("Unknown letter in number: " ++ show c)

digitsFor :: Int -> [Char]
digitsFor n
  | n <= 10   = take n ['0'..'9']
  | n <= 36   = take (n-10) ['A'..'Z'] ++ take (n-10) ['a'..'z'] ++ ['0'..'9']
  | otherwise = error ("Invalid base for parser: " ++ show n)

anyBase :: Integer -> Parser Integer
anyBase n = number n (oneOf (digitsFor (fromIntegral n)))

-- | A parser for Common Lisp's arbitrary-base number syntax, of
--   the form @#[base]r[number]@, where the base is given in
--   decimal. Note that this syntax begins with a @#@, which
--   means it might conflict with defined reader macros.
commonLispNumberAnyBase :: Parser Integer
commonLispNumberAnyBase = do
  _ <- char '#'
  n <- decNumber
  guard (n >= 2 && n <= 36)
  _ <- char 'r'
  signed (anyBase n)

-- | A parser for GNU m4's arbitrary-base number syntax, of
--   the form @0r[base]:[number]@, where the base is given in
--   decimal.
gnuM4NumberAnyBase :: Parser Integer
gnuM4NumberAnyBase = do
  _ <- string "0r"
  n <- decNumber
  guard (n >= 2 && n <= 36)
  _ <- char ':'
  signed (anyBase n)

sign :: Num a => Parser (a -> a)
sign =  (pure id     <* char '+')
    <|> (pure negate <* char '-')
    <|> pure id

-- | Given a parser for some kind of numeric literal, this will attempt to
--   parse a leading @+@ or a leading @-@ followed by the numeric literal,
--   and if a @-@ is found, negate that literal.
signed :: Num a => Parser a -> Parser a
signed p = ($) <$> sign <*> p

-- | Parses a number in the same way as 'prefixedNumber', with an optional
--   leading @+@ or @-@.
signedPrefixedNumber :: Parser Integer
signedPrefixedNumber = signed prefixedNumber

-- | Parses a number, determining which numeric base to use by examining
--   the literal's prefix: @0x@ for a hexadecimal number, @0z@ for a
--   dozenal number, @0o@ for an octal number, and @0b@ for a binary
--   number (as well as the upper-case versions of the same.) If the
--   base is omitted entirely, then it is treated as a decimal number.
prefixedNumber :: Parser Integer
prefixedNumber =  (string "0x" <|> string "0X") *> hexNumber
              <|> (string "0o" <|> string "0O") *> octNumber
              <|> (string "0z" <|> string "0Z") *> dozNumber
              <|> (string "0b" <|> string "0B") *> binNumber
              <|> decNumber

-- | A parser for non-signed binary numbers
binNumber :: Parser Integer
binNumber = number 2 (char '0' <|> char '1')

-- | A parser for signed binary numbers, with an optional leading @+@ or @-@.
signedBinNumber :: Parser Integer
signedBinNumber = signed binNumber

-- | A parser for non-signed octal numbers
octNumber :: Parser Integer
octNumber = number 8 (oneOf "01234567")

-- | A parser for signed octal numbers, with an optional leading @+@ or @-@.
signedOctNumber :: Parser Integer
signedOctNumber = ($) <$> sign <*> octNumber

-- | A parser for non-signed decimal numbers
decNumber :: Parser Integer
decNumber = number 10 digit

-- | A parser for signed decimal numbers, with an optional leading @+@ or @-@.
signedDecNumber :: Parser Integer
signedDecNumber = ($) <$> sign <*> decNumber

dozDigit :: Parser Char
dozDigit = digit <|> oneOf "AaBb\x218a\x218b"

-- | A parser for non-signed duodecimal (dozenal) numbers. This understands both
--   the ASCII characters @'a'@ and @'b'@ and the Unicode characters @'\x218a'@ (↊)
--   and @'\x218b'@ (↋) as digits with the decimal values @10@ and @11@
--   respectively.
dozNumber :: Parser Integer
dozNumber = number 12 dozDigit

-- | A parser for signed duodecimal (dozenal) numbers, with an optional leading @+@ or @-@.
signedDozNumber :: Parser Integer
signedDozNumber = ($) <$> sign <*> dozNumber

-- | A parser for non-signed hexadecimal numbers
hexNumber :: Parser Integer
hexNumber = number 16 hexDigit

-- | A parser for signed hexadecimal numbers, with an optional leading @+@ or @-@.
signedHexNumber :: Parser Integer
signedHexNumber = ($) <$> sign <*> hexNumber


-- |
data Location = Span !SourcePos !SourcePos
  deriving (Eq, Ord, Show)

-- | Add support for source locations while parsing S-expressions, as described in this
--   <https://www.reddit.com/r/haskell/comments/4x22f9/labelling_ast_nodes_with_locations/d6cmdy9/ Reddit>
-- thread.
data Located a = At !Location a
  deriving (Eq, Ord, Show)

-- | Adds a source span to a parser.
located :: Parser a -> Parser (Located a)
located parser = do
  begin <- getPosition
  result <- parser
  end <- getPosition
  return $ At (Span begin end) result

-- | A default location value
dLocation :: Location
dLocation = Span dPos dPos
  where dPos = newPos "" 0 0

{- $intro

This module contains a selection of parsers for different kinds of
identifiers and literals, from which more elaborate parsers can be
assembled. These can afford the user a quick way of building parsers
for different atom types.

-}