File: SAX.hs

package info (click to toggle)
haskell-hexpat 0.20.13-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 916 kB
  • sloc: ansic: 12,303; haskell: 3,457; xml: 1,109; makefile: 5; sh: 5
file content (356 lines) | stat: -rw-r--r-- 15,634 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
{-# LANGUAGE DeriveDataTypeable, TypeSynonymInstances, CPP, ScopedTypeVariables, FlexibleInstances, GADTs #-}
{-# OPTIONS_GHC -fno-cse -fno-full-laziness #-}

-- hexpat, a Haskell wrapper for expat
-- Copyright (C) 2008 Evan Martin <martine@danga.com>
-- Copyright (C) 2009 Stephen Blackheath <http://blacksapphire.com/antispam>

-- | This module provides functions to parse an XML document to a lazy
-- stream of SAX events.
module Text.XML.Expat.SAX (
  -- * XML primitives
  Encoding(..),
  XMLParseError(..),
  XMLParseLocation(..),

  -- * SAX-style parse
  ParseOptions(..),
  SAXEvent(..),

  parse,
  parseG,
  parseLocations,
  parseLocationsG,
  parseLocationsThrowing,
  parseThrowing,
  defaultParseOptions,

  -- * Variants that throw exceptions
  XMLParseException(..),

  -- * Abstraction of string types
  GenericXMLString(..)
  ) where

import Control.Concurrent.MVar
import Control.Exception as Exc
import Text.XML.Expat.Internal.IO
import Data.Bits
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Internal as I
import Data.Int
import Data.ByteString.Internal (c2w, w2c, c_strlen)
import qualified Data.Monoid as M
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Codec.Binary.UTF8.String as U8
import Data.List.Class (List(..), ListItem(..), cons, fromList, mapL)
import Data.Typeable
import Data.Word
import Control.Applicative
import Control.DeepSeq
import Control.Monad
import System.IO.Unsafe
import Foreign.C
import Foreign.ForeignPtr
import Foreign.Marshal.Array
import Foreign.Ptr
import Foreign.Storable


data ParseOptions tag text = ParseOptions
    { overrideEncoding :: Maybe Encoding
          -- ^ The encoding parameter, if provided, overrides the document's
          -- encoding declaration.
    , entityDecoder  :: Maybe (tag -> Maybe text)
          -- ^ If provided, entity references (i.e. @&nbsp;@ and friends) will
          -- be decoded into text using the supplied lookup function
    }

defaultParseOptions :: ParseOptions tag text
defaultParseOptions = ParseOptions Nothing Nothing


-- | An abstraction for any string type you want to use as xml text (that is,
-- attribute values or element text content). If you want to use a
-- new string type with /hexpat/, you must make it an instance of
-- 'GenericXMLString'.
class (M.Monoid s, Eq s) => GenericXMLString s where
    gxNullString :: s -> Bool
    gxToString :: s -> String
    gxFromString :: String -> s
    gxFromChar :: Char -> s
    gxHead :: s -> Char
    gxTail :: s -> s
    gxBreakOn :: Char -> s -> (s, s)
    gxFromByteString :: B.ByteString -> s
    gxToByteString :: s -> B.ByteString

instance GenericXMLString String where
    gxNullString = null
    gxToString = id
    gxFromString = id
    gxFromChar c = [c]
    gxHead = head
    gxTail = tail
    gxBreakOn c = break (==c)
    gxFromByteString = U8.decode . B.unpack
    gxToByteString = B.pack . map c2w . U8.encodeString

instance GenericXMLString B.ByteString where
    gxNullString = B.null
    gxToString = U8.decodeString . map w2c . B.unpack
    gxFromString = B.pack . map c2w . U8.encodeString
    gxFromChar = B.singleton . c2w
    gxHead = w2c . B.head
    gxTail = B.tail
    gxBreakOn c = B.break (== c2w c)
    gxFromByteString = id
    gxToByteString = id

instance GenericXMLString T.Text where
    gxNullString = T.null
    gxToString = T.unpack
    gxFromString = T.pack
    gxFromChar = T.singleton
    gxHead = T.head
    gxTail = T.tail
#if MIN_VERSION_text(0,11,0)
    gxBreakOn c = T.break (==c)
#elif MIN_VERSION_text(0,10,0)
    -- breakBy gets renamed to break between 0.10.0.0 and 0.10.0.1.
    -- There's no 'break' function that is consistent between these two
    -- versions so we work around it using other functions.
    gxBreakOn c t = (T.takeWhile (/=c) t, T.dropWhile (/=c) t)
#else
    gxBreakOn c = T.breakBy (==c)
#endif
    gxFromByteString = TE.decodeUtf8
    gxToByteString = TE.encodeUtf8

data SAXEvent tag text =
    XMLDeclaration text (Maybe text) (Maybe Bool) |
    StartElement tag [(tag, text)] |
    EndElement tag |
    CharacterData text |
    StartCData |
    EndCData |
    ProcessingInstruction text text |
    Comment text |
    FailDocument XMLParseError
    deriving (Eq, Show)

instance (NFData tag, NFData text) => NFData (SAXEvent tag text) where
    rnf (XMLDeclaration ver mEnc mSD) = rnf ver `seq` rnf mEnc `seq` rnf mSD
    rnf (StartElement tag atts) = rnf tag `seq` rnf atts
    rnf (EndElement tag) = rnf tag
    rnf (CharacterData text) = rnf text
    rnf StartCData = ()
    rnf EndCData = ()
    rnf (ProcessingInstruction target text) = rnf target `seq` rnf text
    rnf (Comment text) = rnf text
    rnf (FailDocument err) = rnf err

-- | Parse a generalized list of ByteStrings containing XML to SAX events.
-- In the event of an error, FailDocument is the last element of the output list.
parseG :: forall tag text l . (GenericXMLString tag, GenericXMLString text, List l) =>
          ParseOptions tag text -- ^ Parse options
       -> l ByteString          -- ^ Input text (a lazy ByteString)
       -> l (SAXEvent tag text)
{-# NOINLINE parseG #-}
parseG opts inputBlocks = mapL (return . fst) $ parseImpl opts inputBlocks False noExtra failureA
  where noExtra _ offset = return ((), offset)
        failureA _ = return ()

-- | Parse a generalized list of ByteStrings containing XML to SAX events.
-- In the event of an error, FailDocument is the last element of the output list.
parseLocationsG :: forall tag text l . (GenericXMLString tag, GenericXMLString text, List l) =>
                   ParseOptions tag text -- ^ Parse options
                -> l ByteString          -- ^ Input text (a lazy ByteString)
                -> l (SAXEvent tag text, XMLParseLocation)
{-# NOINLINE parseLocationsG #-}
parseLocationsG opts inputBlocks = parseImpl opts inputBlocks True fetchLocation id
  where
    fetchLocation pBuf offset = do
        [a, b, c, d] <- peekArray 4 (pBuf `plusPtr` offset :: Ptr Int64)
        return (XMLParseLocation a b c d, offset + 32)

parseImpl :: forall a tag text l . (GenericXMLString tag, GenericXMLString text, List l) =>
             ParseOptions tag text -- ^ Parse options
          -> l ByteString          -- ^ Input text (a lazy ByteString)
          -> Bool                  -- ^ True to add locations to binary output
          -> (Ptr Word8 -> Int -> IO (a, Int)) -- ^ Fetch extra data
          -> (IO XMLParseLocation -> IO a)     -- ^ Fetch a value for failure case 
          -> l (SAXEvent tag text, a)
parseImpl opts inputBlocks addLocations extra failureA = runParser inputBlocks parse cacheRef
  where
    (parse, getLocation, cacheRef) = unsafePerformIO $ do
        (parse, getLocation) <- hexpatNewParser
            (overrideEncoding opts)
            ((\decode -> fmap gxToByteString . decode . gxFromByteString) <$> entityDecoder opts)
            addLocations

        cacheRef <- newMVar Nothing
        return (parse, getLocation, cacheRef)

    runParser iblks parse cacheRef = joinL $ do
        li <- runList iblks
        return $ unsafePerformIO $ do
            mCached <- takeMVar cacheRef
            case mCached of
                Just l -> do
                    putMVar cacheRef mCached
                    return l
                Nothing -> do
                    (saxen, rema) <- case li of
                        Nil         -> do
                            (buf, len, mError) <- parse B.empty True
                            saxen <- parseBuf buf len extra
                            rema <- handleFailure mError mzero
                            return (saxen, rema)
                        Cons blk t -> {-unsafeInterleaveIO $-} do
                            (buf, len, mError) <- parse blk False
                            saxen <- parseBuf buf len extra
                            cacheRef' <- newMVar Nothing
                            rema <- handleFailure mError (runParser t parse cacheRef')
                            return (saxen, rema)
                    let l = fromList saxen `mplus` rema
                    putMVar cacheRef (Just l)
                    return l
      where
        handleFailure (Just err) _ = do a <- failureA getLocation
                                        return $ (FailDocument err, a) `cons` mzero
        handleFailure Nothing    l = return l

parseBuf :: (GenericXMLString tag, GenericXMLString text) =>
            ForeignPtr Word8 -> CInt -> (Ptr Word8 -> Int -> IO (a, Int)) -> IO [(SAXEvent tag text, a)]
parseBuf buf _ processExtra = withForeignPtr buf $ \pBuf -> doit [] pBuf 0
  where
    roundUp32 offset = (offset + 3) .&. complement 3
    doit acc pBuf offset0 = offset0 `seq` do
        typ <- peek (pBuf `plusPtr` offset0 :: Ptr Word32)
        (a, offset) <- processExtra pBuf (offset0 + 4)
        case typ of
            0 -> return (reverse acc)
            1 -> do
                nAtts <- peek (pBuf `plusPtr` offset :: Ptr Word32)
                let pName = pBuf `plusPtr` (offset + 4)
                lName <- fromIntegral <$> c_strlen pName
                let name = gxFromByteString $ I.fromForeignPtr buf (offset + 4) lName
                (atts, offset') <- foldM (\(atts, offset) _ -> do
                        let pAtt = pBuf `plusPtr` offset
                        lAtt <- fromIntegral <$> c_strlen pAtt
                        let att = gxFromByteString $ I.fromForeignPtr buf offset lAtt
                            offset' = offset + lAtt + 1
                            pValue = pBuf `plusPtr` offset'
                        lValue <- fromIntegral <$> c_strlen pValue
                        let value = gxFromByteString $ I.fromForeignPtr buf offset' lValue
                        return ((att, value):atts, offset' + lValue + 1)
                    ) ([], offset + 4 + lName + 1) [1,3..nAtts]
                doit ((StartElement name (reverse atts), a) : acc) pBuf (roundUp32 offset')
            2 -> do
                let pName = pBuf `plusPtr` offset
                lName <- fromIntegral <$> c_strlen pName
                let name = gxFromByteString $ I.fromForeignPtr buf offset lName
                    offset' = offset + lName + 1
                doit ((EndElement name, a) : acc) pBuf (roundUp32 offset')
            3 -> do
                len <- fromIntegral <$> peek (pBuf `plusPtr` offset :: Ptr Word32)
                let text = gxFromByteString $ I.fromForeignPtr buf (offset + 4) len
                    offset' = offset + 4 + len
                doit ((CharacterData text, a) : acc) pBuf (roundUp32 offset')
            4 -> do
                let pEnc = pBuf `plusPtr` offset
                lEnc <- fromIntegral <$> c_strlen pEnc
                let enc = gxFromByteString $ I.fromForeignPtr buf offset lEnc
                    offset' = offset + lEnc + 1
                    pVer = pBuf `plusPtr` offset'
                pVerFirst <- peek (castPtr pVer :: Ptr Word8)
                (mVer, offset'') <- case pVerFirst of
                    0 -> return (Nothing, offset' + 1)
                    1 -> do
                        lVer <- fromIntegral <$> c_strlen (pVer `plusPtr` 1)
                        return (Just $ gxFromByteString $ I.fromForeignPtr buf (offset' + 1) lVer, offset' + 1 + lVer + 1)
                    _ -> error "hexpat: bad data from C land"
                cSta <- peek (pBuf `plusPtr` offset'' :: Ptr Int8)
                let sta = if cSta < 0  then Nothing else
                          if cSta == 0 then Just False else
                                            Just True
                doit ((XMLDeclaration enc mVer sta, a) : acc) pBuf (roundUp32 (offset'' + 1))
            5 -> doit ((StartCData, a) : acc) pBuf offset
            6 -> doit ((EndCData, a) : acc) pBuf offset
            7 -> do
                let pTarget = pBuf `plusPtr` offset
                lTarget <- fromIntegral <$> c_strlen pTarget
                let target = gxFromByteString $ I.fromForeignPtr buf offset lTarget
                    offset' = offset + lTarget + 1
                    pData = pBuf `plusPtr` offset'
                lData <- fromIntegral <$> c_strlen pData
                let dat = gxFromByteString $ I.fromForeignPtr buf offset' lData
                doit ((ProcessingInstruction target dat, a) : acc) pBuf (roundUp32 (offset' + lData + 1))
            8 -> do
                let pText = pBuf `plusPtr` offset
                lText <- fromIntegral <$> c_strlen pText
                let text = gxFromByteString $ I.fromForeignPtr buf offset lText
                doit ((Comment text, a) : acc) pBuf (roundUp32 (offset + lText + 1))
            _ -> error "hexpat: bad data from C land"

-- | Lazily parse XML to SAX events. In the event of an error, FailDocument is
-- the last element of the output list.
parse :: (GenericXMLString tag, GenericXMLString text) =>
         ParseOptions tag text  -- ^ Parse options
      -> L.ByteString           -- ^ Input text (a lazy ByteString)
      -> [SAXEvent tag text]
parse opts input = parseG opts (L.toChunks input)


-- | An exception indicating an XML parse error, used by the /..Throwing/ variants.
data XMLParseException = XMLParseException XMLParseError
    deriving (Eq, Show, Typeable)

instance Exception XMLParseException where

-- | A variant of parseSAX that gives a document location with each SAX event.
parseLocations :: (GenericXMLString tag, GenericXMLString text) =>
                  ParseOptions tag text  -- ^ Parse options
               -> L.ByteString            -- ^ Input text (a lazy ByteString)
               -> [(SAXEvent tag text, XMLParseLocation)]
parseLocations opts input = parseLocationsG opts (L.toChunks input)


-- | Lazily parse XML to SAX events. In the event of an error, throw
-- 'XMLParseException'.
--
-- @parseThrowing@ can throw an exception from pure code, which is generally a bad
-- way to handle errors, because Haskell\'s lazy evaluation means it\'s hard to
-- predict where it will be thrown from.  However, it may be acceptable in
-- situations where it's not expected during normal operation, depending on the
-- design of your program.
parseThrowing :: (GenericXMLString tag, GenericXMLString text) =>
                 ParseOptions tag text  -- ^ Parse options
              -> L.ByteString            -- ^ input text (a lazy ByteString)
              -> [SAXEvent tag text]
parseThrowing opts bs = map freakOut $ parse opts bs
  where
    freakOut (FailDocument err) = Exc.throw $ XMLParseException err
    freakOut other = other


-- | A variant of parseSAX that gives a document location with each SAX event.
-- In the event of an error, throw 'XMLParseException'.
--
-- @parseLocationsThrowing@ can throw an exception from pure code, which is generally a bad
-- way to handle errors, because Haskell\'s lazy evaluation means it\'s hard to
-- predict where it will be thrown from.  However, it may be acceptable in
-- situations where it's not expected during normal operation, depending on the
-- design of your program.
parseLocationsThrowing :: (GenericXMLString tag, GenericXMLString text) =>
                          ParseOptions tag text  -- ^ Optional encoding override
                       -> L.ByteString            -- ^ Input text (a lazy ByteString)
                       -> [(SAXEvent tag text, XMLParseLocation)]
parseLocationsThrowing opts bs = map freakOut $ parseLocations opts bs
  where
    freakOut (FailDocument err, _) = Exc.throw $ XMLParseException err
    freakOut other = other