File: IO.hs

package info (click to toggle)
haskell-hexpat 0.20.13-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 916 kB
  • sloc: ansic: 12,303; haskell: 3,457; xml: 1,109; makefile: 5; sh: 5
file content (221 lines) | stat: -rw-r--r-- 8,465 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
{-# LANGUAGE ForeignFunctionInterface, EmptyDataDecls #-}
{-# OPTIONS_GHC -fno-cse -fno-full-laziness #-}


-- | Low-level interface to Expat. Unless speed is paramount, this should
-- normally be avoided in favour of the interfaces provided by
-- 'Text.XML.Expat.SAX' and 'Text.XML.Expat.Tree', etc.
module Text.XML.Expat.Internal.IO (
  HParser,
  hexpatNewParser,
  encodingToString,
  Encoding(..),
  XMLParseError(..),
  XMLParseLocation(..)
  ) where

import Control.Applicative
import Control.DeepSeq
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as I
import Data.Int
import Data.Word
import Foreign
import Foreign.C


data Parser_struct
type ParserPtr = Ptr Parser_struct

data Encoding = ASCII | UTF8 | UTF16 | ISO88591
encodingToString :: Encoding -> String
encodingToString ASCII    = "US-ASCII"
encodingToString UTF8     = "UTF-8"
encodingToString UTF16    = "UTF-16"
encodingToString ISO88591 = "ISO-8859-1"

withOptEncoding :: Maybe Encoding -> (CString -> IO a) -> IO a
withOptEncoding Nothing    f = f nullPtr
withOptEncoding (Just enc) f = withCString (encodingToString enc) f

-- ByteString.useAsCStringLen is almost what we need, but C2HS wants a CInt
-- instead of an Int.
withBStringLen :: B.ByteString -> ((CString, CInt) -> IO a) -> IO a
withBStringLen bs f = do
  B.useAsCStringLen bs $ \(str, len) -> f (str, fromIntegral len)

unStatus :: CInt -> Bool
unStatus 0 = False
unStatus _ = True

getError :: ParserPtr -> IO XMLParseError
getError pp = do
    code <- xmlGetErrorCode pp
    cerr <- xmlErrorString code
    err <- peekCString cerr
    loc <- getParseLocation pp
    return $ XMLParseError err loc

-- |Obtain C value from Haskell 'Bool'.
--
cFromBool :: Num a => Bool -> a
cFromBool = fromBool

-- | Parse error, consisting of message text and error location
data XMLParseError = XMLParseError String XMLParseLocation deriving (Eq, Show)

instance NFData XMLParseError where
    rnf (XMLParseError msg loc) = rnf (msg, loc)

-- | Specifies a location of an event within the input text
data XMLParseLocation = XMLParseLocation {
        xmlLineNumber   :: Int64,  -- ^ Line number of the event
        xmlColumnNumber :: Int64,  -- ^ Column number of the event
        xmlByteIndex    :: Int64,  -- ^ Byte index of event from start of document
        xmlByteCount    :: Int64   -- ^ The number of bytes in the event
    }
    deriving (Eq, Show)

instance NFData XMLParseLocation where
    rnf (XMLParseLocation lin col ind cou) = rnf (lin, col, ind, cou)

getParseLocation :: ParserPtr -> IO XMLParseLocation
getParseLocation pp = do
    line <- xmlGetCurrentLineNumber pp
    col <- xmlGetCurrentColumnNumber pp
    index <- xmlGetCurrentByteIndex pp
    count <- xmlGetCurrentByteCount pp
    return $ XMLParseLocation {
            xmlLineNumber = fromIntegral line,
            xmlColumnNumber = fromIntegral col,
            xmlByteIndex = fromIntegral index,
            xmlByteCount = fromIntegral count
        }

-- Note on word sizes:
--
-- on expat 2.0:
-- XML_GetCurrentLineNumber returns XML_Size
-- XML_GetCurrentColumnNumber returns XML_Size
-- XML_GetCurrentByteIndex returns XML_Index
-- These are defined in expat_external.h
--
-- debian-i386 says XML_Size and XML_Index are 4 bytes.
-- ubuntu-amd64 says XML_Size and XML_Index are 8 bytes.
-- These two systems do NOT define XML_LARGE_SIZE, which would force these types
-- to be 64-bit.
--
-- If we guess the word size too small, it shouldn't matter: We will just discard
-- the most significant part.  If we get the word size too large, we will get
-- garbage (very bad).
--
-- So - what I will do is use CLong and CULong, which correspond to what expat
-- is using when XML_LARGE_SIZE is disabled, and give the correct sizes on the
-- two machines mentioned above.  At the absolute worst the word size will be too
-- short.

foreign import ccall unsafe "expat.h XML_GetErrorCode" xmlGetErrorCode
    :: ParserPtr -> IO CInt
foreign import ccall unsafe "expat.h XML_GetCurrentLineNumber" xmlGetCurrentLineNumber
    :: ParserPtr -> IO CULong
foreign import ccall unsafe "expat.h XML_GetCurrentColumnNumber" xmlGetCurrentColumnNumber
    :: ParserPtr -> IO CULong
foreign import ccall unsafe "expat.h XML_GetCurrentByteIndex" xmlGetCurrentByteIndex
    :: ParserPtr -> IO CLong
foreign import ccall unsafe "expat.h XML_GetCurrentByteCount" xmlGetCurrentByteCount
    :: ParserPtr -> IO CInt
foreign import ccall unsafe "expat.h XML_ErrorString" xmlErrorString
    :: CInt -> IO CString

type HParser = B.ByteString -> Bool -> IO (ForeignPtr Word8, CInt, Maybe XMLParseError)

foreign import ccall unsafe "hexpatNewParser"
  _hexpatNewParser :: Ptr CChar -> CInt -> IO MyParserPtr

foreign import ccall unsafe "hexpatGetParser"
  _hexpatGetParser :: MyParserPtr -> ParserPtr

data MyParser_struct
type MyParserPtr = Ptr MyParser_struct

foreign import ccall "&hexpatFreeParser" hexpatFreeParser :: FunPtr (MyParserPtr -> IO ())

hexpatNewParser :: Maybe Encoding
                -> Maybe (B.ByteString -> Maybe B.ByteString)  -- ^ Entity decoder
                -> Bool        -- ^ Whether to include input locations
                -> IO (HParser, IO XMLParseLocation)
hexpatNewParser enc mDecoder locations =
    withOptEncoding enc $ \cEnc -> do
        parser <- newForeignPtr hexpatFreeParser =<< _hexpatNewParser cEnc (cFromBool locations)
        return (parse parser, withForeignPtr parser $ \mp -> getParseLocation $ _hexpatGetParser mp)
  where
    parse parser = case mDecoder of
        Nothing -> \text final ->
            alloca $ \ppData ->
            alloca $ \pLen ->
            withBStringLen text $ \(textBuf, textLen) ->
            withForeignPtr parser $ \pp -> do
                ok <- unStatus <$> _hexpatParseUnsafe pp textBuf textLen (cFromBool final) ppData pLen
                pData <- peek ppData
                len <- peek pLen
                err <- if ok
                    then return Nothing
                    else Just <$> getError (_hexpatGetParser pp)
                fpData <- newForeignPtr funPtrFree pData
                return (fpData, len, err)
        Just decoder -> \text final ->
            alloca $ \ppData ->
            alloca $ \pLen ->
            withBStringLen text $ \(textBuf, textLen) ->
            withForeignPtr parser $ \pp -> do
                eh <- mkCEntityHandler . wrapCEntityHandler $ decoder
                _hexpatSetEntityHandler pp eh
                ok <- unStatus <$> _hexpatParseSafe pp textBuf textLen (cFromBool final) ppData pLen
                freeHaskellFunPtr eh
                pData <- peek ppData
                len <- peek pLen
                err <- if ok
                    then return Nothing
                    else Just <$> getError (_hexpatGetParser pp)
                fpData <- newForeignPtr funPtrFree pData
                return (fpData, len, err)

foreign import ccall unsafe "hexpatParse"
  _hexpatParseUnsafe :: MyParserPtr -> Ptr CChar -> CInt -> CInt -> Ptr (Ptr Word8) -> Ptr CInt -> IO CInt

foreign import ccall safe "hexpatParse"
  _hexpatParseSafe :: MyParserPtr -> Ptr CChar -> CInt -> CInt -> Ptr (Ptr Word8) -> Ptr CInt -> IO CInt

type CEntityHandler = Ptr CChar -> IO (Ptr CChar)

foreign import ccall safe "wrapper"
  mkCEntityHandler :: CEntityHandler
                   -> IO (FunPtr CEntityHandler)

peekByteStringLen :: CStringLen -> IO B.ByteString
{-# INLINE peekByteStringLen #-}
peekByteStringLen (cstr, len) =
    I.create (fromIntegral len) $ \ptr ->
        I.memcpy ptr (castPtr cstr) (fromIntegral len)

wrapCEntityHandler :: (B.ByteString -> Maybe B.ByteString) -> CEntityHandler
wrapCEntityHandler handler = h
  where
    h cname = do
        sz <- fromIntegral <$> I.c_strlen cname
        name <- peekByteStringLen (cname, sz)
        case handler name of
            Just text -> do
                let (fp, offset, len) = I.toForeignPtr text
                withForeignPtr fp $ \ctextBS -> do
                    ctext <- mallocBytes (len + 1) :: IO CString
                    I.memcpy (castPtr ctext) (ctextBS `plusPtr` offset) (fromIntegral len)
                    poke (ctext `plusPtr` len) (0 :: CChar)
                    return ctext
            Nothing -> return nullPtr

foreign import ccall unsafe "hexpatSetEntityHandler"
  _hexpatSetEntityHandler :: MyParserPtr -> FunPtr CEntityHandler -> IO ()

foreign import ccall "&free" funPtrFree :: FunPtr (Ptr Word8 -> IO ())