File: SpeedBigFiles.hs

package info (click to toggle)
haskell-xeno 0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 212 kB
  • sloc: haskell: 1,324; xml: 120; makefile: 7
file content (122 lines) | stat: -rw-r--r-- 4,258 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
{-# LANGUAGE BangPatterns       #-}
{-# LANGUAGE CPP                #-}
{-# LANGUAGE DeriveGeneric      #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# OPTIONS_GHC -fno-warn-orphans -Wno-unused-imports #-}

-- | Benchmark speed with big files

module Main where


import           Codec.Compression.BZip
import           Control.DeepSeq
import           Criterion
import           Criterion.Main
import           Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as L
import           Data.List (delete)
import           GHC.Generics
import           System.FilePath.Posix
import qualified Text.XML.Expat.SAX as Hexpat
import qualified Text.XML.Expat.Tree as HexpatTree
import qualified Text.XML.Hexml as Hexml
import           Text.XML.Light as XML
import           Text.XML.Light.Input as XML
import qualified Xeno.Types
import qualified Xeno.SAX
import qualified Xeno.DOM
import qualified Xeno.DOM.Robust
import qualified Data.ByteString as S
#ifdef LIBXML2
import qualified Text.XML.LibXML.Parser as Libxml2
#endif


main :: IO ()
main = defaultMain
    [ benchFile allTests       "46MB"  "enwiki-20190901-pages-articles14.xml-p7697599p7744799.bz2"
    , benchFile allTests       "624MB" "enwiki-20190901-pages-articles-multistream1.xml-p10p30302.bz2"
    , benchFile allTests       "921MB" "1HTQ.xml.bz2"
    , benchFile allTests       "1.6Gb" "enwiki-20190901-pages-meta-current6.xml-p565314p892912.bz2"
    , benchFile allExceptHexml "4Gb"   "enwiki-20190901-pages-meta-current24.xml-p30503451p32003451.bz2"
    -- , benchFile allExceptHexml "21Gb"  "enwiki-20190901-pages-meta-history2.xml-p31255p31720.bz2"
    ]


allTests :: [String]
allTests = [ "hexml-dom"
           , "xeno-sax"
           , "xeno-sax-z"
           -- , "xeno-sax-ex"
           -- , "xeno-sax-ex-z"
           , "xeno-dom"
           , "xeno-dom-with-recovery"
           -- XXX: "hexpact", "xml-dom" library don't work with big files; require too much memory
           -- , "hexpat-sax"
           -- , "hexpat-dom"
           -- , "xml-dom"
           -- , "libxml2-dom"
           ]


allExceptHexml :: [String]
allExceptHexml = "hexml-dom" `delete` allTests


benchFile :: [String] -> String -> FilePath -> Benchmark
benchFile enabledTests size fn =
    env (readBZip2File fn)
        (\ ~(input, inputz) -> bgroup size $ benchMethods enabledTests input inputz)


benchMethods :: [String] -> ByteString -> Xeno.Types.ByteStringZeroTerminated -> [Benchmark]
benchMethods enabledTests input inputz =
       runBench "hexml-dom" (whnf Hexml.parse input)
    ++ runBench "xeno-sax"      (whnf Xeno.SAX.validate input)
    ++ runBench "xeno-sax-z"    (whnf Xeno.SAX.validate inputz)
    ++ runBench "xeno-sax-ex  " (whnf Xeno.SAX.validateEx input)
    ++ runBench "xeno-sax-ex-z" (whnf Xeno.SAX.validateEx inputz)
    ++ runBench "xeno-dom" (whnf Xeno.DOM.parse input)
    ++ runBench "xeno-dom-with-recovery" (whnf Xeno.DOM.Robust.parse input)
    ++ runBench
        "hexpat-sax"
        (whnf
            ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) .
             L.fromStrict)
            input)
    ++ runBench
        "hexpat-dom"
        (whnf
            ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString)))
            input)
    ++ runBench "xml-dom" (nf XML.parseXMLDoc input)
#ifdef LIBXML2
    ++ runBench "libxml2-dom" (whnfIO (Libxml2.parseMemory input))
#endif
  where
    runBench name act
        | name `elem` enabledTests = [bench name act]
        | otherwise                = []


readBZip2File :: FilePath -> IO (ByteString, Xeno.Types.ByteStringZeroTerminated)
readBZip2File fn = do
    file <- L.readFile ("data" </> "ex" </> fn)
    let !bs  = L.toStrict $ decompress file
        !bsz = Xeno.Types.BSZT $ bs `S.snoc` 0
    return (bs, bsz)


deriving instance Generic Content
deriving instance Generic Element
deriving instance Generic CData
deriving instance Generic CDataKind
deriving instance Generic QName
deriving instance Generic Attr
instance NFData Content
instance NFData Element
instance NFData CData
instance NFData CDataKind
instance NFData QName
instance NFData Attr