1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE QuasiQuotes #-}
import Control.Monad
import Data.Time.Clock
import System.IO.Posix.MMap
import System.Mem
import Xeno.DOM
import qualified Data.ByteString as BS
main :: IO ()
main = do
let prefix = "ex-data/"
files' = map (prefix ++)
[ {- 921 Mb -} "1htq.xml"
, {- 190 Mb -} "enwiki-20190901-abstract10.xml"
, {- 1.6 Gb -} "enwiki-20190901-pages-logging1.xml"
, {- 4.0 Gb -} "enwiki-20190901-pages-meta-current24.xml-p30503451p32003451.xml"
-- , {- 21 Gb -} "enwiki-20190901-pages-meta-history2.xml"
]
files = concat $ replicate 5 files'
--
deltas <- forM files $ \fn -> do
putStrLn $ "Processing file '" ++ show fn ++ "'"
--
-- NOTE: It is need to cache file in memory BEFORE start test.
-- It can be done with `vmtouch` utility for example (`vmtouch -vtL *`).
--
bs <- unsafeMMapFile fn
-- bs <- BS.readFile fn
putStrLn $ " size: " ++ show (BS.length bs `div` (1024*1024)) ++ " Mb"
performGC
start <- getCurrentTime
-- SAX:
-- let res = validate bs
-- putStrLn [qc| process result: {res}|]
-- DOM:
(\(Right !_node) -> putStrLn " processed!") (parse bs)
finish <- getCurrentTime
let delta = finish `diffUTCTime` start
putStrLn $ " processing time: " ++ show delta
return delta
--
putStrLn "------"
putStrLn $ "Total: " ++ show (sum deltas)
|