File: NormalizeFile.hs

package info (click to toggle)
haskell-unicode-transforms 0.4.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,292 kB
  • sloc: haskell: 786; sh: 15; makefile: 7
file content (31 lines) | stat: -rw-r--r-- 844 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
{-# LANGUAGE TemplateHaskell #-}

-- |
-- Copyright   : (c) 2016 Harendra Kumar
--
-- License     : BSD-3-Clause
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
-- Portability : GHC
--

import           Control.DeepSeq        (deepseq)
import           System.Environment     (getArgs)

import qualified Data.Text.Normalize as UT
import Data.Text (pack, Text)

-- Truncate or expand all datasets to this size to provide a normalized
-- measurement view across all datasets and to reduce the effect of noise
-- because of the datasets being too small.
dataSetSize :: Int
dataSetSize = 1000000

txtInput :: FilePath -> IO Text
txtInput file = fmap (pack . take dataSetSize . cycle) (readFile file)

main :: IO ()
main = do
    [file] <- getArgs
    input <- txtInput file
    UT.normalize UT.NFD input `deepseq` return ()