File: AllNonAsciiChars.hs

package info (click to toggle)
agda-stdlib 2.1-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 9,196 kB
  • sloc: haskell: 375; makefile: 32; sh: 28; lisp: 1
file content (42 lines) | stat: -rw-r--r-- 1,476 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
{-# LANGUAGE OverloadedStrings #-}
-- | This module extracts all the non-ASCII characters used by the
-- library code (along with how many times they are used).

module Main where

import qualified Data.List as List (sortBy, sort)
import qualified Data.List.NonEmpty as List1 (group, head)
import Data.Char (isAscii, ord)
import Data.Function (on)
import Numeric (showHex)
import System.FilePath.Find (find, always, extension, (||?), (==?))
import System.IO (openFile, hSetEncoding, utf8, IOMode(ReadMode))
import qualified Data.Text as T (Text, pack, unpack, concat)
import qualified Data.Text.IO as T (putStrLn, hGetContents)

readUTF8File :: FilePath -> IO T.Text
readUTF8File f = do
  h <- openFile f ReadMode
  hSetEncoding h utf8
  T.hGetContents h

main :: IO ()
main = do
  agdaFiles <- find always
                    (extension ==? ".agda" ||? extension ==? ".lagda")
                    "src"
  nonAsciiChars <-
    filter (not . isAscii) . T.unpack . T.concat <$> mapM readUTF8File agdaFiles
  let table :: [(Char, Int)]
      table = List.sortBy (flip compare `on` snd) $
              map (\cs -> (List1.head cs, length cs)) $
              List1.group $ List.sort $ nonAsciiChars

  let codePoint :: Char -> T.Text
      codePoint c = T.pack $ showHex (ord c) ""

      uPlus :: Char -> T.Text
      uPlus c = T.concat ["(U+", codePoint c, ")"]

  mapM_ (\(c, count) -> T.putStrLn $ T.concat [T.pack [c], " ", uPlus c, ": ", T.pack $ show count])
        table