1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
|
{--
This is a script to generate the necessary tables to support Windows code page
encoding/decoding.
License: see libraries/base/LICENSE
The code page tables are available from :
http://www.unicode.org/Public/MAPPINGS/
To run this script, use e.g.
runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
Currently, this script only supports single-byte encodings, since the lookup
tables required for the CJK double-byte codepages are too large to be
statically linked into every executable. We plan to add support for them once
GHC is able to produce Windows DLLs.
--}
module Main where
import System.FilePath
import qualified Data.Map as Map
import System.IO
import Data.Maybe (mapMaybe)
import Data.List (intersperse)
import Data.Word
import Numeric
import Control.Monad.State
import System.Environment
import Control.Exception(evaluate)
main :: IO ()
main = do
moduleName:outFile:files <- getArgs
let badFiles = -- These fail with an error like
-- MakeTable: Enum.toEnum{Word8}: tag (33088) is outside of bounds (0,255)
-- I have no idea what's going on, so for now we just
-- skip them.
["CPs/CP932.TXT",
"CPs/CP936.TXT",
"CPs/CP949.TXT",
"CPs/CP950.TXT"]
let files' = filter (`notElem` badFiles) files
sbes <- mapM readMapAndIx files'
putStrLn "Writing output"
withBinaryFile outFile WriteMode $ flip hPutStr
$ unlines $ makeTableFile moduleName files' sbes
where
readMapAndIx f = do
putStrLn ("Reading " ++ f)
m <- readMap f
return (codePageNum f, m)
-- filenames are assumed to be of the form "CP1250.TXT"
codePageNum :: FilePath -> Int
codePageNum = read . drop 2 . takeBaseName
readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
readMap f = withBinaryFile f ReadMode $ \h -> do
contents <- hGetContents h
let ms = Map.fromList $ mapMaybe parseLine $ lines contents
evaluate $ Map.size ms
return ms
parseLine :: Enum a => String -> Maybe (a,Char)
parseLine s = case words s of
('#':_):_ -> Nothing
bs:"#DBCS":_ -> Just (readHex' bs, toEnum 0xDC00)
bs:"#UNDEFINED":_ -> Just (readHex' bs, toEnum 0)
bs:cs:('#':_):_ -> Just (readHex' bs, readCharHex cs)
_ -> Nothing
readHex' :: Enum a => String -> a
readHex' ('0':'x':s) = case readHex s of
[(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
_ -> errorWithoutStackTrace $ "Can't read hex: " ++ show s
readHex' s = errorWithoutStackTrace $ "Can't read hex: " ++ show s
readCharHex :: String -> Char
readCharHex s = if c > fromEnum (maxBound :: Word16)
then errorWithoutStackTrace "Can't handle non-BMP character."
else toEnum c
where c = readHex' s
-------------------------------------------
-- Writing out the main data values.
makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
makeTableFile moduleName files maps = concat
[ languageDirectives, firstComment files, header,
theImports, theTypes, blockSizeText, tablePart]
where
header = [ "module " ++ moduleName ++ " where"
, ""
]
tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
, "codePageMap = ["
] ++ (intersperse "\n ," $ map mkTableEntry maps)
++ [" ]"]
mkTableEntry (i,m) = " (" ++ show i ++ ", " ++ makeSBE m ++ " )"
blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
makeSBE :: Map.Map Word8 Char -> String
makeSBE m = unlines
[ "SingleByteCP {"
, " decoderArray = " ++ mkConvArray es
, " , encoderArray = " ++ mkCompactArray (swapMap m)
, " }"
]
where
es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
where
swap (x,y) = (y,x)
mkConvArray :: Embed a => [a] -> String
mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
-------------------------------------------
-- Compact arrays
--
-- The decoding map (from Word8 to Char) can be implemented with a simple array
-- of 256 Word16's. Bytes which do not belong to the code page are mapped to
-- '\0'.
--
-- However, a naive table mapping Char to Word8 would require 2^16 Word8's. We
-- can use much less space with the right data structure, since at most 256 of
-- those entries are nonzero.
--
-- We use "compact arrays", as described in "Unicode Demystified" by Richard
-- Gillam.
--
-- Fix a block size S which is a power of two. We compress an array of N
-- entries (where N>>S) as follows. First, split the array into blocks of size
-- S, then remove all repeated blocks to form the "value" array. Then construct
-- a separate "index" array which maps the position of blocks in the old array
-- to a position in the value array.
--
-- For example, assume that S=32 we have six blocks ABABCA, each with 32
-- elements.
--
-- Then the compressed table consists of two arrays:
-- 1) An array "values", concatenating the unique blocks ABC
-- 2) An array "indices" which equals [0,1,0,1,2,0].
--
-- To look up '\100', first calculate divMod 100 32 = (3,4). Since
-- indices[3]=1, we look at the second unique block B; thus the encoded byte is
-- B[4].
--
-- The upshot of this representation is that the lookup is very quick as it only
-- requires two array accesses plus some bit masking/shifting.
-- From testing, this is an optimal size.
blockBitSize :: Int
blockBitSize = 6
mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
mkCompactArray m = unlines [
""
, " CompactArray {"
, " encoderIndices = " ++ mkConvArray is'
, " , encoderValues = "
++ mkConvArray (concat $ Map.elems vs)
, " , encoderMax = " ++ show (fst $ Map.findMax m)
, " }"
]
where
blockSize = 2 ^ blockBitSize
(is,(vs,_)) = compress blockSize $ m
is' = map (* blockSize) is
type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
-- each entry in the list corresponds to a block of size n.
compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
-> ([Int], CompressState b)
compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
where
chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
$ [minBound..fst (Map.findMax ms)]
mkChunks [] = []
mkChunks xs = take n xs : mkChunks (drop n xs)
lookupOrAdd xs = do
(m,rm) <- get
case Map.lookup xs rm of
Just i -> return i
Nothing -> do
let i = if Map.null m
then 0
else 1 + fst (Map.findMax m)
put (Map.insert i xs m, Map.insert xs i rm)
return i
-------------------------------------------
-- Static parts of the generated module.
languageDirectives :: [String]
languageDirectives = ["{-# LANGUAGE MagicHash, NoImplicitPrelude #-}"]
firstComment :: [FilePath] -> [String]
firstComment files = map ("-- " ++) $
[ "Do not edit this file directly!"
, "It was generated by the MakeTable.hs script using the files below."
, "To regenerate it, run \"make\" in ../../../../codepages/"
, ""
, "Files:"
] ++ map takeFileName files
theImports :: [String]
theImports = map ("import " ++ )
["GHC.Prim", "GHC.Base", "GHC.Word"]
theTypes :: [String]
theTypes = [ "data ConvArray a = ConvArray Addr#"
, "data CompactArray a b = CompactArray {"
, " encoderMax :: !a,"
, " encoderIndices :: !(ConvArray Int),"
, " encoderValues :: !(ConvArray b)"
, " }"
, ""
, "data CodePageArrays = SingleByteCP {"
, " decoderArray :: !(ConvArray Char),"
, " encoderArray :: !(CompactArray Char Word8)"
, " }"
, ""
]
-------------------------------------------
-- Embed class and associated functions
class (Ord a, Enum a, Bounded a, Show a) => Embed a where
mkHex :: a -> String
-- | @since 4.2.0.0
instance Embed Word8 where
mkHex = showHex'
-- | @since 4.2.0.0
instance Embed Word16 where
mkHex = repDualByte
-- | @since 4.2.0.0
instance Embed Char where
mkHex = repDualByte
-- this is used for the indices of the compressed array.
-- | @since 4.2.0.0
instance Embed Int where
mkHex = repDualByte
showHex' :: Integral a => a -> String
showHex' s = "\\x" ++ showHex s ""
repDualByte :: Enum c => c -> String
repDualByte c
| n >= 2^(16::Int) = errorWithoutStackTrace "value is too high!"
-- NOTE : this assumes little-endian architecture. But we're only using this on Windows,
-- so it's probably OK.
| otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)
where
n = fromEnum c
|