File: Collate.hsc

package info (click to toggle)
haskell-text-icu 0.6.3.7-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 284 kB
  • ctags: 82
  • sloc: haskell: 794; ansic: 535; makefile: 3
file content (328 lines) | stat: -rw-r--r-- 13,448 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-}
-- |
-- Module      : Data.Text.ICU.Collate
-- Copyright   : (c) 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- String collation functions for Unicode, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.

module Data.Text.ICU.Collate
    (
    -- * Unicode collation API
    -- $api
    -- * Types
      MCollator
    , Attribute(..)
    , AlternateHandling(..)
    , CaseFirst(..)
    , Strength(..)
    -- * Functions
    , open
    , collate
    , collateIter
    -- ** Utility functions
    , equals
    , getAttribute
    , setAttribute
    , sortKey
    , clone
    , freeze
    ) where

#include <unicode/ucol.h>

import Data.ByteString (empty)
import Data.ByteString.Internal (ByteString(..), create, mallocByteString,
                                 memcpy)
import Data.Int (Int32)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Foreign (useAsPtr)
import Data.Text.ICU.Collate.Internal (Collator(..), MCollator, UCollator,
                                       equals, withCollator, wrap)
import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
import Data.Text.ICU.Internal
    (LocaleName, UChar, CharIterator, UCharIterator,
     asOrdering, withCharIterator, withLocaleName)
import Data.Typeable (Typeable)
import Data.Word (Word8)
import Foreign.C.String (CString)
import Foreign.C.Types (CInt(..))
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, nullPtr)

-- $api
--

-- | Control the handling of variable weight elements.
data AlternateHandling = NonIgnorable
                       -- ^ Treat all codepoints with non-ignorable primary
                       -- weights in the same way.
                       | Shifted
                         -- ^ Cause codepoints with primary weights that are
                         -- equal to or below the variable top value to be
                         -- ignored on primary level and moved to the
                         -- quaternary level.
                         deriving (Eq, Bounded, Enum, Show, Typeable)

-- | Control the ordering of upper and lower case letters.
data CaseFirst = UpperFirst     -- ^ Force upper case letters to sort before
                                -- lower case.
               | LowerFirst     -- ^ Force lower case letters to sort before
                                -- upper case.
                deriving (Eq, Bounded, Enum, Show, Typeable)

-- | The strength attribute. The usual strength for most locales (except
-- Japanese) is tertiary. Quaternary strength is useful when combined with
-- shifted setting for alternate handling attribute and for JIS x 4061
-- collation, when it is used to distinguish between Katakana and Hiragana
-- (this is achieved by setting 'HiraganaQuaternaryMode' mode to
-- 'True'). Otherwise, quaternary level is affected only by the number of
-- non ignorable code points in the string. Identical strength is rarely
-- useful, as it amounts to codepoints of the 'NFD' form of the string.
data Strength = Primary
              | Secondary
              | Tertiary
              | Quaternary
              | Identical
                deriving (Eq, Bounded, Enum, Show, Typeable)

data Attribute = French Bool
               -- ^ Direction of secondary weights, used in French.  'True',
               -- results in secondary weights being considered backwards,
               -- while 'False' treats secondary weights in the order in
               -- which they appear.
               | AlternateHandling AlternateHandling
                 -- ^ For handling variable elements.  'NonIgnorable' is
                 -- default.
               | CaseFirst (Maybe CaseFirst)
               -- ^ Control the ordering of upper and lower case letters.
               -- 'Nothing' (the default) orders upper and lower case
               -- letters in accordance to their tertiary weights.
               | CaseLevel Bool
                 -- ^ Controls whether an extra case level (positioned
                 -- before the third level) is generated or not.  When
                 -- 'False' (default), case level is not generated; when
                 -- 'True', the case level is generated. Contents of the
                 -- case level are affected by the value of the 'CaseFirst'
                 -- attribute. A simple way to ignore accent differences in
                 -- a string is to set the strength to 'Primary' and enable
                 -- case level.
               | NormalizationMode Bool
               -- ^ Controls whether the normalization check and necessary
               -- normalizations are performed. When 'False' (default) no
               -- normalization check is performed. The correctness of the
               -- result is guaranteed only if the input data is in
               -- so-called 'FCD' form (see users manual for more info).
               -- When 'True', an incremental check is performed to see
               -- whether the input data is in 'FCD' form. If the data is
               -- not in 'FCD' form, incremental 'NFD' normalization is
               -- performed.
               | Strength Strength
               | HiraganaQuaternaryMode Bool
                 -- ^ When turned on, this attribute positions Hiragana
                 -- before all non-ignorables on quaternary level. This is a
                 -- sneaky way to produce JIS sort order.
               | Numeric Bool
                 -- ^ When enabled, this attribute generates a collation key
                 -- for the numeric value of substrings of digits.  This is
                 -- a way to get '100' to sort /after/ '2'.
                 deriving (Eq, Show, Typeable)

type UColAttribute = CInt
type UColAttributeValue = CInt

toUAttribute :: Attribute -> (UColAttribute, UColAttributeValue)
toUAttribute (French v)
    = ((#const UCOL_FRENCH_COLLATION), toOO v)
toUAttribute (AlternateHandling v)
    = ((#const UCOL_ALTERNATE_HANDLING), toAH v)
toUAttribute (CaseFirst v)
    = ((#const UCOL_CASE_FIRST), toCF v)
toUAttribute (CaseLevel v)
    = ((#const UCOL_CASE_LEVEL), toOO v)
toUAttribute (NormalizationMode v)
    = ((#const UCOL_NORMALIZATION_MODE), toOO v)
toUAttribute (Strength v)
    = ((#const UCOL_STRENGTH), toS v)
toUAttribute (HiraganaQuaternaryMode v)
    = ((#const UCOL_HIRAGANA_QUATERNARY_MODE), toOO v)
toUAttribute (Numeric v)
    = ((#const UCOL_NUMERIC_COLLATION), toOO v)

toOO :: Bool -> UColAttributeValue
toOO False = #const UCOL_OFF
toOO True  = #const UCOL_ON

toAH :: AlternateHandling -> UColAttributeValue
toAH NonIgnorable = #const UCOL_NON_IGNORABLE
toAH Shifted      = #const UCOL_SHIFTED

toCF :: Maybe CaseFirst -> UColAttributeValue
toCF Nothing           = #const UCOL_OFF
toCF (Just UpperFirst) = #const UCOL_UPPER_FIRST
toCF (Just LowerFirst) = #const UCOL_LOWER_FIRST

toS :: Strength -> UColAttributeValue
toS Primary    = #const UCOL_PRIMARY
toS Secondary  = #const UCOL_SECONDARY
toS Tertiary   = #const UCOL_TERTIARY
toS Quaternary = #const UCOL_QUATERNARY
toS Identical  = #const UCOL_IDENTICAL

fromOO :: UColAttributeValue -> Bool
fromOO (#const UCOL_OFF) = False
fromOO (#const UCOL_ON)  = True
fromOO bad = valueError "fromOO" bad

fromAH :: UColAttributeValue -> AlternateHandling
fromAH (#const UCOL_NON_IGNORABLE) = NonIgnorable
fromAH (#const UCOL_SHIFTED)       = Shifted
fromAH bad = valueError "fromAH" bad

fromCF :: UColAttributeValue -> Maybe CaseFirst
fromCF (#const UCOL_OFF)         = Nothing
fromCF (#const UCOL_UPPER_FIRST) = Just UpperFirst
fromCF (#const UCOL_LOWER_FIRST) = Just LowerFirst
fromCF bad = valueError "fromCF" bad

fromS :: UColAttributeValue -> Strength
fromS (#const UCOL_PRIMARY)    = Primary
fromS (#const UCOL_SECONDARY)  = Secondary
fromS (#const UCOL_TERTIARY)   = Tertiary
fromS (#const UCOL_QUATERNARY) = Quaternary
fromS (#const UCOL_IDENTICAL)  = Identical
fromS bad = valueError "fromS" bad

fromUAttribute :: UColAttribute -> UColAttributeValue -> Attribute
fromUAttribute key val =
  case key of
    (#const UCOL_FRENCH_COLLATION)         -> French (fromOO val)
    (#const UCOL_ALTERNATE_HANDLING)       -> AlternateHandling (fromAH val)
    (#const UCOL_CASE_FIRST)               -> CaseFirst (fromCF val)
    (#const UCOL_CASE_LEVEL)               -> CaseLevel (fromOO val)
    (#const UCOL_NORMALIZATION_MODE)       -> NormalizationMode (fromOO val)
    (#const UCOL_STRENGTH)                 -> Strength (fromS val)
    (#const UCOL_HIRAGANA_QUATERNARY_MODE) -> HiraganaQuaternaryMode (fromOO val)
    (#const UCOL_NUMERIC_COLLATION)        -> Numeric (fromOO val)
    _ -> valueError "fromUAttribute" key

valueError :: Show a => String -> a -> z
valueError func bad = error ("Data.Text.ICU.Collate.IO." ++ func ++
                             ": invalid value " ++ show bad)

type UCollationResult = CInt

-- | Open a 'Collator' for comparing strings.
open :: LocaleName
     -- ^ The locale containing the required collation rules.
     -> IO MCollator
open loc = wrap =<< withLocaleName loc (handleError . ucol_open)

-- | Set the value of an 'MCollator' attribute.
setAttribute :: MCollator -> Attribute -> IO ()
setAttribute c a =
  withCollator c $ \cptr ->
    handleError $ uncurry (ucol_setAttribute cptr) (toUAttribute a)

-- | Get the value of an 'MCollator' attribute.
--
-- It is safe to provide a dummy argument to an 'Attribute' constructor when
-- using this function, so the following will work:
--
-- > getAttribute mcol (NormalizationMode undefined)
getAttribute :: MCollator -> Attribute -> IO Attribute
getAttribute c a = do
  let name = fst (toUAttribute a)
  val <- withCollator c $ \cptr -> handleError $ ucol_getAttribute cptr name
  return $! fromUAttribute name val

-- | Compare two strings.
collate :: MCollator -> Text -> Text -> IO Ordering
collate c a b =
  withCollator c $ \cptr ->
    useAsPtr a $ \aptr alen ->
      useAsPtr b $ \bptr blen ->
        fmap asOrdering . handleError $
        ucol_strcoll cptr aptr (fromIntegral alen) bptr (fromIntegral blen)

-- | Compare two 'CharIterator's.
--
-- If either iterator was constructed from a 'ByteString', it does not need
-- to be copied or converted internally, so this function can be quite
-- cheap.
collateIter :: MCollator -> CharIterator -> CharIterator -> IO Ordering
collateIter c a b =
  fmap asOrdering . withCollator c $ \cptr ->
    withCharIterator a $ \ai ->
      withCharIterator b $ handleError . ucol_strcollIter cptr ai

-- | Create a key for sorting the 'Text' using the given 'Collator'.
-- The result of comparing two 'ByteString's that have been
-- transformed with 'sortKey' will be the same as the result of
-- 'collate' on the two untransformed 'Text's.
sortKey :: MCollator -> Text -> IO ByteString
sortKey c t
    | T.null t = return empty
    | otherwise = do
  withCollator c $ \cptr ->
    useAsPtr t $ \tptr tlen -> do
      let len = fromIntegral tlen
          loop n = do
            fp <- mallocByteString (fromIntegral n)
            i <- withForeignPtr fp $ \p -> ucol_getSortKey cptr tptr len p n
            let j = fromIntegral i
            case undefined of
              _ | i == 0         -> error "Data.Text.ICU.Collate.IO.sortKey: internal error"
                | i > n          -> loop i
                | i <= n `div` 2 -> create j $ \p -> withForeignPtr fp $ \op ->
                                    memcpy p op (fromIntegral i)
                | otherwise      -> return $! PS fp 0 j
      loop (min (len * 4) 8)

-- | Make a safe copy of a mutable 'MCollator' for use in pure code.
-- Subsequent changes to the 'MCollator' will not affect the state of
-- the returned 'Collator'.
freeze :: MCollator -> IO Collator
freeze = fmap C . clone

-- | Make a copy of a mutable 'MCollator'.
-- Subsequent changes to the input 'MCollator' will not affect the state of
-- the returned 'MCollator'.
clone :: MCollator -> IO MCollator
clone c = do
  p <- withCollator c $ \cptr ->
    with (#const U_COL_SAFECLONE_BUFFERSIZE)
      (handleError . ucol_safeClone cptr nullPtr)
  wrap p

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_open" ucol_open
    :: CString -> Ptr UErrorCode -> IO (Ptr UCollator)

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getAttribute" ucol_getAttribute
    :: Ptr UCollator -> UColAttribute -> Ptr UErrorCode -> IO UColAttributeValue

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_setAttribute" ucol_setAttribute
    :: Ptr UCollator -> UColAttribute -> UColAttributeValue -> Ptr UErrorCode -> IO ()

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcoll" ucol_strcoll
    :: Ptr UCollator -> Ptr UChar -> Int32 -> Ptr UChar -> Int32
    -> Ptr UErrorCode -> IO UCollationResult

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getSortKey" ucol_getSortKey
    :: Ptr UCollator -> Ptr UChar -> Int32 -> Ptr Word8 -> Int32
    -> IO Int32

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcollIter" ucol_strcollIter
    :: Ptr UCollator -> Ptr UCharIterator -> Ptr UCharIterator -> Ptr UErrorCode
    -> IO UCollationResult

foreign import ccall unsafe "hs_text_icu.h __hs_ucol_safeClone" ucol_safeClone
        :: Ptr UCollator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode
        -> IO (Ptr UCollator)