File: UTF8.hs

package info (click to toggle)
haskell-utf8-string 1.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 128 kB
  • sloc: haskell: 929; makefile: 2
file content (180 lines) | stat: -rw-r--r-- 6,383 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
#endif
--
-- |
-- Module      :  Data.String.UTF8
-- Copyright   :  (c) Iavor S. Diatchki 2009
-- License     :  BSD3-style (see LICENSE)
--
-- Maintainer  :  emertens@galois.com
-- Stability   :  experimental
-- Portability :  portable
--
{-# LANGUAGE MultiParamTypeClasses, UndecidableInstances #-}
module Data.String.UTF8
  ( -- * Representation
    UTF8
  , UTF8Bytes()
  , fromString
  , toString
  , fromRep
  , toRep
  , G.replacement_char

  -- * Character based operations
  , uncons
  , splitAt
  , take
  , drop
  , span
  , break
  , foldl
  , foldr
  , length
  , lines
  , lines'

  -- * Representation based operations
  , null
  , decode
  , byteSplitAt
  , byteTake
  , byteDrop
  ) where

import Prelude hiding (null,take,drop,span,break
                      ,foldl,foldr,length,lines,splitAt)
import qualified Codec.Binary.UTF8.Generic as G
import Codec.Binary.UTF8.Generic (UTF8Bytes)
import qualified Data.String as S

-- | The type of strings that are represented using the UTF8 encoding.
-- The parameter is the type of the container for the representation.
newtype UTF8 string = Str string deriving (Eq,Ord)   -- XXX: Is this OK?

instance UTF8Bytes string index => Show (UTF8 string) where
  show x = show (toString x)

instance UTF8Bytes string index => S.IsString (UTF8 string) where
  fromString = fromString

fromRep :: string -> UTF8 string
fromRep = Str

toRep :: UTF8 string -> string
toRep (Str x) = x

-- | Converts a Haskell string into a UTF8 encoded string.
-- Complexity: linear.
fromString :: UTF8Bytes string index => String -> UTF8 string
fromString xs = Str (G.fromString xs)

-- | Convert a UTF8 encoded string into a Haskell string.
-- Invalid characters are replaced by 'G.replacement_char'.
-- Complexity: linear.
toString :: UTF8Bytes string index => UTF8 string -> String
toString (Str xs) = G.toString xs

-- | Checks if there are no more bytes in the underlying representation.
null :: UTF8Bytes string index => UTF8 string -> Bool
null (Str x) = G.null x

-- | Split after a given number of characters.
-- Negative values are treated as if they are 0.
splitAt :: UTF8Bytes string index
        => index -> UTF8 string -> (UTF8 string, UTF8 string)
splitAt x (Str bs)  = case G.splitAt x bs of
                        (s1,s2) -> (Str s1, Str s2)

-- | Split after a given number of bytes in the underlying representation.
-- See also 'splitAt'.
byteSplitAt :: UTF8Bytes string index
             => index -> UTF8 string -> (UTF8 string, UTF8 string)
byteSplitAt n (Str x) = case G.bsplit n x of
                          (as,bs) -> (Str as, Str bs)

-- | Take only the given number of bytes from the underlying representation.
-- See also 'take'.
byteTake :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
byteTake n (Str x) = Str (fst (G.bsplit n x))

-- | Drop the given number of bytes from the underlying representation.
-- See also 'drop'.
byteDrop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
byteDrop n (Str x) = Str (G.bdrop n x)


-- | @take n s@ returns the first @n@ characters of @s@.
-- If @s@ has less than @n@ characters, then we return the whole of @s@.
take :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
take n (Str bs) = Str (G.take n bs)

-- | @drop n s@ returns the @s@ without its first @n@ characters.
-- If @s@ has less than @n@ characters, then we return an empty string.
drop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
drop n (Str bs) = Str (G.drop n bs)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate.
span :: UTF8Bytes string index
     => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
span p (Str bs) = case G.span p bs of
                    (s1,s2) -> (Str s1, Str s2)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that do not satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as 'G.replacement_char' to the predicate.
break :: UTF8Bytes string index
      => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
break p (Str bs)  = case G.break p bs of
                      (s1,s2) -> (Str s1, Str s2)

-- | Get the first character of a byte string, if any.
-- Invalid characters are replaced by 'G.replacement_char'.
uncons :: UTF8Bytes string index
       => UTF8 string -> Maybe (Char, UTF8 string)
uncons (Str x)  = do (c,y) <- G.uncons x
                     return (c, Str y)

-- | Extract the first character for the underlying representation,
-- if one is available.  It also returns the number of bytes used
-- in the representation of the character.
-- See also 'uncons'.
decode :: UTF8Bytes string index => UTF8 string -> Maybe (Char, index)
decode (Str x)  = G.decode x

-- | Traverse a bytestring (right biased).
foldr :: UTF8Bytes string index => (Char -> a -> a) -> a -> UTF8 string -> a
foldr cons nil (Str cs) = G.foldr cons nil cs

-- | Traverse a bytestring (left biased).
-- This function is strict in the accumulator.
foldl :: UTF8Bytes string index => (a -> Char -> a) -> a -> UTF8 string -> a
foldl add acc (Str cs)  = G.foldl add acc cs

-- | Counts the number of characters encoded in the bytestring.
-- Note that this includes replacement characters.
-- The function is linear in the number of bytes in the representation.
length :: UTF8Bytes string index => UTF8 string -> index
length (Str b) = G.length b

-- | Split a string into a list of lines.
-- Lines are terminated by @\'\\n\'@ or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- See also 'lines''.
lines :: UTF8Bytes string index => UTF8 string -> [UTF8 string]
lines (Str b) = map Str (G.lines b)   -- XXX: unnecessary map

-- | Split a string into a list of lines.
-- Lines are terminated by @\'\\n\'@ or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- This function preserves the terminators.
-- See also 'lines'.
lines' :: UTF8Bytes string index => UTF8 string -> [UTF8 string]
lines' (Str x)  = map Str (G.lines' x)  -- XXX: unnecessary map