File: TextBuilderCore.hs

package info (click to toggle)
haskell-text-builder-core 0.1.1.1-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 96 kB
  • sloc: haskell: 738; makefile: 3
file content (473 lines) | stat: -rw-r--r-- 15,245 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -Wno-unused-top-binds #-}

module TextBuilderCore
  ( TextBuilder (..),

    -- * Destructors
    isEmpty,
    toText,

    -- * Constructors

    -- ** Text
    string,
    text,
    lazyText,

    -- ** Character
    char,
    unicodeCodepoint,

    -- ** Primitives
    unsafeChars,
    unsafeSeptets,
    unsafeReverseSeptets,
  )
where

import qualified Data.Text as Text
import qualified Data.Text.Array as TextArray
import qualified Data.Text.Internal as TextInternal
import qualified Data.Text.Lazy as TextLazy
import TextBuilderCore.Prelude

#if MIN_VERSION_text(2,0,0)
import qualified TextBuilderCore.Utf8View as Utf8View
#else
import qualified TextBuilderCore.Utf16View as Utf16View
#endif

-- |
-- Composable specification of how to efficiently construct strict 'Text'.
--
-- Provides instances of 'Semigroup' and 'Monoid', which have complexity of /O(1)/.
data TextBuilder
  = TextBuilder
      -- | Estimated maximum size of the byte array to allocate.
      --
      -- If the builder is empty it must be 0.
      -- Otherwise it must be greater than or equal to the amount of bytes to be written.
      --
      -- __Warning:__ Due to \"text\" switching from UTF-16 to UTF-8 since version 2, 'Word16' is used as the byte when the \"text\" version is @<2@ and 'Word8' is used when it's @>=2@.
      Int
      -- | Function that populates a preallocated byte array of the estimated maximum size specified above provided an offset into it and producing the offset after.
      --
      -- __Warning:__ The function must not write outside of the allocated array or bad things will happen to the running app.
      --
      -- __Warning:__ Keep in mind that the array is operating on 'Word8' values starting from @text-2.0@, but prior to it it operates on 'Word16'. This is due to the \"text\" library switching from UTF-16 to UTF-8 after version 2. To deal with this you have the following options:
      --
      -- 1. Restrict the version of the \"text\" library in your package to @>=2@.
      --
      -- 2. Use helpers provided by this library, such as 'unsafeSeptets' and 'unsafeReverseSeptets', which abstract over the differences in the underlying representation.
      --
      -- 3. Use CPP to conditionally compile your code for different versions of \"text\".
      (forall s. TextArray.MArray s -> Int -> ST s Int)

instance IsString TextBuilder where
  fromString = string

instance Show TextBuilder where
  show = show . toText

instance Eq TextBuilder where
  (==) = on (==) toText

instance Semigroup TextBuilder where
  {-# INLINE (<>) #-}
  (<>) (TextBuilder estimatedArraySizeL writeL) (TextBuilder estimatedArraySizeR writeR) =
    TextBuilder
      (estimatedArraySizeL + estimatedArraySizeR)
      ( \array offset -> do
          offsetAfter1 <- writeL array offset
          writeR array offsetAfter1
      )
  stimes n (TextBuilder maxSize write) =
    TextBuilder
      (maxSize * fromIntegral n)
      ( \array ->
          let go n offset =
                if n > 0
                  then do
                    offset <- write array offset
                    go (pred n) offset
                  else return offset
           in go n
      )

instance Monoid TextBuilder where
  {-# INLINE mempty #-}
  mempty = TextBuilder 0 (const return)
  {-# INLINE mconcat #-}
  mconcat list =
    TextBuilder
      (foldl' (\acc (TextBuilder maxSize _) -> acc + maxSize) 0 list)
      ( \array ->
          let go [] offset = return offset
              go (TextBuilder _ write : xs) offset = do
                offsetAfter <- write array offset
                go xs offsetAfter
           in go list
      )

instance Arbitrary TextBuilder where
  arbitrary = text . Text.pack <$> arbitrary
  shrink a = text . Text.pack <$> shrink (Text.unpack (toText a))

-- * Destructors

-- | Execute the builder producing a strict text.
toText :: TextBuilder -> Text
toText (TextBuilder maxSize write) =
  runST $ do
    array <- TextArray.new maxSize
    offsetAfter <- write array 0
    frozenArray <- TextArray.unsafeFreeze array
    return $ TextInternal.text frozenArray 0 offsetAfter

-- | Check whether the builder is empty.
{-# INLINE isEmpty #-}
isEmpty :: TextBuilder -> Bool
isEmpty (TextBuilder maxSize _) = maxSize == 0

-- * Constructors

-- | Construct from a list of characters.
{-# INLINE string #-}
string :: String -> TextBuilder
string string =
  unsafeChars (length string) string

-- | Strict text.
{-# INLINEABLE text #-}
text :: Text -> TextBuilder
#if MIN_VERSION_text(2,0,0)
text (TextInternal.Text array offset length) =
  TextBuilder length \builderArray builderOffset -> do
    TextArray.copyI length builderArray builderOffset array offset
    return $ builderOffset + length
#else
text (TextInternal.Text array offset length) =
  TextBuilder length \builderArray builderOffset -> do
    let builderOffsetAfter = builderOffset + length
    TextArray.copyI builderArray builderOffset array offset builderOffsetAfter
    return builderOffsetAfter
#endif

-- | Lazy text.
{-# INLINE lazyText #-}
lazyText :: TextLazy.Text -> TextBuilder
lazyText =
  TextLazy.foldrChunks (mappend . text) mempty

-- ** Codepoint

-- | Unicode character.
{-# INLINE char #-}
char :: Char -> TextBuilder
char = unicodeCodepoint . ord

-- | Safe Unicode codepoint with invalid values replaced by the @�@ char (codepoint @0xfffd@),
-- which is the same as what @Data.Text.'Data.Text.pack'@ does.
{-# INLINE unicodeCodepoint #-}
unicodeCodepoint :: Int -> TextBuilder
unicodeCodepoint = unsafeUnicodeCodepoint . project
  where
    project x =
      if x .&. 0x1ff800 /= 0xd800
        then x
        else 0xfffd

-- | Unicode codepoint.
--
-- __Warning:__ It is your responsibility to ensure that the codepoint is in proper range,
-- otherwise the produced text will be broken.
-- It must be in the range of 0x0000 to 0x10FFFF.
{-# INLINE unsafeUnicodeCodepoint #-}
unsafeUnicodeCodepoint :: Int -> TextBuilder
#if MIN_VERSION_text(2,0,0)
unsafeUnicodeCodepoint x =
  Utf8View.unicodeCodepoint x unsafeUtf8CodeUnits1 unsafeUtf8CodeUnits2 unsafeUtf8CodeUnits3 unsafeUtf8CodeUnits4
#else
unsafeUnicodeCodepoint x =
  Utf16View.unicodeCodepoint x unsafeUtf16CodeUnits1 unsafeUtf16CodeUnits2
#endif

-- | Single code-unit UTF-8 character.
unsafeUtf8CodeUnits1 :: Word8 -> TextBuilder
#if MIN_VERSION_text(2,0,0)
{-# INLINEABLE unsafeUtf8CodeUnits1 #-}
unsafeUtf8CodeUnits1 unit1 =
  TextBuilder 1 \array offset ->
    TextArray.unsafeWrite array offset unit1
      $> succ offset
#else
{-# INLINE unsafeUtf8CodeUnits1 #-}
unsafeUtf8CodeUnits1 unit1 =
  Utf16View.utf8CodeUnits1 unit1 unsafeUtf16CodeUnits1 unsafeUtf16CodeUnits2
#endif

-- | Double code-unit UTF-8 character.
unsafeUtf8CodeUnits2 :: Word8 -> Word8 -> TextBuilder
#if MIN_VERSION_text(2,0,0)
{-# INLINEABLE unsafeUtf8CodeUnits2 #-}
unsafeUtf8CodeUnits2 unit1 unit2 =
  TextBuilder 2 \array offset -> do
    TextArray.unsafeWrite array offset unit1
    TextArray.unsafeWrite array (offset + 1) unit2
    return $ offset + 2
#else
{-# INLINE unsafeUtf8CodeUnits2 #-}
unsafeUtf8CodeUnits2 unit1 unit2 =
  Utf16View.utf8CodeUnits2 unit1 unit2 unsafeUtf16CodeUnits1 unsafeUtf16CodeUnits2
#endif

-- | Triple code-unit UTF-8 character.
unsafeUtf8CodeUnits3 :: Word8 -> Word8 -> Word8 -> TextBuilder
#if MIN_VERSION_text(2,0,0)
{-# INLINEABLE unsafeUtf8CodeUnits3 #-}
unsafeUtf8CodeUnits3 unit1 unit2 unit3 =
  TextBuilder 3 \array offset -> do
    TextArray.unsafeWrite array offset unit1
    TextArray.unsafeWrite array (offset + 1) unit2
    TextArray.unsafeWrite array (offset + 2) unit3
    return $ offset + 3
#else
{-# INLINE unsafeUtf8CodeUnits3 #-}
unsafeUtf8CodeUnits3 unit1 unit2 unit3 =
  Utf16View.utf8CodeUnits3 unit1 unit2 unit3 unsafeUtf16CodeUnits1 unsafeUtf16CodeUnits2
#endif

-- | UTF-8 character out of 4 code units.
unsafeUtf8CodeUnits4 :: Word8 -> Word8 -> Word8 -> Word8 -> TextBuilder
#if MIN_VERSION_text(2,0,0)
{-# INLINEABLE unsafeUtf8CodeUnits4 #-}
unsafeUtf8CodeUnits4 unit1 unit2 unit3 unit4 =
  TextBuilder 4 \array offset -> do
    TextArray.unsafeWrite array offset unit1
    TextArray.unsafeWrite array (offset + 1) unit2
    TextArray.unsafeWrite array (offset + 2) unit3
    TextArray.unsafeWrite array (offset + 3) unit4
    return $ offset + 4
#else
{-# INLINE unsafeUtf8CodeUnits4 #-}
unsafeUtf8CodeUnits4 unit1 unit2 unit3 unit4 =
  Utf16View.utf8CodeUnits4 unit1 unit2 unit3 unit4 unsafeUtf16CodeUnits1 unsafeUtf16CodeUnits2
#endif

-- | Single code-unit UTF-16 character.
unsafeUtf16CodeUnits1 :: Word16 -> TextBuilder
#if MIN_VERSION_text(2,0,0)
{-# INLINE unsafeUtf16CodeUnits1 #-}
unsafeUtf16CodeUnits1 = unsafeUnicodeCodepoint . fromIntegral
#else
{-# INLINEABLE unsafeUtf16CodeUnits1 #-}
unsafeUtf16CodeUnits1 unit =
  TextBuilder 1 \array offset ->
    TextArray.unsafeWrite array offset unit
      $> succ offset
#endif

-- | Double code-unit UTF-16 character.
unsafeUtf16CodeUnits2 :: Word16 -> Word16 -> TextBuilder
#if MIN_VERSION_text(2,0,0)
{-# INLINE unsafeUtf16CodeUnits2 #-}
unsafeUtf16CodeUnits2 unit1 unit2 = unsafeUnicodeCodepoint cp
  where
    cp = (((fromIntegral unit1 .&. 0x3FF) `shiftL` 10) .|. (fromIntegral unit2 .&. 0x3FF)) + 0x10000
#else
{-# INLINEABLE unsafeUtf16CodeUnits2 #-}
unsafeUtf16CodeUnits2 unit1 unit2 =
  TextBuilder 2 \array offset -> do
    TextArray.unsafeWrite array offset unit1
    TextArray.unsafeWrite array (succ offset) unit2
    return $ offset + 2
#endif

-- * Basic Unsafe Primitives

-- |
-- Helper for constructing from char producers a bit more efficiently than via @(text . fromString)@.
--
-- >>> unsafeChars 3 "123"
-- "123"
--
-- >>> unsafeChars 4 "123"
-- "123"
{-# INLINE unsafeChars #-}
unsafeChars ::
  -- | Maximum size of the provided list of characters.
  --
  -- __Warning__: Must be greater than or equal to the length of the list.
  Int ->
  [Char] ->
  TextBuilder
#if MIN_VERSION_text(2,0,0)
unsafeChars maxChars chars =
  TextBuilder
    (maxChars * 4)
    ( \array ->
        foldr
          ( \char next offset ->
              Utf8View.unicodeCodepoint
                (ord char)
                ( \byte -> do
                    TextArray.unsafeWrite array offset byte
                    next (succ offset)
                )
                ( \byte1 byte2 -> do
                    TextArray.unsafeWrite array offset byte1
                    TextArray.unsafeWrite array (succ offset) byte2
                    next (offset + 2)
                )
                ( \byte1 byte2 byte3 -> do
                    TextArray.unsafeWrite array offset byte1
                    TextArray.unsafeWrite array (succ offset) byte2
                    TextArray.unsafeWrite array (offset + 2) byte3
                    next (offset + 3)
                )
                ( \byte1 byte2 byte3 byte4 -> do
                    TextArray.unsafeWrite array offset byte1
                    TextArray.unsafeWrite array (succ offset) byte2
                    TextArray.unsafeWrite array (offset + 2) byte3
                    TextArray.unsafeWrite array (offset + 3) byte4
                    next (offset + 4)
                )
          )
          return
          chars
    )
#else
unsafeChars maxChars chars =
  TextBuilder
    (maxChars * 2)
    ( \array ->
        foldr
          ( \char next offset ->
              Utf16View.unicodeCodepoint
                (ord char)
                ( \byte -> do
                    TextArray.unsafeWrite array offset byte
                    next (succ offset)
                )
                ( \byte1 byte2 -> do
                    TextArray.unsafeWrite array offset byte1
                    TextArray.unsafeWrite array (succ offset) byte2
                    next (offset + 2)
                )
          )
          return
          chars
    )
#endif

-- |
-- Provides a unified way to deal with the byte array regardless of the version of the @text@ library.
--
-- Keep in mind that prior to @text-2.0@, the array was operating on 'Word16' values due to the library abstracting over @UTF-16@.
-- Starting from @text-2.0@, the array operates on 'Word8' values and the library abstracts over @UTF-8@.
--
-- This function is useful for building ASCII values.
--
-- >>> unsafeSeptets 3 (fmap (+48) [1, 2, 3])
-- "123"
--
-- >>> unsafeSeptets 4 (fmap (+48) [1, 2, 3])
-- "123"
{-# INLINE unsafeSeptets #-}
unsafeSeptets ::
  -- | Maximum size of the byte array to allocate.
  --
  -- Must be greater than or equal to the length of the list.
  --
  -- __Warning:__ If it is smaller, bad things will happen.
  -- We'll be writing outside of the allocated array.
  Int ->
  -- | List of bytes to write.
  --
  -- __Warning:__ It is your responsibility to ensure that the bytes are smaller than 128.
  -- Otherwise the produced text will have a broken encoding.
  --
  -- To ensure of optimization kicking in it is advised to construct the list using 'GHC.List.build'.
  [Word8] ->
  TextBuilder
#if MIN_VERSION_text(2,0,0)
unsafeSeptets maxSize bytes =
  TextBuilder
    maxSize
    ( \array ->
        foldr
          ( \byte next offset -> do
              TextArray.unsafeWrite array offset byte
              next (succ offset)
          )
          return
          bytes
    )
#else
unsafeSeptets maxSize bytes =
  TextBuilder
    maxSize
    ( \array ->
        foldr
          ( \byte next offset -> do
              TextArray.unsafeWrite array offset (fromIntegral byte)
              next (succ offset)
          )
          return
          bytes
    )
#endif

-- | Same as 'unsafeSeptets', but writes the bytes in reverse order and requires the size to be precise.
--
-- >>> unsafeReverseSeptets 3 (fmap (+48) [1, 2, 3])
-- "321"
{-# INLINE unsafeReverseSeptets #-}
unsafeReverseSeptets ::
  -- | Precise amount of bytes in the list.
  --
  -- Needs to be precise, because writing happens in reverse order.
  --
  -- __Warning:__ If it is smaller, bad things will happen.
  -- We'll be writing outside of the allocated array.
  Int ->
  -- | List of bytes to write in reverse order.
  --
  -- __Warning:__ It is your responsibility to ensure that the bytes are smaller than 128.
  -- Otherwise the produced text will have a broken encoding.
  --
  -- To ensure of optimization kicking in it is advised to construct the list using 'GHC.List.build'.
  [Word8] ->
  TextBuilder
#if MIN_VERSION_text(2,0,0)
unsafeReverseSeptets preciseSize bytes =
  TextBuilder
    preciseSize
    ( \array startOffset ->
        let endOffset = startOffset + preciseSize
         in foldr
              ( \byte next offset -> do
                  TextArray.unsafeWrite array offset byte
                  next (pred offset)
              )
              (\_ -> return endOffset)
              bytes
              (pred endOffset)
    )
#else
unsafeReverseSeptets preciseSize bytes =
  TextBuilder
    preciseSize
    ( \array startOffset ->
        let endOffset = startOffset + preciseSize
         in foldr
              ( \byte next offset -> do
                  TextArray.unsafeWrite array offset (fromIntegral byte)
                  next (pred offset)
              )
              (\_ -> return endOffset)
              bytes
              (pred endOffset)
    )
#endif