File: String.hs

package info (click to toggle)
haskell-basement 0.0.16-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,048 kB
  • sloc: haskell: 11,336; ansic: 63; makefile: 5
file content (1474 lines) | stat: -rw-r--r-- 60,796 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
-- |
-- Module      : Basement.String
-- License     : BSD-style
-- Maintainer  : Vincent Hanquez <vincent@snarc.org>
-- Stability   : experimental
-- Portability : portable
--
-- A String type backed by a UTF8 encoded byte array and all the necessary
-- functions to manipulate the string.
--
-- You can think of String as a specialization of a byte array that
-- have element of type Char.
--
-- The String data must contain UTF8 valid data.
--
{-# LANGUAGE BangPatterns               #-}
{-# LANGUAGE MagicHash                  #-}
{-# LANGUAGE NoImplicitPrelude          #-}
{-# LANGUAGE TypeFamilies               #-}
{-# LANGUAGE UnboxedTuples              #-}
{-# LANGUAGE FlexibleContexts           #-}
{-# LANGUAGE CPP                        #-}
module Basement.String
    ( String(..)
    , MutableString(..)
    , create
    , replicate
    , length
    -- * Binary conversion
    , Encoding(..)
    , fromBytes
    , fromChunkBytes
    , fromBytesUnsafe
    , fromBytesLenient
    , toBytes
    , mutableValidate
    , copy
    , ValidationFailure(..)
    , index
    , null
    , drop
    , take
    , splitAt
    , revDrop
    , revTake
    , revSplitAt
    , splitOn
    , sub
    , elem
    , indices
    , intersperse
    , span
    , spanEnd
    , break
    , breakEnd
    , breakElem
    , breakLine
    , dropWhile
    , singleton
    , charMap
    , snoc
    , cons
    , unsnoc
    , uncons
    , find
    , findIndex
    , sortBy
    , filter
    , reverse
    , replace
    , builderAppend
    , builderBuild
    , builderBuild_
    , readInteger
    , readIntegral
    , readNatural
    , readDouble
    , readRational
    , readFloatingExact
    , upper
    , lower
    , caseFold
    , isPrefixOf
    , isSuffixOf
    , isInfixOf
    , stripPrefix
    , stripSuffix
    , all
    , any
    -- * Legacy utility
    , lines
    , words
    , toBase64
    , toBase64URL
    , toBase64OpenBSD
    ) where

import           Basement.UArray           (UArray)
import qualified Basement.UArray           as Vec
import qualified Basement.UArray           as C
import qualified Basement.UArray.Mutable   as MVec
import           Basement.Block.Mutable (Block(..), MutableBlock(..))
import qualified Basement.Block.Mutable    as MBLK
import           Basement.Compat.Bifunctor
import           Basement.Compat.Base
import           Basement.Compat.Natural
import           Basement.Compat.MonadTrans
import           Basement.Compat.Primitive
import           Basement.Types.OffsetSize
import           Basement.Numerical.Additive
import           Basement.Numerical.Subtractive
import           Basement.Numerical.Multiplicative
import           Basement.Numerical.Number
import           Basement.Cast
import           Basement.Monad
import           Basement.PrimType
import           Basement.FinalPtr
import           Basement.IntegralConv
import           Basement.Floating
import           Basement.MutableBuilder
import           Basement.String.CaseMapping (upperMapping, lowerMapping, foldMapping)
import           Basement.UTF8.Table
import           Basement.UTF8.Helper
import           Basement.UTF8.Base
import           Basement.UTF8.Types
import           Basement.UArray.Base as C (onBackendPrim, onBackend, onBackendPure, offset, ValidRange(..), offsetsValidRange, MUArray(..), MUArrayBackend(..))
import           Basement.Alg.Class (Indexable)
import qualified Basement.Alg.UTF8 as UTF8
import qualified Basement.Alg.String as Alg
import           Basement.Types.Char7 (Char7(..), c7Upper, c7Lower)
import qualified Basement.Types.Char7 as Char7
import           GHC.Prim
import           GHC.ST
import           GHC.Types
import           GHC.Word
#if MIN_VERSION_base(4,9,0)
import           GHC.Char
#endif

 -- temporary
import qualified Data.List
import           Data.Ratio
import           Data.Char (toUpper, toLower)
import qualified Prelude

import qualified Basement.String.Encoding.Encoding   as Encoder
import qualified Basement.String.Encoding.ASCII7     as Encoder
import qualified Basement.String.Encoding.UTF16      as Encoder
import qualified Basement.String.Encoding.UTF32      as Encoder
import qualified Basement.String.Encoding.ISO_8859_1 as Encoder

-- | UTF8 Encoder
data EncoderUTF8 = EncoderUTF8

instance Encoder.Encoding EncoderUTF8 where
    type Unit EncoderUTF8 = Word8
    type Error EncoderUTF8 = ValidationFailure
    encodingNext  _ = \ofs -> Right . nextWithIndexer ofs
    encodingWrite _ = writeWithBuilder

-- | Validate a bytearray for UTF8'ness
--
-- On success Nothing is returned
-- On Failure the position along with the failure reason
validate :: UArray Word8
         -> Offset8
         -> CountOf Word8
         -> (Offset8, Maybe ValidationFailure)
validate array ofsStart sz = C.unsafeDewrap goBa goAddr array
  where
    unTranslateOffset start = first (\e -> e `offsetSub` start)
    goBa ba start =
        unTranslateOffset start $ Alg.validate (start+end) ba (start + ofsStart)
    goAddr ptr@(Ptr !_) start =
        pure $ unTranslateOffset start $ Alg.validate (start+end) ptr (ofsStart + start)
    end = ofsStart `offsetPlusE` sz

-- | Similar to 'validate' but works on a 'MutableByteArray'
mutableValidate :: PrimMonad prim
                => MVec.MUArray Word8 (PrimState prim)
                -> Offset Word8
                -> CountOf Word8
                -> prim (Offset Word8, Maybe ValidationFailure)
mutableValidate mba ofsStart sz = do
    loop ofsStart
  where
    end = ofsStart `offsetPlusE` sz

    loop ofs
        | ofs > end  = error "mutableValidate: internal error: went pass offset"
        | ofs == end = return (end, Nothing)
        | otherwise  = do
            r <- one ofs
            case r of
                (nextOfs, Nothing)  -> loop nextOfs
                (pos, Just failure) -> return (pos, Just failure)

    one pos = do
        h <- StepASCII <$> Vec.unsafeRead mba pos
        let nbConts = getNbBytes h
        if nbConts == 0xff
            then return (pos, Just InvalidHeader)
            else if pos + 1 + Offset nbConts > end
                then return (pos, Just MissingByte)
                else do
                    case nbConts of
                        0 -> return (pos + 1, Nothing)
                        1 -> do
                            c1 <- Vec.unsafeRead mba (pos + 1)
                            if isContinuation c1
                                then return (pos + 2, Nothing)
                                else return (pos, Just InvalidContinuation)
                        2 -> do
                            c1 <- Vec.unsafeRead mba (pos + 1)
                            c2 <- Vec.unsafeRead mba (pos + 2)
                            if isContinuation c1 && isContinuation c2
                                then return (pos + 3, Nothing)
                                else return (pos, Just InvalidContinuation)
                        3 -> do
                            c1 <- Vec.unsafeRead mba (pos + 1)
                            c2 <- Vec.unsafeRead mba (pos + 2)
                            c3 <- Vec.unsafeRead mba (pos + 3)
                            if isContinuation c1 && isContinuation c2 && isContinuation c3
                                then return (pos + 4, Nothing)
                                else return (pos, Just InvalidContinuation)
                        _ -> error "internal error"

nextWithIndexer :: (Offset Word8 -> Word8)
                -> Offset Word8
                -> (Char, Offset Word8)
nextWithIndexer getter off =
    case getNbBytes# b# of
        0# -> (toChar h, off + 1)
        1# -> (toChar (decode2 (getter $ off + 1)), off + 2)
        2# -> (toChar (decode3 (getter $ off + 1) (getter $ off + 2)), off + 3)
        3# -> (toChar (decode4 (getter $ off + 1) (getter $ off + 2) (getter $ off + 3))
              , off + 4)
        r -> error ("next: internal error: invalid input: " <> show (I# r) <> " " <> show (W# h))
  where
    b@(W8# b#) = getter off
    !(W# h) = integralUpsize b

    toChar :: Word# -> Char
    toChar w = C# (chr# (word2Int# w))

    decode2 :: Word8 -> Word#
    decode2 (W8# b1) =
        or# (uncheckedShiftL# (and# h 0x1f##) 6#)
            (and# c1 0x3f##)
      where
        c1 = word8ToWord# b1

    decode3 :: Word8 -> Word8 -> Word#
    decode3 (W8# b1) (W8# b2) =
        or# (uncheckedShiftL# (and# h 0xf##) 12#)
            (or# (uncheckedShiftL# (and# c1 0x3f##) 6#)
                 (and# c2 0x3f##))
      where
        c1 = word8ToWord# b1
        c2 = word8ToWord# b2

    decode4 :: Word8 -> Word8 -> Word8 -> Word#
    decode4 (W8# b1) (W8# b2) (W8# b3) =
        or# (uncheckedShiftL# (and# h 0x7##) 18#)
            (or# (uncheckedShiftL# (and# c1 0x3f##) 12#)
                (or# (uncheckedShiftL# (and# c2 0x3f##) 6#)
                    (and# c3 0x3f##))
            )
      where
        c1 = word8ToWord# b1
        c2 = word8ToWord# b2
        c3 = word8ToWord# b3

writeWithBuilder :: (PrimMonad st, Monad st)
                 => Char
                 -> Builder (UArray Word8) (MVec.MUArray Word8) Word8 st err ()
writeWithBuilder c
    | bool# (ltWord# x 0x80##   ) = encode1
    | bool# (ltWord# x 0x800##  ) = encode2
    | bool# (ltWord# x 0x10000##) = encode3
    | otherwise = encode4
  where
    !(I# xi) = fromEnum c
    !x       = int2Word# xi

    encode1 = Vec.builderAppend (W8# (wordToWord8# x))

    encode2 = do
        let x1  = or# (uncheckedShiftRL# x 6#) 0xc0##
            x2  = toContinuation x
        Vec.builderAppend (W8# (wordToWord8# x1)) >> Vec.builderAppend (W8# (wordToWord8# x2))

    encode3 = do
        let x1  = or# (uncheckedShiftRL# x 12#) 0xe0##
            x2  = toContinuation (uncheckedShiftRL# x 6#)
            x3  = toContinuation x
        Vec.builderAppend (W8# (wordToWord8# x1)) >> Vec.builderAppend (W8# (wordToWord8# x2)) >> Vec.builderAppend (W8# (wordToWord8# x3))

    encode4 = do
        let x1  = or# (uncheckedShiftRL# x 18#) 0xf0##
            x2  = toContinuation (uncheckedShiftRL# x 12#)
            x3  = toContinuation (uncheckedShiftRL# x 6#)
            x4  = toContinuation x
        Vec.builderAppend (W8# (wordToWord8# x1)) >> Vec.builderAppend (W8# (wordToWord8# x2)) >> Vec.builderAppend (W8# (wordToWord8# x3)) >> Vec.builderAppend (W8# (wordToWord8# x4))

    toContinuation :: Word# -> Word#
    toContinuation w = or# (and# w 0x3f##) 0x80##

writeUTF8Char :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> UTF8Char -> prim ()
writeUTF8Char (MutableString mba) i (UTF8_1 x1) =
    Vec.unsafeWrite mba i     x1
writeUTF8Char (MutableString mba) i (UTF8_2 x1 x2) = do
    Vec.unsafeWrite mba i     x1
    Vec.unsafeWrite mba (i+1) x2
writeUTF8Char (MutableString mba) i (UTF8_3 x1 x2 x3) = do
    Vec.unsafeWrite mba i     x1
    Vec.unsafeWrite mba (i+1) x2
    Vec.unsafeWrite mba (i+2) x3
writeUTF8Char (MutableString mba) i (UTF8_4 x1 x2 x3 x4) = do
    Vec.unsafeWrite mba i     x1
    Vec.unsafeWrite mba (i+1) x2
    Vec.unsafeWrite mba (i+2) x3
    Vec.unsafeWrite mba (i+3) x4
{-# INLINE writeUTF8Char #-}

unsafeFreezeShrink :: PrimMonad prim => MutableString (PrimState prim) -> CountOf Word8 -> prim String
unsafeFreezeShrink (MutableString mba) s = String <$> Vec.unsafeFreezeShrink mba s
{-# INLINE unsafeFreezeShrink #-}

------------------------------------------------------------------------
-- real functions

-- | Check if a String is null
null :: String -> Bool
null (String ba) = C.length ba == 0

-- we don't know in constant time the count of character in string,
-- however if we estimate bounds of what N characters would
-- take in space (between N and N*4). If the count is thus bigger than
-- the number of bytes, then we know for sure that it's going to
-- be out of bounds
countCharMoreThanBytes :: CountOf Char -> UArray Word8 -> Bool
countCharMoreThanBytes (CountOf chars) ba = chars >= bytes
  where (CountOf bytes) = C.length ba

-- | Create a string composed of a number @n of Chars (Unicode code points).
--
-- if the input @s contains less characters than required, then the input string is returned.
take :: CountOf Char -> String -> String
take n s@(String ba)
    | n <= 0                      = mempty
    | countCharMoreThanBytes n ba = s
    | otherwise                   = String $ Vec.unsafeTake (offsetAsSize $ indexN n s) ba

-- | Create a string with the remaining Chars after dropping @n Chars from the beginning
drop :: CountOf Char -> String -> String
drop n s@(String ba)
    | n <= 0                      = s
    | countCharMoreThanBytes n ba = mempty
    | otherwise                   = String $ Vec.drop (offsetAsSize $ indexN n s) ba

-- | Split a string at the Offset specified (in Char) returning both
-- the leading part and the remaining part.
splitAt :: CountOf Char -> String -> (String, String)
splitAt n s@(String ba)
    | n <= 0                      = (mempty, s)
    | countCharMoreThanBytes n ba = (s, mempty)
    | otherwise                   =
        let (v1,v2) = C.splitAt (offsetAsSize $ indexN n s) ba
         in (String v1, String v2)

-- | Return the offset (in bytes) of the N'th sequence in an UTF8 String
indexN :: CountOf Char -> String -> Offset Word8
indexN !n (String ba) = Vec.unsafeDewrap goVec goAddr ba
  where
    goVec :: Block Word8 -> Offset Word8 -> Offset Word8
    goVec (Block !ma) !start = loop start 0
      where
        !len = start `offsetPlusE` Vec.length ba
        loop :: Offset Word8 -> Offset Char -> Offset Word8
        loop !idx !i
            | idx >= len || i .==# n = sizeAsOffset (idx - start)
            | otherwise              = loop (idx `offsetPlusE` d) (i + Offset 1)
          where d = skipNextHeaderValue (primBaIndex ma idx)
    {-# INLINE goVec #-}

    goAddr :: Ptr Word8 -> Offset Word8 -> ST s (Offset Word8)
    goAddr (Ptr ptr) !start = return $ loop start (Offset 0)
      where
        !len = start `offsetPlusE` Vec.length ba
        loop :: Offset Word8 -> Offset Char -> Offset Word8
        loop !idx !i
            | idx >= len || i .==# n = sizeAsOffset (idx - start)
            | otherwise              = loop (idx `offsetPlusE` d) (i + Offset 1)
          where d = skipNextHeaderValue (primAddrIndex ptr idx)
    {-# INLINE goAddr #-}
{-# INLINE indexN #-}

-- inverse a CountOf that is specified from the end (e.g. take n Chars from the end)
--
-- rev{Take,Drop,SplitAt} TODO optimise:
-- we can process the string from the end using a skipPrev instead of getting the length
countFromStart :: String -> CountOf Char -> CountOf Char
countFromStart s sz@(CountOf sz')
    | sz >= len = CountOf 0
    | otherwise = CountOf (len' - sz')
  where len@(CountOf len') = length s

-- | Similar to 'take' but from the end
revTake :: CountOf Char -> String -> String
revTake n v = drop (countFromStart v n) v

-- | Similar to 'drop' but from the end
revDrop :: CountOf Char -> String -> String
revDrop n v = take (countFromStart v n) v

-- | Similar to 'splitAt' but from the end
revSplitAt :: CountOf Char -> String -> (String, String)
revSplitAt n v = (drop idx v, take idx v) where idx = countFromStart v n

-- | Split on the input string using the predicate as separator
--
-- e.g.
--
-- > splitOn (== ',') ","          == ["",""]
-- > splitOn (== ',') ",abc,"      == ["","abc",""]
-- > splitOn (== ':') "abc"        == ["abc"]
-- > splitOn (== ':') "abc::def"   == ["abc","","def"]
-- > splitOn (== ':') "::abc::def" == ["","","abc","","def"]
--
splitOn :: (Char -> Bool) -> String -> [String]
splitOn predicate s
    | sz == CountOf 0 = [mempty]
    | otherwise    = loop azero azero
  where
    !sz = size s
    end = azero `offsetPlusE` sz
    loop prevIdx idx
        | idx == end = [sub s prevIdx idx]
        | otherwise =
            let !(Step c idx') = next s idx
             in if predicate c
                    then sub s prevIdx idx : loop idx' idx'
                    else loop prevIdx idx'

-- | Internal call to make a substring given offset in bytes.
--
-- This is unsafe considering that one can create a substring
-- starting and/or ending on the middle of a UTF8 sequence.
sub :: String -> Offset8 -> Offset8 -> String
sub (String ba) start end = String $ Vec.sub ba start end

-- | Internal call to split at a given index in offset of bytes.
--
-- This is unsafe considering that one can split in the middle of a
-- UTF8 sequence, so use with care.
splitIndex :: Offset8 -> String -> (String, String)
splitIndex idx (String ba) = (String v1, String v2)
  where (v1,v2) = C.splitAt (offsetAsSize idx) ba

-- | Break a string into 2 strings at the location where the predicate return True
break :: (Char -> Bool) -> String -> (String, String)
break predicate s@(String ba) = runST $ Vec.unsafeIndexer ba go
  where
    !sz = size s
    end = azero `offsetPlusE` sz

    go :: (Offset Word8 -> Word8) -> ST st (String, String)
    go getIdx = loop (Offset 0)
      where
        !nextI = nextWithIndexer getIdx
        loop idx
            | idx == end = return (s, mempty)
            | otherwise  = do
                let (c, idx') = nextI idx
                case predicate c of
                    True  -> return $ splitIndex idx s
                    False -> loop idx'
        {-# INLINE loop #-}
{-# INLINE [2] break #-}

breakEnd :: (Char -> Bool) -> String -> (String, String)
breakEnd predicate s@(String arr)
    | k == end  = (s, mempty)
    | otherwise = splitIndex (k `offsetSub` start) s
  where
    k = C.onBackend goVec (\_ -> pure . goAddr) arr
    (C.ValidRange !start !end) = offsetsValidRange arr
    goVec ba@(Block !_) = let k = Alg.revFindIndexPredicate predicate ba start end
                        in if k == end then end else UTF8.nextSkip ba k
    goAddr ptr@(Ptr !_) =
        let k = Alg.revFindIndexPredicate predicate ptr start end
         in if k == end then end else UTF8.nextSkip ptr k
{-# INLINE [2] breakEnd #-}

#if MIN_VERSION_base(4,9,0)
{-# RULES "break (== 'c')" [3] forall c . break (eqChar c) = breakElem c #-}
#else
{-# RULES "break (== 'c')" [3] forall c . break (== c) = breakElem c #-}
#endif

-- | Break a string into 2 strings at the first occurence of the character
breakElem :: Char -> String -> (String, String)
breakElem !el s@(String ba)
    | sz == 0   = (mempty, mempty)
    | otherwise =
        case asUTF8Char el of
            UTF8_1 w -> let !(v1,v2) = Vec.breakElem w ba in (String v1, String v2)
            _        -> runST $ Vec.unsafeIndexer ba go
  where
    sz = size s
    end = azero `offsetPlusE` sz

    go :: (Offset Word8 -> Word8) -> ST st (String, String)
    go getIdx = loop (Offset 0)
      where
        !nextI = nextWithIndexer getIdx
        loop idx
            | idx == end = return (s, mempty)
            | otherwise  = do
                let (c, idx') = nextI idx
                case el == c of
                    True  -> return $ splitIndex idx s
                    False -> loop idx'

-- | Same as break but cut on a line feed with an optional carriage return.
--
-- This is the same operation as 'breakElem LF' dropping the last character of the
-- string if it's a CR.
--
-- Also for efficiency reason (streaming), it returns if the last character was a CR character.
breakLine :: String -> Either Bool (String, String)
breakLine (String arr) = bimap String String <$> Vec.breakLine arr

-- | Apply a @predicate@ to the string to return the longest prefix that satisfy the predicate and
-- the remaining
span :: (Char -> Bool) -> String -> (String, String)
span predicate s = break (not . predicate) s

-- | Apply a @predicate@ to the string to return the longest suffix that satisfy the predicate and
-- the remaining
spanEnd :: (Char -> Bool) -> String -> (String, String)
spanEnd predicate s = breakEnd (not . predicate) s

-- | Drop character from the beginning while the predicate is true
dropWhile :: (Char -> Bool) -> String -> String
dropWhile predicate = snd . break (not . predicate)

-- | Return whereas the string contains a specific character or not
elem :: Char -> String -> Bool
elem !el s@(String ba) =
    case asUTF8Char el of
        UTF8_1 w -> Vec.elem w ba
        _        -> runST $ Vec.unsafeIndexer ba go
  where
    sz = size s
    end = azero `offsetPlusE` sz

    go :: (Offset Word8 -> Word8) -> ST st Bool
    go getIdx = loop (Offset 0)
      where
        !nextI = nextWithIndexer getIdx
        loop !idx
            | idx == end = return False
            | otherwise  = do
                let (c, idx') = nextI idx
                case el == c of
                    True  -> return True
                    False -> loop idx'

-- | Intersperse the character @sep@ between each character in the string
--
-- > intersperse ' ' "Hello Foundation"
-- "H e l l o   F o u n d a t i o n"
intersperse :: Char -> String -> String
intersperse sep src = case length src - 1 of
    Nothing   -> src
    Just 0    -> src
    Just gaps -> runST $ unsafeCopyFrom src dstBytes go
        where
          lastSrcI :: Offset Char
          lastSrcI = 0 `offsetPlusE` gaps
          dstBytes = (size src :: CountOf Word8) + (gaps `scale` charToBytes (fromEnum sep))

          go :: String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8)
          go src' srcI srcIdx dst dstIdx
              | srcI == lastSrcI = do
                  nextDstIdx <- write dst dstIdx c
                  return (nextSrcIdx, nextDstIdx)
              | otherwise        = do
                  nextDstIdx  <- write dst dstIdx c
                  nextDstIdx' <- write dst nextDstIdx sep
                  return (nextSrcIdx, nextDstIdx')
            where
              !(Step c nextSrcIdx) = next src' srcIdx

-- | Allocate a new @String@ with a fill function that has access to the characters of
--   the source @String@.
unsafeCopyFrom :: String -- ^ Source string
               -> CountOf Word8  -- ^ Length of the destination string in bytes
               -> (String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8))
               -- ^ Function called for each character in the source String
               -> ST s String -- ^ Returns the filled new string
unsafeCopyFrom src dstBytes f = new dstBytes >>= fill (Offset 0) (Offset 0) (Offset 0) f >>= freeze
  where
    srcLen = length src
    end = Offset 0 `offsetPlusE` srcLen
    fill srcI srcIdx dstIdx f' dst'
        | srcI == end = return dst'
        | otherwise = do (nextSrcIdx, nextDstIdx) <- f' src srcI srcIdx dst' dstIdx
                         fill (srcI + Offset 1) nextSrcIdx nextDstIdx f' dst'

-- | Length of a String using CountOf
--
-- this size is available in o(n)
length :: String -> CountOf Char
length (String arr)
    | start == end = 0
    | otherwise    = C.onBackend goVec (\_ -> pure . goAddr) arr
  where
    (C.ValidRange !start !end) = offsetsValidRange arr
    goVec ma = UTF8.length ma start end
    goAddr ptr = UTF8.length ptr start end

-- | Replicate a character @c@ @n@ times to create a string of length @n@
replicate :: CountOf Char -> Char -> String
replicate (CountOf n) c = runST (new nbBytes >>= fill)
  where
    nbBytes   = scale (cast n :: Word) sz
    sz = charToBytes (fromEnum c)
    fill :: PrimMonad prim => MutableString (PrimState prim) -> prim String
    fill ms = loop (Offset 0)
      where
        loop idx
            | idx .==# nbBytes = freeze ms
            | otherwise        = write ms idx c >>= loop

-- | Copy the String
--
-- The slice of memory is copied to a new slice, making the new string
-- independent from the original string..
copy :: String -> String
copy (String s) = String (Vec.copy s)

-- | Create a single element String
singleton :: Char -> String
singleton c = runST $ do
    ms <- new nbBytes
    _  <- write ms (Offset 0) c
    freeze ms
  where
    !nbBytes = charToBytes (fromEnum c)

-- | Unsafely create a string of up to @sz@ bytes.
--
-- The callback @f@ needs to return the number of bytes filled in the underlaying
-- bytes buffer. No check is made on the callback return values, and if it's not
-- contained without the bounds, bad things will happen.
create :: PrimMonad prim => CountOf Word8 -> (MutableString (PrimState prim) -> prim (Offset Word8)) -> prim String
create sz f = do
    ms     <- new sz
    filled <- f ms
    if filled .==# sz
        then freeze ms
        else do
            s <- freeze ms
            let (String ba) = s
            pure $ String $ C.take (offsetAsSize filled) ba

-- | Monomorphically map the character in a string and return the transformed one
charMap :: (Char -> Char) -> String -> String
charMap f src
    | srcSz == 0 = mempty
    | otherwise  =
        let !(elems, nbBytes) = allocateAndFill [] (Offset 0) (CountOf 0)
         in runST $ do
                dest <- new nbBytes
                copyLoop dest elems (Offset 0 `offsetPlusE` nbBytes)
                freeze dest
  where
    !srcSz = size src
    srcEnd = azero `offsetPlusE` srcSz

    allocateAndFill :: [(String, CountOf Word8)]
                    -> Offset8
                    -> CountOf Word8
                    -> ([(String,CountOf Word8)], CountOf Word8)
    allocateAndFill acc idx bytesWritten
        | idx == srcEnd = (acc, bytesWritten)
        | otherwise     =
            let (el@(_,addBytes), idx') = runST $ do
                    -- make sure we allocate at least 4 bytes for the destination for the last few bytes
                    -- otherwise allocating less would bring the danger of spinning endlessly
                    -- and never succeeding.
                    let !diffBytes = srcEnd - idx
                        !allocatedBytes = if diffBytes <= CountOf 4 then CountOf 4 else diffBytes
                    ms <- new allocatedBytes
                    (dstIdx, srcIdx) <- fill ms allocatedBytes idx
                    s <- freeze ms
                    return ((s, dstIdx), srcIdx)
             in allocateAndFill (el : acc) idx' (bytesWritten + addBytes)

    fill :: PrimMonad prim
         => MutableString (PrimState prim)
         -> CountOf Word8
         -> Offset8
         -> prim (CountOf Word8, Offset8)
    fill mba dsz srcIdxOrig =
        loop (Offset 0) srcIdxOrig
      where
        endDst = (Offset 0) `offsetPlusE` dsz
        loop dstIdx srcIdx
            | srcIdx == srcEnd = return (offsetAsSize dstIdx, srcIdx)
            | dstIdx == endDst = return (offsetAsSize dstIdx, srcIdx)
            | otherwise        =
                let !(Step c srcIdx') = next src srcIdx
                    c' = f c -- the mapped char
                    !nbBytes = charToBytes (fromEnum c')
                 in -- check if we have room in the destination buffer
                    if dstIdx `offsetPlusE` nbBytes <= sizeAsOffset dsz
                        then do dstIdx' <- write mba dstIdx c'
                                loop dstIdx' srcIdx'
                        else return (offsetAsSize dstIdx, srcIdx)

    copyLoop _   []     (Offset 0) = return ()
    copyLoop _   []     n          = error ("charMap invalid: " <> show n)
    copyLoop ms@(MutableString mba) ((String ba, sz):xs) end = do
        let start = end `offsetMinusE` sz
        Vec.unsafeCopyAtRO mba start ba (Offset 0) sz
        copyLoop ms xs start

-- | Append a Char to the end of the String and return this new String
snoc :: String -> Char -> String
snoc s@(String ba) c
    | len == CountOf 0 = singleton c
    | otherwise     = runST $ do
        ms <- new (len + nbBytes)
        let (MutableString mba) = ms
        Vec.unsafeCopyAtRO mba (Offset 0) ba (Offset 0) len
        _ <- write ms (azero `offsetPlusE` len) c
        freeze ms
  where
    !len     = size s
    !nbBytes = charToBytes (fromEnum c)

-- | Prepend a Char to the beginning of the String and return this new String
cons :: Char -> String -> String
cons c s@(String ba)
  | len == CountOf 0 = singleton c
  | otherwise     = runST $ do
      ms <- new (len + nbBytes)
      let (MutableString mba) = ms
      idx <- write ms (Offset 0) c
      Vec.unsafeCopyAtRO mba idx ba (Offset 0) len
      freeze ms
  where
    !len     = size s
    !nbBytes = charToBytes (fromEnum c)

-- | Extract the String stripped of the last character and the last character if not empty
--
-- If empty, Nothing is returned
unsnoc :: String -> Maybe (String, Char)
unsnoc s@(String arr)
    | sz == 0   = Nothing
    | otherwise =
        let !(StepBack c idx) = prev s (sizeAsOffset sz)
         in Just (String $ Vec.take (offsetAsSize idx) arr, c)
  where
    sz = size s

-- | Extract the First character of a string, and the String stripped of the first character.
--
-- If empty, Nothing is returned
uncons :: String -> Maybe (Char, String)
uncons s@(String ba)
    | null s    = Nothing
    | otherwise =
        let !(Step c idx) = next s azero
         in Just (c, String $ Vec.drop (offsetAsSize idx) ba)

-- | Look for a predicate in the String and return the matched character, if any.
find :: (Char -> Bool) -> String -> Maybe Char
find predicate s = loop (Offset 0)
  where
    !sz = size s
    end = Offset 0 `offsetPlusE` sz
    loop idx
        | idx == end = Nothing
        | otherwise =
            let !(Step c idx') = next s idx
             in case predicate c of
                    True  -> Just c
                    False -> loop idx'

-- | Sort the character in a String using a specific sort function
--
-- TODO: optimise not going through a list
sortBy :: (Char -> Char -> Ordering) -> String -> String
sortBy sortF s = fromList $ Data.List.sortBy sortF $ toList s -- FIXME for tests

-- | Filter characters of a string using the predicate
filter :: (Char -> Bool) -> String -> String
filter predicate (String arr) = runST $ do
    (finalSize, dst) <- newNative sz $ \(MutableBlock mba) ->
        C.onBackendPrim (\ba@(Block !_) -> Alg.copyFilter predicate sz mba ba start)
                        (\fptr -> withFinalPtr fptr $ \ptr@(Ptr !_) -> Alg.copyFilter predicate sz mba ptr start)
                        arr
    freezeShrink finalSize dst
  where
    !sz    = C.length arr
    !start = C.offset arr

-- | Reverse a string
reverse :: String -> String
reverse (String arr) = runST $ do
    s <- newNative_ (C.length arr) $ \(MutableBlock mba) ->
            C.onBackendPrim
                (\ba@(Block !_) -> UTF8.reverse mba 0 ba start end)
                (\fptr -> withFinalPtr fptr $ \ptr@(Ptr !_) -> UTF8.reverse mba 0 ptr start end)
                arr
    freeze s
  where
    !(C.ValidRange start end) = C.offsetsValidRange arr

-- | Finds where are the insertion points when we search for a `needle`
-- within an `haystack`.
indices :: String -> String -> [Offset8]
indices (String ned) (String hy) = Vec.indices ned hy

-- | Replace all the occurrencies of `needle` with `replacement` in
-- the `haystack` string.
replace :: String -> String -> String -> String
replace (String needle) (String replacement) (String haystack) =
  String $ Vec.replace needle replacement haystack

-- | Return the nth character in a String
--
-- Compared to an array, the string need to be scanned from the beginning
-- since the UTF8 encoding is variable.
index :: String -> Offset Char -> Maybe Char
index s n
    | ofs >= end = Nothing
    | otherwise  =
        let (Step !c _) = next s ofs
         in Just c
  where
    !nbBytes = size s
    end = 0 `offsetPlusE` nbBytes
    ofs = indexN (offsetAsSize n) s

-- | Return the index in unit of Char of the first occurence of the predicate returning True
--
-- If not found, Nothing is returned
findIndex :: (Char -> Bool) -> String -> Maybe (Offset Char)
findIndex predicate s = loop 0 0
  where
    !sz = size s
    loop ofs idx
        | idx .==# sz = Nothing
        | otherwise   =
            let !(Step c idx') = next s idx
             in case predicate c of
                    True  -> Just ofs
                    False -> loop (ofs+1) idx'

-- | Various String Encoding that can be use to convert to and from bytes
data Encoding
    = ASCII7
    | UTF8
    | UTF16
    | UTF32
    | ISO_8859_1
    deriving (Typeable, Data, Eq, Ord, Show, Enum, Bounded)

fromEncoderBytes :: ( Encoder.Encoding encoding
                    , PrimType (Encoder.Unit encoding)
                    )
                 => encoding
                 -> UArray Word8
                 -> (String, Maybe ValidationFailure, UArray Word8)
fromEncoderBytes enc bytes =
    case runST $ Encoder.convertFromTo enc EncoderUTF8 (Vec.recast bytes) of
        -- TODO: Don't swallow up specific error (second element of pair)
        -- TODO: Confused why all this recasting is necessary. I "typed hole"-ed my way to get this function to compile.  Feels like there should be a cleaner method.
        Left (off, _) ->
            let (b1, b2) = Vec.splitAt (offsetAsSize off) (Vec.recast bytes)
            in (String $ Vec.recast b1, Just BuildingFailure, Vec.recast b2)
        Right converted -> (String converted, Nothing, mempty)

-- | Convert a ByteArray to a string assuming a specific encoding.
--
-- It returns a 3-tuple of:
--
-- * The string that has been succesfully converted without any error
-- * An optional validation error
-- * The remaining buffer that hasn't been processed (either as a result of an error, or because the encoded sequence is not fully available)
--
-- Considering a stream of data that is fetched chunk by chunk, it's valid to assume
-- that some sequence might fall in a chunk boundary. When converting chunks,
-- if the error is Nothing and the remaining buffer is not empty, then this buffer
-- need to be prepended to the next chunk
fromBytes :: Encoding -> UArray Word8 -> (String, Maybe ValidationFailure, UArray Word8)
fromBytes ASCII7     bytes = fromEncoderBytes Encoder.ASCII7     bytes
fromBytes ISO_8859_1 bytes = fromEncoderBytes Encoder.ISO_8859_1 bytes
fromBytes UTF16      bytes = fromEncoderBytes Encoder.UTF16      bytes
fromBytes UTF32      bytes = fromEncoderBytes Encoder.UTF32      bytes
fromBytes UTF8       bytes
    | C.null bytes = (mempty, Nothing, mempty)
    | otherwise    =
        case validate bytes (Offset 0) (C.length bytes) of
            (_, Nothing)  -> (fromBytesUnsafe bytes, Nothing, mempty)
            (pos, Just vf) ->
                let (b1, b2) = C.splitAt (offsetAsSize pos) bytes
                 in (fromBytesUnsafe b1, toErr vf, b2)
  where
    toErr MissingByte         = Nothing
    toErr InvalidHeader       = Just InvalidHeader
    toErr InvalidContinuation = Just InvalidContinuation
    toErr BuildingFailure     = Just BuildingFailure

-- | Convert a UTF8 array of bytes to a String.
--
-- If there's any error in the stream, it will automatically
-- insert replacement bytes to replace invalid sequences.
--
-- In the case of sequence that fall in the middle of 2 chunks,
-- the remaining buffer is supposed to be preprended to the
-- next chunk, and resume the parsing.
fromBytesLenient :: UArray Word8 -> (String, UArray Word8)
fromBytesLenient bytes
    | C.null bytes = (mempty, mempty)
    | otherwise    =
        case validate bytes (Offset 0) (C.length bytes) of
            (_, Nothing)                   -> (fromBytesUnsafe bytes, mempty)
            -- TODO: Should anything be done in the 'BuildingFailure' case?
            (_, Just BuildingFailure) -> error "fromBytesLenient: FIXME!"
            (pos, Just MissingByte) ->
                let (b1,b2) = C.splitAt (offsetAsSize pos) bytes
                 in (fromBytesUnsafe b1, b2)
            (pos, Just InvalidHeader) ->
                let (b1,b2) = C.splitAt (offsetAsSize pos) bytes
                    (_,b3)  = C.splitAt 1 b2
                    (s3, r) = fromBytesLenient b3
                 in (mconcat [fromBytesUnsafe b1,replacement, s3], r)
            (pos, Just InvalidContinuation) ->
                let (b1,b2) = C.splitAt (offsetAsSize pos) bytes
                    (_,b3)  = C.splitAt 1 b2
                    (s3, r) = fromBytesLenient b3
                 in (mconcat [fromBytesUnsafe b1,replacement, s3], r)
  where
    -- This is the replacement character U+FFFD used for any invalid header or continuation
    replacement :: String
    !replacement = fromBytesUnsafe $ fromList [0xef,0xbf,0xbd]

-- | Decode a stream of binary chunks containing UTF8 encoding in a list of valid String
--
-- Chunk not necessarily contains a valid string, as
-- a UTF8 sequence could be split over 2 chunks.
fromChunkBytes :: [UArray Word8] -> [String]
fromChunkBytes l = loop l
  where
    loop []         = []
    loop [bytes]    =
        case validate bytes (Offset 0) (C.length bytes) of
            (_, Nothing)  -> [fromBytesUnsafe bytes]
            (_, Just err) -> doErr err
    loop (bytes:cs@(c1:c2)) =
        case validate bytes (Offset 0) (C.length bytes) of
            (_, Nothing) -> fromBytesUnsafe bytes : loop cs
            (pos, Just MissingByte) ->
                let (b1,b2) = C.splitAt (offsetAsSize pos) bytes
                 in fromBytesUnsafe b1 : loop ((b2 `mappend` c1) : c2)
            (_, Just err) -> doErr err
    doErr err = error ("fromChunkBytes: " <> show err)

-- | Convert a Byte Array representing UTF8 data directly to a string without checking for UTF8 validity
--
-- If the input contains invalid sequences, it will trigger runtime async errors when processing data.
--
-- In doubt, use 'fromBytes'
fromBytesUnsafe :: UArray Word8 -> String
fromBytesUnsafe = String

toEncoderBytes :: ( Encoder.Encoding encoding
                  , PrimType (Encoder.Unit encoding)
                  , Exception (Encoder.Error encoding)
                  )
               => encoding
               -> UArray Word8
               -> UArray Word8
toEncoderBytes enc bytes = Vec.recast $
  case runST $ Encoder.convertFromTo EncoderUTF8 enc bytes of
    Left _ -> error "toEncoderBytes: FIXME!"
    Right converted -> converted

-- | Convert a String to a bytearray in a specific encoding
--
-- if the encoding is UTF8, the underlying buffer is returned without extra allocation or any processing
--
-- In any other encoding, some allocation and processing are done to convert.
toBytes :: Encoding -> String -> UArray Word8
toBytes UTF8       (String bytes) = bytes
toBytes ASCII7     (String bytes) = toEncoderBytes Encoder.ASCII7     bytes
toBytes ISO_8859_1 (String bytes) = toEncoderBytes Encoder.ISO_8859_1 bytes
toBytes UTF16      (String bytes) = toEncoderBytes Encoder.UTF16      bytes
toBytes UTF32      (String bytes) = toEncoderBytes Encoder.UTF32      bytes

-- | Split lines in a string using newline as separation.
--
-- Note that carriage return preceding a newline are also strip for
-- maximum compatibility between Windows and Unix system.
lines :: String -> [String]
lines s =
    case breakLine s of
        Left _         -> [s]
        Right (line,r) -> line : lines r

-- | Split words in a string using spaces as separation
--
-- > words "Hello Foundation"
-- [ "Hello", "Foundation" ]
words :: String -> [String]
words = fmap fromList . Prelude.words . toList

-- | Append a character to a String builder
builderAppend :: PrimMonad state => Char -> Builder String MutableString Word8 state err ()
builderAppend c = Builder $ State $ \(i, st, e) ->
    if offsetAsSize i + nbBytes >= chunkSize st
        then do
            cur      <- unsafeFreezeShrink (curChunk st) (offsetAsSize i)
            newChunk <- new (chunkSize st)
            writeUTF8Char newChunk (Offset 0) utf8Char
            return ((), (sizeAsOffset nbBytes, st { prevChunks     = cur : prevChunks st
                                                  , prevChunksSize = offsetAsSize i + prevChunksSize st
                                                  , curChunk       = newChunk
                                                  }, e))
        else do
            writeUTF8Char (curChunk st) i utf8Char
            return ((), (i + sizeAsOffset nbBytes, st, e))
  where
    utf8Char = asUTF8Char c
    nbBytes  = numBytes utf8Char

-- | Create a new String builder using chunks of @sizeChunksI@
builderBuild :: PrimMonad m => Int -> Builder String MutableString Word8 m err () -> m (Either err String)
builderBuild sizeChunksI sb
    | sizeChunksI <= 3 = builderBuild 64 sb
    | otherwise        = do
        firstChunk <- new sizeChunks
        (i, st, e) <- snd <$> runState (runBuilder sb) (Offset 0, BuildingState [] (CountOf 0) firstChunk sizeChunks, Nothing)
        case e of
          Just err -> return (Left err)
          Nothing -> do
            cur <- unsafeFreezeShrink (curChunk st) (offsetAsSize i)
            -- Build final array
            let totalSize = prevChunksSize st + offsetAsSize i
            final <- Vec.new totalSize >>= fillFromEnd totalSize (cur : prevChunks st) >>= Vec.unsafeFreeze
            return . Right . String $ final
  where
    sizeChunks = CountOf sizeChunksI

    fillFromEnd _    []            mba = return mba
    fillFromEnd !end (String x:xs) mba = do
        let sz = Vec.length x
        let start = end `sizeSub` sz
        Vec.unsafeCopyAtRO mba (sizeAsOffset start) x (Offset 0) sz
        fillFromEnd start xs mba

builderBuild_ :: PrimMonad m => Int -> Builder String MutableString Word8 m () () -> m String
builderBuild_ sizeChunksI sb = either (\() -> internalError "impossible output") id <$> builderBuild sizeChunksI sb

stringDewrap :: (Block Word8 -> Offset Word8 -> a)
             -> (Ptr Word8 -> Offset Word8 -> ST s a)
             -> String
             -> a
stringDewrap withBa withPtr (String ba) = C.unsafeDewrap withBa withPtr ba
{-# INLINE stringDewrap #-}

-- | Read an Integer from a String
--
-- Consume an optional minus sign and many digits until end of string.
readIntegral :: (HasNegation i, IntegralUpsize Word8 i, Additive i, Multiplicative i, IsIntegral i) => String -> Maybe i
readIntegral str
    | sz == 0   = Nothing
    | otherwise = stringDewrap withBa (\ptr@(Ptr !_) -> pure . withPtr ptr) str
  where
    !sz = size str
    withBa ba ofs =
        let negativeSign = UTF8.expectAscii ba ofs 0x2d
            startOfs     = if negativeSign then succ ofs else ofs
         in case decimalDigitsBA 0 ba endOfs startOfs of
                (# acc, True, endOfs' #) | endOfs' > startOfs -> Just $! if negativeSign then negate acc else acc
                _                                             -> Nothing
      where !endOfs = ofs `offsetPlusE` sz
    withPtr addr ofs =
        let negativeSign = UTF8.expectAscii addr ofs 0x2d
            startOfs     = if negativeSign then succ ofs else ofs
         in case decimalDigitsPtr 0 addr endOfs startOfs of
                (# acc, True, endOfs' #) | endOfs' > startOfs -> Just $! if negativeSign then negate acc else acc
                _                                             -> Nothing
      where !endOfs = ofs `offsetPlusE` sz
{-# SPECIALISE readIntegral :: String -> Maybe Integer #-}
{-# SPECIALISE readIntegral :: String -> Maybe Int #-}

readInteger :: String -> Maybe Integer
readInteger = readIntegral

-- | Read a Natural from a String
--
-- Consume many digits until end of string.
readNatural :: String -> Maybe Natural
readNatural str
    | sz == 0   = Nothing
    | otherwise = stringDewrap withBa (\ptr@(Ptr !_) -> pure . withPtr ptr) str
  where
    !sz = size str
    withBa ba stringStart =
        case decimalDigitsBA 0 ba eofs stringStart of
            (# acc, True, endOfs #) | endOfs > stringStart -> Just acc
            _                                              -> Nothing
      where eofs = stringStart `offsetPlusE` sz
    withPtr addr stringStart =
        case decimalDigitsPtr 0 addr eofs stringStart of
            (# acc, True, endOfs #) | endOfs > stringStart -> Just acc
            _                                              -> Nothing
      where eofs = stringStart `offsetPlusE` sz

-- | Try to read a Double
readDouble :: String -> Maybe Double
readDouble s =
    readFloatingExact s $ \isNegative integral floatingDigits mExponant ->
        Just $ applySign isNegative $ case (floatingDigits, mExponant) of
            (0, Nothing)              ->                         naturalToDouble integral
            (0, Just exponent)        -> withExponant exponent $ naturalToDouble integral
            (floating, Nothing)       ->                         applyFloating floating $ naturalToDouble integral
            (floating, Just exponent) -> withExponant exponent $ applyFloating floating $ naturalToDouble integral
  where
    applySign True = negate
    applySign False = id
    withExponant e v = v * doubleExponant 10 e
    applyFloating digits n = n / (10 Prelude.^ digits)

-- | Try to read a floating number as a Rational
--
-- Note that for safety reason, only exponent between -10000 and 10000 is allowed
-- as otherwise DoS/OOM is very likely. if you don't want this behavior,
-- switching to a scientific type (not provided yet) that represent the
-- exponent separately is the advised solution.
readRational :: String -> Maybe Prelude.Rational
readRational s =
    readFloatingExact s $ \isNegative integral floatingDigits mExponant ->
        case mExponant of
            Just exponent
                | exponent < -10000 || exponent > 10000 -> Nothing
                | otherwise                             -> Just $ modF isNegative integral % (10 Prelude.^ (cast floatingDigits - exponent))
            Nothing                                     -> Just $ modF isNegative integral % (10 Prelude.^ floatingDigits)
  where
    modF True  = negate . integralUpsize
    modF False = integralUpsize


type ReadFloatingCallback a = Bool      -- sign
                           -> Natural   -- integral part
                           -> Word      -- number of digits in floating section
                           -> Maybe Int -- optional integer representing exponent in base 10
                           -> Maybe a

-- | Read an Floating like number of the form:
--
--   [ '-' ] <numbers> [ '.' <numbers> ] [ ( 'e' | 'E' ) [ '-' ] <number> ]
--
-- Call a function with:
--
-- * A boolean representing if the number is negative
-- * The digits part represented as a single natural number (123.456 is represented as 123456)
-- * The number of digits in the fractional part (e.g. 123.456 => 3)
-- * The exponent if any
--
-- The code is structured as a simple state machine that:
--
-- * Optionally Consume a '-' sign
-- * Consume number for the integral part
-- * Optionally
--   * Consume '.'
--   * Consume remaining digits if not already end of string
-- * Optionally Consume a 'e' or 'E' follow by an optional '-' and a number
--
readFloatingExact :: String -> ReadFloatingCallback a -> Maybe a
readFloatingExact str f
    | sz == 0   = Nothing
    | otherwise = stringDewrap withBa withPtr str
  where
    !sz = size str

    withBa ba stringStart =
        let !isNegative = UTF8.expectAscii ba stringStart 0x2d
         in consumeIntegral isNegative (if isNegative then stringStart+1 else stringStart)
      where
        eofs = stringStart `offsetPlusE` sz
        consumeIntegral !isNegative startOfs =
            case decimalDigitsBA 0 ba eofs startOfs of
                (# acc, True , endOfs #) | endOfs > startOfs -> f isNegative acc 0 Nothing -- end of stream and no '.'
                (# acc, False, endOfs #) | endOfs > startOfs ->
                    if UTF8.expectAscii ba endOfs 0x2e
                        then consumeFloat isNegative acc (endOfs + 1)
                        else consumeExponant isNegative acc 0 endOfs
                _                                            -> Nothing

        consumeFloat isNegative integral startOfs =
            case decimalDigitsBA integral ba eofs startOfs of
                (# acc, True, endOfs #) | endOfs > startOfs -> let (CountOf !diff) = endOfs - startOfs
                                                                in f isNegative acc (cast diff) Nothing
                (# acc, False, endOfs #) | endOfs > startOfs -> let (CountOf !diff) = endOfs - startOfs
                                                                in consumeExponant isNegative acc (cast diff) endOfs
                _                                           -> Nothing

        consumeExponant !isNegative !integral !floatingDigits !startOfs
            | startOfs == eofs = f isNegative integral floatingDigits Nothing
            | otherwise        =
                -- consume 'E' or 'e'
                case UTF8.nextAscii ba startOfs of
                    StepASCII 0x45 -> consumeExponantSign (startOfs+1)
                    StepASCII 0x65 -> consumeExponantSign (startOfs+1)
                    _              -> Nothing
          where
            consumeExponantSign ofs
                | ofs == eofs = Nothing
                | otherwise   = let exponentNegative = UTF8.expectAscii ba ofs 0x2d
                                 in consumeExponantNumber exponentNegative (if exponentNegative then ofs + 1 else ofs)

            consumeExponantNumber exponentNegative ofs =
                case decimalDigitsBA 0 ba eofs ofs of
                    (# acc, True, endOfs #) | endOfs > ofs -> f isNegative integral floatingDigits (Just $! if exponentNegative then negate acc else acc)
                    _                                      -> Nothing
    withPtr ptr@(Ptr !_) stringStart = pure $
        let !isNegative = UTF8.expectAscii ptr stringStart 0x2d
         in consumeIntegral isNegative (if isNegative then stringStart+1 else stringStart)
      where
        eofs = stringStart `offsetPlusE` sz
        consumeIntegral !isNegative startOfs =
            case decimalDigitsPtr 0 ptr eofs startOfs of
                (# acc, True , endOfs #) | endOfs > startOfs -> f isNegative acc 0 Nothing -- end of stream and no '.'
                (# acc, False, endOfs #) | endOfs > startOfs ->
                    if UTF8.expectAscii ptr endOfs 0x2e
                        then consumeFloat isNegative acc (endOfs + 1)
                        else consumeExponant isNegative acc 0 endOfs
                _                                            -> Nothing

        consumeFloat isNegative integral startOfs =
            case decimalDigitsPtr integral ptr eofs startOfs of
                (# acc, True, endOfs #) | endOfs > startOfs -> let (CountOf !diff) = endOfs - startOfs
                                                                in f isNegative acc (cast diff) Nothing
                (# acc, False, endOfs #) | endOfs > startOfs -> let (CountOf !diff) = endOfs - startOfs
                                                                in consumeExponant isNegative acc (cast diff) endOfs
                _                                           -> Nothing

        consumeExponant !isNegative !integral !floatingDigits !startOfs
            | startOfs == eofs = f isNegative integral floatingDigits Nothing
            | otherwise        =
                -- consume 'E' or 'e'
                case UTF8.nextAscii ptr startOfs of
                    StepASCII 0x45 -> consumeExponantSign (startOfs+1)
                    StepASCII 0x65 -> consumeExponantSign (startOfs+1)
                    _              -> Nothing
          where
            consumeExponantSign ofs
                | ofs == eofs = Nothing
                | otherwise   = let exponentNegative = UTF8.expectAscii ptr ofs 0x2d
                                 in consumeExponantNumber exponentNegative (if exponentNegative then ofs + 1 else ofs)

            consumeExponantNumber exponentNegative ofs =
                case decimalDigitsPtr 0 ptr eofs ofs of
                    (# acc, True, endOfs #) | endOfs > ofs -> f isNegative integral floatingDigits (Just $! if exponentNegative then negate acc else acc)
                    _                                      -> Nothing

-- | Take decimal digits and accumulate it in `acc`
--
-- The loop starts at the offset specified and finish either when:
--
-- * It reach the end of the string
-- * It reach a non-ASCII character
-- * It reach an ASCII character that is not a digit (0 to 9)
--
-- Otherwise each iterations:
--
-- * Transform the ASCII digits into a number
-- * scale the accumulator by 10
-- * Add the number (between 0 and 9) to the accumulator
--
-- It then returns:
--
-- * The new accumulated value
-- * Whether it stop by end of string or not
-- * The end offset when the loop stopped
--
-- If end offset == start offset then no digits have been consumed by
-- this function
decimalDigitsBA :: (IntegralUpsize Word8 acc, Additive acc, Multiplicative acc, Integral acc)
                => acc
                -> Block Word8
                -> Offset Word8 -- end offset
                -> Offset Word8 -- start offset
                -> (# acc, Bool, Offset Word8 #)
decimalDigitsBA startAcc ba !endOfs !startOfs = loop startAcc startOfs
  where
    loop !acc !ofs
        | ofs == endOfs = (# acc, True, ofs #)
        | otherwise     =
            case UTF8.nextAsciiDigit ba ofs of
                sg@(StepDigit d) | isValidStepDigit sg -> loop (10 * acc + integralUpsize d) (succ ofs)
                                 | otherwise           -> (# acc, False, ofs #)
{-# SPECIALIZE decimalDigitsBA :: Integer -> Block Word8 -> Offset Word8 -> Offset Word8 -> (# Integer, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsBA :: Natural -> Block Word8 -> Offset Word8 -> Offset Word8 -> (# Natural, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsBA :: Int -> Block Word8 -> Offset Word8 -> Offset Word8 -> (# Int, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsBA :: Word -> Block Word8 -> Offset Word8 -> Offset Word8 -> (# Word, Bool, Offset Word8 #) #-}

-- | same as decimalDigitsBA specialized for ptr #
decimalDigitsPtr :: (IntegralUpsize Word8 acc, Additive acc, Multiplicative acc, Integral acc)
                 => acc
                 -> Ptr Word8
                 -> Offset Word8 -- end offset
                 -> Offset Word8 -- start offset
                 -> (# acc, Bool, Offset Word8 #)
decimalDigitsPtr startAcc ptr !endOfs !startOfs = loop startAcc startOfs
  where
    loop !acc !ofs
        | ofs == endOfs = (# acc, True, ofs #)
        | otherwise     =
            case UTF8.nextAsciiDigit ptr ofs of
                sg@(StepDigit d) | isValidStepDigit sg -> loop (10 * acc + integralUpsize d) (succ ofs)
                                 | otherwise           -> (# acc, False, ofs #)
{-# SPECIALIZE decimalDigitsPtr :: Integer -> Ptr Word8 -> Offset Word8 -> Offset Word8 -> (# Integer, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsPtr :: Natural -> Ptr Word8 -> Offset Word8 -> Offset Word8 -> (# Natural, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsPtr :: Int -> Ptr Word8 -> Offset Word8 -> Offset Word8 -> (# Int, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsPtr :: Word -> Ptr Word8 -> Offset Word8 -> Offset Word8 -> (# Word, Bool, Offset Word8 #) #-}

-- | Convert a 'String' 'Char' by 'Char' using a case mapping function.
caseConvert :: (Char7 -> Char7) -> (Char -> CM) -> String -> String
caseConvert opASCII op s@(String arr) = runST $ do
  mba <- MBLK.new iLen
  nL <- C.onBackendPrim
        (\blk  -> go mba blk (Offset 0) start)
        (\fptr -> withFinalPtr fptr $ \ptr -> go mba ptr (Offset 0) start)
        arr
  freeze . MutableString $ MVec.MUArray 0 nL (C.MUArrayMBA mba)
  where
    !(C.ValidRange start end) = C.offsetsValidRange arr
    !iLen = 1 + C.length arr
    go :: (Indexable container Word8, PrimMonad prim)
       => MutableBlock Word8 (PrimState prim)
       -> container
       -> Offset Word8
       -> Offset Word8
       -> prim (CountOf Word8)
    go !dst !src = loop dst iLen 0
      where
        eSize !e = if e == '\0' then 0 else charToBytes (fromEnum e)
        loop !dst !allocLen !nLen !dstIdx !srcIdx
          | srcIdx == end    = return nLen
          | nLen == allocLen = realloc
          | headerIsAscii h  = do
                UTF8.writeASCII dst dstIdx (opASCII $ Char7 $ stepAsciiRawValue h)
                loop dst allocLen (nLen + 1) (dstIdx+Offset 1) (srcIdx+Offset 1)
          | otherwise = do
              let !(CM c1 c2 c3) = op c
                  !(Step c nextSrcIdx) = UTF8.nextWith h src (srcIdx+Offset 1)
              nextDstIdx <- UTF8.writeUTF8 dst dstIdx c1
              if c2 == '\0' -- We keep the most common case loop as short as possible.
                then loop dst allocLen (nLen + charToBytes (fromEnum c1)) nextDstIdx nextSrcIdx
                else do
                  let !cSize = eSize c1 + eSize c2 + eSize c3
                  nextDstIdx <- UTF8.writeUTF8 dst nextDstIdx c2
                  nextDstIdx <- if c3 == '\0' then return nextDstIdx else UTF8.writeUTF8 dst nextDstIdx c3
                  loop dst allocLen (nLen + cSize) nextDstIdx nextSrcIdx
          where
            {-# NOINLINE realloc #-}
            realloc = do
              let nAll = allocLen + allocLen + 1
              nDst <- MBLK.new nAll
              MBLK.unsafeCopyElements nDst 0 dst 0 nLen
              loop nDst nAll nLen dstIdx srcIdx
            h = UTF8.nextAscii src srcIdx

-- | Convert a 'String' to the upper-case equivalent.
upper :: String -> String
upper = caseConvert c7Upper upperMapping

-- | Convert a 'String' to the upper-case equivalent.
lower :: String -> String
lower = caseConvert c7Lower lowerMapping

-- | Convert a 'String' to the unicode case fold equivalent.
--
-- Case folding is mostly used for caseless comparison of strings.
caseFold :: String -> String
caseFold = caseConvert c7Upper foldMapping

-- | Check whether the first string is a prefix of the second string.
isPrefixOf :: String -> String -> Bool
isPrefixOf (String needle) (String haystack) = C.isPrefixOf needle haystack

-- | Check whether the first string is a suffix of the second string.
isSuffixOf :: String -> String -> Bool
isSuffixOf (String needle) (String haystack)
    | needleLen > hayLen = False
    | otherwise          = needle == C.revTake needleLen haystack
  where
    needleLen = C.length needle
    hayLen    = C.length haystack

-- | Check whether the first string is contains within the second string.
--
-- TODO: implemented the naive way and thus terribly inefficient, reimplement properly
isInfixOf :: String -> String -> Bool
isInfixOf (String needle) (String haystack)
    = loop (hayLen - needleLen) haystack
    where
      needleLen = C.length needle
      hayLen    = C.length haystack
      loop Nothing    _         = False
      loop (Just cnt) haystack' = needle == C.take needleLen haystack' || loop (cnt-1) (C.drop 1 haystack')

-- | Try to strip a prefix from the start of a String.
--
-- If the prefix is not starting the string, then Nothing is returned,
-- otherwise the striped string is returned
stripPrefix :: String -> String -> Maybe String
stripPrefix (String suffix) (String arr)
    | C.isPrefixOf suffix arr = Just $ String $ C.drop (C.length suffix) arr
    | otherwise               = Nothing

-- | Try to strip a suffix from the end of a String.
--
-- If the suffix is not ending the string, then Nothing is returned,
-- otherwise the striped string is returned
stripSuffix :: String -> String -> Maybe String
stripSuffix (String prefix) (String arr)
    | C.isSuffixOf prefix arr = Just $ String $ C.revDrop (C.length prefix) arr
    | otherwise               = Nothing

all :: (Char -> Bool) -> String -> Bool
all predicate (String arr) = C.onBackend goBA (\_ -> pure . goAddr) arr
  where
    !(C.ValidRange start end) = C.offsetsValidRange arr
    goBA ba   = UTF8.all predicate ba start end
    goAddr addr = UTF8.all predicate addr start end

any :: (Char -> Bool) -> String -> Bool
any predicate (String arr) = C.onBackend goBA (\_ -> pure . goAddr) arr
  where
    !(C.ValidRange start end) = C.offsetsValidRange arr
    goBA ba   = UTF8.any predicate ba start end
    goAddr addr = UTF8.any predicate addr start end

-- | Transform string @src@ to base64 binary representation.
toBase64 :: String -> String
toBase64 (String src) = fromBytesUnsafe . Vec.toBase64Internal set src $ True
  where
    !set = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"#

-- | Transform string @src@ to URL-safe base64 binary representation.
-- The result will be either padded or unpadded, depending on the boolean
-- @padded@ argument.
toBase64URL :: Bool -> String -> String
toBase64URL padded (String src) = fromBytesUnsafe . Vec.toBase64Internal set src $ padded
  where
    !set = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"#

-- | Transform string @src@ to OpenBSD base64 binary representation.
toBase64OpenBSD :: String -> String
toBase64OpenBSD (String src) = fromBytesUnsafe . Vec.toBase64Internal set src $ False
  where
    !set = "./ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"#