File: Shape.hsc

package info (click to toggle)
haskell-text-icu 0.8.0.5-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 536 kB
  • sloc: haskell: 1,210; ansic: 1,147; makefile: 4
file content (105 lines) | stat: -rw-r--r-- 5,218 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
{-# LANGUAGE CPP, ForeignFunctionInterface #-}

-- |
-- Module      : Data.Text.ICU.Shape
-- Copyright   : (c) 2018 Ondrej Palkovsky
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- Bindings for arabic shaping, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.
--

module Data.Text.ICU.Shape
  (
      shapeArabic
    , ShapeOption(..)
  ) where

#include <unicode/ushape.h>

import Data.List (foldl')
import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError)
import Data.Bits ((.|.))
import Data.Int (Int32)
import Foreign.Ptr (Ptr)
import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr)
import Data.Text (Text)
import System.IO.Unsafe (unsafePerformIO)

-- | Options for the 'shapeArabic' function.
data ShapeOption =
    AggregateTaskheel
  -- ^ Tashkeel aggregation option: Replaces any combination of U+0651 with one of U+064C, U+064D, U+064E, U+064F, U+0650 with U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
  | DigitTypeAnExtended
  -- ^ Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9).
  | DigitsAlen2AnInitAl
  -- ^ Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly
  -- directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
  | DigitsAlen2AnInitLr
  -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
  | DigitsAn2En
  -- ^ Digit shaping option: Replace Arabic-Indic digits by European digits (U+0030...).
  | DigitsEn2An
  -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits.
  | LengthFixedSpacesAtBeginning
  -- ^ If more room is necessary, then try to consume spaces at the beginning of the text.
  | LengthFixedSpacesAtEnd
  -- ^ If more room is necessary, then try to consume spaces at the end of the text.
  | LengthFixedSpacesNear
  -- ^ If more room is necessary, then try to consume spaces next to modified characters.
  | LettersShape
  -- ^ Letter shaping option: replace abstract letter characters by "shaped" ones.
  | LettersUnshape
  -- ^ Letter shaping option: replace "shaped" letter characters by abstract ones.
  | LettersShapeTashkeelIsolated
  -- ^ The only difference with LettersShape is that Tashkeel letters are always "shaped" into the isolated form instead of the medial form (selecting codepoints from the Arabic Presentation Forms-B block).
  | PreservePresentation
  -- ^ Presentation form option: Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B characters with 0+06xx characters, before shaping.
  | TextDirectionVisualLTR
  -- ^ Direction indicator: the source is in visual LTR order, the leftmost displayed character stored first.
  deriving (Show)

reduceShapeOpts :: [ShapeOption] -> Int32
reduceShapeOpts = foldl' orO 0
    where a `orO` b = a .|. fromShapeOption b

fromShapeOption :: ShapeOption -> Int32
fromShapeOption AggregateTaskheel   = #const U_SHAPE_AGGREGATE_TASHKEEL
fromShapeOption DigitTypeAnExtended   = #const U_SHAPE_DIGIT_TYPE_AN_EXTENDED
fromShapeOption DigitsAlen2AnInitAl = #const U_SHAPE_DIGITS_ALEN2AN_INIT_AL
fromShapeOption DigitsAlen2AnInitLr = #const U_SHAPE_DIGITS_ALEN2AN_INIT_LR
fromShapeOption DigitsAn2En = #const U_SHAPE_DIGITS_AN2EN
fromShapeOption DigitsEn2An = #const U_SHAPE_DIGITS_EN2AN
fromShapeOption LengthFixedSpacesAtBeginning = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
fromShapeOption LengthFixedSpacesAtEnd = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_END
fromShapeOption LengthFixedSpacesNear = #const U_SHAPE_LENGTH_FIXED_SPACES_NEAR
fromShapeOption LettersShape = #const U_SHAPE_LETTERS_SHAPE
fromShapeOption LettersUnshape = #const U_SHAPE_LETTERS_UNSHAPE
fromShapeOption LettersShapeTashkeelIsolated = #const U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED
fromShapeOption PreservePresentation = #const U_SHAPE_PRESERVE_PRESENTATION
fromShapeOption TextDirectionVisualLTR = #const U_SHAPE_TEXT_DIRECTION_VISUAL_LTR

-- | Shape Arabic text on a character basis.
--
-- Text-based shaping means that some character codepoints in the text are replaced by
-- others depending on the context. It transforms one kind of text into another.
-- In comparison, modern displays for Arabic text select appropriate, context-dependent font
-- glyphs for each text element, which means that they transform text into a glyph vector.
--
-- You probably want to call this with the LettersShape option in the default case.
shapeArabic :: [ShapeOption] -> Text -> Text
shapeArabic options t = unsafePerformIO . useAsUCharPtr t $ \sptr slen ->
  let slen' = fromIntegral slen
      options' = reduceShapeOpts options
  in handleOverflowError (fromIntegral slen)
      (\dptr dlen -> u_shapeArabic sptr slen' dptr (fromIntegral dlen) options')
      (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))

foreign import ccall unsafe "hs_text_icu.h __hs_u_shapeArabic" u_shapeArabic
  :: Ptr UChar -> Int32
  -> Ptr UChar -> Int32
  -> Int32 -> Ptr UErrorCode -> IO Int32