File: English.hs

package info (click to toggle)
haskell-minimorph 0.3.0.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 88 kB
  • sloc: haskell: 320; makefile: 6
file content (280 lines) | stat: -rw-r--r-- 9,148 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
{-# LANGUAGE ViewPatterns #-}
-- TODO : learn how to use Functional Morphology instead
-- |Simple default rules for English morphology
module NLP.Minimorph.English where

#if !(MIN_VERSION_base(4,11,0))
  -- this is redundant starting with base-4.11 / GHC 8.4
import Data.Semigroup
#endif

import           Data.Char (isSpace, isUpper, toLower)
import           Data.Text (Text)
import qualified Data.Text as T

import NLP.Minimorph.Util

-- ---------------------------------------------------------------------
-- ** Punctuation
-- ---------------------------------------------------------------------

-- | No Oxford commas, alas.
--
-- > commas "and" "foo bar"       == "foo and bar"
-- > commas "and" "foo, bar, baz" == "foo, bar and baz"
commas :: Text -> [Text] -> Text
commas _ []  = ""
commas _ [x] = x
commas et xs = T.intercalate ", " (init xs) <+> et <+> last xs

-- ---------------------------------------------------------------------
-- ** Numbers
-- ---------------------------------------------------------------------

-- | > cardinal 0 == "zero"
--   > cardinal 1 == "one"
--   > cardinal 2 == "two"
--   > cardinal 10 == "ten"
--   > cardinal 11 == "11"
cardinal :: Int -> Text
cardinal n = case n of
    0  -> "zero"
    1  -> "one"
    2  -> "two"
    3  -> "three"
    4  -> "four"
    5  -> "five"
    6  -> "six"
    7  -> "seven"
    8  -> "eight"
    9  -> "nine"
    10 -> "ten"
    _ -> tshow n

-- | > ordinalNotSpelled 1 == "1st"
--   > ordinalNotSpelled 2 == "2nd"
--   > ordinalNotSpelled 11 == "11th"
ordinalNotSpelled :: Int -> Text
ordinalNotSpelled k = case abs k `rem` 100 of
  n | n > 3 && n < 21 -> k `suf` "th"
    | n `rem` 10 == 1 -> k `suf` "st"
    | n `rem` 10 == 2 -> k `suf` "nd"
    | n `rem` 10 == 3 -> k `suf` "rd"
    | otherwise       -> k `suf` "th"
 where
  num `suf` s = tshow num <> s

-- | > ordinal 1 == "first"
--   > ordinal 2 == "second"
--   > ordinal 3 == "third"
--   > ordinal 11 == "11th"
--   > ordinal 42 == "42nd"
ordinal :: Int -> Text
ordinal n = case n of
    0  -> "zeroth"
    1  -> "first"
    2  -> "second"
    3  -> "third"
    4  -> "fourth"
    5  -> "fifth"
    6  -> "sixth"
    7  -> "seventh"
    8  -> "eighth"
    9  -> "ninth"
    10 -> "tenth"
    k  -> ordinalNotSpelled k

-- ---------------------------------------------------------------------
-- ** Nouns and verbs
-- ---------------------------------------------------------------------

-- | Heuristics for English plural for an unknown noun.
--
-- > defaultNounPlural "egg"    == "eggs"
-- > defaultNounPlural "patch"  == "patches"
-- > defaultNounPlural "boy"    == "boys"
-- > defaultNounPlural "spy"    == "spies"
-- > defaultNounPlural "thesis" == "theses"
--
-- http://www.paulnoll.com/Books/Clear-English/English-plurals-1.html
--
-- http://en.wikipedia.org/wiki/English_plural
defaultNounPlural :: Text -> Text
defaultNounPlural x
    | "is" `T.isSuffixOf` x = thesis
    | hasSibilantSuffix x   = sibilant_o
    | hasCoSuffix x         = sibilant_o
    | hasCySuffix x         = y_final
    | "ff" `T.isSuffixOf` x = ff_final  -- quite often not the case
    | "f" `T.isSuffixOf` x  = f_final   -- but this one as well, so both needed
    | otherwise             = plain
  where
    plain      = x            <> "s"
    sibilant_o = x            <> "es"
    y_final    = T.init x     <> "ies"
    f_final    = T.init x     <> "ves"
    ff_final   = T.dropEnd 2 x <> "ves"
    thesis     = T.dropEnd 2 x <> "es"

-- | Heuristics for 3rd person singular and past participle
--   for an unknown regular verb. Doubling of final consonants
--   can be handled via a table of (partially) irregular verbs.
--
-- > defaultVerbStuff "walk"  == ("walks",  "walked")
-- > defaultVerbStuff "push"  == ("pushes", "pushed")
-- > defaultVerbStuff "play"  == ("plays",  "played")
-- > defaultVerbStuff "cry"   == ("cries",  "cried")
defaultVerbStuff :: Text -> (Text, Text)
defaultVerbStuff x
    | hasSibilantSuffix x   = sibilant_o
    | hasCoSuffix x         = sibilant_o
    | hasCySuffix x         = y_final
    | "e" `T.isSuffixOf` x  = e_final
    | otherwise             = plain
  where
    plain      = (x <> "s"         , x <> "ed")
    sibilant_o = (x <> "es"        , x <> "ed")
    e_final    = (x <> "s"         , x <> "d")
    y_final    = (T.init x <> "ies", T.init x <> "ied")

-- | Heuristics for a possesive form for an unknown noun.
--
-- > defaultPossesive "pass"        == "pass'"
-- > defaultPossesive "SOS"         == "SOS'"
-- > defaultPossesive "Mr Blinkin'" == "Mr Blinkin's"
-- > defaultPossesive "cry"         == "cry's"
defaultPossesive :: Text -> Text
defaultPossesive t =
  case T.last t of
    's'  -> t <> "'"
    'S'  -> t <> "'"
    '\'' -> t <> "s"
    _    -> t <> "'s"

-- ---------------------------------------------------------------------
-- ** Determiners
-- ---------------------------------------------------------------------

anNumerals :: [Text]
anNumerals = [ "11", "11th", "18", "18th" ]

-- | > indefiniteDet "dog"  == "a"
--   > indefiniteDet "egg"  == "an"
--   > indefiniteDet "ewe"  == "a"
--   > indefiniteDet "ewok" == "an"
--   > indefiniteDet "8th"  == "an"
indefiniteDet :: Text -> Text
indefiniteDet t = if wantsAn t then "an" else "a"

-- | True if the indefinite determiner for a word would normally be
--   \'an\' as opposed to \'a\'.
wantsAn :: Text -> Bool
wantsAn t_ =
    if startsWithAcronym t_
       then acronymWantsAn t_
       else useAn0 || useAn1
  where
    t      = fst $ T.break isSep $ T.toLower t_
    useAn0 = t `elem` anNumerals
    useAn1 = case T.uncons t of
                Just (h, "") -> isLetterWithInitialVowelSound h
                Just ('8',_) -> True
                Just ('u',_) -> hasVowel_U_Prefix t
                Just (h, _)  -> isVowel h `butNot` hasSemivowelPrefix t
                Nothing      -> False
    x `butNot` y = x && not y
    isSep c = isSpace c || c `elem` ("-" :: String)

-- | Variant of 'wantsAn' that assumes the input string is pronounced
--   one letter at a time.
--
--   > wantsAn        "x-ray" == False
--   > acronymWantsAn "x-ray" == True
--
--   Note that this won't do the right thing for words like \"SCUBA\".
--   You really have to reserve it for those separate-letter acronyms.
acronymWantsAn :: Text -> Bool
acronymWantsAn (T.toLower -> t) =
    useAn0 || useAn1
  where
    useAn0 = t `elem` anNumerals
    useAn1 = case T.uncons t of
                Just ('8',_) -> True
                Just (h,_)   -> isLetterWithInitialVowelSound h
                Nothing      -> False

-- ---------------------------------------------------------------------
-- ** Acronyms
-- ---------------------------------------------------------------------

-- | True if all upper case from second letter and up.
--
--   > looksLikeAcronym "DNA"  == True
--   > looksLikeAcronym "tRNA" == True
--   > looksLikeAcronym "x"    == False
--   > looksLikeAcronym "DnA"  == False
looksLikeAcronym :: Text -> Bool
looksLikeAcronym "" = False
looksLikeAcronym x = T.all isUpper (if T.length x > 1 then T.drop 1 x else x)

-- | True if the first word (separating on either hyphen or space)
--   looks like an acronym.
startsWithAcronym :: Text -> Bool
startsWithAcronym =
    looksLikeAcronym . firstWord
  where
    firstWord = fst . T.break isSep
    isSep c   = isSpace c || c `elem` ("-" :: String)

-- ---------------------------------------------------------------------
-- ** Sounds
-- ---------------------------------------------------------------------

-- | Ends with a \'sh\' sound.
hasSibilantSuffix :: Text -> Bool
hasSibilantSuffix x = any (`T.isSuffixOf` x) ["x","s","ch","sh","z","j"]

-- | Starts with a semivowel.
hasSemivowelPrefix :: Text -> Bool
hasSemivowelPrefix ls = any (`T.isPrefixOf` ls) ["y","w","eu","ewe"]

-- | Starts with a vowel-y \'U\' sound
hasVowel_U_Prefix :: Text -> Bool
hasVowel_U_Prefix t =
    case T.unpack t of
        ['u']       -> False
        ['u',_]     -> True
        ('u':c:v:_) -> not (isConsonant c && isVowel v)
        _           -> False

-- | Last two letters are a consonant and \'y\'.
hasCySuffix :: Text -> Bool
hasCySuffix (T.unpack . T.takeEnd 2 -> [x, 'y']) = isConsonant x
hasCySuffix _ = False

-- | Last two letters are a consonant and \'o\'.
hasCoSuffix :: Text -> Bool
hasCoSuffix (T.unpack . T.takeEnd 2 -> [x, 'o']) = isConsonant x
hasCoSuffix _ = False

-- | Is a vowel.
isVowel :: Char -> Bool
isVowel = (`elem` ("aeiou" :: String)) . toLower

-- | Letters that when pronounced independently in English sound like they
--   begin with vowels.
--
--   > isLetterWithInitialVowelSound 'r' == True
--   > isLetterWithInitialVowelSound 'k' == False
--
--   (In the above, \'r\' is pronounced \"are\", but \'k\' is pronounced
--   \"kay\".)
isLetterWithInitialVowelSound :: Char -> Bool
isLetterWithInitialVowelSound = (`elem` ("aeiofhlmnrsx" :: String)) . toLower

-- | Is a consonant.
--
--   Note that not every `Char` is either a vowel or a consonant.
--   We consider numbers, spaces and symbols to be neither vowel or consonants
isConsonant :: Char -> Bool
isConsonant = (`elem` ("bcdfghjklmnpqrstvwxyz" :: String)) . toLower