1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
|
<title>The Haskell 1.3 Library Report: Character Utilities</title>
<body bgcolor="#ffffff"> <i>The Haskell 1.4 Library Report</i><br> <a href="index.html">top</a> | <a href="maybe.html">back</a> | <a href="monad.html">next</a> | <a href="libindex.html">contents</a> <br><hr>
<a name="sect9"></a>
<h2>9<tt> </tt>Character Utilities</h2><p>
<table border=2 cellpadding=3>
<tr><td>
<tt><br>
module Char ( <br>
isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, isLower, <br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum, <br>
digitToInt, intToDigit,<br>
toUpper, toLower,<br>
ord, chr,<br>
readLitChar, showLitChar<br>
) where<br>
<br>
isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, isLower, <br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum :: Char -> Bool<br>
<br>
toUpper, toLower :: Char -> Char<br>
<br>
digitToInt :: Char -> Int<br>
intToDigit :: Int -> Char<br>
<br>
ord :: Char -> Int<br>
chr :: Int -> Char<br>
<br>
readLitChar :: Reads Char<br>
showLitChar :: Char -> ShowS<br>
</tt></td></tr></table>
<p>
This library provides a limited set of operations on the Unicode
character set. The first 128 entries of this character set are
identical to the ASCII set; with the next 128 entries comes the
Latin-1 character set. This module offers only a limited view of the
full Unicode character set; the full set of Unicode character
attributes is not accessible in this library.<p>
Unicode characters may be divided into five general categories:
non-printing, lower case alphabetic, other alphabetic, numeric digits, and
other printable characters. For the purposes of Haskell, any
alphabetic character which is not lower case is treated as upper case
(Unicode actually has three cases: upper, lower, and title). Numeric
digits may be part of identifiers but digits outside the ASCII range are not
used by the reader to represent numbers. <p>
For each sort of Unicode character, here are the predicates which
return <tt>True</tt>:<br>
<table >
<tr><td>Character Type </td><td> Predicates </td></tr><tr><td>Lower Case </td><td> <tt>isPrint</tt> </td><td> <tt>isAlphaNum</tt> </td><td> <tt>isAlpha</tt> </td><td> <tt>isLower</tt> </td></tr><tr><td>Other Alphabetic </td><td> <tt>isPrint</tt> </td><td> <tt>isAlphaNum</tt> </td><td> <tt>isAlpha</tt> </td><td> </td><td> <tt>isUpper</tt> </td></tr><tr><td>Digits </td><td> <tt>isPrint</tt> </td><td> <tt>isAlphaNum</tt> </td></tr><tr><td>Other Printable </td><td> <tt>isPrint</tt> </td></tr><tr><td>Non-printing </td><td> </td></tr></table>
<p>
The <tt>isDigit</tt>, <tt>isOctDigit</tt>, and <tt>isHexDigit</tt> functions select only
ASCII characters; <tt>intToDigit</tt> and <tt>digitToInt</tt> operate only for
characters satisfying these predicates. <p>
The <tt>isSpace</tt> function recognizes only white characters in the Latin-1
range.<p>
The <tt>readLitChar</tt> and <tt>showLitChar</tt> functions leave characters
outside the Latin-1 range unchanged. <p>
Function <tt>toUpper</tt> converts a letter to the corresponding
upper-case letter, leaving any other character unchanged. Any
Unicode letter which has an upper-case equivalent is transformed.
Similarly, <tt>toLower</tt> converts a letter to the
corresponding lower-case letter, leaving any other character
unchanged.<p>
The <tt>ord</tt> and <tt>chr</tt> functions are <tt>fromEnum</tt> and <tt>toEnum
</tt>restricted to the type <tt>Char</tt>.<a name="Char"></a><p>
<a name="sect9.1"></a>
<h3>9.1<tt> </tt>Library <tt>Char</tt></h3>
<tt><br>
module Char ( <br>
isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, isLower,<br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum,<br>
digitToInt, intToDigit,<br>
toUpper, toLower,<br>
ord, chr,<br>
readLitChar, showLitChar, lexLitChar<br>
) where<br>
<br>
import Array -- used for character name table.<br>
<br>
import UnicodePrims -- source of primitive Unicode functions.<br>
<br>
-- Character-testing operations<br>
isAscii, isControl, isPrint, isSpace, isUpper, isLower,<br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum :: Char -> Bool<br>
<br>
isAscii c = c < '\x80'<br>
<br>
isLatin1 c = c <= '\xff'<br>
<br>
-- Only ASCII Chars can be controls <br>
<br>
isControl c = c < ' ' || c >= '\DEL' && c <= '\x9f'<br>
<br>
-- This function does not<br>
<br>
isPrint = primUnicodeIsPrint<br>
<br>
-- Only Latin-1 spaces recognized<br>
<br>
isSpace c = c `elem` " \t\n\r\f\v\xA0"<br>
<br>
isUpper = primUnicodeIsUpper<br>
<br>
isLower c = primUnicodeIsLower<br>
<br>
isAlpha c = isUpper c || isLower c<br>
<br>
isDigit c = c >= '0' && c <= '9'<br>
<br>
isOctDigit c = c >= '0' && c <= '7'<br>
<br>
isHexDigit c = isDigit c || c >= 'A' && c <= 'F' ||<br>
c >= 'a' && c <= 'f'<br>
<br>
isAlphanum c = primUnicodeIsAlphnum<br>
<br>
<br>
-- Digit conversion operations<br>
digitToInt :: Char -> Int<br>
digitToInt c<br>
| isDigit c = fromEnum c - fromEnum '0'<br>
| c >= 'a' && c <= 'f' = fromEnum c - fromEnum 'a' + 10<br>
| c >= 'A' && c <= 'F' = fromEnum c - fromEnum 'A' + 10<br>
| otherwise = error "Char.digitToInt: not a digit"<br>
<br>
intToDigit :: Int -> Char<br>
intToDigit i<br>
| i >= 0 && i <= 9 = toEnum (fromEnum '0' + i)<br>
| i >= 10 && i <= 15 = toEnum (fromEnum 'a' + i - 10)<br>
| otherwise = error "Char.intToDigit: not a digit"<br>
<br>
<br>
-- Case-changing operations<br>
toUpper :: Char -> Char<br>
toUpper = primUnicodeToUpper<br>
<br>
toLower :: Char -> Char<br>
toLower = primUnicodeToLower<br>
<br>
-- Character code functions<br>
ord :: Char -> Int<br>
ord = fromEnum<br>
<br>
chr :: Int -> Char<br>
chr = toEnum<br>
<br>
-- Text functions<br>
readLitChar :: ReadS Char<br>
readLitChar ('\\':s) = readEsc s<br>
where<br>
readEsc ('a':s) = [('\a',s)]<br>
readEsc ('b':s) = [('\b',s)]<br>
readEsc ('f':s) = [('\f',s)]<br>
readEsc ('n':s) = [('\n',s)]<br>
readEsc ('r':s) = [('\r',s)]<br>
readEsc ('t':s) = [('\t',s)]<br>
readEsc ('v':s) = [('\v',s)]<br>
readEsc ('\\':s) = [('\\',s)]<br>
readEsc ('"':s) = [('"',s)]<br>
readEsc ('\'':s) = [('\'',s)]<br>
readEsc ('^':c:s) | c >= '@' && c <= '_'<br>
= [(chr (ord c - ord '@'), s)]<br>
readEsc s@(d:_) | isDigit d<br>
= [(chr n, t) | (n,t) <- readDec s]<br>
readEsc ('o':s) = [(chr n, t) | (n,t) <- readOct s]<br>
readEsc ('x':s) = [(chr n, t) | (n,t) <- readHex s]<br>
readEsc s@(c:_) | isUpper c<br>
= let table = ('\DEL' := "DEL") : assocs asciiTab<br>
in case [(c,s') | (c := mne) <- table,<br>
([],s') <- [match mne s]]<br>
of (pr:_) -> [pr]<br>
[] -> []<br>
readEsc _ = []<br>
readLitChar (c:s) = [(c,s)]<br>
<br>
sshowLitChar :: Char -> ShowS<br>
showLitChar c | c > '\DEL' = showChar '\\' . <br>
protectEsc isDigit (shows (ord c))<br>
showLitChar '\DEL' = showString "\\DEL"<br>
showLitChar '\\' = showString "\\\\"<br>
showLitChar c | c >= ' ' = showChar c<br>
showLitChar '\a' = showString "\\a"<br>
showLitChar '\b' = showString "\\b"<br>
showLitChar '\f' = showString "\\f"<br>
showLitChar '\n' = showString "\\n"<br>
showLitChar '\r' = showString "\\r"<br>
showLitChar '\t' = showString "\\t"<br>
showLitChar '\v' = showString "\\v"<br>
showLitChar '\SO' = protectEsc (== 'H') (showString "\\SO")<br>
showLitChar c = showString ('\\' : asciiTab!c)<br>
<br>
protectEsc p f = f . cont<br>
where cont s@(c:_) | p c = "\\&" ++ s<br>
cont s = s<br>
asciiTab = listArray ('\NUL', ' ')<br>
["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",<br>
"BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", <br>
"DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",<br>
"CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", <br>
"SP"] <br>
<br>
lexLitChar :: ReadS String<br>
lexLitChar ('\\':s) = [('\\':esc, t) | (esc,t) <- lexEsc s]<br>
where<br>
lexEsc (c:s) | c `elem` "abfnrtv\\\"'" = [([c],s)]<br>
lexEsc s@(d:_) | isDigit d = lexDigits s<br>
lexEsc ('^':c:s) | c >= '@' && c <= '_' = [(['^',c],s)]<br>
-- Very crude approximation to \XYZ. Let readers work this out.<br>
lexEsc s@(c:_) | isUpper c [span isCharName s]<br>
lexEsc _ = []<br>
isCharName c = isUpper c || isDigit c<br>
<br>
lexLitChar (c:s) = [([c],s)]<br>
lexLitChar "" = []<br>
<br>
<p>
<hr><i>The Haskell 1.4 Library Report</i><br><a href="index.html">top</a> | <a href="maybe.html">back</a> | <a href="monad.html">next</a> | <a href="libindex.html">contents</a> <br><font size=2>April 4, 1997</font>
<p>
</tt>
|