
|
<title>The Haskell 1.3 Library Report: Character Utilities</title>
<body bgcolor="#ffffff"> <i>The Haskell 1.4 Library Report</i><br> <a href="index.html">top</a> | <a href="maybe.html">back</a> | <a href="monad.html">next</a> | <a href="libindex.html">contents</a> <br><hr>
<a name="sect9"></a>
<h2>9<tt> </tt>Character Utilities</h2><p>
<table border=2 cellpadding=3>
<tr><td>
<tt><br>
module Char ( <br>
isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, isLower, <br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum, <br>
digitToInt, intToDigit,<br>
toUpper, toLower,<br>
ord, chr,<br>
readLitChar, showLitChar<br>
) where<br>
<br>
isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, isLower, <br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum :: Char -> Bool<br>
<br>
toUpper, toLower :: Char -> Char<br>
<br>
digitToInt :: Char -> Int<br>
intToDigit :: Int -> Char<br>
<br>
ord :: Char -> Int<br>
chr :: Int -> Char<br>
<br>
readLitChar :: Reads Char<br>
showLitChar :: Char -> ShowS<br>
</tt></td></tr></table>
<p>
This library provides a limited set of operations on the Unicode
character set. The first 128 entries of this character set are
identical to the ASCII set; with the next 128 entries comes the
Latin-1 character set. This module offers only a limited view of the
full Unicode character set; the full set of Unicode character
attributes is not accessible in this library.<p>
Unicode characters may be divided into five general categories:
non-printing, lower case alphabetic, other alphabetic, numeric digits, and
other printable characters. For the purposes of Haskell, any
alphabetic character which is not lower case is treated as upper case
(Unicode actually has three cases: upper, lower, and title). Numeric
digits may be part of identifiers but digits outside the ASCII range are not
used by the reader to represent numbers. <p>
For each sort of Unicode character, here are the predicates which
return <tt>True</tt>:<br>
<table >
<tr><td>Character Type </td><td> Predicates </td></tr><tr><td>Lower Case </td><td> <tt>isPrint</tt> </td><td> <tt>isAlphaNum</tt> </td><td> <tt>isAlpha</tt> </td><td> <tt>isLower</tt> </td></tr><tr><td>Other Alphabetic </td><td> <tt>isPrint</tt> </td><td> <tt>isAlphaNum</tt> </td><td> <tt>isAlpha</tt> </td><td> </td><td> <tt>isUpper</tt> </td></tr><tr><td>Digits </td><td> <tt>isPrint</tt> </td><td> <tt>isAlphaNum</tt> </td></tr><tr><td>Other Printable </td><td> <tt>isPrint</tt> </td></tr><tr><td>Non-printing </td><td> </td></tr></table>
<p>
The <tt>isDigit</tt>, <tt>isOctDigit</tt>, and <tt>isHexDigit</tt> functions select only
ASCII characters; <tt>intToDigit</tt> and <tt>digitToInt</tt> operate only for
characters satisfying these predicates. <p>
The <tt>isSpace</tt> function recognizes only white characters in the Latin-1
range.<p>
The <tt>readLitChar</tt> and <tt>showLitChar</tt> functions leave characters
outside the Latin-1 range unchanged. <p>
Function <tt>toUpper</tt> converts a letter to the corresponding
upper-case letter, leaving any other character unchanged. Any
Unicode letter which has an upper-case equivalent is transformed.
Similarly, <tt>toLower</tt> converts a letter to the
corresponding lower-case letter, leaving any other character
unchanged.<p>
The <tt>ord</tt> and <tt>chr</tt> functions are <tt>fromEnum</tt> and <tt>toEnum
</tt>restricted to the type <tt>Char</tt>.<a name="Char"></a><p>
<a name="sect9.1"></a>
<h3>9.1<tt> </tt>Library <tt>Char</tt></h3>
<tt><br>
module Char ( <br>
isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, isLower,<br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum,<br>
digitToInt, intToDigit,<br>
toUpper, toLower,<br>
ord, chr,<br>
readLitChar, showLitChar, lexLitChar<br>
) where<br>
<br>
import Array -- used for character name table.<br>
<br>
import UnicodePrims -- source of primitive Unicode functions.<br>
<br>
-- Character-testing operations<br>
isAscii, isControl, isPrint, isSpace, isUpper, isLower,<br>
isAlpha, isDigit, isOctDigit, isHexDigit, isAlphanum :: Char -> Bool<br>
<br>
isAscii c = c < '\x80'<br>
<br>
isLatin1 c = c <= '\xff'<br>
<br>
-- Only ASCII Chars can be controls <br>
<br>
isControl c = c < ' ' || c >= '\DEL' && c <= '\x9f'<br>
<br>
-- This function does not<br>
<br>
isPrint = primUnicodeIsPrint<br>
<br>
-- Only Latin-1 spaces recognized<br>
<br>
isSpace c = c `elem` " \t\n\r\f\v\xA0"<br>
<br>
isUpper = primUnicodeIsUpper<br>
<br>
isLower c = primUnicodeIsLower<br>
<br>
isAlpha c = isUpper c || isLower c<br>
<br>
isDigit c = c >= '0' && c <= '9'<br>
<br>
isOctDigit c = c >= '0' && c <= '7'<br>
<br>
isHexDigit c = isDigit c || c >= 'A' && c <= 'F' ||<br>
c >= 'a' && c <= 'f'<br>
<br>
isAlphanum c = primUnicodeIsAlphnum<br>
<br>
<br>
-- Digit conversion operations<br>
digitToInt :: Char -> Int<br>
digitToInt c<br>
| isDigit c = fromEnum c - fromEnum '0'<br>
| c >= 'a' && c <= 'f' = fromEnum c - fromEnum 'a' + 10<br>
| c >= 'A' && c <= 'F' = fromEnum c - fromEnum 'A' + 10<br>
| otherwise = error "Char.digitToInt: not a digit"<br>
<br>
intToDigit :: Int -> Char<br>
intToDigit i<br>
| i >= 0 && i <= 9 = toEnum (fromEnum '0' + i)<br>
| i >= 10 && i <= 15 = toEnum (fromEnum 'a' + i - 10)<br>
| otherwise = error "Char.intToDigit: not a digit"<br>
<br>
<br>
-- Case-changing operations<br>
toUpper :: Char -> Char<br>
toUpper = primUnicodeToUpper<br>
<br>
toLower :: Char -> Char<br>
toLower = primUnicodeToLower<br>
<br>
-- Character code functions<br>
ord :: Char -> Int<br>
ord = fromEnum<br>
<br>
chr :: Int -> Char<br>
chr = toEnum<br>
<br>
-- Text functions<br>
readLitChar :: ReadS Char<br>
readLitChar ('\\':s) = readEsc s<br>
where<br>
readEsc ('a':s) = [('\a',s)]<br>
readEsc ('b':s) = [('\b',s)]<br>
readEsc ('f':s) = [('\f',s)]<br>
readEsc ('n':s) = [('\n',s)]<br>
readEsc ('r':s) = [('\r',s)]<br>
readEsc ('t':s) = [('\t',s)]<br>
readEsc ('v':s) = [('\v',s)]<br>
readEsc ('\\':s) = [('\\',s)]<br>
readEsc ('"':s) = [('"',s)]<br>
readEsc ('\'':s) = [('\'',s)]<br>
readEsc ('^':c:s) | c >= '@' && c <= '_'<br>
= [(chr (ord c - ord '@'), s)]<br>
readEsc s@(d:_) | isDigit d<br>
= [(chr n, t) | (n,t) <- readDec s]<br>
readEsc ('o':s) = [(chr n, t) | (n,t) <- readOct s]<br>
readEsc ('x':s) = [(chr n, t) | (n,t) <- readHex s]<br>
readEsc s@(c:_) | isUpper c<br>
= let table = ('\DEL' := "DEL") : assocs asciiTab<br>
in case [(c,s') | (c := mne) <- table,<br>
([],s') <- [match mne s]]<br>
of (pr:_) -> [pr]<br>
[] -> []<br>
readEsc _ = []<br>
readLitChar (c:s) = [(c,s)]<br>
<br>
sshowLitChar :: Char -> ShowS<br>
showLitChar c | c > '\DEL' = showChar '\\' . <br>
protectEsc isDigit (shows (ord c))<br>
showLitChar '\DEL' = showString "\\DEL"<br>
showLitChar '\\' = showString "\\\\"<br>
showLitChar c | c >= ' ' = showChar c<br>
showLitChar '\a' = showString "\\a"<br>
showLitChar '\b' = showString "\\b"<br>
showLitChar '\f' = showString "\\f"<br>
showLitChar '\n' = showString "\\n"<br>
showLitChar '\r' = showString "\\r"<br>
showLitChar '\t' = showString "\\t"<br>
showLitChar '\v' = showString "\\v"<br>
showLitChar '\SO' = protectEsc (== 'H') (showString "\\SO")<br>
showLitChar c = showString ('\\' : asciiTab!c)<br>
<br>
protectEsc p f = f . cont<br>
where cont s@(c:_) | p c = "\\&" ++ s<br>
cont s = s<br>
asciiTab = listArray ('\NUL', ' ')<br>
["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",<br>
"BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", <br>
"DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",<br>
"CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", <br>
"SP"] <br>
<br>
lexLitChar :: ReadS String<br>
lexLitChar ('\\':s) = [('\\':esc, t) | (esc,t) <- lexEsc s]<br>
where<br>
lexEsc (c:s) | c `elem` "abfnrtv\\\"'" = [([c],s)]<br>
lexEsc s@(d:_) | isDigit d = lexDigits s<br>
lexEsc ('^':c:s) | c >= '@' && c <= '_' = [(['^',c],s)]<br>
-- Very crude approximation to \XYZ. Let readers work this out.<br>
lexEsc s@(c:_) | isUpper c [span isCharName s]<br>
lexEsc _ = []<br>
isCharName c = isUpper c || isDigit c<br>
<br>
lexLitChar (c:s) = [([c],s)]<br>
lexLitChar "" = []<br>
<br>
<p>
<hr><i>The Haskell 1.4 Library Report</i><br><a href="index.html">top</a> | <a href="maybe.html">back</a> | <a href="monad.html">next</a> | <a href="libindex.html">contents</a> <br><font size=2>April 4, 1997</font>
<p>
</tt>
|