1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
|
/** -*-C-*-ish
Kaya standard library
Copyright (C) 2004, 2005 Edwin Brady
This file is distributed under the terms of the GNU Lesser General
Public Licence. See COPYING for licence.
*/
"<summary>Regular expressions</summary>
<prose>This module provides functions to interface to libpcre (perl-compatible regular expressions) for constructing and matching with regular expressions.</prose>
<prose>Regular expression support is provided by the PCRE library package,
which is open source software, written by Philip Hazel, and copyright
by the University of Cambridge, England.</prose>
<prose>The <code>pcrepattern</code> or <code>perlre</code> manual pages explain the regular expression syntax in detail.</prose>"
module Regex;
import Builtins;
import Prelude;
%include "regex_glue.h";
%imported "regex_glue";
%link "pcre";
"<summary>The regular expression could not be compiled.</summary>
<prose>This Exception is thrown by <functionref>compile</functionref> if the regular expression cannot be compiled. It will also be thrown by functions such as <functionref>grep</functionref> that call <code>compile</code>.</prose>"
Exception RegexCompilationFailed;
"<summary>UTF-8 support unavailable</summary>
<prose>This Exception is thrown if the string being matched contains non-ASCII characters, but the PCRE library is compiled without UTF-8 support. For web applications, this exception is very easy for a user to trigger, so you should always use a library with UTF-8 support for those.</prose>"
Exception NoUTF8Support;
"<summary>Compiled regular expression</summary>
<prose>This data type represents a compiled regular expression. It is significantly more efficient to use <functionref>compile</functionref> to compile a regular expression once, and then to pass it to functions as needed.</prose>"
abstract data Regex = Regex(Ptr pcre);
"<summary>Match object.</summary>
<prose>A match operation returns <code>matches</code> if a match is
found. The <code>matches</code> field is an array of sub matches,
where <code>matches[0]</code> is the entire matched
string. <code>before</code> and <code>after</code> are the strings
before and after the matched string.</prose>
<related><functionref>match</functionref></related>"
public data Match = matches([String] matches, String before, String after)
| noMatch;
/* Documentation for Extended option taken from PCRE
manual page written by Philip Hazel, copyright
1997-2003 University of Cambridge. */
"<summary>Flags for regular expression compilation</summary>
<prose>Flags for regular expression compilation</prose>
<list>
<item><code>IgnoreCase</code> makes the expression case-insensitive (so \"^a\" would match \"All\" and \"all\"</item>
<item><code>Extended</code> when set causes whitespace in the pattern (other than in a character class) and characters between a '#' outside a character class and the next newline character to be ignored. An escaping backslash can be used to include a whitespace or '#' character as part of the pattern.</item>
<item><code>Multiline</code> changes the meaning of the start '^' and end '$' characters so that as well as matching the start and end of the string, they will also match immediately after and immediately before a newline '\\n' character.</item>
<item><code>Ungreedy</code> causes the expression and sub-expressions to find the smallest possible match, rather than the largest possible, unless a '?' is placed after the expression. For example, the pattern \"a.*a\" would match \"aca\" if ungreedy, and \"acada\" if greedy in the string \"bacadaf\".</item>
</list>
<related><functionref>compile</functionref></related>"
public data REFlags = IgnoreCase | Extended | Multiline | Ungreedy;
// _CASELESS, _EXTENDED, _MULTILINE, _UNGREEDY
"<summary>Replacement options</summary>
<prose>Options to use when replacing strings using regular expressions. Currently only one flag exists, <code>Global</code>, which causes the matched string to be replaced everywhere it appears rather than just the first occurence.</prose>"
public data ReplaceFlags = Global;
foreign "regex_glue.o" {
Ptr re_compile(Ptr vm, String pattern, Int len, Int flagcode) = re_compile;
Ptr re_match(Ptr vm, Ptr pcre, String str, Int len) = re_match;
Bool re_quickmatch(Ptr vm, Ptr pcre, String str, Int len) = re_quickmatch;
Bool matched(Ptr mo) = matched;
[String] getmatches(Ptr mo) = getmatches;
String getBefore(Ptr mo) = getBefore;
String getAfter(Ptr mo) = getAfter;
}
r"<argument name='pattern'>The regular expression pattern to compile</argument>
<argument name='fl'>A list of <dataref>REFlags</dataref> options for the regular expression. This parameter is optional and defaults to the empty list.</argument>
<summary>Compile a regular expression</summary>
<prose>Compile a regular expression and return the compiled <dataref>Regex</dataref>. This can then be re-used in several function calls for greater efficiency.</prose>
<prose>Note that if a literal backslash needs to be matched, it must be written as \"\\\\\" as both Kaya and PCRE will interpret one layer of backslashes. You can avoid this from Kaya 0.2.6 onwards by using a raw string instead.</prose>
<example>// these two are equivalent
test1 = quickMatch(\"\\\\\",string);
test2 = quickMatch(R\"\\\",string);</example>
<related><functionref>match</functionref></related>
<related><functionref>quickMatch</functionref></related>
<related><functionref>replace</functionref></related>
<related><functionref>split</functionref></related>"
public Regex compile(String pattern, [REFlags] fl = createArray(1))
{
flagcode = 0;
for flag in fl {
case flag of {
IgnoreCase -> flagcode = flagcode | 1;
| Extended -> flagcode = flagcode | 2;
| Multiline -> flagcode = flagcode | 4;
| Ungreedy -> flagcode = flagcode | 8;
}
}
try {
Ptr re = re_compile(getVM,pattern,length(pattern),flagcode);
return Regex(re);
} catch(InternalError(e)) {
if (e==0) {
throw(RegexCompilationFailed);
} else if (e==1) {
throw(NoUTF8Support);
} else {
throw(InternalError(e));
}
}
}
"<argument name='x'>A compiled regular expression</argument>
<argument name='str'>The String to match</argument>
<summary>Attempt to match a regular expression.</summary>
<prose>Attempt to match a compiled regular expression to a string, returning a <dataref>Match</dataref> object.</prose>
<example>r = compile(\"s$\");
s1 = \"sausages\";
m1 = match(r,s1); // matches([\"s\"],\"sausage\",\"\")
s2 = \"a sausage\";
m2 = match(r,s2); // noMatch</example>
<related><functionref>compile</functionref></related>
<related><functionref>quickMatch</functionref></related>
<related><functionref index='1'>quickMatch</functionref></related>"
public Match match(Regex x, String str)
{
mo = re_match(getVM, x.pcre, str, length(str));
try {
if (matched(mo)) {
return matches(getmatches(mo),
getBefore(mo),
getAfter(mo));
}
else {
return noMatch;
}
} catch(InternalError(e)) {
if (e==1) {
throw(NoUTF8Support);
}
}
return noMatch;
}
"<argument name='patt'>A regular expression in String form</argument>
<argument name='str'>The string to be matched</argument>
<argument name='fl'>A list of <dataref>REFlags</dataref> options for the regular expression. This parameter is optional and defaults to the empty list.</argument>
<summary>Test whether a string matches the pattern</summary>
<prose>Test whether <code>str</code> matches <code>patt</code>, returning true if it does and false otherwise.</prose>
<example>if (!quickMatch(\".@.*\\.[a-z]+$\",email,[IgnoreCase])) {
throw(NotAnEmailAddress);
}</example>
<related><functionref>match</functionref></related>
<related><functionref>quickMatch</functionref></related>"
public Bool quickMatch(String patt, String str, [REFlags] fl = createArray(1))
{
return quickMatch(compile(patt,fl),str);
}
"<argument name='re'>A compiled regular expression</argument>
<argument name='str'>The String to match</argument>
<summary>Test a String with a compiled regular expression.</summary>
<prose>This function is identical to <functionref>quickMatch</functionref> except that it uses a compiled regular expression.</prose>
<related><dataref>Regex</dataref></related>
<related><functionref>compile</functionref></related>
<related><functionref>match</functionref></related>
<related><functionref index='1'>quickMatch</functionref></related>"
public Bool quickMatch(Regex re, String str)
{
return re_quickmatch(getVM,re.pcre,str,length(str));
}
"Replace all instances.
Replaces all instances of key with new in the String str"
Void replaceAll(Regex keyre, String new, var String str, [ReplaceFlags] fl = createArray(1))
{
newstr = "";
repeat case match(keyre,str) of {
matches(xs,before,str) -> newstr += before +
(substVars(xs,new));
| noMatch() -> newstr += str;
break; // Stop matching
}
str = newstr;
}
"Replace all instances.
Replaces all instances of key with new in the String str"
Void replaceAll(String key, String new, var String str, [ReplaceFlags] fl = createArray(1), [REFlags] mfl = createArray(1))
{
keyre = compile(key,mfl);
replaceAll(keyre,new,str,fl);
}
"<argument name='keyre'>A compiled regular expression</argument>
<argument name='new'>The string to replace matches with</argument>
<argument name='str'>The string to search for matches (will be edited in place).</argument>
<argument name='fl'>A list of <dataref>ReplaceFlags</dataref> options. This parameter may be omitted and defaults to the empty list.</argument>
<summary>Replace a compiled pattern with a string.</summary>
<prose>Replace the first match (or all matches if <code>fl</code> contains
<code>Global</code>)
of <code>key</code> with <code>new</code> in the String <code>str</code>.
<code>new</code> may contain back references into the pattern, $1, $2, etc,
which stand for sub-matches.</prose>
<related><functionref>compile</functionref></related>
<related><functionref index='1'>replace</functionref></related>"
public Void replace(Regex keyre, String new, var String str,
[ReplaceFlags] fl = createArray(1))
{
if (elem(Global,fl)) {
replaceAll(keyre,new,str,fl);
}
else
{
mo = match(keyre,str);
case mo of {
matches(xs,before,after) -> str = before + (substVars(xs,new))
+ after;
| noMatch() -> ; // Do nothing
}
}
}
"<argument name='key'>A pattern string</argument>
<argument name='new'>The string to replace matches with</argument>
<argument name='str'>The string to search for matches (will be edited in place).</argument>
<argument name='fl'>A list of <dataref>ReplaceFlags</dataref> options. This parameter may be omitted and defaults to the empty list.</argument>
<argument name='mfl'>A list of <dataref>REFlags</dataref> options for compiling the pattern. Ths parameter may be omitted and defaults to the empty list.</argument>
<summary>Replace a compiled pattern with a string.</summary>
<prose>Replace the first match (or all matches if <code>fl</code> contains
<code>Global</code>)
of <code>key</code> with <code>new</code> in the String <code>str</code>.
<code>new</code>> may contain back references into the pattern, $1, $2, etc,
which stand for sub-matches.</prose>
<related><functionref>compile</functionref></related>
<related><functionref index='0'>replace</functionref></related>"
public Void replace(String key, String new, var String str,
[ReplaceFlags] fl = createArray(1), [REFlags] mfl = createArray(1))
{
if (elem(Global,fl)) {
replaceAll(key,new,str,fl,mfl);
}
else
{
keyre = compile(key,mfl);
mo = match(keyre,str);
case mo of {
matches(xs,before,after) -> str = before + (substVars(xs,new))
+ after;
| noMatch() -> ; // Do nothing
}
}
}
"Replace $1, $2, etc in new with elements of vars.
Helper function for <em>replace</em>; avoid using this, as it will be
made private when that is implemented!"
String substVars([String] vars, var String new)
{
i=1;
newstr=copy(new);
fst = shift(vars);
for x in vars {
// FIXME: Make "\$" not do any substitution. Somehow.
replaceAll("\\$"+String(i),x,newstr);
i++;
}
return newstr;
}
"<argument name='patt'>The pattern string</argument>
<argument name='xs'>The strings to match</argument>
<argument name='inverse'>If this is set to true, return only strings <emphasis>not</emphasis> matching the pattern. This parameter is optional and defaults to false.</argument>
<summary>Search for patterns.</summary>
<prose>Searches for patterns and returns the entries in <em>xs</em> which match the given pattern.</prose>
<related><functionref>match</functionref></related>"
public [String] grep(String patt, [String] xs, Bool inverse=false) {
out = [];
re = compile(patt);
for x in xs {
if (quickMatch(re,x)!=inverse) {
push(out,x);
}
}
return out;
}
"<argument name='patt'>The pattern string</argument>
<argument name='str'>The String to split</argument>
<argument name='limit'>The maximum number of pieces (from the left) to split <code>str</code> into. If this is parameter is omitted or set to zero an unlimited number of pieces will be allowed.</argument>
<argument name='allowempty'>If this is true, then consecutive delimeters will be treated as having an empty string between them. If it is false (or omitted) then consecutive delimeters will be treated as one.</argument>
<summary>Split a string into substrings.</summary>
<prose>Using <code>patt</code> as a delimeter, split <code>str</code> into substrings.</prose>
<example>x = \"a,b,c\";
xs = split(\",\",x); // [\"a\",\"b\",\"c\"]
xs = split(\",\",x,1); // [\"a\",\"b,c\"]</example>
<related><functionref index='0'>split</functionref></related>"
public [String] split(String patt, String str, Int limit=0, Bool allowempty = false) {
pattre = compile(patt);
return split(pattre,str,limit,allowempty);
}
"<argument name='pattre'>A compiled regular expression</argument>
<argument name='str'>The String to split</argument>
<argument name='limit'>The maximum number of pieces (from the left) to split <code>str</code> into. If this is parameter is omitted or set to zero an unlimited number of pieces will be allowed.</argument>
<argument name='allowempty'>If this is true, then consecutive delimeters will be treated as having an empty string between them. If it is false (or omitted) then consecutive delimeters will be treated as one.</argument>
<summary>Split a string into substrings.</summary>
<prose>Using <code>pattre</code> as a delimeter, split <code>str</code> into substrings.</prose>
<related><functionref>compile</functionref></related>
<related><functionref index='1'>split</functionref></related>"
public [String] split(Regex pattre, String str, Int limit=0, Bool allowempty = false) {
strs = [];
repeat case match(pattre,str) of {
matches(xs,before,str) -> if (before!="" || allowempty) {
push(strs,before);
}
limit--;
if (limit == 0) {
if (allowempty || str != "") {
push(strs,str);
}
break;
} // stop matching
| noMatch() -> if (allowempty || str != "") {
push(strs,str);
}
break; // Stop matching
}
return strs;
}
|