File: Regex.k

package info (click to toggle)
kaya 0.4.2-4
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 4,448 kB
  • ctags: 1,694
  • sloc: cpp: 9,536; haskell: 7,461; sh: 3,013; yacc: 910; makefile: 816; perl: 90
file content (331 lines) | stat: -rw-r--r-- 15,369 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/** -*-C-*-ish
    Kaya standard library
    Copyright (C) 2004, 2005 Edwin Brady

    This file is distributed under the terms of the GNU Lesser General
    Public Licence. See COPYING for licence.
*/

"<summary>Regular expressions</summary>
<prose>This module provides functions to interface to libpcre (perl-compatible regular expressions) for constructing and matching with regular expressions.</prose>
<prose>Regular expression support is provided by the PCRE library package,
     which is open source software, written by Philip Hazel, and copyright
     by the University of Cambridge, England.</prose>
<prose>The <code>pcrepattern</code> or <code>perlre</code> manual pages explain the regular expression syntax in detail.</prose>"

module Regex;

import Builtins;
import Prelude;

%include "regex_glue.h";
%imported "regex_glue";
%link "pcre";

"<summary>The regular expression could not be compiled.</summary>
<prose>This Exception is thrown by <functionref>compile</functionref> if the regular expression cannot be compiled. It will also be thrown by functions such as <functionref>grep</functionref> that call <code>compile</code>.</prose>"
Exception RegexCompilationFailed;
"<summary>UTF-8 support unavailable</summary>
<prose>This Exception is thrown if the string being matched contains non-ASCII characters, but the PCRE library is compiled without UTF-8 support. For web applications, this exception is very easy for a user to trigger, so you should always use a library with UTF-8 support for those.</prose>"
Exception NoUTF8Support;

"<summary>Compiled regular expression</summary>
<prose>This data type represents a compiled regular expression. It is significantly more efficient to use <functionref>compile</functionref> to compile a regular expression once, and then to pass it to functions as needed.</prose>"
abstract data Regex = Regex(Ptr pcre);

"<summary>Match object.</summary>
<prose>A match operation returns <code>matches</code> if a match is
found. The <code>matches</code> field is an array of sub matches,
where <code>matches[0]</code> is the entire matched
string. <code>before</code> and <code>after</code> are the strings
before and after the matched string.</prose>
<related><functionref>match</functionref></related>"
public data Match = matches([String] matches, String before, String after)
                  | noMatch;

		  /* Documentation for Extended option taken from PCRE
		     manual page written by Philip Hazel, copyright
		     1997-2003 University of Cambridge. */
"<summary>Flags for regular expression compilation</summary>
<prose>Flags for regular expression compilation</prose>
<list>
<item><code>IgnoreCase</code> makes the expression case-insensitive (so \"^a\" would match \"All\" and \"all\"</item>
<item><code>Extended</code> when set causes whitespace in the pattern (other than in a character class) and characters between a '#' outside a character class and the next newline character to be ignored. An escaping backslash can be used to include a whitespace or '#' character as part of the pattern.</item>
<item><code>Multiline</code> changes the meaning of the start '^' and end '$' characters so that as well as matching the start and end of the string, they will also match immediately after and immediately before a newline '\\n' character.</item>
<item><code>Ungreedy</code> causes the expression and sub-expressions to find the smallest possible match, rather than the largest possible, unless a '?' is placed after the expression. For example, the pattern \"a.*a\" would match \"aca\" if ungreedy, and \"acada\" if greedy in the string \"bacadaf\".</item>
</list>
<related><functionref>compile</functionref></related>"
public data REFlags = IgnoreCase | Extended | Multiline | Ungreedy;
// _CASELESS, _EXTENDED, _MULTILINE, _UNGREEDY

"<summary>Replacement options</summary>
<prose>Options to use when replacing strings using regular expressions. Currently only one flag exists, <code>Global</code>, which causes the matched string to be replaced everywhere it appears rather than just the first occurence.</prose>"
public data ReplaceFlags = Global;

foreign "regex_glue.o" {
    Ptr re_compile(Ptr vm, String pattern, Int len, Int flagcode) = re_compile;
    Ptr re_match(Ptr vm, Ptr pcre, String str, Int len) = re_match;
    Bool re_quickmatch(Ptr vm, Ptr pcre, String str, Int len) = re_quickmatch;
    Bool matched(Ptr mo) = matched;
    [String] getmatches(Ptr mo) = getmatches;
    String getBefore(Ptr mo) = getBefore;
    String getAfter(Ptr mo) = getAfter;
}

r"<argument name='pattern'>The regular expression pattern to compile</argument>
<argument name='fl'>A list of <dataref>REFlags</dataref> options for the regular expression. This parameter is optional and defaults to the empty list.</argument>
<summary>Compile a regular expression</summary>
<prose>Compile a regular expression and return the compiled <dataref>Regex</dataref>. This can then be re-used in several function calls for greater efficiency.</prose>
<prose>Note that if a literal backslash needs to be matched, it must be written as \"\\\\\" as both Kaya and PCRE will interpret one layer of backslashes. You can avoid this from Kaya 0.2.6 onwards by using a raw string instead.</prose>
<example>// these two are equivalent
test1 = quickMatch(\"\\\\\",string);
test2 = quickMatch(R\"\\\",string);</example>
<related><functionref>match</functionref></related>
<related><functionref>quickMatch</functionref></related>
<related><functionref>replace</functionref></related>
<related><functionref>split</functionref></related>"
public Regex compile(String pattern, [REFlags] fl = createArray(1))
{
    flagcode = 0;
    for flag in fl {
      case flag of {
	IgnoreCase -> flagcode = flagcode | 1;
	| Extended -> flagcode = flagcode | 2;
	| Multiline -> flagcode = flagcode | 4;
	| Ungreedy -> flagcode = flagcode | 8;
      }
    }

    try {
	Ptr re = re_compile(getVM,pattern,length(pattern),flagcode);
	return Regex(re);
    } catch(InternalError(e)) {
	if (e==0) {
	    throw(RegexCompilationFailed);
	} else if (e==1) {
  	    throw(NoUTF8Support);
	} else {
	    throw(InternalError(e));
	}
    }
}

"<argument name='x'>A compiled regular expression</argument>
<argument name='str'>The String to match</argument>
<summary>Attempt to match a regular expression.</summary>
<prose>Attempt to match a compiled regular expression to a string, returning a <dataref>Match</dataref> object.</prose>
<example>r = compile(\"s$\");
s1 = \"sausages\";
m1 = match(r,s1); // matches([\"s\"],\"sausage\",\"\")
s2 = \"a sausage\";
m2 = match(r,s2); // noMatch</example>
<related><functionref>compile</functionref></related>
<related><functionref>quickMatch</functionref></related>
<related><functionref index='1'>quickMatch</functionref></related>"
public Match match(Regex x, String str)
{
    mo = re_match(getVM, x.pcre, str, length(str));
    try {
	if (matched(mo)) {
	    return matches(getmatches(mo),
			   getBefore(mo),
			   getAfter(mo));
	}
	else {
	    return noMatch;
	}
    } catch(InternalError(e)) {
	if (e==1) {
	    throw(NoUTF8Support);
	}
    }
    return noMatch;
}

"<argument name='patt'>A regular expression in String form</argument>
<argument name='str'>The string to be matched</argument>
<argument name='fl'>A list of <dataref>REFlags</dataref> options for the regular expression. This parameter is optional and defaults to the empty list.</argument>
<summary>Test whether a string matches the pattern</summary>
<prose>Test whether <code>str</code> matches <code>patt</code>, returning true if it does and false otherwise.</prose>
<example>if (!quickMatch(\".@.*\\.[a-z]+$\",email,[IgnoreCase])) {
    throw(NotAnEmailAddress);
}</example>
<related><functionref>match</functionref></related>
<related><functionref>quickMatch</functionref></related>"
public Bool quickMatch(String patt, String str, [REFlags] fl = createArray(1))
{
    return quickMatch(compile(patt,fl),str);
}


"<argument name='re'>A compiled regular expression</argument>
<argument name='str'>The String to match</argument>
<summary>Test a String with a compiled regular expression.</summary>
<prose>This function is identical to <functionref>quickMatch</functionref> except that it uses a compiled regular expression.</prose>
<related><dataref>Regex</dataref></related>
<related><functionref>compile</functionref></related>
<related><functionref>match</functionref></related>
<related><functionref index='1'>quickMatch</functionref></related>"
public Bool quickMatch(Regex re, String str)
{
  return re_quickmatch(getVM,re.pcre,str,length(str));
}

"Replace all instances.
Replaces all instances of key with new in the String str"
Void replaceAll(Regex keyre, String new, var String str, [ReplaceFlags] fl = createArray(1))
{
    newstr = "";
    repeat case match(keyre,str) of {
	    matches(xs,before,str) -> newstr += before + 
		                      (substVars(xs,new));
	    | noMatch() -> newstr += str; 
	                   break; // Stop matching
    }
    str = newstr;
}

"Replace all instances.
Replaces all instances of key with new in the String str"
Void replaceAll(String key, String new, var String str, [ReplaceFlags] fl = createArray(1), [REFlags] mfl = createArray(1))
{
    keyre = compile(key,mfl);
    replaceAll(keyre,new,str,fl);
}

"<argument name='keyre'>A compiled regular expression</argument>
<argument name='new'>The string to replace matches with</argument>
<argument name='str'>The string to search for matches (will be edited in place).</argument>
<argument name='fl'>A list of <dataref>ReplaceFlags</dataref> options. This parameter may be omitted and defaults to the empty list.</argument>
<summary>Replace a compiled pattern with a string.</summary>
<prose>Replace the first match (or all matches if <code>fl</code> contains
<code>Global</code>)
of <code>key</code> with <code>new</code> in the String <code>str</code>.
<code>new</code> may contain back references into the pattern, $1, $2, etc,
which stand for sub-matches.</prose>
<related><functionref>compile</functionref></related>
<related><functionref index='1'>replace</functionref></related>"
public Void replace(Regex keyre, String new, var String str, 
		    [ReplaceFlags] fl = createArray(1))
{
    if (elem(Global,fl)) {
	replaceAll(keyre,new,str,fl);
    } 
    else
    {
	mo = match(keyre,str);
	case mo of {
	    matches(xs,before,after) -> str = before + (substVars(xs,new)) 
		                              + after;
	    | noMatch() -> ; // Do nothing
	}
    }
}

"<argument name='key'>A pattern string</argument>
<argument name='new'>The string to replace matches with</argument>
<argument name='str'>The string to search for matches (will be edited in place).</argument>
<argument name='fl'>A list of <dataref>ReplaceFlags</dataref> options. This parameter may be omitted and defaults to the empty list.</argument>
<argument name='mfl'>A list of <dataref>REFlags</dataref> options for compiling the pattern. Ths parameter may be omitted and defaults to the empty list.</argument>
<summary>Replace a compiled pattern with a string.</summary>
<prose>Replace the first match (or all matches if <code>fl</code> contains
<code>Global</code>)
of <code>key</code> with <code>new</code> in the String <code>str</code>.
<code>new</code>> may contain back references into the pattern, $1, $2, etc,
which stand for sub-matches.</prose>
<related><functionref>compile</functionref></related>
<related><functionref index='0'>replace</functionref></related>"
public Void replace(String key, String new, var String str, 
		    [ReplaceFlags] fl = createArray(1), [REFlags] mfl = createArray(1))
{
    if (elem(Global,fl)) {
	replaceAll(key,new,str,fl,mfl);
    } 
    else
    {
	keyre = compile(key,mfl);
	mo = match(keyre,str);
	case mo of {
	    matches(xs,before,after) -> str = before + (substVars(xs,new)) 
		                              + after;
	    | noMatch() -> ; // Do nothing
	}
    }
}

"Replace $1, $2, etc in new with elements of vars.
Helper function for <em>replace</em>; avoid using this, as it will be
made private when that is implemented!"
String substVars([String] vars, var String new)
{
    i=1;
    newstr=copy(new);
    fst = shift(vars);
    for x in vars {
	// FIXME: Make "\$" not do any substitution. Somehow.
	replaceAll("\\$"+String(i),x,newstr);
	i++;
    }
    return newstr;
}

"<argument name='patt'>The pattern string</argument>
<argument name='xs'>The strings to match</argument>
<argument name='inverse'>If this is set to true, return only strings <emphasis>not</emphasis> matching the pattern. This parameter is optional and defaults to false.</argument>
<summary>Search for patterns.</summary>
<prose>Searches for patterns and returns the entries in <em>xs</em> which match the given pattern.</prose>
<related><functionref>match</functionref></related>"
public [String] grep(String patt, [String] xs, Bool inverse=false) {
    out = [];
    re = compile(patt);
    for x in xs {
	if (quickMatch(re,x)!=inverse) {
	    push(out,x);
	}
    }
    return out;
}

"<argument name='patt'>The pattern string</argument>
<argument name='str'>The String to split</argument>
<argument name='limit'>The maximum number of pieces (from the left) to split <code>str</code> into. If this is parameter is omitted or set to zero an unlimited number of pieces will be allowed.</argument>
<argument name='allowempty'>If this is true, then consecutive delimeters will be treated as having an empty string between them. If it is false (or omitted) then consecutive delimeters will be treated as one.</argument>
<summary>Split a string into substrings.</summary>
<prose>Using <code>patt</code> as a delimeter, split <code>str</code> into substrings.</prose>
<example>x = \"a,b,c\";
xs = split(\",\",x); // [\"a\",\"b\",\"c\"]
xs = split(\",\",x,1); // [\"a\",\"b,c\"]</example>
<related><functionref index='0'>split</functionref></related>"
public [String] split(String patt, String str, Int limit=0, Bool allowempty = false) {
    pattre = compile(patt);
    return split(pattre,str,limit,allowempty);
}

"<argument name='pattre'>A compiled regular expression</argument>
<argument name='str'>The String to split</argument>
<argument name='limit'>The maximum number of pieces (from the left) to split <code>str</code> into. If this is parameter is omitted or set to zero an unlimited number of pieces will be allowed.</argument>
<argument name='allowempty'>If this is true, then consecutive delimeters will be treated as having an empty string between them. If it is false (or omitted) then consecutive delimeters will be treated as one.</argument>
<summary>Split a string into substrings.</summary>
<prose>Using <code>pattre</code> as a delimeter, split <code>str</code> into substrings.</prose>
<related><functionref>compile</functionref></related>
<related><functionref index='1'>split</functionref></related>"
public [String] split(Regex pattre, String str, Int limit=0, Bool allowempty = false) {
    strs = [];
    repeat case match(pattre,str) of {
      matches(xs,before,str) -> if (before!="" || allowempty) { 
	push(strs,before);
      }
      limit--; 
      if (limit == 0) { 
	if (allowempty || str != "") {
	  push(strs,str); 
	}
	break;
      } // stop matching
	| noMatch() -> 	if (allowempty || str != "") {
	  push(strs,str); 
	}
	               break; // Stop matching
    }
    return strs;
}