1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
|
;; Module for regular expression searching/matching in CLISP
;; Bruno Haible 14.4.1995, 18.4.1995
;; Sam Steingold 1999-10-28
(defpackage "REGEXP"
(:documentation
"POSIX Regular Expressions - matching, compiling, executing.")
(:use "LISP" "FFI")
(:export "MATCH" "MATCH-START" "MATCH-END" "MATCH-STRING" "REGEXP-QUOTE"
"REGEXP-COMPILE" "REGEXP-EXEC" "REGEXP-SPLIT" "WITH-LOOP-SPLIT"))
(in-package "REGEXP")
; Common OS definitions:
(def-c-type size_t uint)
#|
; Intermediate types not actually exported by regex.h:
(def-c-type reg_syntax_t uint)
(def-c-struct re_pattern_buffer
(buffer c-pointer)
(allocated ulong)
(used ulong)
(syntax reg_syntax_t)
(fastmap c-pointer)
(translate c-pointer)
(re_nsub size_t)
(flags uint8)
)
(def-c-type %regex_t re_pattern_buffer)
(eval-when (load compile eval) (defconstant sizeof-%regex_t (sizeof '%regex_t)))
(def-c-type regex_t (c-array uchar #.sizeof-%regex_t))
|#
(def-c-type regex_t-ptr c-pointer)
; Types exported by regex.h:
(def-c-type regoff_t int)
(def-c-struct regmatch_t
(rm_so regoff_t)
(rm_eo regoff_t)
)
;; Functions exported by regex.h:
#| ;; This documentation comes from regex.h and regex.c.
extern int regcomp (regex_t *preg, const char *pattern, int cflags);
regcomp takes a regular expression as a string and compiles it.
PREG is a regex_t *. We do not expect any fields to be initialized,
since POSIX says we shouldn't. Thus, we set
`buffer' to the compiled pattern;
`used' to the length of the compiled pattern;
`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
REG_EXTENDED bit in CFLAGS is set; otherwise, to
RE_SYNTAX_POSIX_BASIC;
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
`fastmap' and `fastmap_accurate' to zero;
`re_nsub' to the number of subexpressions in PATTERN.
PATTERN is the address of the pattern string.
CFLAGS is a series of bits which affect compilation.
If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
use POSIX basic syntax.
If REG_NEWLINE is set, then . and [^...] don't match newline.
Also, regexec will try a match beginning after every newline.
If REG_ICASE is set, then we considers upper- and lowercase
versions of letters to be equivalent when matching.
If REG_NOSUB is set, then when PREG is passed to regexec, that
routine will report only success or failure, and nothing about the
registers.
It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
the return codes and their meanings.)
extern int regexec (const regex_t *preg, const char *string, size_t nmatch,
regmatch_t pmatch[], int eflags);
regexec searches for a given pattern, specified by PREG, in the
string STRING.
If NMATCH is zero or REG_NOSUB was set in the cflags argument to
`regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
least NMATCH elements, and we set them to the offsets of the
corresponding matched substrings.
EFLAGS specifies `execution flags' which affect matching: if
REG_NOTBOL is set, then ^ does not match at the beginning of the
string; if REG_NOTEOL is set, then $ does not match at the end.
We return 0 if we find a match and REG_NOMATCH if not.
extern size_t regerror (int errcode, const regex_t *preg,
char *errbuf, size_t errbuf_size);
Returns a message corresponding to an error code, ERRCODE, returned
from either regcomp or regexec. We don't use PREG here.
extern void regfree (regex_t *preg);
Free dynamically allocated space used by PREG.
(def-c-call-out regcomp (:arguments (preg (c-ptr regex_t) :out)
(pattern c-string)
(cflags int)
)
(:return-type int)
)
(def-c-call-out regexec (:arguments (preg (c-ptr regex_t))
(string c-string)
(nmatch size_t)
(pmatch (c-ptr (c-array regmatch_t 0)))
(eflags int)
)
(:return-type int)
)
(def-c-call-out regerror (:arguments (errcode int)
(preg (c-ptr regex_t))
(errbuf (c-ptr character))
(errbuf_size size_t)
)
(:return-type size_t)
)
(def-c-call-out regfree (:arguments (preg (c-ptr regex_t)))
(:return-type nil)
)
|#
;; This interface is not exactly adapted to our needs. We introduce
;; slightly modified functions.
#|
extern int mregcomp (regex_t **ppreg, const char *pattern, int cflags);
extern int regexec (const regex_t *preg, const char *string, size_t nmatch,
regmatch_t pmatch[], int eflags);
extern const char *mregerror (int errcode, const regex_t *preg);,
extern void mregfree (regex_t *preg);
|#
(eval-when (compile load eval) (defconstant num-matches 10))
(def-c-call-out mregcomp (:arguments (ppreg (c-ptr regex_t-ptr) :out)
(pattern c-string)
(cflags int)
)
(:return-type int)
)
(def-c-call-out regexec (:arguments (preg regex_t-ptr)
(string c-string)
(nmatch size_t)
(pmatch (c-ptr (c-array regmatch_t #.num-matches)) :out)
(eflags int)
)
(:return-type int)
)
(def-c-call-out mregerror (:arguments (errcode int)
(preg regex_t-ptr)
)
(:return-type c-string :malloc-free)
)
(def-c-call-out mregfree (:arguments (preg regex_t-ptr))
(:return-type nil)
)
; cflags values
(defconstant REG_EXTENDED 1)
(defconstant REG_ICASE 2)
(defconstant REG_NEWLINE 4)
(defconstant REG_NOSUB 8)
; eflags values
(defconstant REG_NOTBOL 1)
(defconstant REG_NOTEOL 2)
(defun mregfree-finally (compiled-pattern)
;; beware: compiled-pattern could come from a previous session
(when (validp compiled-pattern)
(mregfree compiled-pattern)))
(defun regexp-compile (pattern &optional case-insensitive)
(let (errcode compiled-pattern)
(assert (zerop (setf (values errcode compiled-pattern)
(mregcomp pattern (if case-insensitive REG_ICASE 0))))
(pattern)
"~s: ~a" 'regexp-compile (mregerror errcode compiled-pattern))
;; Arrange that when compiled-pattern is garbage-collected,
;; mregfree will be called.
(ext:finalize compiled-pattern #'mregfree-finally)
compiled-pattern))
(setf (fdefinition 'match-start) (fdefinition 'regmatch_t-rm_so))
(setf (fdefinition '(setf match-start))
(lambda (new-value match) (setf (regmatch_t-rm_so match) new-value)))
(setf (fdefinition 'match-end) (fdefinition 'regmatch_t-rm_eo))
(setf (fdefinition '(setf match-end))
(lambda (new-value match) (setf (regmatch_t-rm_eo match) new-value)))
(defun regexp-exec (compiled-pattern string &key (start 0) (end nil))
(assert (stringp string) (string)
"~s: the second argument must be a string, not ~s"
'regexp-exec string)
(let* ((len (length string))
(end (or end len))
;; Prepare the string.
(string
(if (and (eql start 0) (eql end len))
string
(make-array (- end start)
:element-type 'character
:displaced-to string
:displaced-index-offset start))))
(declare (string string))
(multiple-value-bind (errcode matches)
(regexec compiled-pattern string #.num-matches 0)
;; Compute return values.
(if (zerop errcode)
(values-list ; the first value will be non-NIL
(map 'list (if (eql start 0)
#'identity
(lambda (match)
(incf (match-start match) start)
(incf (match-end match) start)
match))
(delete-if #'minusp matches :key #'match-start)))
nil))))
;; The following implementation of MATCH compiles the pattern
;; once for every search.
(defun match-once (pattern string &key (start 0) (end nil) (case-insensitive nil))
(regexp-exec (regexp-compile pattern case-insensitive)
string :start start :end end))
;; The following implementation of MATCH compiles the pattern
;; only once per Lisp session, if it is a literal string.
(defmacro match (pattern string &rest more-forms)
(if (stringp pattern)
`(%MATCH (MATCHER ,pattern) ,string ,@more-forms)
`(MATCH-ONCE ,pattern ,string ,@more-forms)
) )
(defmacro matcher (pattern)
(declare (string pattern))
`(LOAD-TIME-VALUE (%MATCHER ,pattern))
)
(defun %matcher (pattern)
(list* pattern nil nil)
; car = pattern,
; cadr = compiled pattern, case sensitive,
; cddr = compiled pattern, case insensitive.
)
(defun %match (patternbox string &key (start 0) (end nil) (case-insensitive nil))
;; Compile the pattern, if not already done.
(let ((compiled-pattern
(if case-insensitive (cddr patternbox) (cadr patternbox))))
(unless (and compiled-pattern (validp compiled-pattern))
(setq compiled-pattern (regexp-compile (car patternbox) case-insensitive))
(if case-insensitive
(setf (cddr patternbox) compiled-pattern)
(setf (cadr patternbox) compiled-pattern)))
(regexp-exec compiled-pattern string :start start :end end)))
; Convert a match (of type regmatch_t) to a substring.
(defun match-string (string match)
(let ((start (match-start match))
(end (match-end match)))
(make-array (- end start)
:element-type 'character
:displaced-to string
:displaced-index-offset start
) ) )
; Utility function
(defun regexp-quote (string)
(let ((qstring (make-array 10 :element-type 'character
:adjustable t :fill-pointer 0)))
(map nil (lambda (c)
(case c
((#\$ #\^ #\. #\* #\[ #\] #\\) ; #\+ #\?
(vector-push-extend #\\ qstring)))
(vector-push-extend c qstring))
string)
qstring))
(defun regexp-split (pattern string &key (start 0) end case-insensitive)
"Split the STRING by the regexp PATTERN."
(loop
:with compiled =
(if (stringp pattern)
(regexp-compile pattern case-insensitive)
pattern)
:for match = (regexp-exec compiled string :start start :end end)
:collect
(make-array (- (if match (match-start match) (length string)) start)
:element-type 'character
:displaced-to string
:displaced-index-offset start)
:while match
:do (setq start (match-end match))))
(defmacro with-loop-split ((var stream pattern &optional case-insensitive)
&body forms)
"Read from STREAM one line at a time, binding VAR to the split line."
(let ((compiled-pattern (gensym "WLS-")) (line (gensym "WLS-")))
`(loop
:with ,compiled-pattern =
(if (stringp ,pattern)
(regexp-compile ,pattern ,case-insensitive)
,pattern)
:and ,var
:for ,line = (read-line ,stream nil nil)
:while ,line
:do (setq ,var (regexp-split ,compiled-pattern ,line)) ,@forms)))
|