File: regexp.lisp

package info (click to toggle)
clisp 1%3A2.27-0.5
  • links: PTS
  • area: main
  • in suites: woody
  • size: 49,860 kB
  • ctags: 20,752
  • sloc: ansic: 123,781; lisp: 67,533; asm: 19,633; xml: 11,766; sh: 9,788; fortran: 8,307; makefile: 3,570; objc: 2,481; perl: 1,744; java: 341; yacc: 318; sed: 117
file content (322 lines) | stat: -rw-r--r-- 11,772 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
;; Module for regular expression searching/matching in CLISP
;; Bruno Haible 14.4.1995, 18.4.1995
;; Sam Steingold 1999-10-28

(defpackage "REGEXP"
  (:documentation
   "POSIX Regular Expressions - matching, compiling, executing.")
  (:use "LISP" "FFI")
  (:export "MATCH" "MATCH-START" "MATCH-END" "MATCH-STRING" "REGEXP-QUOTE"
           "REGEXP-COMPILE" "REGEXP-EXEC" "REGEXP-SPLIT" "WITH-LOOP-SPLIT"))

(in-package "REGEXP")

; Common OS definitions:
(def-c-type size_t uint)

#|
; Intermediate types not actually exported by regex.h:
(def-c-type reg_syntax_t uint)
(def-c-struct re_pattern_buffer
  (buffer c-pointer)
  (allocated ulong)
  (used ulong)
  (syntax reg_syntax_t)
  (fastmap c-pointer)
  (translate c-pointer)
  (re_nsub size_t)
  (flags uint8)
)
(def-c-type %regex_t re_pattern_buffer)
(eval-when (load compile eval) (defconstant sizeof-%regex_t (sizeof '%regex_t)))
(def-c-type regex_t (c-array uchar #.sizeof-%regex_t))
|#
(def-c-type regex_t-ptr c-pointer)

; Types exported by regex.h:
(def-c-type regoff_t int)
(def-c-struct regmatch_t
  (rm_so regoff_t)
  (rm_eo regoff_t)
)

;; Functions exported by regex.h:

#| ;; This documentation comes from regex.h and regex.c.

extern int regcomp (regex_t *preg, const char *pattern, int cflags);

   regcomp takes a regular expression as a string and compiles it.

   PREG is a regex_t *.  We do not expect any fields to be initialized,
   since POSIX says we shouldn't.  Thus, we set

     `buffer' to the compiled pattern;
     `used' to the length of the compiled pattern;
     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
       REG_EXTENDED bit in CFLAGS is set; otherwise, to
       RE_SYNTAX_POSIX_BASIC;
     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
     `fastmap' and `fastmap_accurate' to zero;
     `re_nsub' to the number of subexpressions in PATTERN.

   PATTERN is the address of the pattern string.

   CFLAGS is a series of bits which affect compilation.

     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
     use POSIX basic syntax.

     If REG_NEWLINE is set, then . and [^...] don't match newline.
     Also, regexec will try a match beginning after every newline.

     If REG_ICASE is set, then we considers upper- and lowercase
     versions of letters to be equivalent when matching.

     If REG_NOSUB is set, then when PREG is passed to regexec, that
     routine will report only success or failure, and nothing about the
     registers.

   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
   the return codes and their meanings.)


extern int regexec (const regex_t *preg, const char *string, size_t nmatch,
                    regmatch_t pmatch[], int eflags);

   regexec searches for a given pattern, specified by PREG, in the
   string STRING.

   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
   least NMATCH elements, and we set them to the offsets of the
   corresponding matched substrings.

   EFLAGS specifies `execution flags' which affect matching: if
   REG_NOTBOL is set, then ^ does not match at the beginning of the
   string; if REG_NOTEOL is set, then $ does not match at the end.

   We return 0 if we find a match and REG_NOMATCH if not.


extern size_t regerror (int errcode, const regex_t *preg,
                        char *errbuf, size_t errbuf_size);

   Returns a message corresponding to an error code, ERRCODE, returned
   from either regcomp or regexec.   We don't use PREG here.


extern void regfree (regex_t *preg);

   Free dynamically allocated space used by PREG.


(def-c-call-out regcomp (:arguments (preg (c-ptr regex_t) :out)
                                    (pattern c-string)
                                    (cflags int)
                        )
                        (:return-type int)
)
(def-c-call-out regexec (:arguments (preg (c-ptr regex_t))
                                    (string c-string)
                                    (nmatch size_t)
                                    (pmatch (c-ptr (c-array regmatch_t 0)))
                                    (eflags int)
                        )
                        (:return-type int)
)
(def-c-call-out regerror (:arguments (errcode int)
                                     (preg (c-ptr regex_t))
                                     (errbuf (c-ptr character))
                                     (errbuf_size size_t)
                         )
                         (:return-type size_t)
)
(def-c-call-out regfree (:arguments (preg (c-ptr regex_t)))
                        (:return-type nil)
)

|#

;; This interface is not exactly adapted to our needs. We introduce
;; slightly modified functions.
#|
extern int mregcomp (regex_t **ppreg, const char *pattern, int cflags);
extern int regexec (const regex_t *preg, const char *string, size_t nmatch,
                    regmatch_t pmatch[], int eflags);
extern const char *mregerror (int errcode, const regex_t *preg);,
extern void mregfree (regex_t *preg);
|#

(eval-when (compile load eval) (defconstant num-matches 10))
(def-c-call-out mregcomp (:arguments (ppreg (c-ptr regex_t-ptr) :out)
                                     (pattern c-string)
                                     (cflags int)
                         )
                         (:return-type int)
)
(def-c-call-out regexec (:arguments (preg regex_t-ptr)
                                    (string c-string)
                                    (nmatch size_t)
                                    (pmatch (c-ptr (c-array regmatch_t #.num-matches)) :out)
                                    (eflags int)
                        )
                        (:return-type int)
)
(def-c-call-out mregerror (:arguments (errcode int)
                                      (preg regex_t-ptr)
                          )
                          (:return-type c-string :malloc-free)
)
(def-c-call-out mregfree (:arguments (preg regex_t-ptr))
                         (:return-type nil)
)
; cflags values
(defconstant REG_EXTENDED 1)
(defconstant REG_ICASE    2)
(defconstant REG_NEWLINE  4)
(defconstant REG_NOSUB    8)
; eflags values
(defconstant REG_NOTBOL   1)
(defconstant REG_NOTEOL   2)

(defun mregfree-finally (compiled-pattern)
  ;; beware: compiled-pattern could come from a previous session
  (when (validp compiled-pattern)
    (mregfree compiled-pattern)))

(defun regexp-compile (pattern &optional case-insensitive)
  (let (errcode compiled-pattern)
    (assert (zerop (setf (values errcode compiled-pattern)
                         (mregcomp pattern (if case-insensitive REG_ICASE 0))))
            (pattern)
            "~s: ~a" 'regexp-compile (mregerror errcode compiled-pattern))
    ;; Arrange that when compiled-pattern is garbage-collected,
    ;; mregfree will be called.
    (ext:finalize compiled-pattern #'mregfree-finally)
    compiled-pattern))

(setf (fdefinition 'match-start) (fdefinition 'regmatch_t-rm_so))
(setf (fdefinition '(setf match-start))
      (lambda (new-value match) (setf (regmatch_t-rm_so match) new-value)))

(setf (fdefinition 'match-end) (fdefinition 'regmatch_t-rm_eo))
(setf (fdefinition '(setf match-end))
      (lambda (new-value match) (setf (regmatch_t-rm_eo match) new-value)))

(defun regexp-exec (compiled-pattern string &key (start 0) (end nil))
  (assert (stringp string) (string)
          "~s: the second argument must be a string, not ~s"
          'regexp-exec string)
  (let* ((len (length string))
         (end (or end len))
         ;; Prepare the string.
         (string
           (if (and (eql start 0) (eql end len))
             string
             (make-array (- end start)
                         :element-type 'character
                         :displaced-to string
                         :displaced-index-offset start))))
    (declare (string string))
    (multiple-value-bind (errcode matches)
        (regexec compiled-pattern string #.num-matches 0)
      ;; Compute return values.
      (if (zerop errcode)
        (values-list          ; the first value will be non-NIL
         (map 'list (if (eql start 0)
                      #'identity
                      (lambda (match)
                        (incf (match-start match) start)
                        (incf (match-end match) start)
                        match))
              (delete-if #'minusp matches :key #'match-start)))
        nil))))

;; The following implementation of MATCH compiles the pattern
;; once for every search.
(defun match-once (pattern string &key (start 0) (end nil) (case-insensitive nil))
  (regexp-exec (regexp-compile pattern case-insensitive)
               string :start start :end end))

;; The following implementation of MATCH compiles the pattern
;; only once per Lisp session, if it is a literal string.
(defmacro match (pattern string &rest more-forms)
  (if (stringp pattern)
    `(%MATCH (MATCHER ,pattern) ,string ,@more-forms)
    `(MATCH-ONCE ,pattern ,string ,@more-forms)
) )

(defmacro matcher (pattern)
  (declare (string pattern))
  `(LOAD-TIME-VALUE (%MATCHER ,pattern))
)
(defun %matcher (pattern)
  (list* pattern nil nil)
  ; car = pattern,
  ; cadr = compiled pattern, case sensitive,
  ; cddr = compiled pattern, case insensitive.
)

(defun %match (patternbox string &key (start 0) (end nil) (case-insensitive nil))
  ;; Compile the pattern, if not already done.
  (let ((compiled-pattern
          (if case-insensitive (cddr patternbox) (cadr patternbox))))
    (unless (and compiled-pattern (validp compiled-pattern))
      (setq compiled-pattern (regexp-compile (car patternbox) case-insensitive))
      (if case-insensitive
        (setf (cddr patternbox) compiled-pattern)
        (setf (cadr patternbox) compiled-pattern)))
    (regexp-exec compiled-pattern string :start start :end end)))

; Convert a match (of type regmatch_t) to a substring.
(defun match-string (string match)
  (let ((start (match-start match))
        (end (match-end match)))
    (make-array (- end start)
                :element-type 'character
                :displaced-to string
                :displaced-index-offset start
) ) )

; Utility function
(defun regexp-quote (string)
  (let ((qstring (make-array 10 :element-type 'character
                                :adjustable t :fill-pointer 0)))
    (map nil (lambda (c)
               (case c
                 ((#\$ #\^ #\. #\* #\[ #\] #\\) ; #\+ #\?
                   (vector-push-extend #\\ qstring)))
               (vector-push-extend c qstring))
         string)
    qstring))

(defun regexp-split (pattern string &key (start 0) end case-insensitive)
  "Split the STRING by the regexp PATTERN."
  (loop
    :with compiled =
            (if (stringp pattern)
              (regexp-compile pattern case-insensitive)
              pattern)
    :for match = (regexp-exec compiled string :start start :end end)
    :collect
      (make-array (- (if match (match-start match) (length string)) start)
                  :element-type 'character
                  :displaced-to string
                  :displaced-index-offset start)
    :while match
    :do (setq start (match-end match))))

(defmacro with-loop-split ((var stream pattern &optional case-insensitive)
                           &body forms)
  "Read from STREAM one line at a time, binding VAR to the split line."
  (let ((compiled-pattern (gensym "WLS-")) (line (gensym "WLS-")))
    `(loop
       :with ,compiled-pattern =
         (if (stringp ,pattern)
           (regexp-compile ,pattern ,case-insensitive)
           ,pattern)
       :and ,var
       :for ,line = (read-line ,stream nil nil)
       :while ,line
       :do (setq ,var (regexp-split ,compiled-pattern ,line)) ,@forms)))