File: sregex.lisp

package info (click to toggle)
maxima-sage 5.45.1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 113,788 kB
  • sloc: lisp: 440,833; fortran: 14,665; perl: 14,369; tcl: 10,997; sh: 4,475; makefile: 2,520; ansic: 447; python: 262; xml: 59; awk: 37; sed: 17
file content (187 lines) | stat: -rw-r--r-- 7,174 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#|
                              ~*~  sregex  ~*~
  
    Maxima interface to pregexp.lisp (a portable regex parser by Dorai Sitaram)
  
    Copyright : 2008 - 2016 Volker van Nek

--------------------------------------------------------------------------------

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
    MA 02110-1301, USA.

--------------------------------------------------------------------------------

   Functions: 
   
   regex_compile - returns a Lisp structure containing a compiled regex.
   
   regex_match_pos, regex_match, regex_split, regex_subst_first, regex_subst -
      the regular expression used as an argument may be a string or 
      compiled by regex_compile.
   
   string_to_regex - masking special characters.
   
   Examples:
   
   str : "his hay needle stack -- my hay needle stack -- her hay needle stack"$
   
   regex : regex_compile("ne{2}dle")$
   
   regex_match_pos(regex, str);
                                               [[9, 15]]
   regex_match_pos("ne{2}dle", str);
                                               [[9, 15]]
   regex_match_pos("ne{2}dle", str, 25, 44);
                                               [[32, 38]]
   regex_match("ne{2}dle", "hay needle stack");
                                               ["needle"]
   regex_match("ne{2}dle", "hay needle stack", 10);
                                                 false
   regex_split("[,;]+", "split,pea;;;soup");
                                        ["split", "pea", "soup"]
   regex_subst_first("ty", "t.", "liberte egalite fraternite");
                                      "liberty egalite fraternite"
   regex_subst("ty", "t.\\b", "liberte egalite fraternite");
                                      "liberty egality fraternity"
   string_to_regex(". :");
                                                 "\. :"

--------------------------------------------------------------------------------

   The pregexp manual by Dorai Sitaram is at
   http://ds26gte.github.io/pregexp/index.html.

--------------------------------------------------------------------------------

   Like in stringproc.lisp we use 1-indexed position specifications. 
   
   When the external format is not utf-8 (unicode) positions are counted in 
   octets at Lisp level and in Maxima characters at Maxima level. 
   See remarks in stringproc.lisp.
   Without unicode support non-us-ascii Maxima characters are not recognized 
   by regular expressions, e.g. the regex "." doesn't match to an umlaut.

|#

(in-package :maxima)


(declare-top (special *parse-utf-8-input*))



(defstruct (compiled-regex (:print-function compiled-regex-print))
  parse-tree )

(defun compiled-regex-print (struct stream i) 
  (declare (ignore struct i))
  (format stream "Structure [COMPILED-REGEX]") ) ;; wxMaxima prints this
                                                 ;; terminal should print this too

(defun $regex_compile (regex)
  (make-compiled-regex
    :parse-tree (pregexp regex) ))


(defun regex-check-and-maybe-coerce (name regex &rest args)
  (cond
    ((compiled-regex-p regex)
      (setq regex (compiled-regex-parse-tree regex)) )
    ((not (stringp regex))
      (gf-merror (intl:gettext "`~m': first arg must be a compiled regex or a string.") name) ))
  (unless (every #'stringp args)
    (gf-merror (intl:gettext "Unsuitable arguments to `~m'.") name) )
  regex )


(defun regex-index-error (name)
  (gf-merror (intl:gettext "`~m': improper start or end index.") name) )


;; When the external format is not utf-8 (unicode) positions are counted in octets.
;; We want them in numbers of characters to find the right position in a string.
;; utf-8-pos-dec returns the decrement we need to adjust.
;;   (string position = octet position - decrement)
(defun regex-utf-8-pos-dec (ov off pos) ;; begin to count at a given offset
  (do ((i off (1+ i))
       (n 0)) 
     ((= i pos) n)
    (when (= (logand (aref ov i) 192.) 128.)
      (incf n) )))


(defun $regex_match_pos (regex str &optional (start 1) (end nil)) ;; 1-based indexing!
  (setq regex (regex-check-and-maybe-coerce "regex_match_pos" regex str))
  (decf start)
  (when end (decf end))
  (let (ov)
    (or (ignore-errors 
          (when *parse-utf-8-input*
            (setq ov (string-to-raw-bytes str))
            (let ((args (utf-8-fix-start-end ov (list nil start end))))
              (setq start (cadr args)
                    end (caddr args) )))
          (let ((pos-list (pregexp-match-positions regex str start end))
                (pos-mlist nil) )
            (if pos-list 
              (dolist (pos pos-list (cons '(mlist simp) (nreverse pos-mlist)))
                (when *parse-utf-8-input*
                  (let ((dec (regex-utf-8-pos-dec ov 0 (car pos))))
                    (decf (cdr pos) (+ dec (regex-utf-8-pos-dec ov (car pos) (cdr pos))))
                    (decf (car pos) dec) ))
                (push `((mlist simp) ,(1+ (car pos)) ,(1+ (cdr pos))) pos-mlist) )
              (return-from $regex_match_pos nil) )))
        (regex-index-error "regex_match_pos") )))


(defun $regex_match (regex str &optional (start 1) (end nil))
  (setq regex (regex-check-and-maybe-coerce "regex_match" regex str))
  (or (ignore-errors 
        (when *parse-utf-8-input*
          (let* ((ov (string-to-raw-bytes str))
                 (args (utf-8-fix-start-end ov (list nil start end))) )
            (setq start (cadr args)
                  end (caddr args) )))
        (let ((match 
                (pregexp-match regex str (1- start) (if end (1- end) nil)) ))
          (if match 
            (cons '(mlist simp) match)
            (return-from $regex_match nil) )))
      (regex-index-error "regex_match") ))


(defun $regex_split (regex str)
  (setq regex (regex-check-and-maybe-coerce "regex_split" regex str))
  (cons '(mlist simp) (pregexp-split regex str)) )


(defun $regex_subst_first (replacement regex str)
  (setq regex (regex-check-and-maybe-coerce "regex_subst_first" regex str replacement))
  (pregexp-replace regex str replacement) )
;;
;; Argument order different to the order of pregexp-replace.
;; Use order like in $ssubst or substitute: new, old, str.
;;
(defun $regex_subst (replacement regex str)
  (setq regex (regex-check-and-maybe-coerce "regex_subst" regex str replacement))
  (pregexp-replace* regex str replacement) )


(defun $string_to_regex (str)
  (unless (stringp str)
    (gf-merror (intl:gettext "`string_to_regex': Argument must be a string.")) )
  (pregexp-quote str) )