1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/charmap.lisp,v 1.19 2009/09/17 19:17:30 edi Exp $
;;; An optimized representation of sets of characters.
;;; Copyright (c) 2008-2009, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:
;;; * Redistributions of source code must retain the above copyright
;;; notice, this list of conditions and the following disclaimer.
;;; * Redistributions in binary form must reproduce the above
;;; copyright notice, this list of conditions and the following
;;; disclaimer in the documentation and/or other materials
;;; provided with the distribution.
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(in-package :cl-ppcre)
(defstruct (charmap (:constructor make-charmap%))
;; a bit vector mapping char codes to "booleans" (1 for set members,
;; 0 for others)
(vector #*0 :type simple-bit-vector)
;; the smallest character code of all characters in the set
(start 0 :type fixnum)
;; the upper (exclusive) bound of all character codes in the set
(end 0 :type fixnum)
;; the number of characters in the set, or NIL if this is unknown
(count nil :type (or fixnum null))
;; whether the charmap actually represents the complement of the set
(complementp nil :type boolean))
;; seems to be necessary for some Lisps like ClozureCL
(defmethod make-load-form ((map charmap) &optional environment)
(make-load-form-saving-slots map :environment environment))
(declaim (inline in-charmap-p))
(defun in-charmap-p (char charmap)
"Tests whether the character CHAR belongs to the set represented by CHARMAP."
(declare #.*standard-optimize-settings*)
(declare (character char) (charmap charmap))
(let* ((char-code (char-code char))
(char-in-vector-p
(let ((charmap-start (charmap-start charmap)))
(declare (fixnum charmap-start))
(and (<= charmap-start char-code)
(< char-code (the fixnum (charmap-end charmap)))
(= 1 (sbit (the simple-bit-vector (charmap-vector charmap))
(- char-code charmap-start)))))))
(cond ((charmap-complementp charmap) (not char-in-vector-p))
(t char-in-vector-p))))
(defun charmap-contents (charmap)
"Returns a list of all characters belonging to a character map.
Only works for non-complement charmaps."
(declare #.*standard-optimize-settings*)
(declare (charmap charmap))
(and (not (charmap-complementp charmap))
(loop for code of-type fixnum from (charmap-start charmap) to (charmap-end charmap)
for i across (the simple-bit-vector (charmap-vector charmap))
when (= i 1)
collect (code-char code))))
(defun make-charmap (start end test-function &optional complementp)
"Creates and returns a charmap representing all characters with
character codes in the interval [start end) that satisfy
TEST-FUNCTION. The COMPLEMENTP slot of the charmap is set to the
value of the optional argument, but this argument doesn't have an
effect on how TEST-FUNCTION is used."
(declare #.*standard-optimize-settings*)
(declare (fixnum start end))
(let ((vector (make-array (- end start) :element-type 'bit))
(count 0))
(declare (fixnum count))
(loop for code from start below end
for char = (code-char code)
for index from 0
when char do
(incf count)
(setf (sbit vector index) (if (funcall test-function char) 1 0)))
(make-charmap% :vector vector
:start start
:end end
;; we don't know for sure if COMPLEMENTP is true as
;; there isn't a necessary a character for each
;; integer below *REGEX-CHAR-CODE-LIMIT*
:count (and (not complementp) count)
;; make sure it's boolean
:complementp (not (not complementp)))))
(defun create-charmap-from-test-function (test-function start end)
"Creates and returns a charmap representing all characters with
character codes between START and END which satisfy TEST-FUNCTION.
Tries to find the smallest interval which is necessary to represent
the character set and uses the complement representation if that
helps."
(declare #.*standard-optimize-settings*)
(let (start-in end-in start-out end-out)
;; determine the smallest intervals containing the set and its
;; complement, [start-in, end-in) and [start-out, end-out) - first
;; the lower bound
(loop for code from start below end
for char = (code-char code)
until (and start-in start-out)
when (and char
(not start-in)
(funcall test-function char))
do (setq start-in code)
when (and char
(not start-out)
(not (funcall test-function char)))
do (setq start-out code))
(unless start-in
;; no character satisfied the test, so return a "pseudo" charmap
;; where IN-CHARMAP-P is always false
(return-from create-charmap-from-test-function
(make-charmap% :count 0)))
(unless start-out
;; no character failed the test, so return a "pseudo" charmap
;; where IN-CHARMAP-P is always true
(return-from create-charmap-from-test-function
(make-charmap% :complementp t)))
;; now determine upper bound
(loop for code from (1- end) downto start
for char = (code-char code)
until (and end-in end-out)
when (and char
(not end-in)
(funcall test-function char))
do (setq end-in (1+ code))
when (and char
(not end-out)
(not (funcall test-function char)))
do (setq end-out (1+ code)))
;; use the smaller interval
(cond ((<= (- end-in start-in) (- end-out start-out))
(make-charmap start-in end-in test-function))
(t (make-charmap start-out end-out (complement* test-function) t)))))
|