File: read.lisp

package info (click to toggle)
cl-unicode 0.1.4-3
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 2,124 kB
  • ctags: 210
  • sloc: lisp: 1,706; makefile: 18
file content (282 lines) | stat: -rw-r--r-- 14,103 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-UNICODE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-unicode/build/read.lisp,v 1.34 2012-05-04 21:17:45 edi Exp $

;;; Copyright (c) 2008-2012, Dr. Edmund Weitz. All rights reserved.

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

(in-package :cl-unicode)

(defmacro with-unicode-file (((&rest bindings) file-name &optional two-line-ranges) &body body)
  "Utility macro to parse a file which is formatted as described in
<http://unicode.org/Public/UNIDATA/UCD.html#UCD_File_Format>.  The
file named FILE-NAME is searched for in the directory \"data/\"
relative to this source file.  The code then iterates through the file
and executes BODY for each non-comment line binding the variables in
BINDINGS to the parsed fields of the line.  For the details of
BINDINGS see the EXTRACT-FOO functions in util.lisp or the usage of
this macro below.  If TWO-LINE-RANGES is true, then the macro expects
a file like \"UnicodeData.txt\" where ranges aren't denoted as usual
but rather using <..., First> and <..., Last>."
  (let ((variables (extract-variables bindings))
        (types (extract-types bindings)))
    `(let ((pathname (merge-pathnames ,file-name (merge-pathnames "data/" *this-file*))))
       (when *compile-verbose*
         (format t "~&;;; Parsing Unicode file ~A" (file-namestring pathname))
         (force-output))
       (with-open-file (binary-in pathname :element-type 'flex:octet)
         ;; Unicode data files must be read as UTF-8
         (let ((in (flex:make-flexi-stream binary-in :external-format '(:utf-8 :eol-style :lf))))
           (loop
            (flet ((get-line-contents ()
                     (let ((line (or (read-line in nil) (return))))
                       (and (not (ppcre:scan "^\\s*(?:#|$)" line))
                            (ppcre:split "\\s*#.*$|\\s*;\\s*" line :limit most-positive-fixnum)))))
              (let ((contents (get-line-contents)))
                (when contents
                  (destructuring-bind ,variables
                      (parse-one-line contents ',types (list ,@(extract-defaults bindings)))
                    ,@(when two-line-ranges
                        `((when (ppcre:scan "^<.*, First>$" ,(second variables))
                            (let ((range-end (first (parse-one-line (list (first (get-line-contents)))))))
                              (setq ,(first variables) (cons ,(first variables) range-end))))))
                    ,@body))))))))))

(defmacro with-code-point-range ((var range) &body body)
  "Utility macro which executes BODY with VAR bound to each code point
in RANGE in turn.  VAR can either be an integer \(for one code point)
or a cons of two integers \(for an inclusive range)."
  (with-rebinding (range)
    `(flet ((thunk (,var) ,@body))
       (cond ((atom ,range) (thunk ,range))
             (t (loop for point from (car ,range) to (cdr ,range)
                      do (thunk point)))))))

(defun read-character-data ()
  "Parses the file \"UnicodeData.txt\" and generates one CHAR-INFO
entry per code point which is stored in *CHAR-DATABASE*."
  ;; by definition, we'll never see this property in the file, so we
  ;; have to add it to *GENERAL-CATEGORIES* explicitly
  (setq *general-categories* (list '#.(property-symbol "Cn")))
  (with-unicode-file ((code-point-range
                       name
                       (general-category symbol)
                       (combining-class integer)
                       (bidi-class symbol)
                       ;; decomposition mapping, ignored for now
                       _
                       (decimal-digit integer nil)
                       (digit integer nil)
                       (numeric rational nil)
                       (bidi-mirrored boolean)
                       (unicode1-name string nil)
                       ;; ISO comment, ignored
                       _
                       (uppercase-mapping hex nil)
                       (lowercase-mapping hex nil)
                       (titlecase-mapping hex nil))
                      "UnicodeData.txt" t)
    (pushnew general-category *general-categories* :test #'eq)
    (pushnew bidi-class *bidi-classes* :test #'eq)
    ;; if the name starts with #\<, it's not really a name but denotes
    ;; a range - some of these names (CJK unified ideographs and
    ;; Hangul syllables) will be computed later, the others are NIL
    (when (char= (char name 0) #\<)
      (setq name nil))
    (with-code-point-range (code-point code-point-range)
      (setf (aref *char-database* code-point)
            (make-instance 'char-info
                           :code-point code-point
                           :name name
                           :general-category general-category
                           :combining-class combining-class
                           :bidi-class bidi-class
                           :numeric-type (cond (decimal-digit '#.(property-symbol "Decimal"))
                                               (digit '#.(property-symbol "Digit"))
                                               (numeric '#.(property-symbol "Numeric")))
                           :numeric-value numeric
                           :binary-props (and bidi-mirrored
                                              (list '#.(property-symbol "BidiMirrored")))
                           :unicode1-name unicode1-name
                           :uppercase-mapping uppercase-mapping
                           :lowercase-mapping lowercase-mapping
                           :titlecase-mapping titlecase-mapping)))))

(defun read-scripts ()
  "Parses the file \"Scripts.txt\" and adds the information about the
script to the corresponding entries in *CHAR-DATABASE*."
  (with-unicode-file ((code-point-range (script symbol)) "Scripts.txt")
    (pushnew script *scripts* :test #'eq)
    (with-code-point-range (code-point code-point-range)
      (let ((char-info (aref *char-database* code-point)))
        (when char-info
          (setf (script* char-info) script))))))

(defun read-code-blocks ()
  "Parses the file \"Blocks.txt\" and adds the information about the
code block to the corresponding entries in *CHAR-DATABASE*."
  (with-unicode-file ((code-point-range (code-block symbol)) "Blocks.txt")
    (pushnew code-block *code-blocks* :test #'eq)
    (with-code-point-range (code-point code-point-range)
      (let ((char-info (aref *char-database* code-point)))
        (when char-info
          (setf (code-block* char-info) code-block))))))

(defun read-binary-properties ()
  "Parses the file \"PropList.txt\" and adds information about binary
properties to the corresponding entries in *CHAR-DATABASE*."
  ;; this property was derived from UnicodeData.txt already
  (setq *binary-properties* (list '#.(property-symbol "BidiMirrored")))
  (with-unicode-file ((code-point-range (property symbol)) "PropList.txt")
    ;; we don't need this information as we derive it from a code
    ;; point not being mentioned in UnicodeData.txt - see also the
    ;; initform for GENERAL-CATEGORY in the definition of CHAR-INFO
    (unless (eq property '#.(property-symbol "NoncharacterCodePoint"))
      (pushnew property *binary-properties* :test #'eq)
      (with-code-point-range (code-point code-point-range)
        (let ((char-info (aref *char-database* code-point)))
          (unless char-info
            ;; this file actually contains some information for
            ;; unassigned (but reserved) code points, like e.g. #xfff0
            (setf char-info (make-instance 'char-info :code-point code-point)
                  (aref *char-database* code-point) char-info))
          (push property (binary-props* char-info)))))))

(defun read-derived-age ()
  "Parses the file \"DerivedAge.txt\" and adds information about the
\"age\" to the corresponding entries in *CHAR-DATABASE*."
  (with-unicode-file ((code-point-range (age age)) "DerivedAge.txt")
    (with-code-point-range (code-point code-point-range)
      (let ((char-info (aref *char-database* code-point)))
        (when char-info
          (setf (age* char-info) age))))))

(defun read-mirroring-glyphs ()
  "Parses the file \"BidiMirroring.txt\" and adds information about
mirroring glyphs to the corresponding entries in *CHAR-DATABASE*."
  (with-unicode-file ((code-point-range (mirroring-glyph hex)) "BidiMirroring.txt")
    (with-code-point-range (code-point code-point-range)
      (let ((char-info (aref *char-database* code-point)))
        (when char-info
          (setf (bidi-mirroring-glyph* char-info) mirroring-glyph))))))

(defun read-jamo ()
  "Parses the file \"Jamo.txt\" and stores information about Jamo
short names in the *JAMO-SHORT-NAMES* hash table.  This information is
later used to compute Hangul syllable names."
  (clrhash *jamo-short-names*)
  (with-unicode-file ((code-point-range (short-name string "")) "Jamo.txt")
    (with-code-point-range (code-point code-point-range)
      (setf (gethash code-point *jamo-short-names*) short-name))))

(defun default-bidi-class (char-info)
  "Returns the default Bidi class for the character described by the
CHAR-INFO object CHAR-INFO.  The default is computed as explained in
<http://unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt>."
  (let ((code-point (code-point char-info)))
    (cond ((and (or (<= #x0600 code-point #x07BF)
                    (<= #xFB50 code-point #xFDFF)
                    (<= #xFE70 code-point #xFEFF))
                (not (find '#.(property-symbol "NoncharacterCodePoint")
                           (binary-props* code-point))))
           '#.(property-symbol "AL"))
          ((or (<= #x0590 code-point #x05FF)
               (<= #x07C0 code-point #x08ff)
               (<= #xFB1D code-point #xFB4F)
               (<= #x10800 code-point #x10FFF))
           '#.(property-symbol "R"))
          (t '#.(property-symbol "L")))))

(defun set-default-bidi-classes ()
  "Loops through all assigned characters in *CHAR-DATABASE* and
defaults their Bidi class if it wasn't set already."
  (loop for char-info across *char-database*
        when (and char-info (not (bidi-class* char-info)))
        do (let ((default-bidi-class (default-bidi-class char-info)))
             (pushnew default-bidi-class *bidi-classes* :test #'eq)
             (setf (bidi-class* char-info) default-bidi-class))))

(defun fill-database ()
  "Initializes all relevant datastructures and parses all Unicode data
files in the \"data/\" directory to build up enough information in
memory \(specifically the *CHAR-DATABASE* array) to write the missing
source code files for CL-UNICODE."
  (setq *char-database* (make-empty-char-database)
        *general-categories* nil
        *scripts* nil
        *code-blocks* nil
        *binary-properties* nil
        *bidi-classes* nil)
  (initialize-property-symbols)
  (read-character-data)
  (read-scripts)
  (read-code-blocks)
  (read-binary-properties)
  (read-derived-age)
  (read-mirroring-glyphs)
  (read-jamo)
  (set-default-bidi-classes))

(defun build-name-mappings ()
  "Initializes and fills the hash tables which map code points to
\(Unicode 1.0) names and vice versa using the information in
*CHAR-DATABASE*."
  (clrhash *names-to-code-points*)
  (clrhash *unicode1-names-to-code-points*)
  (clrhash *code-points-to-names*)
  (clrhash *code-points-to-unicode1-names*)
  (loop for char-info across *char-database*
        for name = (and char-info (name char-info))
        for unicode1-name = (and char-info (unicode1-name char-info))
        for code-point = (and char-info (code-point char-info))
        when name
        do (setf (gethash code-point *code-points-to-names*) name
                 (gethash (canonicalize-name name) *names-to-code-points*) code-point)
        when unicode1-name
        do (setf (gethash code-point *code-points-to-unicode1-names*) unicode1-name
                 (gethash (canonicalize-name unicode1-name) *unicode1-names-to-code-points*) code-point)))

(defun build-case-mapping ()
  "Initializes and filles the *CASE-MAPPINGS* hash table from
*CHAR-DATABASE*."
  (clrhash *case-mappings*)
  (loop for char-info across *char-database*
        for mappings = (and char-info
                            (list (uppercase-mapping* char-info)
                                  (lowercase-mapping* char-info)
                                  (titlecase-mapping* char-info)))
        when (and mappings (some #'identity mappings))
        do (setf (gethash (code-point char-info) *case-mappings*) mappings)))

(defun build-data-structures ()
  "One function to combine the complete process of parsing all Unicode
data files and building the corresponding Lisp datastructures in
memory."
  (fill-database)
  (when *compile-verbose*
    (format t "~&;;; Building hash tables")
    (force-output))
  (build-name-mappings)
  (build-case-mapping))