File: unicodedata.lisp

package info (click to toggle)
maxima 5.47.0-9
  • links: PTS
  • area: main
  • in suites: forky
  • size: 193,104 kB
  • sloc: lisp: 434,678; fortran: 14,665; tcl: 10,990; sh: 4,577; makefile: 2,763; ansic: 447; java: 328; python: 262; perl: 201; xml: 60; awk: 28; sed: 15; javascript: 2
file content (130 lines) | stat: -rw-r--r-- 5,127 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
;; -*- mode: lisp; coding: utf-8 -*-
;; Copyright Leo Butler (leo.butler@member.fsf.org) 2015
;; Released under the terms of GPLv3+

#|

Rewrite Maxima's alphabetp function to handle wide-characters.
Add the ability for user to alter the table of alphabetic characters at runtime.

To do:
-detect if unicode characters are characters

|#

(defpackage :unicodedata
  (:use :common-lisp :maxima)
  (:import-from :maxima merror alphabetp *alphabet* $file_search1 $file_search_lisp $done $all mfuncall mlist)
  )

(in-package :unicodedata)

(let  ((unicode-data-hashtable (make-hash-table :test #'eql))
       (alphabetp-hashtable  (make-hash-table :test #'equal)))


  (labels ((lookup (i &optional (unicode-data-hashtable unicode-data-hashtable)) (symbol-name (car (gethash i unicode-data-hashtable))))
	   (description (x) (symbol-name (second x)))
	   (create-selector (regexp)
	     (cond ((stringp regexp)
		    (let ((s (string-downcase regexp)))
		      (lambda (x) (search s (string-downcase x)))))
		   ((eql regexp '$all)
		    (lambda (x) (declare (ignore x)) t))
		   ((null regexp)
		    (lambda (x) (declare (ignore x)) nil))
		   (t
		    (merror "regexp must be a string, the symbol `all' or empty."))))
	   (create-adder (lookup-char)
	     (labels ((this (regexp append)
			(let ((selector (create-selector regexp)))
			  (unless append (clrhash alphabetp-hashtable))
			  (maphash (lambda (k v)
				     (if (and k (funcall selector (funcall lookup-char v)))
					 (maxima::$set_alpha_char k)))
				   unicode-data-hashtable))
			'$done))
	       #'this))
	   #-sbcl (char-sym (x) (first x))
	   (category (x) (symbol-name (third x))))

    (let ((stack '()))
      (defun unicode-alphabetp (c)
	(cond ((< (char-code c) 128.)				;; this character is ascii and must be non-alphabetic
	       (setf stack '()))				;; there are no characters on the stack nor known to be part of a wide-character
	      ((null stack)					;; len=0 
	       (push c stack)
	       (if (gethash stack alphabetp-hashtable)
		   stack
		   (setf stack '())))
	      (t						;; 1 or more characters in stack
	       (push c stack)
	       (cond ((gethash stack alphabetp-hashtable)	;; c is part of wide-character
		      stack)
		     (t
		      (setf stack '())				;; c is not end of wide-character
		      (unicode-alphabetp c))))))		;; but it may start new one.
      ;; redefine alphabetp from src/nparse.lisp
      ;; to use unicode-alphabetp
      (defun alphabetp (n)
	(and (characterp n)
	     (cond ((or (alpha-char-p n)
			(member n *alphabet*))
		    (setf stack '())
		    t)
		   (t
		    (unicode-alphabetp n))))))
    
    (defun maxima::$set_alpha_char (char-sym)
      "A user-level function to add a wide character to the hashtable of
known alphabetical characters."
      (let ((char-sym-list (coerce (cond ((stringp char-sym) char-sym)
					 ((symbolp char-sym) (symbol-name char-sym))
					 ((and (integerp char-sym) (< 127. char-sym) (> 917999. char-sym))
					  #+(or sbcl clisp)
					  (format nil "~c" (code-char char-sym))
					  #-(or sbcl clisp)
					  (lookup char-sym unicode-data-hashtable))
					 (t (merror "first argument must be a string, symbol or integer")))
				   'list)))
	(do ((x (reverse char-sym-list) (cdr x))) ((null x) '$done)
	  (setf (gethash x alphabetp-hashtable)
		#-t (push (list char-sym-list (gethash char-sym unicode-data-hashtable)) (gethash x alphabetp-hashtable))
		#+t t))))

    (defun maxima::$unicode_init (&optional (regexp nil) file)
      (let ((data-file (mfuncall '$file_search1 (or file "unicodedata-txt.lisp") '((mlist) $file_search_lisp))))
	(loop for (n char-sym description category) in (with-open-file (instr data-file :direction :input) (read instr t nil nil))
	   do
	     (setf (gethash n unicode-data-hashtable) (list char-sym description category)))
	(if regexp (maxima::$unicode_add regexp))))

    
    (defun maxima::$unicode_add_category (category &optional append)
      (funcall (create-adder #'category) category append))
    
    (defun maxima::$unicode_add (&optional (regexp nil) (append nil))
      "Select the wide characters via a MAXIMA-NREGEX regexp. If
	   REGEXP is the symbol `all', this is equivalent to \".\"; if
	   REGEXP is NIL, then no matches are made, i.e. the hash
	   table UNICODE-DATA-HASHTABLE is emptied. Example:
	   unicode_add(\"greek .+ letter [^ ]+$\");"
      (cond ((and (listp regexp) (not (null regexp)))
	     (let ((r (loop for regex in (cdr regexp)
			 for append = append then t
			 collect (maxima::$unicode_add regex append))))
	       (cons '(mlist) r)))
	    (t
	     (let ((n (hash-table-count alphabetp-hashtable)))
	       (funcall (create-adder #'description) regexp append)
	       (list '(mlist) regexp (- (hash-table-count alphabetp-hashtable) n))))))

    (defun print-hashtable (&optional (ht unicode-data-hashtable))
      (let ((*print-base* 16.)
	    (*print-readably* t)
	    (*print-radix* t))
	(maphash (lambda(k v) (format t "(~a  ~{~s ~})~%" k (if (listp v) v (list v)))) ht)))
    (defun print-alphabetp-hashtable ()
      (print-hashtable alphabetp-hashtable))))

; end of unicodedata.lisp