1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
;; unicode-sniffer.lisp -- attempt to determine Unicode encoding by inspecting bytes
;; copyright 2018 by Robert Dodier
;; I release this work under terms of the GNU General Public License, version 2
;; Adapted from: https://en.wikipedia.org/wiki/Byte_order_mark
(defparameter unicode-signatures
;; Sort signatures in order of decreasing length,
;; so longer signatures are tested first.
;; This really only makes a difference for UTF-16le vs UTF-32le,
;; but it is harmless in other cases.
(sort
'(((#xEF #xBB #xBF) . #+clisp charset:utf-8 #-clisp :utf-8)
((#xFE #xFF) . #+clisp charset:unicode-16-big-endian #+ecl :UCS-2BE #+cmucl :utf-16-be #-(or clisp ecl cmucl) :utf-16be)
((#xFF #xFE) . #+clisp charset:unicode-16-little-endian #+ecl :UCS-2LE #+cmucl :utf-16-le #-(or clisp ecl cmucl) :utf-16le)
((#x00 #x00 #xFE #xFF) . #+clisp charset:unicode-32-big-endian #+ecl :UCS-4BE #+cmucl :utf-32-be #-(or clisp ecl cmucl) :utf-32be)
((#xFF #xFE #x00 #x00) . #+clisp charset:unicode-32-little-endian #+ecl :UCS-4LE #+cmucl :utf-32-le #-(or clisp ecl cmucl) :utf-32le)
;; UTF-7 not known to SBCL, CCL, ECL, or CMUCL
((#x2B #x2F #x76 #x38) . #+clisp charset:utf-7 #-clisp :utf-7)
((#x2B #x2F #x76 #x39) . #+clisp charset:utf-7 #-clisp :utf-7)
((#x2B #x2F #x76 #x2B) . #+clisp charset:utf-7 #-clisp :utf-7)
((#x2B #x2F #x76 #x2F) . #+clisp charset:utf-7 #-clisp :utf-7)
((#x2B #x2F #x76 #x38 #x2D) . #+clisp charset:utf-7 #-clisp :utf-7)
;; UTF-1 not known to Clisp, SBCL, CCL, ECL, or CMUCL
((#xF7 #x64 #x4C) . :utf-1)
;; UTF-EBCDIC not known to Clisp, SBCL, CCL, ECL, or CMUCL
;; SBCL knows "US-EBCDIC" but UTF-EBCDIC is different (right?) so not known to SBCL either
((#xDD #x73 #x66 #x73) . :utf-ebcdic)
;; SCSU not known to Clisp, SBCL, CCL, ECL, or CMUCL
((#x0E #xFE #xFF) . :scsu)
;; BOCU not known to Clisp, SBCL, CCL, ECL, or CMUCL
((#xFB #xEE #x28) . :bocu-1)
;; :CP936 is a subset of :GB-18030 according to Wikipedia, so this is a "best fit"
;; GB-18030 and CP936 not known to CMUCL
((#x84 #x31 #x95 #x33) . #+clisp charset:cp936 #+(or ccl sbcl) :cp936 #+ecl :|cp936| #+abcl :gb18030 #-(or clisp ccl sbcl ecl abcl) :gb-18030))
#'(lambda (a b) (> (length (car a)) (length (car b))))))
(defun sniffer-match (initial-bytes signature-bytes)
(let*
((m (length signature-bytes))
(initial-bytes-subseq (subseq initial-bytes 0 m))
(byte-pairs (mapcar #'(lambda (a b) (list a b)) initial-bytes-subseq signature-bytes)))
(loop for p in byte-pairs
do (if (not (equal (first p) (second p)))
(return-from sniffer-match nil)))
t))
(defun sniffer-match-search (initial-bytes)
(loop for x in unicode-signatures
do (if (sniffer-match initial-bytes (car x))
(return-from sniffer-match-search (cdr x)))))
;; Given a file name F, returns a Unicode encoding designator
;; if the initial bytes of F match any signature in the UNICODE-SIGNATURES table,
;; otherwise NIL.
(defun unicode-sniffer (f)
(with-open-file (s f :element-type '(unsigned-byte 8))
(let*
((signature-length-max (apply #'max (mapcar #'(lambda (x) (length (car x))) unicode-signatures)))
(initial-bytes (loop repeat signature-length-max collect (read-byte s nil))))
(sniffer-match-search initial-bytes))))
;; Expose UNICODE-SNIFFER to Maxima user.
;; Returns the symbol name (i.e., a string) of the encoding,
;; if any was found, otherwise false.
(defun $inferred_encoding (f)
(let ((e (unicode-sniffer f)))
(if e (symbol-name e) "DEFAULT")))
;; Try to verify that the inferred encoding is among
;; the encodings known to this Lisp implementation.
;; If there is no known method to check the encoding
;; for this Lisp implementation, return 'UNKNOWN.
;; Otherwise this function returns a generalized Boolean.
(defun check-encoding (e)
;; work around ECL bug #435: "UCS-4LE not on list of basic encodings"
#+ecl (or (eq e ':ucs-4le) (member e (ext:all-encodings)))
#+ccl (ccl:lookup-character-encoding e)
#+clisp (equal (symbol-package e) (find-package :charset))
#+cmucl (assoc e (ext:list-all-external-formats))
#+sbcl (gethash e sb-impl::*external-formats*)
#+gcl nil ;; GCL 2.6.12 does not recognize :external-format in OPEN
;; work around ABCL bug: "SYSTEM:AVAILABLE-ENCODINGS symbols strangeness" (https://github.com/armedbear/abcl/issues/82)
#+abcl (member (symbol-name e) (mapcar #'symbol-name (system:available-encodings)) :test #'string=)
#-(or ecl ccl clisp cmucl sbcl gcl abcl) 'unknown)
;; Expose CHECK-ENCODING to Maxima user.
;; Argument is an encoding symbol name, such as that returned by $INFERRED_ENCODING.
;; Returns true if the encoding is recognized by the Lisp implementation,
;; false if the encoding is not recognized or the argument is null;
;; if there is no known method to check the encoding, print an error message.
(defun $recognized_encoding_p (e)
(setq e (string-upcase e))
(or
(not (null (string= e "DEFAULT")))
(let ((s (find-symbol e #+clisp :charset #-clisp :keyword)))
(if (not s)
(merror (intl:gettext "recognized_encoding_p: ~M can't be the name of an encoding for this Lisp implementation.") e)
(let ((x (check-encoding s)))
(cond
((eq x 'unknown)
(merror (intl:gettext "recognized_encoding_p: I don't know how to verify encoding for this Lisp implementation.")))
(t
(not (null x)))))))))
|