1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
(defpackage dawg.bintrie-builder
(:use :common-lisp :dawg.global)
(:export build-from-file
build-from-list
collect-children
node-label
node-terminal?
node-sibling-total
node-child
node-options
element-count))
(in-package :dawg.bintrie-builder)
(package-alias :dawg.octet-stream :stream)
;;;;;;;;;;;;;;;
;;; declamation
(declaim #.*fastest*
(inline make-node collect-children calc-child-total calc-sibling-total
node-options element-count))
;;;;;;;;
;;; node
(defstruct node
(label 0 :type octet)
(terminal? nil :type boolean)
(child nil :type (or null node))
(sibling nil :type (or null node))
(child-total 0 :type positive-fixnum) ; amount of child side nodes
(sibling-total 0 :type positive-fixnum) ; amount of sibling side nodes
(value -1 :type fixnum)
(hash -1 :type fixnum))
;;;;;;;;;;;;;;;;;;;;;;
;;; auxiliary function
(macrolet ((calc-xxx-total (node slot)
`(with-slots (,slot) (the node ,node)
(if (null ,slot)
0
(the positive-fixnum
(+ (if (node-terminal? ,slot) 1 0)
(node-child-total ,slot) (node-sibling-total ,slot)))))))
(defun calc-child-total (node) (calc-xxx-total node child))
(defun calc-sibling-total (node) (calc-xxx-total node sibling)))
;;;;;;;;;;;;;;;;;
;;; hash function
(defun node= (n1 n2)
(and (eq (node-child n1) (node-child n2))
(eq (node-sibling n1) (node-sibling n2))
(= (node-value n1) (node-value n2))
(= (node-label n1) (node-label n2))
(eq (node-terminal? n1) (node-terminal? n2))))
(defun sxhash-node (node)
(if (null node)
#.(sxhash nil)
(with-slots (hash child-total sibling-total) (the node node)
(when (= -1 hash)
(setf hash (logxor (sxhash (node-label node))
(sxhash (node-value node))
(sxhash (node-terminal? node))
(fixnumize (* (sxhash-node (node-child node)) 7))
(fixnumize (* (sxhash-node (node-sibling node)) 13))))
(setf child-total (calc-child-total node)
sibling-total (calc-sibling-total node)))
hash)))
;;;;;;;;;;;;;;;;;;
;;; build function
(defun share (node memo)
(if (null node)
nil
(or (dict:get node memo)
(progn
(setf (node-child node) (share (node-child node) memo)
(node-sibling node) (share (node-sibling node) memo))
(dict:get node memo))
(setf (dict:get node memo) node))))
(defun push-child (in parent value)
(if (stream:eos? in)
(setf (node-terminal? parent) t
(node-value parent) value)
(let ((new-node (make-node :label (stream:read in))))
(shiftf (node-sibling new-node) (node-child parent) new-node)
(push-child in new-node value))))
(defun insert (in parent memo value)
(let ((node (node-child parent)))
(if (or (null node)
(stream:eos? in)
(/= (stream:peek in) (node-label node)))
(progn
(setf (node-child parent) (share node memo))
(push-child in parent value))
(insert (stream:eat in) node memo value))))
(defun build-impl (key-generator show-progress)
(loop WITH trie = (make-node)
WITH memo = (dict:make :test #'node= :hash #'sxhash-node)
FOR num fixnum FROM 0
FOR (key . value) = (funcall key-generator)
WHILE key
DO
(when (and show-progress (zerop (mod num 100000)))
(format t "~&; ~A~%" num))
(let ((in (stream:make key)))
(declare (dynamic-extent in))
(insert in trie memo value))
FINALLY
(return (share trie memo))))
(defun build-from-list (keyset &key show-progress)
(when show-progress
(format t "~&; build trie list (size ~A):~%" (length keyset)))
(build-impl (lambda () (prog1 (car keyset)
(setf keyset (cdr keyset))))
show-progress))
(defun build-from-file (filepath &key show-progress)
(when show-progress
(format t "~&; build trie from ~A:~%" filepath))
(with-open-file (in filepath)
(build-impl (lambda () (read-line in nil nil))
show-progress)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; other external function
(defun node-options (node)
"Encode terminal? and sibling-total fields into fixnum"
(with-slots (terminal? sibling-total) (the node node)
(fixnumize
(+ (if terminal? 1 0)
(ash sibling-total 1)))))
(defun element-count (node)
(with-slots (terminal? child-total) (the node node)
(the fixnum (+ (if terminal? 1 0) child-total))))
(defun collect-children (node)
(loop WITH acc = '()
FOR child = (node-child node)
THEN (node-sibling child)
WHILE child
DO
(push child acc)
FINALLY
(return acc)))
;;;;;;;;;;;;;
;;; for debug
(defun member? (key trie)
(declare #.*interface*
(simple-characters key)
(node trie))
(let ((in (stream:make key)))
(declare (dynamic-extent in))
(nlet recur ((in in) (node (node-child trie)) (parent trie))
(cond ((stream:eos? in) (node-terminal? parent))
((null node) nil)
((= (stream:peek in) (node-label node))
(recur (stream:eat in) (node-child node) node))
((< (stream:peek in) (node-label node))
(recur in (node-sibling node) parent))))))
(package-alias :dawg.octet-stream)
|