File: bintrie-builder.lisp

package info (click to toggle)
ruby-unf-ext 0.0.7.7-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, trixie
  • size: 5,472 kB
  • sloc: cpp: 14,118; lisp: 1,180; ruby: 94; makefile: 4
file content (168 lines) | stat: -rw-r--r-- 5,615 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
(defpackage dawg.bintrie-builder
  (:use :common-lisp :dawg.global)
  (:export build-from-file
           build-from-list
           collect-children
           node-label
           node-terminal?
           node-sibling-total
           node-child
           node-options
           element-count))
(in-package :dawg.bintrie-builder)

(package-alias :dawg.octet-stream :stream)

;;;;;;;;;;;;;;;
;;; declamation
(declaim #.*fastest*
         (inline make-node collect-children calc-child-total calc-sibling-total 
                 node-options element-count))

;;;;;;;;
;;; node
(defstruct node
  (label         0 :type octet)
  (terminal?   nil :type boolean)
  (child       nil :type (or null node))
  (sibling     nil :type (or null node))
  (child-total   0 :type positive-fixnum) ; amount of child side nodes
  (sibling-total 0 :type positive-fixnum) ; amount of sibling side nodes
  (value -1 :type fixnum)
  (hash         -1 :type fixnum))

;;;;;;;;;;;;;;;;;;;;;;
;;; auxiliary function
(macrolet ((calc-xxx-total (node slot)
             `(with-slots (,slot) (the node ,node)
                (if (null ,slot)
                    0
                  (the positive-fixnum
                       (+ (if (node-terminal? ,slot) 1 0)
                          (node-child-total ,slot) (node-sibling-total ,slot)))))))
  (defun calc-child-total (node) (calc-xxx-total node child))
  (defun calc-sibling-total (node) (calc-xxx-total node sibling)))

;;;;;;;;;;;;;;;;;
;;; hash function
(defun node= (n1 n2)
  (and (eq (node-child n1) (node-child n2))
       (eq (node-sibling n1) (node-sibling n2))
       (= (node-value n1) (node-value n2))
       (= (node-label n1) (node-label n2))
       (eq (node-terminal? n1) (node-terminal? n2))))

(defun sxhash-node (node)
  (if (null node)
      #.(sxhash nil)
    (with-slots (hash child-total sibling-total) (the node node)
      (when (= -1 hash)
        (setf hash (logxor (sxhash (node-label node))
                           (sxhash (node-value node))
                           (sxhash (node-terminal? node))
                           (fixnumize (* (sxhash-node (node-child node)) 7))
                           (fixnumize (* (sxhash-node (node-sibling node)) 13))))
        (setf child-total (calc-child-total node)
              sibling-total (calc-sibling-total node)))
      hash)))

;;;;;;;;;;;;;;;;;;
;;; build function
(defun share (node memo)
  (if (null node)
      nil
    (or (dict:get node memo)
        (progn 
          (setf (node-child node) (share (node-child node) memo)
                (node-sibling node) (share (node-sibling node) memo))
          (dict:get node memo))
        (setf (dict:get node memo) node))))

(defun push-child (in parent value)
  (if (stream:eos? in)
      (setf (node-terminal? parent) t
            (node-value parent) value)
    (let ((new-node (make-node :label (stream:read in))))
      (shiftf (node-sibling new-node) (node-child parent) new-node)
      (push-child in new-node value))))

(defun insert (in parent memo value)
  (let ((node (node-child parent)))
    (if (or (null node)
            (stream:eos? in)
            (/= (stream:peek in) (node-label node)))
        (progn
          (setf (node-child parent) (share node memo))
          (push-child in parent value))
      (insert (stream:eat in) node memo value))))

(defun build-impl (key-generator show-progress)
    (loop WITH trie = (make-node)
          WITH memo = (dict:make :test #'node= :hash #'sxhash-node)
          FOR num fixnum FROM 0
          FOR (key . value) = (funcall key-generator)
          WHILE key
      DO
      (when (and show-progress (zerop (mod num 100000)))
        (format t "~&;  ~A~%" num))
      (let ((in (stream:make key)))
        (declare (dynamic-extent in))
        (insert in trie memo value))

      FINALLY
      (return (share trie memo))))

(defun build-from-list (keyset &key show-progress)
  (when show-progress
    (format t "~&; build trie list (size ~A):~%" (length keyset)))
  (build-impl (lambda () (prog1 (car keyset)
                           (setf keyset (cdr keyset))))
              show-progress))
  
(defun build-from-file (filepath &key show-progress)
  (when show-progress
    (format t "~&; build trie from ~A:~%" filepath))
  (with-open-file (in filepath)
    (build-impl (lambda () (read-line in nil nil)) 
                show-progress)))

;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; other external function
(defun node-options (node)
  "Encode terminal? and sibling-total fields into fixnum"
  (with-slots (terminal? sibling-total) (the node node)
    (fixnumize
     (+ (if terminal? 1 0)
        (ash sibling-total 1)))))

(defun element-count (node)
  (with-slots (terminal? child-total) (the node node)
    (the fixnum (+ (if terminal? 1 0) child-total))))
        
(defun collect-children (node)
  (loop WITH acc = '()
        FOR child = (node-child node)
               THEN (node-sibling child)
        WHILE child
    DO
    (push child acc)
    FINALLY
    (return acc)))

;;;;;;;;;;;;;
;;; for debug
(defun member? (key trie)
  (declare #.*interface*
           (simple-characters key)
           (node trie))
  (let ((in (stream:make key)))
    (declare (dynamic-extent in))
    (nlet recur ((in in) (node (node-child trie)) (parent trie))
      (cond ((stream:eos? in) (node-terminal? parent))
            ((null node) nil)
            ((= (stream:peek in) (node-label node))
             (recur (stream:eat in) (node-child node) node))
            ((< (stream:peek in) (node-label node))
             (recur in (node-sibling node) parent))))))

(package-alias :dawg.octet-stream)