1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
|
#!/usr/bin/guile -s
!#
;; sxw2words -- extract the words from an .sxw document
;; Copyright (C) 2004 Andy Wingo <wingo at pobox dot com>
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
(use-modules (sxml ssax)
(os process)
(ice-9 rdelim)
(srfi srfi-14))
(or (= (length (program-arguments)) 2)
(begin
(display "usage: sxw2words SXW-FILE\n" (current-error-port))
(exit 1)))
(define sxw-file (cadr (program-arguments)))
(define (get-dict-words)
(let ((port (open-input-file "/usr/share/dict/words")))
(let lp ((words '()) (line (read-line port)))
(if (eof-object? line)
(sort! (reverse! words) string-ci<?)
(lp (cons line words) (read-line port))))))
(define (uniq l)
(let lp ((last-word "") (in l) (out '()))
(cond ((null? in) (reverse! out))
((string-ci=? last-word (car in)) (lp last-word (cdr in) out))
(else (lp (car in) (cdr in) (cons (car in) out))))))
(define trim-char-set (char-set-complement char-set:letter))
(define (get-sxw-words)
((ssax:make-parser
NEW-LEVEL-SEED
(lambda (elem-gi attributes namespaces
expected-content seed)
seed)
FINISH-ELEMENT
(lambda (elem-gi attributes namespaces parent-seed seed)
seed)
CHAR-DATA-HANDLER
(lambda (string1 string2 seed)
(let* ((strs (map
(lambda (x) (string-trim-both x trim-char-set))
(remove!
string-null?
(append-map
(lambda (x) (string-split x #\space))
(string-split string1 #\newline)))))
(seed (append! strs seed)))
(if (string-null? string2) seed
(cons string2 seed)))))
(cdr (run-with-pipe ; "r" for read-only
"r" "unzip" "-p" sxw-file "content.xml"))
'()))
(let lp ((words (uniq (sort! (get-sxw-words) string-ci<?)))
(dict-words (get-dict-words))
(out '()))
(cond
((null? words)
(for-each (lambda (x) (display x) (newline)) (reverse! out)))
((string-ci=? (car words) (car dict-words))
(lp (cdr words) (cdr dict-words) out))
((string-ci>? (car words) (car dict-words))
(lp words (cdr dict-words) out))
(else
(lp (cdr words) dict-words (cons (car words) out)))))
;;; arch-tag: 6c2617d3-32a4-4a4d-8914-48c7ee1b5ad8
|