File: sxw2words

package info (click to toggle)
guile-lib 0.2.8.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,940 kB
  • sloc: lisp: 7,099; sh: 4,177; makefile: 190
file content (85 lines) | stat: -rwxr-xr-x 2,989 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/guile -s
!#
;; sxw2words -- extract the words from an .sxw document
;; Copyright (C) 2004  Andy Wingo <wingo at pobox dot com>

;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

(use-modules (sxml ssax)
             (os process)
             (ice-9 rdelim)
             (srfi srfi-14))

(or (= (length (program-arguments)) 2)
    (begin
      (display "usage: sxw2words SXW-FILE\n" (current-error-port))
      (exit 1)))

(define sxw-file (cadr (program-arguments)))

(define (get-dict-words)
  (let ((port (open-input-file "/usr/share/dict/words")))
    (let lp ((words '()) (line (read-line port)))
      (if (eof-object? line)
          (sort! (reverse! words) string-ci<?)
          (lp (cons line words) (read-line port))))))

(define (uniq l)
  (let lp ((last-word "") (in l) (out '()))
    (cond ((null? in) (reverse! out))
          ((string-ci=? last-word (car in)) (lp last-word (cdr in) out))
          (else (lp (car in) (cdr in) (cons (car in) out))))))

(define trim-char-set (char-set-complement char-set:letter))
(define (get-sxw-words)
  ((ssax:make-parser
    NEW-LEVEL-SEED 
    (lambda (elem-gi attributes namespaces
                     expected-content seed)
      seed)
    
    FINISH-ELEMENT
    (lambda (elem-gi attributes namespaces parent-seed seed)
      seed)

    CHAR-DATA-HANDLER
    (lambda (string1 string2 seed)
      (let* ((strs (map
                    (lambda (x) (string-trim-both x trim-char-set))
                    (remove!
                     string-null? 
                     (append-map
                      (lambda (x) (string-split x #\space))
                      (string-split string1 #\newline)))))
             (seed (append! strs seed)))
        (if (string-null? string2) seed
            (cons string2 seed)))))
   (cdr (run-with-pipe ; "r" for read-only
         "r" "unzip" "-p" sxw-file "content.xml"))
   '()))

(let lp ((words (uniq (sort! (get-sxw-words) string-ci<?)))
         (dict-words (get-dict-words))
         (out '()))
  (cond
   ((null? words)
    (for-each (lambda (x) (display x) (newline)) (reverse! out)))
   ((string-ci=? (car words) (car dict-words))
    (lp (cdr words) (cdr dict-words) out))
   ((string-ci>? (car words) (car dict-words))
    (lp words (cdr dict-words) out))
   (else
    (lp (cdr words) dict-words (cons (car words) out)))))

;;; arch-tag: 6c2617d3-32a4-4a4d-8914-48c7ee1b5ad8