1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
;;;; HTML scan calls procedures for word, tag, whitespac, and newline.
;;; Copyright 2002 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1. Any copy made of this software must include this copyright notice
;in full.
;
;2. I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3. In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.
(require 'line-i/o)
(require 'string-port)
(require 'scanf)
(require-if 'compiling 'string-case)
;;@code{(require 'html-for-each)}
;;@ftindex html-for-each
;;@body
;;@1 is an input port or a string naming an existing file containing
;;HTML text.
;;@2 is a procedure of one argument or #f.
;;@3 is a procedure of one argument or #f.
;;@4 is a procedure of one argument or #f.
;;@5 is a procedure of no arguments or #f.
;;
;;@0 opens and reads characters from port @1 or the file named by
;;string @1. Sequential groups of characters are assembled into
;;strings which are either
;;
;;@itemize @bullet
;;@item
;;enclosed by @samp{<} and @samp{>} (hypertext markups or comments);
;;@item
;;end-of-line;
;;@item
;;whitespace; or
;;@item
;;none of the above (words).
;;@end itemize
;;
;;Procedures are called according to these distinctions in order of
;;the string's occurrence in @1.
;;
;;@5 is called with no arguments for end-of-line @emph{not within a
;;markup or comment}.
;;
;;@4 is called with strings of non-newline whitespace.
;;
;;@3 is called with hypertext markup strings (including @samp{<} and
;;@samp{>}).
;;
;;@2 is called with the remaining strings.
;;
;;@0 returns an unspecified value.
(define (html-for-each file word-proc markup-proc white-proc newline-proc)
(define nl (string #\newline))
(define (string-index str . chrs)
(define len (string-length str))
(do ((pos 0 (+ 1 pos)))
((or (>= pos len) (memv (string-ref str pos) chrs))
(and (< pos len) pos))))
(define (proc-words line edx)
(let loop ((idx 0))
(define ldx idx)
(do ((idx idx (+ 1 idx)))
((or (>= idx edx)
(not (char-whitespace? (string-ref line idx))))
(do ((jdx idx (+ 1 jdx)))
((or (>= jdx edx)
(char-whitespace? (string-ref line jdx)))
(and white-proc (not (= ldx idx))
(white-proc (substring line ldx idx)))
(and word-proc (not (= idx jdx))
(word-proc (substring line idx jdx)))
(if (< jdx edx) (loop jdx))))))))
((if (input-port? file) call-with-open-ports call-with-input-file)
file
(lambda (iport)
(do ((line (read-line iport) (read-line iport)))
((eof-object? line))
(do ((idx (string-index line #\<) (string-index line #\<)))
((not idx) (proc-words line (string-length line)))
; seen '<'
(proc-words line idx)
(let ((trm (if (and (<= (+ 4 idx) (string-length line))
(string=? "<!--" (substring line idx (+ 4 idx))))
"-->" #\>)))
(let loop ((lne (substring line idx (string-length line)))
(tag "")
(quot #f))
(define edx (or (eof-object? lne)
(if quot
(string-index lne quot)
(if (char? trm)
(string-index lne #\" #\' #\>)
(string-index lne #\>)))))
(cond
((not edx) ; still inside tag
;;(print quot trm 'within-tag lne)
(loop (read-line iport)
(and markup-proc (string-append tag lne nl))
quot))
((eqv? #t edx) ; EOF
;;(print quot trm 'eof lne)
(slib:error 'unterminated 'HTML 'entity file)
(and markup-proc (markup-proc tag)))
((eqv? quot (string-ref lne edx)) ; end of quoted string
;;(print quot trm 'end-quote lne)
(set! edx (+ 1 edx))
(loop (substring lne edx (string-length lne))
(and markup-proc
(string-append tag (substring lne 0 edx)))
#f))
((not (eqv? #\> (string-ref lne edx))) ; start of quoted
;;(print quot trm 'start-quote lne)
(set! edx (+ 1 edx))
(loop (substring lne edx (string-length lne))
(and markup-proc
(string-append tag (substring lne 0 edx)))
(string-ref lne (+ -1 edx))))
((or (and (string? trm) ; found matching '>' or '-->'
(<= 2 edx)
(equal? trm (substring lne (+ -2 edx) (+ 1 edx))))
(eqv? (string-ref lne edx) trm))
;;(print quot trm 'end-> lne)
(set! edx (+ 1 edx))
(and markup-proc
(markup-proc (string-append tag (substring lne 0 edx))))
; process words after '>'
(set! line (substring lne edx (string-length lne))))
(else
;;(print quot trm 'within-comment lne)
(set! edx (+ 1 edx))
(loop (substring lne edx (string-length lne))
(and markup-proc
(string-append tag (substring lne 0 edx)))
#f))))))
(and newline-proc (newline-proc))))))
;;@args file limit
;;@args file
;;@1 is an input port or a string naming an existing file containing
;;HTML text. If supplied, @2 must be an integer. @2 defaults to
;;1000.
;;
;;@0 opens and reads HTML from port @1 or the file named by string @1,
;;until reaching the (mandatory) @samp{TITLE} field. @0 returns the
;;title string with adjacent whitespaces collapsed to one space. @0
;;returns #f if the title field is empty, absent, if the first
;;character read from @1 is not @samp{#\<}, or if the end of title is
;;not found within the first (approximately) @2 words.
(define (html:read-title file . limit)
(set! limit (if (null? limit) 1000 (* 2 (car limit))))
((if (input-port? file) call-with-open-ports call-with-input-file)
file
(lambda (port)
(and (eqv? #\< (peek-char port))
(call-with-current-continuation
(lambda (return)
(define (cnt . args)
(if (negative? limit)
(return #f)
(set! limit (+ -1 limit))))
(define capturing? #f)
(define text '())
(html-for-each
port
(lambda (str)
(cnt)
(if capturing? (set! text (cons " " (cons str text)))))
(lambda (str)
(cnt)
(cond ((prefix-ci? "<title" str)
(set! capturing? #t))
((prefix-ci? "</title" str)
(return (and (not (null? text))
(apply string-append
(reverse (cdr text))))))
((or (prefix-ci? "</head" str)
(prefix-ci? "<body" str))
(return #f))))
cnt
cnt)
#f))))))
(define (prefix-ci? pre str)
(define prelen (string-length pre))
(and (< prelen (string-length str))
(string-ci=? pre (substring str 0 prelen))))
;;@body
;;@1 is a hypertext markup string.
;;
;;If @1 is a (hypertext) comment or DTD, then @0 returns #f.
;;Otherwise @0 returns the hypertext element string consed onto an
;;association list of the attribute name-symbols and values. If the
;;tag ends with "/>", then "/" is appended to the hypertext element
;;string. The name-symbols are created by @code{string-ci->symbol}.
;;Each value is a string; or #t if the name had no value
;;assigned within the markup.
(define (htm-fields htm)
(require 'string-case)
(and
(not (and (> (string-length htm) 3) (equal? "<!" (substring htm 0 2))))
(call-with-input-string htm
(lambda (port)
(define element #f)
(define fields '())
(cond ((not (eqv? 1 (fscanf port "<%s" element)))
(slib:error 'htm-fields 'strange htm)))
(let loop ((chr (peek-char port)))
(define name #f)
(define junk #f)
(define value #t)
(cond
((eof-object? chr)
(cond ((and element
(eqv? (string-ref element
(+ -1 (string-length element)))
#\>))
(cons (substring element 0 (+ -1 (string-length element)))
fields))
(else
(slib:warn 'htm-fields 'missing '> htm)
(if element
(cons element (reverse fields))
(reverse fields)))))
((eqv? #\> chr) (cons element (reverse fields)))
((eqv? #\/ chr)
(set! element (string-append element (string (read-char port))))
(loop (peek-char port)))
((char-whitespace? chr) (read-char port) (loop (peek-char port)))
((case (fscanf port "%[-a-zA-Z0-9:] %[=] %[-.a-zA-Z0-9]"
name junk value)
((3 1) #t)
((2)
(case (peek-char port)
((#\") (cond ((eqv? 1 (fscanf port "\"%[^\"]\"" value)))
((eqv? #\" (peek-char port))
(read-char port)
(set! value ""))
(else #f)))
((#\') (cond ((eqv? 1 (fscanf port "'%[^']'" value)))
((eqv? #\' (peek-char port))
(read-char port)
(set! value ""))
(else #f)))
(else #f)))
(else #f))
(set! fields (cons (cons (string-ci->symbol name) value)
fields))
(loop (peek-char port)))
(else (slib:warn 'htm-fields 'bad 'field htm)
(reverse fields))))))))
|