1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
#!/usr/bin/env newlisp
; Retrieves a page with 20 links via Google containing
; the search word, then spawns 20 child processes, each
; retrieving one page from a link. All pages are then
; HTML-cleaned, tokenized and words are counted using
; bayes-train.
; USAGE: query <search-word>
; or rename to query.cgi and run on a web server
; using the following query.html:
; NOTE!!! Google changes the formatting of links (see LINK_REGEX) once
; in a while and then the whole thing breaks!
;<html>
;<head><title>Query</title></head>
;<body><font face="Verdana">
;<p>Parallel processing support on newLISP using 'spawn'</p>
;<form name="query" action="query.cgi" method="POST">
;Input a search word: <input type="text" name="query" size="48">
; <input type="submit" value="Submit" name="submit">
;</form>
;</font></body></html>
(set 'cgi-mode nil)
;(set 'cgi-mode true) ; run as CGI get question from query.html
(when cgi-mode
(module "cgi.lsp")
(print "Content-Type: text/html\n\n"))
;; globals and constants in this namespace
(set 'LINK_REGEX {q=(http://[^>& ]+)&})
(set 'START_TIME 0)
(set 'SITES '())
(set 'CONTENT "")
(constant 'BRK (if cgi-mode "<br>" ""))
(define (url-decode str)
(replace "%([0-9A-F][0-9A-F])" str (char (int $1 0 16)) 1))
(define (url-encode str)
(replace {([^a-zA-Z0-9])} str (format "%%%2X" (char $1)) 0))
(define (clean-html text)
(replace "<script.*?</script>" text "" 4)
(replace "<style.*?</style>" text "" 4)
(replace "<head.*?</head>" text "" 4)
(replace "<[^>]*>" text " " 0)
(replace "." text " . ")
(replace "," text " , ")
(replace "!" text " ! ")
(replace "?" text " ? ")
(replace "\n" text " ")
(replace "\r" text "")
(replace "\t" text " ")
(replace " +" text " " 4)
; this should be a list of all ISO 8859-1 Symbols
; see http://www.w3schools.com/tags/ref_entities.asp
; HTML entities should be replaced with UTF-8
(replace " " text " ")
(replace "—" text " - ")
(replace "–" text " - ")
(replace """ text "'")
(replace "&" text "&")
;(replace "‎" text (char 8206))
;(replace {\&#(\d+);} text (char (int $1)) 0)
)
(if cgi-mode
(set 'question (CGI:get "query"))
(set 'question (last (parse (main-args -1) "/"))) ; default is 'query'
)
(set 'page (get-url
(string "http://www.google.com/search?q=" (url-encode question)
"&num=22&ie=UTF-8&oe=UTF-8")))
(set 'links (find-all LINK_REGEX page $1))
(when cgi-mode
(println {<html><font face="Verdana"><head>})
(println {<META http-equiv="Content-Type" CONTENT="text/html; charset=utf-8" />})
(println {</head><body>}))
(println "query term: " (url-decode question) BRK)
(set 'START_TIME (time-of-day))
######################### this is, where all the pages are retrieved #################
; spawn a childprocess for each link
(dolist (lnk (0 20 links))
(set 'pid (spawn 'page (get-url lnk 4000)))
(push (list pid lnk) SITES -1))
; this gets executed whenever a page has been retrieved
(define (page-ready-event pid)
(let (link (0 80 (lookup pid SITES)))
(set 'link (url-decode link))
(println (inc cnt) " pid:" pid " " (- (time-of-day) START_TIME) " ms " link BRK)
(push (lower-case (clean-html page)) CONTENT -1)
(inc xferred (length page)))
)
; start waiting for pages
(println "waiting: ..." BRK BRK)
(unless (sync 10000 page-ready-event)
(println BRK "timeout" BRK))
#####################################################################################
(println BRK "bytes transferred: " xferred BRK)
(println "total time: " (- (time-of-day) START_TIME) " ms" BRK)
######################### all the counting is done here using bayes-train ###########
(catch (load "Lex") 'result)
(println "training: " (time (bayes-train (find-all {[a-zA-Z]+} CONTENT) 'Lex)) " ms" BRK)
(println "total words processed: " (Lex:total 0) BRK)
(println BRK)
#####################################################################################
; sort by frequency and print in four columns
(set 'items (sort (Lex) (fn (x y) (> (x 1 0) (y 1 0)))))
(set 'items (transpose (explode items (ceil (div (length items) 4)))))
(when cgi-mode (println "<pre>"))
(dolist (line items)
(dolist (item line)
(if (and item (< (length (item 0)) 19))
(print (format "%-18s %5d, " (item 0) (item 1 0)))
(print (format "%-18s , " " ")))) ; nil
(println))
(when cgi-mode
(println "</pre>")
(println "</body></font></html>"))
; if accumulating results over various queries
;(save "Lex" 'Lex)
(exit)
|