File: query

package info (click to toggle)
newlisp 10.7.5-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 6,248 kB
  • sloc: ansic: 33,280; lisp: 4,181; sh: 609; makefile: 215
file content (143 lines) | stat: -rwxr-xr-x 4,585 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env newlisp

; Retrieves a page with 20 links via Google containing 
; the search word, then spawns 20 child processes, each
; retrieving one page from a link. All pages are then 
; HTML-cleaned, tokenized and words are counted using
; bayes-train.

; USAGE: query <search-word>

; or rename to query.cgi and run on a web server
; using the following query.html:

; NOTE!!! Google changes the formatting of links (see LINK_REGEX) once
; in a while and then the whole thing breaks! 


;<html> 
;<head><title>Query</title></head> 
;<body><font face="Verdana">
;<p>Parallel processing support on newLISP using 'spawn'</p>
;<form name="query" action="query.cgi" method="POST"> 
;Input a search word:&nbsp;<input type="text" name="query" size="48"> 
;&nbsp;<input type="submit" value="Submit" name="submit"> 
;</form> 
;</font></body></html> 

(set 'cgi-mode nil)
;(set 'cgi-mode true) ; run as CGI get question from query.html

(when cgi-mode 
    (module "cgi.lsp")
    (print "Content-Type: text/html\n\n"))

;; globals and constants in this namespace
(set 'LINK_REGEX {q=(http://[^>& ]+)&})
(set 'START_TIME 0)
(set 'SITES '())
(set 'CONTENT "")
(constant 'BRK (if cgi-mode "<br>" ""))

(define (url-decode str)
  (replace "%([0-9A-F][0-9A-F])" str (char (int $1 0 16)) 1))

(define (url-encode str)  
  (replace {([^a-zA-Z0-9])} str (format "%%%2X" (char $1)) 0))

(define (clean-html text)
	(replace "<script.*?</script>" text "" 4)
	(replace "<style.*?</style>" text "" 4)
	(replace "<head.*?</head>" text "" 4)
	(replace "<[^>]*>" text " " 0)
	(replace "." text " . ")
	(replace "," text " , ")
	(replace "!" text " ! ")
	(replace "?" text " ? ")
	(replace "\n" text " ")
	(replace "\r" text "")
	(replace "\t" text " ")
	(replace " +" text " " 4)
	; this should be a list of all ISO 8859-1 Symbols
	; see http://www.w3schools.com/tags/ref_entities.asp
	; HTML entities should be replaced with UTF-8
	(replace "&nbsp;" text " ")
	(replace "&mdash;" text " - ")
	(replace "&ndash;" text " - ")
	(replace "&quot;" text "'")
	(replace "&amp;" text "&")
	;(replace "&lrm;" text (char 8206))
	;(replace {\&#(\d+);} text (char (int $1)) 0)
)

(if cgi-mode
    (set 'question (CGI:get "query"))
    (set 'question (last (parse (main-args -1) "/"))) ; default is 'query'
)
        

(set 'page (get-url 
	(string "http://www.google.com/search?q=" (url-encode question)
			"&num=22&ie=UTF-8&oe=UTF-8")))

(set 'links (find-all LINK_REGEX page $1))

(when cgi-mode
    (println {<html><font face="Verdana"><head>})
    (println {<META http-equiv="Content-Type" CONTENT="text/html; charset=utf-8" />})
    (println {</head><body>}))

(println "query term: " (url-decode question) BRK)
(set 'START_TIME (time-of-day))

######################### this is, where all the pages are retrieved #################
; spawn a childprocess for each link
(dolist (lnk (0 20 links))
	(set 'pid (spawn 'page (get-url lnk 4000)))
	(push (list pid lnk) SITES -1))

; this gets executed whenever a page has been retrieved
(define (page-ready-event pid)
    (let (link (0 80 (lookup pid SITES)))
        (set 'link (url-decode link))
        (println (inc cnt) " pid:" pid " " (- (time-of-day) START_TIME) " ms " link BRK)
        (push (lower-case (clean-html page)) CONTENT -1)
        (inc xferred (length page)))
)

; start waiting for pages
(println "waiting: ..." BRK BRK)
(unless (sync 10000  page-ready-event)
	(println BRK "timeout" BRK))
#####################################################################################

(println BRK "bytes transferred: " xferred BRK)
(println "total time: " (- (time-of-day) START_TIME) " ms" BRK)

######################### all the counting is done here using bayes-train ###########
(catch (load "Lex") 'result)
(println "training: " (time (bayes-train (find-all {[a-zA-Z]+} CONTENT) 'Lex)) " ms" BRK)
(println "total words processed: " (Lex:total 0) BRK)
(println BRK)
#####################################################################################

; sort by frequency and print in four columns
(set 'items (sort (Lex) (fn (x y) (> (x 1 0) (y 1 0)))))
(set 'items (transpose (explode items (ceil (div (length items) 4)))))

(when cgi-mode (println "<pre>"))
(dolist (line items)
    (dolist (item line)
        (if (and item (< (length (item 0)) 19)) 
            (print (format "%-18s %5d, " (item 0) (item 1 0)))
            (print (format "%-18s      , " " ")))) ; nil
    (println))
		
(when cgi-mode
    (println "</pre>")
    (println "</body></font></html>"))

; if accumulating results over various queries
;(save "Lex" 'Lex)
(exit)