File: cleanHTML.kbs

package info (click to toggle)
basic256 1.1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 24,460 kB
  • sloc: cpp: 10,148; yacc: 3,023; java: 1,091; lex: 1,051; sh: 117; xml: 33; makefile: 15
file content (138 lines) | stat: -rw-r--r-- 4,576 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# cleanHTML.kbs - Clean the HTML downloaded from the documentation WIKI
#
# modification History
# date...... programmer... description
# 2014-08-06 j.m.reneau    original coding
#
# READ THE README file in this folder to describe the use of this
# utility
#
## BE SURE THE WGET COMPLETED BEFORE RUNNING THIS PROGRAM


if version() < 1010207 then
   print "requires version 1.1.2.7 or higher"
endif
debug = false

# d$ regular expressoions to remove from the downloaded HTML files
# remove an expression if it does not contain any other of the
# expressions.  This way nested tags get removed in a better
# order using simple regex.
#
# These regex statements require the minimal (non-greedy)
# setting in the QRegExp libraty (see regexminimal statement)
#
dim d$[100]
nd=-1
nd++: d$[nd]= '<!--.*-->'
nd++: d$[nd]= 'doku\.php@id='
nd++: d$[nd]= '<form.*<\/form>'
nd++: d$[nd]= '<script.*<\/script>'
nd++: d$[nd]= '<div class=\"no\".*<\/div>'
nd++: d$[nd]= '<div id=\"dokuwiki__usertools\">.*<\/div>'
nd++: d$[nd]= '<div id=\"dokuwiki__sitetools\">.*<\/div>'
nd++: d$[nd]= '<div id=\"dokuwiki__pagetools\">.*<\/div>'
nd++: d$[nd]= '<div class=\"tools group\">.*<\/div>'
nd++: d$[nd]= '<div class=\"mobileTools\">.*<\/div>'
nd++: d$[nd]= '<div class=\"trace\">.*<\/div>'
nd++: d$[nd]= '<div class=\"tools\">.*<\/div>'
nd++: d$[nd]= '<div class=\"buttons\">.*<\/div>'
nd++: d$[nd]= '<div class=\"inclmeta.*<\/div>'
nd++: d$[nd]= '<ul class=\"a11y skip\">.*<\/ul>'
nd++: d$[nd]= '<span class=\"tooltip\">.*<\/span>'
nd++: d$[nd]= '<meta name=\".*>'
nd++: d$[nd]= '<link rel=\"search.*>'
nd++: d$[nd]= '<link rel=\"alternate.*>'
nd++: d$[nd]= '<link rel=\"canonical.*>'
nd++: d$[nd]= '<link rel=\"start.*>'
nd++: d$[nd]= '<link rel=\"contents.*>'
nd++: d$[nd]= '<link rel=\"shortcut.*>'
nd++: d$[nd]= '<link rel=\"apple-touch.*>'
nd++: d$[nd]= 'fetch\.php.*media='
nd++: d$[nd]= '\.php.*tseed='
nd++: d$[nd]= '<a href=\"http:\/\/www\.dokuwiki\.org.*<\/a>'
nd++: d$[nd]= '<a href=\"http:\/\/www\.php\.net.*<\/a>'
nd++: d$[nd]= '<a href=\"http:\/\/validator.*<\/a>'
nd++: d$[nd]= '<a href=\"http:\/\/jigsaw.*<\/a>'
nd++: d$[nd]= '<a href=\"http:\/\/dokuwiki\.org.*<\/a>'

file$ = dir('./wiki')
while file$ <> ""
   if instrx(file$,"^doku.*html$") <> 0 then
      print "processing " + file$
      # read in file into string
      open "./wiki/" + file$
      html$ = ""
      while not eof
         line$ = readline
         while(asc(right(line$,1))) < 32
         line$ = left(line$,length(line$)-1)
         if line$ = "" then exit while
      end while
      html$ += line$ + chr(13)
   end while
   close
   print length(html$); 
   #
   # find all occurrences of the expressions in the d$ array
   # and delete them - if one contains another skip it until the
   # inner expressions are deleted
   #
   regexminimal true
   #
   do
      needchange = false
      for t = 0 to nd
         pos = instrx(html$,d$[t])
         while pos <> 0
            part$ = midx(html$, d$[t], pos)
            if debug then print pos + " " + d$[t] + " " + left(part$,10) + "..." + right(part$,10) + " " + length(part$)
            needchange = true
            doit = true
            for u = 0 to nd
               if u <> t then
                  if instrx(part$,d$[u]) <> 0 then
                     doit = false
                     exit for
                  endif
               endif
            next u
            if doit then
               if debug then print "doing " + t + " " + left(part$,10) + "..." + right(part$,10) + " " + length(part$)
               html$ = replace(html$,part$,"")
               pos = instrx(html$, d$[t], pos)
               if instr(html$,"stylesheet") = 0 then end
            else
               if debug then print "skipping " + left(part$,10) + "..." + right(part$,10) + " " + length(part$) + " rule " + u
               pos = instrx(html$, d$[t], pos+1)
            endif
         end while
      next t
   until not needchange
   #
   # do some character translations and cleaning up
   html$ = replace(html$,"%3A","_")
   html$ = replace(html$,"%253A","_")
   while instr(html$, " "+chr(13)) <> 0
      html$ = replace(html$," "+chr(13),chr(13))
   end while
   while instr(html$, chr(13)+chr(13)) <> 0
      html$ = replace(html$,chr(13)+chr(13),chr(13))
   end while
   #
   # write file out into HTML FOLDER with good name
   print " " + length(html$)
   f$ = mid(file$,13,999)
   f$ = replace(f$,"%3A","_")
   f$ = replace(f$,"%253A","_")
   print f$
   open "./help/" + f$
   reset
   write html$
   close
   #
endif
file$ = dir()

end while