1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
|
#!/usr/bin/env ruby
# vim:encoding=UTF-8:
$KCODE = "u" if RUBY_VERSION < "1.9" # json use this
require 'uri'
require 'net/http'
require 'stringio'
require 'zlib'
require 'nkf'
class ThreadData
class UnknownThread < StandardError; end
attr_accessor :uri
attr_accessor :last_modified, :size
Line = Struct.new(:n, :name, :mail, :misc, :body, :opts, :id) do
def aa?
body = self.body
return false if body.count("\n") < 3
significants = body.scan(/[>\n0-9a-z0-9A-Za-zA-Zぁ-んァ-ン一-龠]/u).size.to_f
body_length = body.scan(/./u).size
is_aa = 1 - significants / body_length
is_aa > 0.6
end
end
def initialize(thread_uri)
@uri = URI(thread_uri)
_, _, _, @board, @num, = *@uri.path.split('/')
@dat = []
end
def length
@dat.length
end
def subject
retrieve(true) if @dat.size.zero?
self[1].opts || ""
end
def [](n)
l = @dat[n - 1]
return nil unless l
name, mail, misc, body, opts = * l.split(/<>/)
id = misc[/ID:([^\s]+)/, 1]
body.gsub!(/<br>/, "\n")
body.gsub!(/<[^>]+>/, "")
body.gsub!(/^\s+|\s+$/, "")
body.gsub!(/&(gt|lt|amp|nbsp);/) {|s|
{ 'gt' => ">", 'lt' => "<", 'amp' => "&", 'nbsp' => " " }[$1]
}
Line.new(n, name, mail, misc, body, opts, id)
end
def dat
@num
end
def retrieve(force=false)
@dat = [] if @force
res = Net::HTTP.start(@uri.host, @uri.port) do |http|
req = Net::HTTP::Get.new('/%s/dat/%d.dat' % [@board, @num])
req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
req['Accept-Encoding'] = 'gzip' unless @size
unless force
req['If-Modified-Since'] = @last_modified if @last_modified
req['Range'] = "bytes=%d-" % @size if @size
end
http.request(req)
end
ret = nil
case res.code.to_i
when 200, 206
body = res.body
if res['Content-Encoding'] == 'gzip'
body = StringIO.open(body, 'rb') {|io| Zlib::GzipReader.new(io).read }
end
@last_modified = res['Last-Modified']
if res.code == '206'
@size += body.size
else
@size = body.size
end
body = NKF.nkf('-w', body)
curr = @dat.size + 1
@dat.concat(body.split(/\n/))
last = @dat.size
(curr..last).map {|n|
self[n]
}
when 416 # たぶん削除が発生
p ['416']
retrieve(true)
[]
when 304 # Not modified
[]
when 302 # dat 落ち
p ['302', res['Location']]
raise UnknownThread
else
p ['Unknown Status:', res.code]
[]
end
end
def canonicalize_subject(subject)
subject.gsub(/[A-Za-z0-9]/u) {|c|
c.unpack("U*").map {|i| i - 65248 }.pack("U*")
}
end
def guess_next_thread
res = Net::HTTP.start(@uri.host, @uri.port) do |http|
req = Net::HTTP::Get.new('/%s/subject.txt' % @board)
req['User-Agent'] = 'Monazilla/1.00 (2ig.rb/0.0e)'
http.request(req)
end
recent_posted_threads = (900..999).inject({}) {|r,i|
line = self[i]
line.body.scan(%r|ttp://#{@uri.host}/test/read.cgi/[^/]+/\d+/|).each do |uri|
r["h#{uri}"] = i
end if line
r
}
current_subject = canonicalize_subject(self.subject)
current_thread_rev = current_subject.scan(/\d+/).map {|d| d.to_i }
current = current_subject.scan(/./u)
body = NKF.nkf('-w', res.body)
threads = body.split(/\n/).map {|l|
dat, rest = *l.split(/<>/)
dat.sub!(/\.dat$/, "")
uri = "http://#{@uri.host}/test/read.cgi/#{@board}/#{dat}/"
subject, n = */(.+?) \((\d+)\)/.match(rest).captures
canonical_subject = canonicalize_subject(subject)
thread_rev = canonical_subject[/\d+/].to_i
distance = (dat == self.dat) ? Float::MAX :
(subject == self.subject) ? 0 :
levenshtein(canonical_subject.scan(/./u), current)
continuous_num = current_thread_rev.find {|rev| rev == thread_rev - 1 }
appear_recent = recent_posted_threads[uri]
score = distance
score -= 10 if continuous_num
score -= 10 if appear_recent
score += 10 if dat.to_i < self.dat.to_i
{
:uri => uri,
:dat => dat,
:subject => subject,
:distance => distance,
:continuous_num => continuous_num,
:appear_recent => appear_recent,
:score => score.to_f
}
}.sort_by {|o|
o[:score]
}
threads
end
def levenshtein(a, b)
case
when a.empty?
b.length
when b.empty?
a.length
when a == b
0
else
d = Array.new(a.length + 1) { |s|
Array.new(b.length + 1, 0)
}
(0..a.length).each do |i|
d[i][0] = i
end
(0..b.length).each do |j|
d[0][j] = j
end
(1..a.length).each do |i|
(1..b.length).each do |j|
cost = (a[i - 1] == b[j - 1]) ? 0 : 1
d[i][j] = [
d[i-1][j ] + 1,
d[i ][j-1] + 1,
d[i-1][j-1] + cost
].min
end
end
d[a.length][b.length]
end
end
end
if __FILE__ == $0
require 'pp'
thread = ThreadData.new(ARGV[0])
pp thread.guess_next_thread.reverse
p thread.subject
end
|