1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
|
# encoding: UTF-8
#
## Herein lies all the code responsible for threading messages. It's
## basically an online version of the JWZ threading algorithm:
## http://www.jwz.org/doc/threading.html
##
## I didn't implement it for efficiency, but thanks to our search
## engine backend, it's typically not applied to very many messages at
## once.
##
## At the top level, we have a ThreadSet, which represents a set of
## threads, e.g. a message folder or an inbox. Each ThreadSet contains
## zero or more Threads. A Thread represents all the message related
## to a particular subject. Each Thread has one or more Containers. A
## Container is a recursive structure that holds the message tree as
## determined by the references: and in-reply-to: headers. Each
## Container holds zero or one messages. In the case of zero messages,
## it means we've seen a reference to the message but haven't (yet)
## seen the message itself.
##
## A Thread can have multiple top-level Containers if we decide to
## group them together independent of tree structure, typically if
## (e.g. due to someone using a primitive MUA) the messages have the
## same subject but we don't have evidence from in-reply-to: or
## references: headers. In this case Thread#each can optionally yield
## a faked root object tying them all together into one tree
## structure.
require 'set'
module Redwood
class Thread
include Enumerable
attr_reader :containers
def initialize
## ah, the joys of a multithreaded application with a class called
## "Thread". i keep instantiating the wrong one...
raise "wrong Thread class, buddy!" if block_given?
@containers = []
end
def << c
@containers << c
end
def empty?; @containers.empty?; end
def empty!; @containers.clear; end
def drop c; @containers.delete(c) or raise "bad drop"; end
## unused
def dump f=$stdout
f.puts "=== start thread with #{@containers.length} trees ==="
@containers.each { |c| c.dump_recursive f; f.puts }
f.puts "=== end thread ==="
end
## yields each message, its depth, and its parent. the message yield
## parameter can be a Message object, or :fake_root, or nil (no
## message found but the presence of one deduced from other
## messages).
def each fake_root=false
adj = 0
root = @containers.find_all { |c| c.message && !Message.subj_is_reply?(c.message.subj) }.argmin { |c| c.date }
if root
adj = 1
root.first_useful_descendant.each_with_stuff do |c, d, par|
yield c.message, d, (par ? par.message : nil)
end
elsif @containers.length > 1 && fake_root
adj = 1
yield :fake_root, 0, nil
end
@containers.each do |cont|
next if cont == root
fud = cont.first_useful_descendant
fud.each_with_stuff do |c, d, par|
## special case here: if we're an empty root that's already
## been joined by a fake root, don't emit
yield c.message, d + adj, (par ? par.message : nil) unless
fake_root && c.message.nil? && root.nil? && c == fud
end
end
end
def first; each { |m, *| return m if m }; nil; end
def has_message?; any? { |m, *| m.is_a? Message }; end
def dirty?; any? { |m, *| m && m.dirty? }; end
def date; map { |m, *| m.date if m }.compact.max; end
def snippet
with_snippets = select { |m, *| m && m.snippet && !m.snippet.empty? }
first_unread, * = with_snippets.select { |m, *| m.has_label?(:unread) }.sort_by { |m, *| m.date }.first
return first_unread.snippet if first_unread
last_read, * = with_snippets.sort_by { |m, *| m.date }.last
return last_read.snippet if last_read
""
end
def authors; map { |m, *| m.from if m }.compact.uniq; end
def apply_label t; each { |m, *| m && m.add_label(t) }; end
def remove_label t; each { |m, *| m && m.remove_label(t) }; end
def toggle_label label
if has_label? label
remove_label label
false
else
apply_label label
true
end
end
def set_labels l; each { |m, *| m && m.labels = l }; end
def has_label? t; any? { |m, *| m && m.has_label?(t) }; end
def each_dirty_message; each { |m, *| m && m.dirty? && yield(m) }; end
def direct_participants
map { |m, *| [m.from] + m.to if m }.flatten.compact.uniq
end
def participants
map { |m, *| [m.from] + m.to + m.cc + m.bcc if m }.flatten.compact.uniq
end
def size; map { |m, *| m ? 1 : 0 }.sum; end
def subj; argfind { |m, *| m && m.subj }; end
def labels; inject(Set.new) { |s, (m, *)| m ? s | m.labels : s } end
def labels= l
raise ArgumentError, "not a set" unless l.is_a?(Set)
each { |m, *| m && m.labels = l.dup }
end
def latest_message
inject(nil) do |a, b|
b = b.first
if a.nil?
b
elsif b.nil?
a
else
b.date > a.date ? b : a
end
end
end
def to_s
"<thread containing: #{@containers.join ', '}>"
end
def sort_key
m = latest_message
m ? [-m.date.to_i, m.id] : [-Time.now.to_i, ""]
end
end
## recursive structure used internally to represent message trees as
## described by reply-to: and references: headers.
##
## the 'id' field is the same as the message id. but the message might
## be empty, in the case that we represent a message that was referenced
## by another message (as an ancestor) but never received.
class Container
attr_accessor :message, :parent, :children, :id, :thread
def initialize id
raise "non-String #{id.inspect}" unless id.is_a? String
@id = id
@message, @parent, @thread = nil, nil, nil
@children = []
end
def each_with_stuff parent=nil
yield self, 0, parent
@children.sort_by(&:sort_key).each do |c|
c.each_with_stuff(self) { |cc, d, par| yield cc, d + 1, par }
end
end
def descendant_of? o
if o == self
true
else
@parent && @parent.descendant_of?(o)
end
end
def == o; Container === o && id == o.id; end
def empty?; @message.nil?; end
def root?; @parent.nil?; end
def root; root? ? self : @parent.root; end
## skip over any containers which are empty and have only one child. we use
## this make the threaded display a little nicer, and only stick in the
## "missing message" line when it's graphically necessary, i.e. when the
## missing message has more than one descendent.
def first_useful_descendant
if empty? && @children.size == 1
@children.first.first_useful_descendant
else
self
end
end
def find_attr attr
if empty?
@children.argfind { |c| c.find_attr attr }
else
@message.send attr
end
end
def subj; find_attr :subj; end
def date; find_attr :date; end
def is_reply?; subj && Message.subj_is_reply?(subj); end
def to_s
[ "<#{id}",
(@parent.nil? ? nil : "parent=#{@parent.id}"),
(@children.empty? ? nil : "children=#{@children.map { |c| c.id }.inspect}"),
].compact.join(" ") + ">"
end
def dump_recursive f=$stdout, indent=0, root=true, parent=nil
raise "inconsistency" unless parent.nil? || parent.children.include?(self)
unless root
f.print " " * indent
f.print "+->"
end
line = "[#{thread.nil? ? ' ' : '*'}] " + #"[#{useful? ? 'U' : ' '}] " +
if @message
message.subj ##{@message.refs.inspect} / #{@message.replytos.inspect}"
else
"<no message>"
end
f.puts "#{id} #{line}"#[0 .. (105 - indent)]
indent += 3
@children.each { |c| c.dump_recursive f, indent, false, self }
end
def sort_key
empty? ? [Time.now.to_i, ""] : [@message.date.to_i, @message.id]
end
end
## A set of threads, so a forest. Is integrated with the index and
## builds thread structures by reading messages from it.
##
## If 'thread_by_subj' is true, puts messages with the same subject in
## one thread, even if they don't reference each other. This is
## helpful for crappy MUAs that don't set In-reply-to: or References:
## headers, but means that messages may be threaded unnecessarily.
##
## The following invariants are maintained: every Thread has at least one
## Container tree, and every Container tree has at least one Message.
class ThreadSet
attr_reader :num_messages
bool_reader :thread_by_subj
def initialize index, thread_by_subj=true
@index = index
@num_messages = 0
## map from message ids to container objects
@messages = SavingHash.new { |id| Container.new id }
## map from subject strings or (or root message ids) to thread objects
@threads = SavingHash.new { Thread.new }
@thread_by_subj = thread_by_subj
end
def thread_for_id mid; @messages.member?(mid) && @messages[mid].root.thread end
def contains_id? id; @messages.member?(id) && !@messages[id].empty? end
def thread_for m; thread_for_id m.id end
def contains? m; contains_id? m.id end
def threads; @threads.values end
def size; @threads.size end
def dump f=$stdout
@threads.each do |s, t|
f.puts "**********************"
f.puts "** for subject #{s} **"
f.puts "**********************"
t.dump f
end
end
## link two containers
def link p, c, overwrite=false
if p == c || p.descendant_of?(c) || c.descendant_of?(p) # would create a loop
#puts "*** linking parent #{p.id} and child #{c.id} would create a loop"
return
end
#puts "in link for #{p.id} to #{c.id}, perform? #{c.parent.nil?} || #{overwrite}"
return unless c.parent.nil? || overwrite
remove_container c
p.children << c
c.parent = p
## if the child was previously a top-level container, it now ain't,
## so ditch our thread and kill it if necessary
prune_thread_of c
end
private :link
def remove_container c
c.parent.children.delete c if c.parent # remove from tree
end
private :remove_container
def prune_thread_of c
return unless c.thread
c.thread.drop c
@threads.delete_if { |k, v| v == c.thread } if c.thread.empty?
c.thread = nil
end
private :prune_thread_of
def remove_id mid
return unless @messages.member?(mid)
c = @messages[mid]
remove_container c
prune_thread_of c
end
def remove_thread_containing_id mid
return unless @messages.member?(mid)
c = @messages[mid]
t = c.root.thread
@threads.delete_if { |key, thread| t == thread }
end
## load in (at most) num number of threads from the index
def load_n_threads num, opts={}
@index.each_id_by_date opts do |mid, builder|
break if size >= num unless num == -1
next if contains_id? mid
m = builder.call
load_thread_for_message m, :skip_killed => opts[:skip_killed], :load_deleted => opts[:load_deleted], :load_spam => opts[:load_spam]
yield size if block_given?
end
end
## loads in all messages needed to thread m
## may do nothing if m's thread is killed
def load_thread_for_message m, opts={}
good = @index.each_message_in_thread_for m, opts do |mid, builder|
next if contains_id? mid
add_message builder.call
end
add_message m if good
end
## merges in a pre-loaded thread
def add_thread t
raise "duplicate" if @threads.values.member? t
t.each { |m, *| add_message m }
end
## merges two threads together. both must be members of this threadset.
## does its best, heuristically, to determine which is the parent.
def join_threads threads
return if threads.size < 2
containers = threads.map do |t|
c = @messages.member?(t.first.id) ? @messages[t.first.id] : nil
raise "not in threadset: #{t.first.id}" unless c && c.message
c
end
## use subject headers heuristically
parent = containers.find { |c| !c.is_reply? }
## no thread was rooted by a non-reply, so make a fake parent
parent ||= @messages["joining-ref-" + containers.map { |c| c.id }.join("-")]
containers.each do |c|
next if c == parent
c.message.add_ref parent.id
link parent, c
end
true
end
def is_relevant? m
m.refs.any? { |ref_id| @messages.member? ref_id }
end
def delete_message message
el = @messages[message.id]
return unless el.message
el.message = nil
end
## the heart of the threading code
def add_message message
el = @messages[message.id]
return if el.message # we've seen it before
#puts "adding: #{message.id}, refs #{message.refs.inspect}"
el.message = message
## link via references:
(message.refs + [el.id]).inject(nil) do |prev, ref_id|
ref = @messages[ref_id]
link prev, ref if prev
ref
end
## link via in-reply-to:
message.replytos.each do |ref_id|
ref = @messages[ref_id]
link ref, el, true
break # only do the first one
end
root = el.root
key =
if thread_by_subj?
Message.normalize_subj root.subj
else
root.id
end
## check to see if the subject is still the same (in the case
## that we first added a child message with a different
## subject)
if root.thread
if @threads.member?(key) && @threads[key] != root.thread
@threads.delete key
end
else
thread = @threads[key]
thread << root
root.thread = thread
end
## last bit
@num_messages += 1
end
end
end
|