1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
|
// Package jwz is an implementation of the email threading algorithm created by Jamie Zawinski and explained by him
// at: https://www.jwz.org/doc/threading.html
//
// This package was created by cribbing from the code at:
// https://www.jwz.org/doc/threading.html#:~:text=grendel-1999-05-14.tar.gz
// from the Java source code in view/Threader.java - it contains no ham and cheese sandwiches.
//
// The code, interface etc. was obviously adapted in to Go form, though where possible, the code reflects the
// original Java if it is not too ungolike.
//
// Author: Jim Idle - jimi@idle.ws / jimi@gatherstars.com
//
// See the LICENSE file, sit down, have a scone.
//
package jwz
import (
"errors"
"fmt"
)
// Threader arranges a set of messages into a thread hierarchy, by references.
//
type Threader struct {
rootNode *threadContainer
idTable map[string]*threadContainer
bogusIDCount int
}
// NewThreader returns an instance of the Threader struct, that is ready to attack
// your Threadable
//
//goland:noinspection GoUnusedExportedFunction
func NewThreader() *Threader {
t := &Threader{
idTable: make(map[string]*threadContainer),
}
return t
}
// Thread will create a threadable organized so that the root node
// is the original reference, creating dummy placeholders for the emails
// we don't have yet
//
func (t *Threader) Thread(threadable Threadable) (Threadable, error) {
if threadable == nil {
return nil, nil
}
// Build a thread container from this single email
//
if !threadable.IsDummy() {
if err := t.buildContainer(threadable); err != nil {
return nil, err
}
} else {
return nil, errors.New("cannot thread a single email with a dummy root")
}
var err error
// Organize the root set from what we have
//
t.rootNode, err = t.findRootSet()
if err != nil {
return nil, err
}
// We no longer need the map - probably no real need to blank it here, but the original Java code did that,
// and it won't harm to let the GC reclaim this in case our caller keeps the *Threader around for some reason
//
t.idTable = nil
// We do this to avoid flipping the input order each time through.
//
t.rootNode.reverseChildren()
// There should not be a next in the root of a conversation thread
//
if t.rootNode.next != nil {
return nil, fmt.Errorf("root node contains a next and should not: %#v", t.rootNode)
}
// Because the result of this function is a tree that does actually contain dummies for missing references
// we need to add a dummy threadable for any node that does not yet have one. Then we can flush the chain
// of containers in to the threadable
//
t.rootNode.fillDummy(threadable)
var result Threadable
if t.rootNode.child != nil {
result = t.rootNode.child.threadable
}
// Flush the tree structure of each element of the root set down into
// their underlying Threadables
//
_ = t.rootNode.flush()
t.rootNode = nil
return result, nil
}
// ThreadSlice will thread the set of messages contained within threadableSlice.
// The Threadable returned is the new first element of the root set.
//
func (t *Threader) ThreadSlice(threadableSlice []Threadable) (Threadable, error) {
if threadableSlice == nil || len(threadableSlice) == 0 {
return nil, nil
}
// Iterate all the Threadable represented by the root and build the
// threadContainer from them
//
for _, nt := range threadableSlice {
if !nt.IsDummy() {
if err := t.buildContainer(nt); err != nil {
return nil, err
}
}
}
return t.threadRoot()
}
// ThreadRoot will thread the set of messages provided by ThreadableRoot.
// The Threadable returned is the new first element of the root set.
//
func (t *Threader) ThreadRoot(threadableRoot ThreadableRoot) (Threadable, error) {
if threadableRoot == nil {
return nil, nil
}
// Iterate all the Threadable represented by the root and build the
// threadContainer from them
//
for threadableRoot.Next() == true {
nt := threadableRoot.Get()
if !nt.IsDummy() {
if err := t.buildContainer(nt); err != nil {
return nil, err
}
}
}
return t.threadRoot()
}
func (t *Threader) threadRoot() (Threadable, error) {
var err error
// Organize the root set from what we have
//
t.rootNode, err = t.findRootSet()
if err != nil {
return nil, err
}
// We no longer need the map - probably no real need to blank it here, but the original Java code did that,
// and it won't harm to let the GC reclaim this in case our caller keeps the *Threader around for some reason
//
t.idTable = nil
// Get rid of any empty containers. They should no longer needed
//
t.pruneEmptyContainers(t.rootNode)
// We do this so to avoid flipping the input order each time through.
//
t.rootNode.reverseChildren()
// We might need to sort on subjects, so let's process them
//
t.gatherSubjects()
// There should not be a next in the root of a conversation thread
//
if t.rootNode.next != nil {
return nil, fmt.Errorf("root node contains a next and should not: %#v", t.rootNode)
}
for r := t.rootNode.child; r != nil; r = r.next {
// If this direct child of the root node has no threadable in it,
// manufacture a dummy container to bind its children together.
// Note that these dummies can only ever occur as elements of
// the root set.
//
if r.threadable == nil {
r.threadable = r.child.threadable.MakeDummy(r.forID)
}
}
var result Threadable
if t.rootNode.child != nil {
result = t.rootNode.child.threadable
}
// Flush the tree structure of each element of the root set down into
// their underlying Threadables
//
_ = t.rootNode.flush()
t.rootNode = nil
return result, nil
}
// buildContainer() does three things:
//
// - It walks the tree of Threadable, and wraps each in a
// threadContainer object.
// - It indexes each threadContainer object in the idTable, under
// the message ID of the contained Threadable.
// - For each of the references within Threadable, it ensures that there
// is a threadContainer in the table (an empty one, if necessary.)
//
func (t *Threader) buildContainer(threadable Threadable) error {
var present bool
// See if we already have a container for this threadable
//
id := threadable.MessageThreadID()
tid := id
c, present := t.idTable[id]
if present {
// There is already a ThreadContainer in the table for this ID.
// Under normal circumstances, there will be no IThreadable in it
// (since it was a forward reference from a References field.)
//
// If there is already a threadable in it, then that means there
// are two IThreadables with the same ID. Generate a new ID for
// this one, sigh... This ID is only used to cause the two entries
// in the hash table to not stomp each other.
//
if c.threadable != nil {
id = fmt.Sprintf("<Bogus-id:%d>", t.bogusIDCount)
t.bogusIDCount++
c = nil
} else {
c.threadable = threadable
}
}
// Create a ThreadContainer for this Threadable, and store it in
// the map
//
if c == nil {
c = &threadContainer{forID: tid}
c.threadable = threadable
c.forID = tid
t.idTable[id] = c
}
// Create ThreadContainers for each of the references which don't
// have them. Link each of the referenced messages together in the
// order implied by the references field, unless they are already
// linked.
//
var parentRef, ref *threadContainer
// Iterate through the references field of the threadable and see if we
// already have a reference to them in our map. Create one if not
//
refs := threadable.MessageThreadReferences()
for _, refString := range refs {
ref, present = t.idTable[refString]
if !present {
ref = &threadContainer{forID: refString}
t.idTable[refString] = ref
}
// If we have references A B C D, make D be a child of C, etc.,
// except if they have parents already.
//
if parentRef != nil && // there is a parent
ref.parent == nil && // don't have a parent already
parentRef != ref && // not a tight loop
!ref.findChild(parentRef) && // already linked
!parentRef.findChild(ref) { // not a wide loop
// Ok, link it into the parent's child list.
//
ref.parent = parentRef
ref.next = parentRef.child
parentRef.child = ref
}
parentRef = ref
}
// At this point `parentRef' is set to the container of the last element
// in the references field. Make that be the parent of this container,
// unless doing so would introduce a circularity.
//
if parentRef != nil &&
(parentRef == c ||
c.findChild(parentRef)) {
parentRef = nil
}
if c.parent != nil {
// If it has a parent already, that's there because we saw this message
// in a references field, and presumed a parent based on the other
// entries in that field. Now that we have the actual message, we can
// be more definitive, so throw away the old parent and use this new one.
// Find this container in the parent's child-list, and unlink it.
//
// Note that this could cause this message to now have no parent, if it
// has no references field, but some message referred to it as the
// non-first element of its references. (Which would have been some
// kind of lie...)
//
var rest, prev *threadContainer
for prev, rest = nil, c.parent.child; rest != nil; {
if rest == c {
break
}
prev = rest
rest = rest.next
}
if rest == nil {
return fmt.Errorf("didn't find %#v in parent %#v", c, c.parent)
}
if prev == nil {
c.parent.child = c.next
} else {
prev.next = c.next
}
c.next = nil
c.parent = nil
}
// If we have a parent, link c into the parent's child list.
//
if parentRef != nil {
c.parent = parentRef
c.next = parentRef.child
parentRef.child = c
}
// No error
//
return nil
}
// findRootSet finds the root set of the threadContainers, and returns a root node.
//
// NB: A container is in the root set if it has no parents.
//
func (t *Threader) findRootSet() (*threadContainer, error) {
root := &threadContainer{}
for _, c := range t.idTable {
if c.parent == nil {
if c.next != nil {
return nil, fmt.Errorf("container has no parent, but has a next value: %#v", c.next)
}
c.next = root.child
root.child = c
}
}
return root, nil
}
// Walk through the threads and discard any empty container objects.
// After calling this, there will only be any empty container objects
// at depth 0, and those will all have at least two kids.
//
func (t *Threader) pruneEmptyContainers(parent *threadContainer) {
var prev *threadContainer
var container = parent.child
var next *threadContainer
if container != nil {
next = container.next
}
for container != nil {
if container.threadable == nil && container.child == nil {
// This is an empty container with no kids. Nuke it.
//
// Normally such containers won't occur, but they can show up when
// two messages have References lines that disagree. For example,
// assuming A and B are messages, and 1, 2, and 3 are references for
// messages we haven't seen:
//
// A has refs: 1 2 3
// B has refs: 1 3
//
// There is ambiguity whether 3 is a child of 1 or 2. So,
// depending on the processing order, we might end up with either
//
// -- 1
// |-- 2
// |-- 3
// |-- A
// |-- B
// or
// -- 1
// |-- 2 <--- non-root childless container
// |-- 3
// |-- A
// |-- B
//
if prev == nil {
parent.child = container.next
} else {
prev.next = container.next
}
// Set container to prev so that prev keeps its same value
// the next time through the loop.
//
container = prev
} else if container.threadable == nil && // expired, and
container.child != nil && // has kids, and
(container.parent != nil || // not at root, or
container.child.next == nil) { // only one kid
// Expired message with kids. Promote the kids to this level.
// Don't do this if we would be promoting them to the root level,
// unless there is only one kid.
//
var tail *threadContainer
var kids = container.child
// Remove this container from the list, replacing it with `kids'
//
if prev == nil {
parent.child = kids
} else {
prev.next = kids
}
// Make each child's parent be this level's parent.
// Make the last child's next be this container's next
// - splicing `kids' into the list in place of `container'
//
for tail = kids; tail.next != nil; tail = tail.next {
tail.parent = container.parent
}
tail.parent = container.parent
tail.next = container.next
// Since we've inserted items in the chain, `next' currently points
// to the item after them (tail.next); reset that so that we process
// the newly promoted items the very next time around.
//
next = kids
// Set container to prev so that prev keeps its same value
// the next time through the loop.
//
container = prev
} else if container.child != nil {
// A real message with kids.
// Iterate over its children, and try to strip out the junk.
//
t.pruneEmptyContainers(container)
}
// Set up for the next iteration
//
prev = container
container = next
if container == nil {
next = nil
} else {
next = container.next
}
}
}
// If any two members of the root set have the same subject, merge them.
// This is so that messages which don't have References headers at all
// still get threaded (to the extent possible, at least.)
//
func (t *Threader) gatherSubjects() {
var count int
subjTable := make(map[string]*threadContainer)
for c := t.rootNode.child; c != nil; c = c.next {
threadable := c.threadable
// If there is no threadable, this is a dummy node in the root set.
// Only root set members may be dummies, and they always have at least
// two kids. Take the first kid as representative of the subject.
//
if threadable == nil {
threadable = c.child.threadable
}
subj := threadable.SimplifiedSubject()
if subj == "" {
continue
}
old := subjTable[subj]
// Add this container to the table if:
// - There is no container in the table with this subject, or
// - This one is a dummy container and the old one is not: the dummy
// one is more interesting as a root, so put it in the table instead.
// - The container in the table has a "Re:" version of this subject,
// and this container has a non-"Re:" version of this subject.
// The non-re version is the more interesting of the two.
//
if old == nil ||
(c.threadable == nil && old.threadable != nil) ||
(old.threadable != nil && old.threadable.SubjectIsReply() &&
c.threadable != nil && !c.threadable.SubjectIsReply()) {
subjTable[subj] = c
count++
}
}
// We are done if the table is empty
//
if count == 0 {
return
}
// The subj_table is now populated with one entry for each subject which
// occurs in the root set. Now iterate over the root set, and gather
// together the difference.
//
var prev, c, rest *threadContainer
prev = nil
c = t.rootNode.child
rest = c.next
for c != nil {
threadable := c.threadable
// might be a dummy -- see above
//
if threadable == nil {
threadable = c.child.threadable
}
subj := threadable.SimplifiedSubject()
// Don't thread together all subject-less messages; let them dangle.
//
if subj != "" {
old := subjTable[subj]
if old != c { // Avoid processing ourselves
// Ok, so now we have found another container in the root set with
// the same subject. There are a few possibilities:
//
// - If both are dummies, append one's children to the other, and remove
// the now-empty container.
//
// - If one container is a dummy and the other is not, make the non-dummy
// one be a child of the dummy, and a sibling of the other "real"
// messages with the same subject (the dummy's children.)
//
// - If that container is a non-dummy, and that message's subject does
// not begin with "Re:", but *this* message's subject does, then
// make this be a child of the other.
//
// - If that container is a non-dummy, and that message's subject begins
// with "Re:", but *this* message's subject does *not*, then make that
// be a child of this one -- they were mis-ordered. (This happens
// somewhat implicitly, since if there are two messages, one with Re:
// and one without, the one without will be in the hash table,
// regardless of the order in which they were seen.)
//
// - Otherwise, make a new dummy container and make both messages be a
// child of it. This catches the both-are-replies and neither-are-
// replies cases, and makes them be siblings instead of asserting a
// hierarchical relationship which might not be true.
//
// (People who reply to a message without using "Re:" and without using
// a References line will break this slightly. Those people suck.)
//
// (It has occurred to me that taking the date or message number into
// account would be one way of resolving some ambiguous cases,
// but that's not altogether straightforward either.)
// JI: You cannot rely on the clock settings being correct on a server/client that sent a message
//
// Remove the "second" message from the root set.
if prev == nil {
t.rootNode.child = c.next
} else {
prev.next = c.next
}
c.next = nil
if old.threadable == nil && c.threadable == nil {
// They're both dummies; merge them.
//
var tail *threadContainer
for tail = old.child; tail != nil && tail.next != nil; tail = tail.next {
}
tail.next = c.child
for tail = c.child; tail != nil; tail = tail.next {
tail.parent = old
}
c.child = nil
} else if old.threadable == nil || // old is empty, or
(c.threadable != nil &&
c.threadable.SubjectIsReply() && // c has Re, and
!old.threadable.SubjectIsReply()) { // old does not.
// Make this message be a child of the other.
c.parent = old
c.next = old.child
old.child = c
} else {
// Make the old and new messages be children of a new dummy container.
// We do this by creating a new container object for old->msg and
// transforming the old container into a dummy (by merely emptying it),
// so that the table still points to the one that is at depth 0
// instead of depth 1.
//
newC := &threadContainer{}
newC.threadable = old.threadable
newC.child = old.child
for tail := newC.child; tail != nil; tail = tail.next {
tail.parent = newC
}
old.threadable = nil
old.child = nil
c.parent = old
newC.parent = old
// old is now a dummy; make it have exactly two kids, c and newC.
//
old.child = c
c.next = newC
}
// we've done a merge, so keep the same `prev' next time around.
//
c = prev
}
}
prev = c
c = rest
if rest != nil {
rest = rest.next
}
}
}
|