File: jwz.go

package info (click to toggle)
golang-github-gatherstars-com-jwz 1.4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 13,744 kB
  • sloc: makefile: 3
file content (679 lines) | stat: -rw-r--r-- 19,225 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
// Package jwz is an implementation of the email threading algorithm created by Jamie Zawinski and explained by him
// at: https://www.jwz.org/doc/threading.html
//
// This package was created by cribbing from the code at:
//     https://www.jwz.org/doc/threading.html#:~:text=grendel-1999-05-14.tar.gz
// from the Java source code in view/Threader.java - it contains no ham and cheese sandwiches.
//
// The code, interface etc. was obviously adapted in to Go form, though where possible, the code reflects the
// original Java if it is not too ungolike.
//
// Author: Jim Idle - jimi@idle.ws / jimi@gatherstars.com
//
// See the LICENSE file, sit down, have a scone.
//
package jwz

import (
	"errors"
	"fmt"
)

// Threader arranges a set of messages into a thread hierarchy, by references.
//
type Threader struct {
	rootNode     *threadContainer
	idTable      map[string]*threadContainer
	bogusIDCount int
}

// NewThreader returns an instance of the Threader struct, that is ready to attack
// your Threadable
//
//goland:noinspection GoUnusedExportedFunction
func NewThreader() *Threader {

	t := &Threader{
		idTable: make(map[string]*threadContainer),
	}
	return t
}

// Thread will create a threadable organized  so that the root node
// is the original reference, creating dummy placeholders for the emails
// we don't have yet
//
func (t *Threader) Thread(threadable Threadable) (Threadable, error) {

	if threadable == nil {
		return nil, nil
	}

	// Build a thread container from this single email
	//
	if !threadable.IsDummy() {
		if err := t.buildContainer(threadable); err != nil {
			return nil, err
		}
	} else {
		return nil, errors.New("cannot thread a single email with a dummy root")
	}

	var err error

	// Organize the root set from what we have
	//
	t.rootNode, err = t.findRootSet()
	if err != nil {
		return nil, err
	}

	// We no longer need the map - probably no real need to blank it here, but the original Java code did that,
	// and it won't harm to let the GC reclaim this in case our caller keeps the *Threader around for some reason
	//
	t.idTable = nil

	// We do this to avoid flipping the input order each time through.
	//
	t.rootNode.reverseChildren()

	// There should not be a next in the root of a conversation thread
	//
	if t.rootNode.next != nil {
		return nil, fmt.Errorf("root node contains a next and should not: %#v", t.rootNode)
	}

	// Because the result of this function is a tree that does actually contain dummies for missing references
	// we need to add a dummy threadable for any node that does not yet have one. Then we can flush the chain
	// of containers in to the threadable
	//
	t.rootNode.fillDummy(threadable)

	var result Threadable
	if t.rootNode.child != nil {
		result = t.rootNode.child.threadable
	}

	// Flush the tree structure of each element of the root set down into
	// their underlying Threadables
	//
	_ = t.rootNode.flush()
	t.rootNode = nil

	return result, nil
}

// ThreadSlice will thread the set of messages contained within threadableSlice.
// The Threadable returned is the new first element of the root set.
//
func (t *Threader) ThreadSlice(threadableSlice []Threadable) (Threadable, error) {

	if threadableSlice == nil || len(threadableSlice) == 0 {
		return nil, nil
	}

	// Iterate all the Threadable represented by the root and build the
	// threadContainer from them
	//
	for _, nt := range threadableSlice {
		if !nt.IsDummy() {
			if err := t.buildContainer(nt); err != nil {
				return nil, err
			}
		}
	}
	return t.threadRoot()
}

// ThreadRoot will thread the set of messages provided by ThreadableRoot.
// The Threadable returned is the new first element of the root set.
//
func (t *Threader) ThreadRoot(threadableRoot ThreadableRoot) (Threadable, error) {

	if threadableRoot == nil {
		return nil, nil
	}

	// Iterate all the Threadable represented by the root and build the
	// threadContainer from them
	//
	for threadableRoot.Next() == true {
		nt := threadableRoot.Get()
		if !nt.IsDummy() {
			if err := t.buildContainer(nt); err != nil {
				return nil, err
			}
		}
	}
	return t.threadRoot()
}

func (t *Threader) threadRoot() (Threadable, error) {

	var err error

	// Organize the root set from what we have
	//
	t.rootNode, err = t.findRootSet()
	if err != nil {
		return nil, err
	}

	// We no longer need the map - probably no real need to blank it here, but the original Java code did that,
	// and it won't harm to let the GC reclaim this in case our caller keeps the *Threader around for some reason
	//
	t.idTable = nil

	// Get rid of any empty containers. They should no longer needed
	//
	t.pruneEmptyContainers(t.rootNode)

	// We do this so to avoid flipping the input order each time through.
	//
	t.rootNode.reverseChildren()

	// We might need to sort on subjects, so let's process them
	//
	t.gatherSubjects()

	// There should not be a next in the root of a conversation thread
	//
	if t.rootNode.next != nil {
		return nil, fmt.Errorf("root node contains a next and should not: %#v", t.rootNode)
	}

	for r := t.rootNode.child; r != nil; r = r.next {

		// If this direct child of the root node has no threadable in it,
		// manufacture a dummy container to bind its children together.
		// Note that these dummies can only ever occur as elements of
		// the root set.
		//
		if r.threadable == nil {
			r.threadable = r.child.threadable.MakeDummy(r.forID)
		}
	}

	var result Threadable
	if t.rootNode.child != nil {
		result = t.rootNode.child.threadable
	}

	// Flush the tree structure of each element of the root set down into
	// their underlying Threadables
	//
	_ = t.rootNode.flush()
	t.rootNode = nil

	return result, nil
}

// buildContainer() does three things:
//
//   - It walks the tree of Threadable, and wraps each in a
//     threadContainer object.
//   - It indexes each threadContainer object in the idTable, under
//     the message ID of the contained Threadable.
//    - For each of the references within Threadable, it ensures that there
//     is a threadContainer in the table (an empty one, if necessary.)
//
func (t *Threader) buildContainer(threadable Threadable) error {

	var present bool

	// See if we already have a container for this threadable
	//
	id := threadable.MessageThreadID()
	tid := id

	c, present := t.idTable[id]
	if present {

		// There is already a ThreadContainer in the table for this ID.
		// Under normal circumstances, there will be no IThreadable in it
		// (since it was a forward reference from a References field.)
		//
		// If there is already a threadable in it, then that means there
		// are two IThreadables with the same ID.  Generate a new ID for
		// this one, sigh...  This ID is only used to cause the two entries
		// in the hash table to not stomp each other.
		//
		if c.threadable != nil {
			id = fmt.Sprintf("<Bogus-id:%d>", t.bogusIDCount)
			t.bogusIDCount++
			c = nil
		} else {
			c.threadable = threadable
		}
	}

	// Create a ThreadContainer for this Threadable, and store it in
	// the map
	//
	if c == nil {
		c = &threadContainer{forID: tid}
		c.threadable = threadable
		c.forID = tid
		t.idTable[id] = c
	}

	// Create ThreadContainers for each of the references which don't
	// have them.  Link each of the referenced messages together in the
	// order implied by the references field, unless they are already
	// linked.
	//
	var parentRef, ref *threadContainer

	// Iterate through the references field of the threadable and see if we
	// already have a reference to them in our map. Create one if not
	//
	refs := threadable.MessageThreadReferences()
	for _, refString := range refs {

		ref, present = t.idTable[refString]
		if !present {

			ref = &threadContainer{forID: refString}

			t.idTable[refString] = ref
		}

		// If we have references A B C D, make D be a child of C, etc.,
		// except if they have parents already.
		//
		if parentRef != nil && // there is a parent
			ref.parent == nil && // don't have a parent already
			parentRef != ref && // not a tight loop
			!ref.findChild(parentRef) && // already linked
			!parentRef.findChild(ref) { // not a wide loop

			// Ok, link it into the parent's child list.
			//
			ref.parent = parentRef
			ref.next = parentRef.child
			parentRef.child = ref
		}
		parentRef = ref
	}

	// At this point `parentRef' is set to the container of the last element
	// in the references field.  Make that be the parent of this container,
	// unless doing so would introduce a circularity.
	//
	if parentRef != nil &&
		(parentRef == c ||
			c.findChild(parentRef)) {
		parentRef = nil
	}

	if c.parent != nil {

		// If it has a parent already, that's there because we saw this message
		// in a references field, and presumed a parent based on the other
		// entries in that field.  Now that we have the actual message, we can
		// be more definitive, so throw away the old parent and use this new one.
		// Find this container in the parent's child-list, and unlink it.
		//
		// Note that this could cause this message to now have no parent, if it
		// has no references field, but some message referred to it as the
		// non-first element of its references.  (Which would have been some
		// kind of lie...)
		//
		var rest, prev *threadContainer
		for prev, rest = nil, c.parent.child; rest != nil; {
			if rest == c {
				break
			}
			prev = rest
			rest = rest.next
		}

		if rest == nil {
			return fmt.Errorf("didn't find %#v in parent %#v", c, c.parent)
		}

		if prev == nil {
			c.parent.child = c.next
		} else {
			prev.next = c.next
		}

		c.next = nil
		c.parent = nil
	}

	// If we have a parent, link c into the parent's child list.
	//
	if parentRef != nil {
		c.parent = parentRef
		c.next = parentRef.child
		parentRef.child = c
	}

	// No error
	//
	return nil
}

// findRootSet finds the root set of the threadContainers, and returns a root node.
//
// NB: A container is in the root set if it has no parents.
//
func (t *Threader) findRootSet() (*threadContainer, error) {

	root := &threadContainer{}
	for _, c := range t.idTable {
		if c.parent == nil {
			if c.next != nil {
				return nil, fmt.Errorf("container has no parent, but has a next value: %#v", c.next)
			}
			c.next = root.child
			root.child = c
		}
	}
	return root, nil
}

// Walk through the threads and discard any empty container objects.
// After calling this, there will only be any empty container objects
// at depth 0, and those will all have at least two kids.
//
func (t *Threader) pruneEmptyContainers(parent *threadContainer) {

	var prev *threadContainer
	var container = parent.child
	var next *threadContainer

	if container != nil {
		next = container.next
	}

	for container != nil {

		if container.threadable == nil && container.child == nil {
			// This is an empty container with no kids.  Nuke it.
			//
			// Normally such containers won't occur, but they can show up when
			// two messages have References lines that disagree.  For example,
			// assuming A and B are messages, and 1, 2, and 3 are references for
			// messages we haven't seen:
			//
			//        A has refs: 1 2 3
			//        B has refs: 1 3
			//
			// There is ambiguity whether 3 is a child of 1 or 2.  So,
			// depending on the processing order, we might end up with either
			//
			//        -- 1
			//           |-- 2
			//               |-- 3
			//                   |-- A
			//                   |-- B
			// or
			//        -- 1
			//           |-- 2            <--- non-root childless container
			//           |-- 3
			//               |-- A
			//               |-- B
			//
			if prev == nil {
				parent.child = container.next
			} else {
				prev.next = container.next
			}

			// Set container to prev so that prev keeps its same value
			// the next time through the loop.
			//
			container = prev

		} else if container.threadable == nil && // expired, and
			container.child != nil && // has kids, and
			(container.parent != nil || // not at root, or
				container.child.next == nil) { // only one kid

			// Expired message with kids.  Promote the kids to this level.
			// Don't do this if we would be promoting them to the root level,
			// unless there is only one kid.
			//
			var tail *threadContainer
			var kids = container.child

			// Remove this container from the list, replacing it with `kids'
			//
			if prev == nil {
				parent.child = kids
			} else {
				prev.next = kids
			}

			// Make each child's parent be this level's parent.
			// Make the last child's next be this container's next
			//  - splicing `kids' into the list in place of `container'
			//
			for tail = kids; tail.next != nil; tail = tail.next {
				tail.parent = container.parent
			}
			tail.parent = container.parent
			tail.next = container.next

			// Since we've inserted items in the chain, `next' currently points
			// to the item after them (tail.next); reset that so that we process
			// the newly promoted items the very next time around.
			//
			next = kids

			// Set container to prev so that prev keeps its same value
			// the next time through the loop.
			//
			container = prev

		} else if container.child != nil {

			// A real message with kids.
			// Iterate over its children, and try to strip out the junk.
			//
			t.pruneEmptyContainers(container)
		}

		// Set up for the next iteration
		//
		prev = container
		container = next
		if container == nil {
			next = nil
		} else {
			next = container.next
		}
	}
}

// If any two members of the root set have the same subject, merge them.
// This is so that messages which don't have References headers at all
// still get threaded (to the extent possible, at least.)
//
func (t *Threader) gatherSubjects() {

	var count int

	subjTable := make(map[string]*threadContainer)

	for c := t.rootNode.child; c != nil; c = c.next {

		threadable := c.threadable

		// If there is no threadable, this is a dummy node in the root set.
		// Only root set members may be dummies, and they always have at least
		// two kids.  Take the first kid as representative of the subject.
		//
		if threadable == nil {
			threadable = c.child.threadable
		}

		subj := threadable.SimplifiedSubject()
		if subj == "" {
			continue
		}

		old := subjTable[subj]

		// Add this container to the table if:
		//  - There is no container in the table with this subject, or
		//  - This one is a dummy container and the old one is not: the dummy
		//    one is more interesting as a root, so put it in the table instead.
		//  - The container in the table has a "Re:" version of this subject,
		//    and this container has a non-"Re:" version of this subject.
		//    The non-re version is the more interesting of the two.
		//
		if old == nil ||
			(c.threadable == nil && old.threadable != nil) ||
			(old.threadable != nil && old.threadable.SubjectIsReply() &&
				c.threadable != nil && !c.threadable.SubjectIsReply()) {
			subjTable[subj] = c
			count++
		}
	}

	// We are done if the table is empty
	//
	if count == 0 {
		return
	}

	// The subj_table is now populated with one entry for each subject which
	// occurs in the root set.  Now iterate over the root set, and gather
	// together the difference.
	//
	var prev, c, rest *threadContainer

	prev = nil
	c = t.rootNode.child
	rest = c.next

	for c != nil {

		threadable := c.threadable

		// might be a dummy -- see above
		//
		if threadable == nil {
			threadable = c.child.threadable
		}

		subj := threadable.SimplifiedSubject()

		// Don't thread together all subject-less messages; let them dangle.
		//
		if subj != "" {

			old := subjTable[subj]

			if old != c { // Avoid processing ourselves

				// Ok, so now we have found another container in the root set with
				// the same subject.  There are a few possibilities:
				//
				// - If both are dummies, append one's children to the other, and remove
				//   the now-empty container.
				//
				// - If one container is a dummy and the other is not, make the non-dummy
				//   one be a child of the dummy, and a sibling of the other "real"
				//   messages with the same subject (the dummy's children.)
				//
				// - If that container is a non-dummy, and that message's subject does
				//   not begin with "Re:", but *this* message's subject does, then
				//   make this be a child of the other.
				//
				// - If that container is a non-dummy, and that message's subject begins
				//   with "Re:", but *this* message's subject does *not*, then make that
				//   be a child of this one -- they were mis-ordered.  (This happens
				//   somewhat implicitly, since if there are two messages, one with Re:
				//   and one without, the one without will be in the hash table,
				//   regardless of the order in which they were seen.)
				//
				// - Otherwise, make a new dummy container and make both messages be a
				//   child of it.  This catches the both-are-replies and neither-are-
				//   replies cases, and makes them be siblings instead of asserting a
				//   hierarchical relationship which might not be true.
				//
				//   (People who reply to a message without using "Re:" and without using
				//   a References line will break this slightly.  Those people suck.)
				//
				// (It has occurred to me that taking the date or message number into
				// account would be one way of resolving some ambiguous cases,
				// but that's not altogether straightforward either.)
				// JI: You cannot rely on the clock settings being correct on a server/client that sent a message
				//

				// Remove the "second" message from the root set.
				if prev == nil {
					t.rootNode.child = c.next
				} else {
					prev.next = c.next
				}
				c.next = nil

				if old.threadable == nil && c.threadable == nil {

					// They're both dummies; merge them.
					//
					var tail *threadContainer
					for tail = old.child; tail != nil && tail.next != nil; tail = tail.next {
					}

					tail.next = c.child
					for tail = c.child; tail != nil; tail = tail.next {
						tail.parent = old
					}
					c.child = nil

				} else if old.threadable == nil || // old is empty, or
					(c.threadable != nil &&
						c.threadable.SubjectIsReply() && //   c has Re, and
						!old.threadable.SubjectIsReply()) { //   old does not.

					// Make this message be a child of the other.
					c.parent = old
					c.next = old.child
					old.child = c

				} else {

					// Make the old and new messages be children of a new dummy container.
					// We do this by creating a new container object for old->msg and
					// transforming the old container into a dummy (by merely emptying it),
					// so that the  table still points to the one that is at depth 0
					// instead of depth 1.
					//
					newC := &threadContainer{}

					newC.threadable = old.threadable
					newC.child = old.child
					for tail := newC.child; tail != nil; tail = tail.next {
						tail.parent = newC
					}

					old.threadable = nil
					old.child = nil

					c.parent = old
					newC.parent = old

					// old is now a dummy; make it have exactly two kids, c and newC.
					//
					old.child = c
					c.next = newC
				}

				// we've done a merge, so keep the same `prev' next time around.
				//
				c = prev
			}
		}
		prev = c
		c = rest
		if rest != nil {
			rest = rest.next
		}
	}
}