File: Wikidiff2.cpp

package info (click to toggle)
wikidiff2 1.2%2Bgit03ea59f-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 276 kB
  • ctags: 240
  • sloc: cpp: 1,357; php: 172; makefile: 13
file content (343 lines) | stat: -rw-r--r-- 8,768 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/**
 * Diff formatter, based on code by Steinar H. Gunderson, converted to work with the
 * Dairiki diff engine by Tim Starling
 *
 * GPL.
 */

#include <stdio.h>
#include <string.h>
#include "Wikidiff2.h"
#include <thai/thailib.h>
#include <thai/thwchar.h>
#include <thai/thbrk.h>


void Wikidiff2::diffLines(const StringVector & lines1, const StringVector & lines2,
		int numContextLines)
{
	// first do line-level diff
	StringDiff linediff(lines1, lines2);

	int from_index = 1, to_index = 1;

	// Should a line number be printed before the next context line?
	// Set to true initially so we get a line number on line 1
	bool showLineNumber = true;

	for (int i = 0; i < linediff.size(); ++i) {
		int n, j, n1, n2;
		// Line 1 changed, show heading with no leading context
		if (linediff[i].op != DiffOp<String>::copy && i == 0) {
			printBlockHeader(1, 1);
		}

		switch (linediff[i].op) {
			case DiffOp<String>::add:
				// inserted lines
				n = linediff[i].to.size();
				for (j=0; j<n; j++) {
					printAdd(*linediff[i].to[j]);
				}
				to_index += n;
				break;
			case DiffOp<String>::del:
				// deleted lines
				n = linediff[i].from.size();
				for (j=0; j<n; j++) {
					printDelete(*linediff[i].from[j]);
				}
				from_index += n;
				break;
			case DiffOp<String>::copy:
				// copy/context
				n = linediff[i].from.size();
				for (j=0; j<n; j++) {
					if ((i != 0 && j < numContextLines) /*trailing*/
							|| (i != linediff.size() - 1 && j >= n - numContextLines)) /*leading*/ {
						if (showLineNumber) {
							printBlockHeader(from_index, to_index);
							showLineNumber = false;
						}
						printContext(*linediff[i].from[j]);
					} else {
						showLineNumber = true;
					}
					from_index++;
					to_index++;
				}
				break;
			case DiffOp<String>::change:
				// replace, i.e. we do a word diff between the two sets of lines
				n1 = linediff[i].from.size();
				n2 = linediff[i].to.size();
				n = std::min(n1, n2);
				for (j=0; j<n; j++) {
					printWordDiff(*linediff[i].from[j], *linediff[i].to[j]);
				}
				from_index += n;
				to_index += n;
				if (n1 > n2) {
					for (j=n2; j<n1; j++) {
						printDelete(*linediff[i].from[j]);
					}
				} else {
					for (j=n1; j<n2; j++) {
						printAdd(*linediff[i].to[j]);
					}
				}
				break;
		}
		// Not first line anymore, don't show line number by default
		showLineNumber = false;
	}
}

void Wikidiff2::debugPrintWordDiff(WordDiff & worddiff)
{
	for (unsigned i = 0; i < worddiff.size(); ++i) {
		DiffOp<Word> & op = worddiff[i];
		switch (op.op) {
			case DiffOp<Word>::copy:
				result += "Copy\n";
				break;
			case DiffOp<Word>::del:
				result += "Delete\n";
				break;
			case DiffOp<Word>::add:
				result += "Add\n";
				break;
			case DiffOp<Word>::change:
				result += "Change\n";
				break;
		}
		result += "From: ";
		bool first = true;
		for (int j=0; j<op.from.size(); j++) {
			if (first) {
				first = false;
			} else {
				result += ", ";
			}
			result += "(";
			result += op.from[j]->whole() + ")";
		}
		result += "\n";
		result += "To: ";
		first = true;
		for (int j=0; j<op.to.size(); j++) {
			if (first) {
				first = false;
			} else {
				result += ", ";
			}
			result += "(";
			result += op.to[j]->whole() + ")";
		}
		result += "\n\n";
	}
}

void Wikidiff2::printText(const String & input)
{
	size_t start = 0;
	size_t end = input.find_first_of("<>&");
	while (end != String::npos) {
		if (end > start) {
			result.append(input, start, end - start);
		}
		switch (input[end]) {
			case '<':
				result.append("&lt;");
				break;
			case '>':
				result.append("&gt;");
				break;
			default /*case '&'*/:
				result.append("&amp;");
		}
		start = end + 1;
		end = input.find_first_of("<>&", start);
	}
	// Append the rest of the string after the last special character
	if (start < input.size()) {
		result.append(input, start, input.size() - start);
	}
}

// Weak UTF-8 decoder
// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
int Wikidiff2::nextUtf8Char(String::const_iterator & p, String::const_iterator & charStart,
		String::const_iterator end)
{
	int c = 0;
	unsigned char byte;
	int seqLength = 0;
	charStart = p;
	if (p == end) {
		return 0;
	}
	do {
		byte = (unsigned char)*p;
		if (byte < 0x80) {
			c = byte;
			seqLength = 0;
		} else if (byte >= 0xc0) {
			// Start of UTF-8 character
			// If this is unexpected, due to an overshort sequence, we ignore the invalid
			// sequence and resynchronise here
			if (byte < 0xe0) {
				seqLength = 1;
				c = byte & 0x1f;
			} else if (byte < 0xf0) {
				seqLength = 2;
				c = byte & 0x0f;
			} else {
				seqLength = 3;
				c = byte & 7;
			}
		} else if (seqLength) {
			c <<= 6;
			c |= byte & 0x3f;
			--seqLength;
		} else {
			// Unexpected continuation, ignore
		}
		++p;
	} while (seqLength && p != end);
	return c;
}

// Split a string into words
//
// TODO: I think the best way to do this would be to use ICU BreakIterator
// instead of libthai + DIY. Basically you'd run BreakIterators from several
// different locales (en, th, ja) and merge the results, i.e. if a break occurs
// in any locale at a given position, split the string. I don't know if the
// quality of the Thai dictionary in ICU matches the one in libthai, we would
// have to check this somehow.
void Wikidiff2::explodeWords(const String & text, WordVector &words)
{
	// Don't try to do a word-level diff on very long lines
	if (text.size() > MAX_DIFF_LINE) {
		words.push_back(Word(text.begin(), text.end(), text.end()));
		return;
	}

	// Decode the UTF-8 in the string.
	// * Save the character sizes (in bytes)
	// * Convert the string to TIS-620, which is the internal character set of libthai.
	// * Save the character offsets of any break positions (same format as libthai).

	String tisText, charSizes;
	String::const_iterator suffixEnd, charStart, p;
	IntSet breaks;

	tisText.reserve(text.size());
	charSizes.reserve(text.size());
	wchar_t ch, lastChar;
	thchar_t thaiChar;
	bool hasThaiChars = false;

	p = text.begin();
	ch = nextUtf8Char(p, charStart, text.end());
	lastChar = 0;
	int charIndex = 0;
	while (ch) {
		thaiChar = th_uni2tis(ch);
		if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
			hasThaiChars = true;
		}
		tisText += (char)thaiChar;
		charSizes += (char)(p - charStart);

		if (isLetter(ch)) {
			if (lastChar && !isLetter(lastChar)) {
				breaks.insert(charIndex);
			}
		} else {
			breaks.insert(charIndex);
		}
		charIndex++;
		lastChar = ch;
		ch = nextUtf8Char(p, charStart, text.end());
	}

	// If there were any Thai characters in the string, run th_brk on it and add
	// the resulting break positions
	if (hasThaiChars) {
		IntVector thaiBreakPositions;
		tisText += '\0';
		thaiBreakPositions.resize(tisText.size());
		int numBreaks = th_brk((const thchar_t*)(tisText.data()),
				&thaiBreakPositions[0], thaiBreakPositions.size());
		thaiBreakPositions.resize(numBreaks);
		breaks.insert(thaiBreakPositions.begin(), thaiBreakPositions.end());
	}

	// Add a fake end-of-string character and have a break on it, so that the
	// last word gets added without special handling
	breaks.insert(charSizes.size());
	charSizes += (char)0;

	// Now make the word array by traversing the breaks set
	p = text.begin();
	IntSet::iterator pBrk = breaks.begin();
	String::const_iterator wordStart = text.begin();
	String::const_iterator suffixStart = text.end();

	// If there's a break at the start of the string, skip it
	if (pBrk != breaks.end() && *pBrk == 0) {
		pBrk++;
	}

	for (charIndex = 0; charIndex < charSizes.size(); p += charSizes[charIndex++]) {
		// Assume all spaces are ASCII
		if (isSpace(*p)) {
			suffixStart = p;
		}
		if (pBrk != breaks.end() && charIndex == *pBrk) {
			if (suffixStart == text.end()) {
				words.push_back(Word(wordStart, p, p));
			} else {
				words.push_back(Word(wordStart, suffixStart, p));
			}
			pBrk++;
			suffixStart = text.end();
			wordStart = p;
		}
	}
}

void Wikidiff2::explodeLines(const String & text, StringVector &lines)
{
	String::const_iterator ptr = text.begin();
	while (ptr != text.end()) {
		String::const_iterator ptr2 = std::find(ptr, text.end(), '\n');
		lines.push_back(String(ptr, ptr2));

		ptr = ptr2;
		if (ptr != text.end()) {
			++ptr;
		}
	}
}

const Wikidiff2::String & Wikidiff2::execute(const String & text1, const String & text2, int numContextLines)
{
	// Allocate some result space to avoid excessive copying
	result.clear();
	result.reserve(text1.size() + text2.size() + 10000);

	// Split input strings into lines
	StringVector lines1;
	StringVector lines2;
	explodeLines(text1, lines1);
	explodeLines(text2, lines2);

	// Do the diff
	diffLines(lines1, lines2, numContextLines);

	// Return a reference to the result buffer
	return result;
}