File: comment_test.go

package info (click to toggle)
golang-golang-x-net 1%3A0.7.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bookworm-backports, bookworm-proposed-updates, experimental, sid
  • size: 7,060 kB
  • sloc: asm: 18; makefile: 14
file content (270 lines) | stat: -rw-r--r-- 9,256 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html

import (
	"bytes"
	"testing"
)

// TestComments exhaustively tests every 'interesting' N-byte string is
// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
// where 4 is the length of the "<!--" prefix that starts an HTML comment.
//
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
// sampled from the interestingCommentBytes const string, below. These cover
// all of the possible state transitions from comment-related parser states, as
// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
// and subsequent sections).
//
// The spec is written as an explicit state machine that, as a side effect,
// accumulates "the comment token's data" to a separate buffer.
// Tokenizer.readComment in this package does not have an explicit state
// machine and usually returns the comment text as a sub-slice of the input,
// between the opening '<' and closing '>' or EOF. This test confirms that the
// two algorithms match.
func TestComments(t *testing.T) {
	const prefix = "<!--"
	const suffixLen = 6
	buffer := make([]byte, 0, len(prefix)+suffixLen)
	testAllComments(t, append(buffer, prefix...))
}

// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
// It's equivalent to any other non-interesting byte (other than being replaced
// by U+FFFD REPLACEMENT CHARACTER).
//
// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
// testOneComment below breaks the loop instead.
//
// 'x' represents all other "non-interesting" comment bytes.
var interestingCommentBytes = [...]byte{
	'!', '-', '<', '>', 'x',
}

// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
// interesting bytes and then tests that this package's tokenization matches
// the HTML spec.
//
// Precondition: len(buffer) < cap(buffer)
// Precondition: string(buffer[:4]) == "<!--"
func testAllComments(t *testing.T, buffer []byte) {
	for _, interesting := range interestingCommentBytes {
		b := append(buffer, interesting)
		testOneComment(t, b)
		if len(b) < cap(b) {
			testAllComments(t, b)
		}
	}
}

func testOneComment(t *testing.T, b []byte) {
	z := NewTokenizer(bytes.NewReader(b))
	if next := z.Next(); next != CommentToken {
		t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
	}
	gotRemainder := string(b[len(z.Raw()):])
	gotComment := string(z.Text())

	i := len("<!--")
	wantBuffer := []byte(nil)
loop:
	for state := 43; ; {
		// Consume the next input character, handling EOF.
		if i >= len(b) {
			break
		}
		nextInputCharacter := b[i]
		i++

		switch state {
		case 43: // 13.2.5.43 Comment start state.
			switch nextInputCharacter {
			case '-':
				state = 44
			case '>':
				break loop
			default:
				i-- // Reconsume.
				state = 45
			}

		case 44: // 13.2.5.44 Comment start dash state.
			switch nextInputCharacter {
			case '-':
				state = 51
			case '>':
				break loop
			default:
				wantBuffer = append(wantBuffer, '-')
				i-- // Reconsume.
				state = 45
			}

		case 45: // 13.2.5.45 Comment state.
			switch nextInputCharacter {
			case '-':
				state = 50
			case '<':
				wantBuffer = append(wantBuffer, '<')
				state = 46
			default:
				wantBuffer = append(wantBuffer, nextInputCharacter)
			}

		case 46: // 13.2.5.46 Comment less-than sign state.
			switch nextInputCharacter {
			case '!':
				wantBuffer = append(wantBuffer, '!')
				state = 47
			case '<':
				wantBuffer = append(wantBuffer, '<')
				state = 46
			default:
				i-- // Reconsume.
				state = 45
			}

		case 47: // 13.2.5.47 Comment less-than sign bang state.
			switch nextInputCharacter {
			case '-':
				state = 48
			default:
				i-- // Reconsume.
				state = 45
			}

		case 48: // 13.2.5.48 Comment less-than sign bang dash state.
			switch nextInputCharacter {
			case '-':
				state = 49
			default:
				i-- // Reconsume.
				state = 50
			}

		case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
			switch nextInputCharacter {
			case '>':
				break loop
			default:
				i-- // Reconsume.
				state = 51
			}

		case 50: // 13.2.5.50 Comment end dash state.
			switch nextInputCharacter {
			case '-':
				state = 51
			default:
				wantBuffer = append(wantBuffer, '-')
				i-- // Reconsume.
				state = 45
			}

		case 51: // 13.2.5.51 Comment end state.
			switch nextInputCharacter {
			case '!':
				state = 52
			case '-':
				wantBuffer = append(wantBuffer, '-')
			case '>':
				break loop
			default:
				wantBuffer = append(wantBuffer, "--"...)
				i-- // Reconsume.
				state = 45
			}

		case 52: // 13.2.5.52 Comment end bang state.
			switch nextInputCharacter {
			case '-':
				wantBuffer = append(wantBuffer, "--!"...)
				state = 50
			case '>':
				break loop
			default:
				wantBuffer = append(wantBuffer, "--!"...)
				i-- // Reconsume.
				state = 45
			}

		default:
			t.Fatalf("input=%q: unexpected state %d", b, state)
		}
	}

	wantRemainder := ""
	if i < len(b) {
		wantRemainder = string(b[i:])
	}
	wantComment := string(wantBuffer)
	if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
		t.Errorf("input=%q\ngot:  %q + %q\nwant: %q + %q",
			b, gotComment, gotRemainder, wantComment, wantRemainder)
	}
}

// This table below summarizes the HTML-comment-related state machine from
// 13.2.5.43 "Comment start state" and subsequent sections.
// https://html.spec.whatwg.org/#comment-start-state
//
// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
// initial 13.2.5.1 "Data state":
//   - "<"  moves to 13.2.5.6  "Tag open state",
//   - "!"  moves to 13.2.5.42 "Markup declaration open state",
//   - "--" moves to 13.2.5.43 "Comment start state".
// Each of these transitions are the only way to get to the 6/42/43 states.
//
// State   !         -         <         >         NUL       EOF       default   HTML spec section
// 43      ...       s44       ...       s01.T.E0  ...       ...       r45       13.2.5.43 Comment start state
// 44      ...       s51       ...       s01.T.E0  ...       T.Z.E1    r45.A-    13.2.5.44 Comment start dash state
// 45      ...       s50       s46.A<    ...       t45.A?.E2 T.Z.E1    t45.Ax    13.2.5.45 Comment state
// 46      s47.A!    ...       t46.A<    ...       ...       ...       r45       13.2.5.46 Comment less-than sign state
// 47      ...       s48       ...       ...       ...       ...       r45       13.2.5.47 Comment less-than sign bang state
// 48      ...       s49       ...       ...       ...       ...       r50       13.2.5.48 Comment less-than sign bang dash state
// 49      ...       ...       ...       s01.T     ...       T.Z.E1    r51.E3    13.2.5.49 Comment less-than sign bang dash dash state
// 50      ...       s51       ...       ...       ...       T.Z.E1    r45.A-    13.2.5.50 Comment end dash state
// 51      s52       t51.A-    ...       s01.T     ...       T.Z.E1    r45.A--   13.2.5.51 Comment end state
// 52      ...       s50.A--!  ...       s01.T.E4  ...       T.Z.E1    r45.A--!  13.2.5.52 Comment end bang state
//
// State 43 is the "Comment start state" meaning that we've only seen "<!--"
// and nothing else. Similarly, state 44 means that we've only seen "<!---",
// with three dashes, and nothing else. For the other states, we deduce
// (working backwards) that the immediate prior input must be:
//   - 45  something that's not '-'
//   - 46  "<"
//   - 47  "<!"
//   - 48  "<!-"
//   - 49  "<!--"  not including the opening "<!--"
//   - 50  "-"     not including the opening "<!--" and also not "--"
//   - 51  "--"    not including the opening "<!--"
//   - 52  "--!"
//
// The table cell actions:
//   - ...   do the default action
//   - A!    append "!"      to the comment token's data.
//   - A-    append "-"      to the comment token's data.
//   - A--   append "--"     to the comment token's data.
//   - A--!  append "--!"    to the comment token's data.
//   - A<    append "<"      to the comment token's data.
//   - A?    append "\uFFFD" to the comment token's data.
//   - Ax    append the current input character to the comment token's data.
//   - E0    parse error (abrupt-closing-of-empty-comment).
//   - E1    parse error (eof-in-comment).
//   - E2    parse error (unexpected-null-character).
//   - E3    parse error (nested-comment).
//   - E4    parse error (incorrectly-closed-comment).
//   - T     emit the current comment token.
//   - Z     emit an end-of-file token.
//   - rNN   reconsume in the 13.2.5.NN     state (after any A* or E* operations).
//   - s01   switch to the    13.2.5.1 Data state (after any A* or E* operations).
//   - sNN   switch to the    13.2.5.NN     state (after any A* or E* operations).
//   - tNN   stay in the      13.2.5.NN     state (after any A* or E* operations).
//
// The E* actions are called errors in the HTML spec but they are not fatal
// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
// the parser"). They are warnings that, in practice, browsers simply ignore.