File: ParsoidParser.php

package info (click to toggle)
mediawiki 1%3A1.43.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 417,464 kB
  • sloc: php: 1,062,949; javascript: 664,290; sql: 9,714; python: 5,458; xml: 3,489; sh: 1,131; makefile: 64
file content (355 lines) | stat: -rw-r--r-- 13,076 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
<?php

namespace MediaWiki\Parser\Parsoid;

use MediaWiki\Content\TextContent;
use MediaWiki\Content\WikitextContent;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices;
use MediaWiki\Page\PageReference;
use MediaWiki\Parser\ParserFactory;
use MediaWiki\Parser\ParserOptions;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
use MediaWiki\Revision\MutableRevisionRecord;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Title\Title;
use MediaWiki\WikiMap\WikiMap;
use Wikimedia\Assert\Assert;
use Wikimedia\Parsoid\Config\PageConfig;
use Wikimedia\Parsoid\Parsoid;

/**
 * Parser implementation which uses Parsoid.
 *
 * Currently incomplete; see T236809 for the long-term plan.
 *
 * @since 1.41
 * @unstable since 1.41; see T236809 for plan.
 */
class ParsoidParser /* eventually this will extend \Parser */ {
	/**
	 * @unstable
	 * This should not be used widely right now since this may go away.
	 * This is being added to support DiscussionTools with Parsoid HTML
	 * and after initial exploration, this may be implemented differently.
	 */
	public const PARSOID_TITLE_KEY = "parsoid:title-dbkey";
	private Parsoid $parsoid;
	private PageConfigFactory $pageConfigFactory;
	private LanguageConverterFactory $languageConverterFactory;
	private ParserFactory $legacyParserFactory;

	/**
	 * @param Parsoid $parsoid
	 * @param PageConfigFactory $pageConfigFactory
	 * @param LanguageConverterFactory $languageConverterFactory
	 * @param ParserFactory $legacyParserFactory
	 */
	public function __construct(
		Parsoid $parsoid,
		PageConfigFactory $pageConfigFactory,
		LanguageConverterFactory $languageConverterFactory,
		ParserFactory $legacyParserFactory
	) {
		$this->parsoid = $parsoid;
		$this->pageConfigFactory = $pageConfigFactory;
		$this->languageConverterFactory = $languageConverterFactory;
		$this->legacyParserFactory = $legacyParserFactory;
	}

	/**
	 * Internal helper to avoid code deuplication across two methods
	 *
	 * @param PageConfig $pageConfig
	 * @param ParserOptions $options
	 * @return ParserOutput
	 */
	private function genParserOutput(
		PageConfig $pageConfig, ParserOptions $options, ?ParserOutput $previousOutput
	): ParserOutput {
		$parserOutput = new ParserOutput();

		// Parsoid itself does not vary output by parser options right now.
		// But, ensure that any option use by extensions, parser functions,
		// recursive parses, or (in the unlikely future scenario) Parsoid itself
		// are recorded as used.
		$options->registerWatcher( [ $parserOutput, 'recordOption' ] );

		// The enable/disable logic here matches that in Parser::internalParseHalfParsed(),
		// although __NOCONTENTCONVERT__ is handled internal to Parsoid.
		//
		// T349137: It might be preferable to handle __NOCONTENTCONVERT__ here rather than
		// by inspecting the DOM inside Parsoid. That will come in a separate patch.
		$htmlVariantLanguage = null;
		if ( !( $options->getDisableContentConversion() || $options->getInterfaceMessage() ) ) {
			// NOTES (some of these are TODOs for read views integration)
			// 1. This html variant conversion is a pre-cache transform. HtmlOutputRendererHelper
			//    has another variant conversion that is a post-cache transform based on the
			//    'Accept-Language' header. If that header is set, there is really no reason to
			//    do this conversion here. So, eventually, we are likely to either not pass in
			//    the htmlVariantLanguage option below OR disable language conversion from the
			//    wt2html path in Parsoid and this and the Accept-Language variant conversion
			//    both would have to be handled as post-cache transforms.
			//
			// 2. Parser.php calls convert() which computes a preferred variant from the
			//    target language. But, we cannot do that unconditionally here because REST API
			//    requests specify the exact variant via the 'Content-Language' header.
			//
			//    For Parsoid page views, either the callers will have to compute the
			//    preferred variant and set it in ParserOptions OR the REST API will have
			//    to set some other flag indicating that the preferred variant should not
			//    be computed. For now, I am adding a temporary hack, but this should be
			//    replaced with something more sensible (T267067).
			//
			// 3. Additionally, Parsoid's callers will have to set targetLanguage in ParserOptions
			//    to mimic the logic in Parser.php (missing right now).
			$langCode = $pageConfig->getPageLanguageBcp47();
			if ( $options->getRenderReason() === 'page-view' ) { // TEMPORARY HACK
				$langFactory = MediaWikiServices::getInstance()->getLanguageFactory();
				$lang = $langFactory->getLanguage( $langCode );
				$langConv = $this->languageConverterFactory->getLanguageConverter( $lang );
				$htmlVariantLanguage = $langFactory->getLanguage( $langConv->getPreferredVariant() );
			} else {
				$htmlVariantLanguage = $langCode;
			}
		}
		$oldPageConfig = null;
		$oldPageBundle = null;

		// T371713: Temporary statistics collection code to determine
		// feasibility of Parsoid selective update
		$sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get(
			MainConfigNames::ParsoidSelectiveUpdateSampleRate
		);
		$doSample = ( $sampleRate && mt_rand( 1, $sampleRate ) === 1 );
		if ( $doSample && $previousOutput !== null && $previousOutput->getCacheRevisionId() ) {
			// Allow fetching the old wikitext corresponding to the
			// $previousOutput
			$oldPageConfig = $this->pageConfigFactory->create(
				Title::newFromLinkTarget( $pageConfig->getLinkTarget() ),
				$options->getUserIdentity(),
				$previousOutput->getCacheRevisionId(),
				null,
				$previousOutput->getLanguage(),
			);
			$oldPageBundle =
				PageBundleParserOutputConverter::pageBundleFromParserOutput(
					$previousOutput
				);
		}

		$defaultOptions = [
			'pageBundle' => true,
			'wrapSections' => true,
			'logLinterData' => true,
			'body_only' => false,
			'htmlVariantLanguage' => $htmlVariantLanguage,
			'offsetType' => 'byte',
			'outputContentVersion' => Parsoid::defaultHTMLVersion(),
			'previousOutput' => $oldPageBundle,
			'previousInput' => $oldPageConfig,
			'sampleStats' => $doSample,
			'renderReason' => $options->getRenderReason(),
		];

		$parserOutput->resetParseStartTime();

		// This can throw ClientError or ResourceLimitExceededException.
		// Callers are responsible for figuring out how to handle them.
		$pageBundle = $this->parsoid->wikitext2html(
			$pageConfig,
			$defaultOptions,
			$headers,
			$parserOutput );

		$parserOutput = PageBundleParserOutputConverter::parserOutputFromPageBundle( $pageBundle, $parserOutput );

		// Record the page title in dbkey form so that post-cache transforms
		// have access to the title.
		$parserOutput->setExtensionData(
			self::PARSOID_TITLE_KEY,
			Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedDBkey()
		);

		// Register a watcher again because the $parserOutput arg
		// and $parserOutput return value above are different objects!
		$options->registerWatcher( [ $parserOutput, 'recordOption' ] );

		$parserOutput->setFromParserOptions( $options );

		$parserOutput->recordTimeProfile();
		$this->makeLimitReport( $options, $parserOutput );

		// T371713: Collect statistics on parsing time -vs- presence of
		// $previousOutput
		$stats = MediaWikiServices::getInstance()->getStatsFactory();
		$labels = [
			'type' => $previousOutput === null ? 'full' : 'selective',
			'wiki' => WikiMap::getCurrentWikiId(),
			'reason' => $options->getRenderReason() ?: 'unknown',
		];
		$stats
			->getCounter( 'Parsoid_parse_cpu_seconds' )
			->setLabels( $labels )
			->incrementBy( $parserOutput->getTimeProfile( 'cpu' ) );
		$stats
			->getCounter( 'Parsoid_parse_total' )
			->setLabels( $labels )
			->increment();

		// Add Parsoid skinning module
		$parserOutput->addModuleStyles( [ 'mediawiki.skinning.content.parsoid' ] );

		// Record Parsoid version in extension data; this allows
		// us to use the onRejectParserCacheValue hook to selectively
		// expire "bad" generated content in the event of a rollback.
		$parserOutput->setExtensionData(
			'core:parsoid-version', Parsoid::version()
		);
		$parserOutput->setExtensionData(
			'core:html-version', Parsoid::defaultHTMLVersion()
		);

		return $parserOutput;
	}

	/**
	 * Convert wikitext to HTML
	 * Do not call this function recursively.
	 *
	 * @param string|TextContent $text Text we want to parse
	 * @param-taint $text escapes_htmlnoent
	 * @param PageReference $page
	 * @param ParserOptions $options
	 * @param bool $linestart
	 * @param bool $clearState
	 * @param int|null $revId ID of the revision being rendered. This is used to render
	 *  REVISION* magic words. 0 means that any current revision will be used. Null means
	 *  that {{REVISIONID}}/{{REVISIONUSER}} will be empty and {{REVISIONTIMESTAMP}} will
	 *  use the current timestamp.
	 * @param ?ParserOutput $previousOutput The (optional) result of a
	 *  previous parse of this page, which can be used for selective update.
	 * @return ParserOutput
	 * @return-taint escaped
	 * @unstable since 1.41
	 */
	public function parse(
		$text, PageReference $page, ParserOptions $options,
		bool $linestart = true, bool $clearState = true, ?int $revId = null,
		?ParserOutput $previousOutput = null
	): ParserOutput {
		Assert::invariant( $linestart, '$linestart=false is not yet supported' );
		Assert::invariant( $clearState, '$clearState=false is not yet supported' );
		$title = Title::newFromPageReference( $page );
		$lang = $options->getTargetLanguage();
		if ( $lang === null && $options->getInterfaceMessage() ) {
			$lang = $options->getUserLangObj();
		}
		$pageConfig = $revId === null || $revId === 0 ? null : $this->pageConfigFactory->create(
			$title,
			$options->getUserIdentity(),
			$revId,
			null, // unused
			$lang // defaults to title page language if null
		);
		$content = null;
		if ( $text instanceof TextContent ) {
			$content = $text;
			$text = $content->getText();
		}
		if ( !( $pageConfig && $pageConfig->getPageMainContent() === $text ) ) {
			// This is a bit awkward! But we really need to parse $text, which
			// may or may not correspond to the $revId provided!
			// T332928 suggests one solution: splitting the "have revid"
			// callers from the "bare text, no associated revision" callers.
			$revisionRecord = new MutableRevisionRecord( $title );
			if ( $revId !== null ) {
				$revisionRecord->setId( $revId );
			}
			$revisionRecord->setSlot(
				SlotRecord::newUnsaved(
					SlotRecord::MAIN,
					$content ?? new WikitextContent( $text )
				)
			);
			$pageConfig = $this->pageConfigFactory->create(
				$title,
				$options->getUserIdentity(),
				$revisionRecord,
				null, // unused
				$lang // defaults to title page language if null
			);
		}

		return $this->genParserOutput( $pageConfig, $options, $previousOutput );
	}

	/**
	 * @internal
	 *
	 * Convert custom wikitext (stored in main slot of the $fakeRev arg) to HTML.
	 * Callers are expected NOT to stuff the result into ParserCache.
	 *
	 * @param RevisionRecord $fakeRev Revision to parse
	 * @param PageReference $page
	 * @param ParserOptions $options
	 * @return ParserOutput
	 * @unstable since 1.41
	 */
	public function parseFakeRevision(
		RevisionRecord $fakeRev, PageReference $page, ParserOptions $options
	): ParserOutput {
		wfDeprecated( __METHOD__, '1.43' );
		$title = Title::newFromPageReference( $page );
		$lang = $options->getTargetLanguage();
		if ( $lang === null && $options->getInterfaceMessage() ) {
			$lang = $options->getUserLangObj();
		}
		$pageConfig = $this->pageConfigFactory->create(
			$title,
			$options->getUserIdentity(),
			$fakeRev,
			null, // unused
			$lang // defaults to title page language if null
		);

		return $this->genParserOutput( $pageConfig, $options, null );
	}

	/**
	 * Set the limit report data in the current ParserOutput.
	 * This is ported from Parser::makeLimitReport() and should eventually
	 * use the method from the superclass directly.
	 */
	protected function makeLimitReport(
		ParserOptions $parserOptions, ParserOutput $parserOutput
	) {
		$maxIncludeSize = $parserOptions->getMaxIncludeSize();

		$cpuTime = $parserOutput->getTimeProfile( 'cpu' );
		if ( $cpuTime !== null ) {
			$parserOutput->setLimitReportData( 'limitreport-cputime',
				sprintf( "%.3f", $cpuTime )
			);
		}

		$wallTime = $parserOutput->getTimeProfile( 'wall' );
		$parserOutput->setLimitReportData( 'limitreport-walltime',
			sprintf( "%.3f", $wallTime )
		);

		$parserOutput->setLimitReportData( 'limitreport-timingprofile', [ 'not yet supported' ] );

		// Add other cache related metadata
		$parserOutput->setLimitReportData( 'cachereport-timestamp',
			$parserOutput->getCacheTime() );
		$parserOutput->setLimitReportData( 'cachereport-ttl',
			$parserOutput->getCacheExpiry() );
		$parserOutput->setLimitReportData( 'cachereport-transientcontent',
			$parserOutput->hasReducedExpiry() );
	}

}