1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
|
<?php
namespace MediaWiki\Parser\Parsoid;
use MediaWiki\Content\TextContent;
use MediaWiki\Content\WikitextContent;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices;
use MediaWiki\Page\PageReference;
use MediaWiki\Parser\ParserFactory;
use MediaWiki\Parser\ParserOptions;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
use MediaWiki\Revision\MutableRevisionRecord;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Title\Title;
use MediaWiki\WikiMap\WikiMap;
use Wikimedia\Assert\Assert;
use Wikimedia\Parsoid\Config\PageConfig;
use Wikimedia\Parsoid\Parsoid;
/**
* Parser implementation which uses Parsoid.
*
* Currently incomplete; see T236809 for the long-term plan.
*
* @since 1.41
* @unstable since 1.41; see T236809 for plan.
*/
class ParsoidParser /* eventually this will extend \Parser */ {
/**
* @unstable
* This should not be used widely right now since this may go away.
* This is being added to support DiscussionTools with Parsoid HTML
* and after initial exploration, this may be implemented differently.
*/
public const PARSOID_TITLE_KEY = "parsoid:title-dbkey";
private Parsoid $parsoid;
private PageConfigFactory $pageConfigFactory;
private LanguageConverterFactory $languageConverterFactory;
private ParserFactory $legacyParserFactory;
/**
* @param Parsoid $parsoid
* @param PageConfigFactory $pageConfigFactory
* @param LanguageConverterFactory $languageConverterFactory
* @param ParserFactory $legacyParserFactory
*/
public function __construct(
Parsoid $parsoid,
PageConfigFactory $pageConfigFactory,
LanguageConverterFactory $languageConverterFactory,
ParserFactory $legacyParserFactory
) {
$this->parsoid = $parsoid;
$this->pageConfigFactory = $pageConfigFactory;
$this->languageConverterFactory = $languageConverterFactory;
$this->legacyParserFactory = $legacyParserFactory;
}
/**
* Internal helper to avoid code deuplication across two methods
*
* @param PageConfig $pageConfig
* @param ParserOptions $options
* @return ParserOutput
*/
private function genParserOutput(
PageConfig $pageConfig, ParserOptions $options, ?ParserOutput $previousOutput
): ParserOutput {
$parserOutput = new ParserOutput();
// Parsoid itself does not vary output by parser options right now.
// But, ensure that any option use by extensions, parser functions,
// recursive parses, or (in the unlikely future scenario) Parsoid itself
// are recorded as used.
$options->registerWatcher( [ $parserOutput, 'recordOption' ] );
// The enable/disable logic here matches that in Parser::internalParseHalfParsed(),
// although __NOCONTENTCONVERT__ is handled internal to Parsoid.
//
// T349137: It might be preferable to handle __NOCONTENTCONVERT__ here rather than
// by inspecting the DOM inside Parsoid. That will come in a separate patch.
$htmlVariantLanguage = null;
if ( !( $options->getDisableContentConversion() || $options->getInterfaceMessage() ) ) {
// NOTES (some of these are TODOs for read views integration)
// 1. This html variant conversion is a pre-cache transform. HtmlOutputRendererHelper
// has another variant conversion that is a post-cache transform based on the
// 'Accept-Language' header. If that header is set, there is really no reason to
// do this conversion here. So, eventually, we are likely to either not pass in
// the htmlVariantLanguage option below OR disable language conversion from the
// wt2html path in Parsoid and this and the Accept-Language variant conversion
// both would have to be handled as post-cache transforms.
//
// 2. Parser.php calls convert() which computes a preferred variant from the
// target language. But, we cannot do that unconditionally here because REST API
// requests specify the exact variant via the 'Content-Language' header.
//
// For Parsoid page views, either the callers will have to compute the
// preferred variant and set it in ParserOptions OR the REST API will have
// to set some other flag indicating that the preferred variant should not
// be computed. For now, I am adding a temporary hack, but this should be
// replaced with something more sensible (T267067).
//
// 3. Additionally, Parsoid's callers will have to set targetLanguage in ParserOptions
// to mimic the logic in Parser.php (missing right now).
$langCode = $pageConfig->getPageLanguageBcp47();
if ( $options->getRenderReason() === 'page-view' ) { // TEMPORARY HACK
$langFactory = MediaWikiServices::getInstance()->getLanguageFactory();
$lang = $langFactory->getLanguage( $langCode );
$langConv = $this->languageConverterFactory->getLanguageConverter( $lang );
$htmlVariantLanguage = $langFactory->getLanguage( $langConv->getPreferredVariant() );
} else {
$htmlVariantLanguage = $langCode;
}
}
$oldPageConfig = null;
$oldPageBundle = null;
// T371713: Temporary statistics collection code to determine
// feasibility of Parsoid selective update
$sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get(
MainConfigNames::ParsoidSelectiveUpdateSampleRate
);
$doSample = ( $sampleRate && mt_rand( 1, $sampleRate ) === 1 );
if ( $doSample && $previousOutput !== null && $previousOutput->getCacheRevisionId() ) {
// Allow fetching the old wikitext corresponding to the
// $previousOutput
$oldPageConfig = $this->pageConfigFactory->create(
Title::newFromLinkTarget( $pageConfig->getLinkTarget() ),
$options->getUserIdentity(),
$previousOutput->getCacheRevisionId(),
null,
$previousOutput->getLanguage(),
);
$oldPageBundle =
PageBundleParserOutputConverter::pageBundleFromParserOutput(
$previousOutput
);
}
$defaultOptions = [
'pageBundle' => true,
'wrapSections' => true,
'logLinterData' => true,
'body_only' => false,
'htmlVariantLanguage' => $htmlVariantLanguage,
'offsetType' => 'byte',
'outputContentVersion' => Parsoid::defaultHTMLVersion(),
'previousOutput' => $oldPageBundle,
'previousInput' => $oldPageConfig,
'sampleStats' => $doSample,
'renderReason' => $options->getRenderReason(),
];
$parserOutput->resetParseStartTime();
// This can throw ClientError or ResourceLimitExceededException.
// Callers are responsible for figuring out how to handle them.
$pageBundle = $this->parsoid->wikitext2html(
$pageConfig,
$defaultOptions,
$headers,
$parserOutput );
$parserOutput = PageBundleParserOutputConverter::parserOutputFromPageBundle( $pageBundle, $parserOutput );
// Record the page title in dbkey form so that post-cache transforms
// have access to the title.
$parserOutput->setExtensionData(
self::PARSOID_TITLE_KEY,
Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedDBkey()
);
// Register a watcher again because the $parserOutput arg
// and $parserOutput return value above are different objects!
$options->registerWatcher( [ $parserOutput, 'recordOption' ] );
$parserOutput->setFromParserOptions( $options );
$parserOutput->recordTimeProfile();
$this->makeLimitReport( $options, $parserOutput );
// T371713: Collect statistics on parsing time -vs- presence of
// $previousOutput
$stats = MediaWikiServices::getInstance()->getStatsFactory();
$labels = [
'type' => $previousOutput === null ? 'full' : 'selective',
'wiki' => WikiMap::getCurrentWikiId(),
'reason' => $options->getRenderReason() ?: 'unknown',
];
$stats
->getCounter( 'Parsoid_parse_cpu_seconds' )
->setLabels( $labels )
->incrementBy( $parserOutput->getTimeProfile( 'cpu' ) );
$stats
->getCounter( 'Parsoid_parse_total' )
->setLabels( $labels )
->increment();
// Add Parsoid skinning module
$parserOutput->addModuleStyles( [ 'mediawiki.skinning.content.parsoid' ] );
// Record Parsoid version in extension data; this allows
// us to use the onRejectParserCacheValue hook to selectively
// expire "bad" generated content in the event of a rollback.
$parserOutput->setExtensionData(
'core:parsoid-version', Parsoid::version()
);
$parserOutput->setExtensionData(
'core:html-version', Parsoid::defaultHTMLVersion()
);
return $parserOutput;
}
/**
* Convert wikitext to HTML
* Do not call this function recursively.
*
* @param string|TextContent $text Text we want to parse
* @param-taint $text escapes_htmlnoent
* @param PageReference $page
* @param ParserOptions $options
* @param bool $linestart
* @param bool $clearState
* @param int|null $revId ID of the revision being rendered. This is used to render
* REVISION* magic words. 0 means that any current revision will be used. Null means
* that {{REVISIONID}}/{{REVISIONUSER}} will be empty and {{REVISIONTIMESTAMP}} will
* use the current timestamp.
* @param ?ParserOutput $previousOutput The (optional) result of a
* previous parse of this page, which can be used for selective update.
* @return ParserOutput
* @return-taint escaped
* @unstable since 1.41
*/
public function parse(
$text, PageReference $page, ParserOptions $options,
bool $linestart = true, bool $clearState = true, ?int $revId = null,
?ParserOutput $previousOutput = null
): ParserOutput {
Assert::invariant( $linestart, '$linestart=false is not yet supported' );
Assert::invariant( $clearState, '$clearState=false is not yet supported' );
$title = Title::newFromPageReference( $page );
$lang = $options->getTargetLanguage();
if ( $lang === null && $options->getInterfaceMessage() ) {
$lang = $options->getUserLangObj();
}
$pageConfig = $revId === null || $revId === 0 ? null : $this->pageConfigFactory->create(
$title,
$options->getUserIdentity(),
$revId,
null, // unused
$lang // defaults to title page language if null
);
$content = null;
if ( $text instanceof TextContent ) {
$content = $text;
$text = $content->getText();
}
if ( !( $pageConfig && $pageConfig->getPageMainContent() === $text ) ) {
// This is a bit awkward! But we really need to parse $text, which
// may or may not correspond to the $revId provided!
// T332928 suggests one solution: splitting the "have revid"
// callers from the "bare text, no associated revision" callers.
$revisionRecord = new MutableRevisionRecord( $title );
if ( $revId !== null ) {
$revisionRecord->setId( $revId );
}
$revisionRecord->setSlot(
SlotRecord::newUnsaved(
SlotRecord::MAIN,
$content ?? new WikitextContent( $text )
)
);
$pageConfig = $this->pageConfigFactory->create(
$title,
$options->getUserIdentity(),
$revisionRecord,
null, // unused
$lang // defaults to title page language if null
);
}
return $this->genParserOutput( $pageConfig, $options, $previousOutput );
}
/**
* @internal
*
* Convert custom wikitext (stored in main slot of the $fakeRev arg) to HTML.
* Callers are expected NOT to stuff the result into ParserCache.
*
* @param RevisionRecord $fakeRev Revision to parse
* @param PageReference $page
* @param ParserOptions $options
* @return ParserOutput
* @unstable since 1.41
*/
public function parseFakeRevision(
RevisionRecord $fakeRev, PageReference $page, ParserOptions $options
): ParserOutput {
wfDeprecated( __METHOD__, '1.43' );
$title = Title::newFromPageReference( $page );
$lang = $options->getTargetLanguage();
if ( $lang === null && $options->getInterfaceMessage() ) {
$lang = $options->getUserLangObj();
}
$pageConfig = $this->pageConfigFactory->create(
$title,
$options->getUserIdentity(),
$fakeRev,
null, // unused
$lang // defaults to title page language if null
);
return $this->genParserOutput( $pageConfig, $options, null );
}
/**
* Set the limit report data in the current ParserOutput.
* This is ported from Parser::makeLimitReport() and should eventually
* use the method from the superclass directly.
*/
protected function makeLimitReport(
ParserOptions $parserOptions, ParserOutput $parserOutput
) {
$maxIncludeSize = $parserOptions->getMaxIncludeSize();
$cpuTime = $parserOutput->getTimeProfile( 'cpu' );
if ( $cpuTime !== null ) {
$parserOutput->setLimitReportData( 'limitreport-cputime',
sprintf( "%.3f", $cpuTime )
);
}
$wallTime = $parserOutput->getTimeProfile( 'wall' );
$parserOutput->setLimitReportData( 'limitreport-walltime',
sprintf( "%.3f", $wallTime )
);
$parserOutput->setLimitReportData( 'limitreport-timingprofile', [ 'not yet supported' ] );
// Add other cache related metadata
$parserOutput->setLimitReportData( 'cachereport-timestamp',
$parserOutput->getCacheTime() );
$parserOutput->setLimitReportData( 'cachereport-ttl',
$parserOutput->getCacheExpiry() );
$parserOutput->setLimitReportData( 'cachereport-transientcontent',
$parserOutput->hasReducedExpiry() );
}
}
|