File: TextExtractor.php

package info (click to toggle)
mediawiki 1%3A1.43.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 417,464 kB
  • sloc: php: 1,062,949; javascript: 664,290; sql: 9,714; python: 5,458; xml: 3,489; sh: 1,131; makefile: 64
file content (96 lines) | stat: -rw-r--r-- 2,795 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<?php

namespace MediaWiki\Extension\AbuseFilter;

use MediaWiki\Content\Content;
use MediaWiki\Content\TextContent;
use MediaWiki\Extension\AbuseFilter\Hooks\AbuseFilterHookRunner;
use MediaWiki\Permissions\Authority;
use MediaWiki\Revision\RevisionRecord;

/**
 * This service provides an interface to convert RevisionRecord and Content objects to some text
 * suitable for running abuse filters.
 *
 * @internal No external code should rely on this representation
 */
class TextExtractor {
	public const SERVICE_NAME = 'AbuseFilterTextExtractor';

	/** @var AbuseFilterHookRunner */
	private $hookRunner;

	/**
	 * @param AbuseFilterHookRunner $hookRunner
	 */
	public function __construct( AbuseFilterHookRunner $hookRunner ) {
		$this->hookRunner = $hookRunner;
	}

	/**
	 * Look up some text of a revision from its revision id
	 *
	 * Note that this is really *some* text, we do not make *any* guarantee
	 * that this text will be even close to what the user actually sees, or
	 * that the form is fit for any intended purpose.
	 *
	 * Note also that if the revision for any reason is not an Revision
	 * the function returns with an empty string.
	 *
	 * For now, this returns all the revision's slots, concatenated together.
	 * In future, this will be replaced by a better solution. See T208769 for
	 * discussion.
	 *
	 * @param RevisionRecord|null $revision a valid revision
	 * @param Authority $performer to check for privileged access
	 * @return string the content of the revision as some kind of string,
	 *        or an empty string if it can not be found
	 * @return-taint none
	 */
	public function revisionToString( ?RevisionRecord $revision, Authority $performer ): string {
		if ( !$revision ) {
			return '';
		}

		$strings = [];

		foreach ( $revision->getSlotRoles() as $role ) {
			$content = $revision->getContent( $role, RevisionRecord::FOR_THIS_USER, $performer );
			if ( $content === null ) {
				continue;
			}
			$strings[$role] = $this->contentToString( $content );
		}

		return implode( "\n\n", $strings );
	}

	/**
	 * Converts the given Content object to a string.
	 *
	 * This uses TextContent::getText() if $content is an instance of TextContent,
	 * or Content::getTextForSearchIndex() otherwise.
	 *
	 * The hook AbuseFilterContentToString can be used to override this
	 * behavior.
	 *
	 * @param Content $content
	 *
	 * @return string a suitable string representation of the content.
	 */
	public function contentToString( Content $content ): string {
		$text = null;

		if ( $this->hookRunner->onAbuseFilter_contentToString(
			$content,
			$text
		) ) {
			$text = $content instanceof TextContent
				? $content->getText()
				: $content->getTextForSearchIndex();
		}

		// T22310
		return TextContent::normalizeLineEndings( (string)$text );
	}
}