File: ExtractFormatter.php

package info (click to toggle)
mediawiki 1%3A1.35.13-1%2Bdeb11u2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 274,932 kB
  • sloc: php: 677,563; javascript: 572,709; sql: 11,565; python: 4,447; xml: 3,145; sh: 892; perl: 788; ruby: 496; pascal: 365; makefile: 128
file content (96 lines) | stat: -rw-r--r-- 2,260 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<?php

namespace TextExtracts;

use DOMElement;
use HtmlFormatter\HtmlFormatter;

/**
 * Provides text-only or limited-HTML extracts of page HTML
 *
 * @license GPL-2.0-or-later
 */
class ExtractFormatter extends HtmlFormatter {
	public const SECTION_MARKER_START = "\1\2";
	public const SECTION_MARKER_END = "\2\1";

	/**
	 * @var bool
	 */
	private $plainText;

	/**
	 * @param string $text Text to convert
	 * @param bool $plainText Whether extract should be plaintext
	 */
	public function __construct( $text, $plainText ) {
		parent::__construct( HtmlFormatter::wrapHTML( $text ) );
		$this->plainText = $plainText;

		$this->setRemoveMedia( true );

		if ( $plainText ) {
			$this->flattenAllTags();
		} else {
			$this->flatten( [ 'a' ] );
		}
	}

	/**
	 * Performs final transformations (such as newline replacement for plaintext
	 * option) and returns resulting HTML.
	 *
	 * @param DOMElement|string|null $element ID of element to get HTML from.
	 * Ignored
	 * @return string Processed HTML
	 */
	public function getText( $element = null ) {
		$this->filterContent();
		$text = parent::getText();
		if ( $this->plainText ) {
			$text = html_entity_decode( $text );
			// replace nbsp with space
			$text = str_replace( "\u{00A0}", ' ', $text );
			// for Windows
			$text = str_replace( "\r", "\n", $text );
			// normalise newlines
			$text = preg_replace( "/\n{3,}/", "\n\n", $text );
		}
		return trim( $text );
	}

	/**
	 * @param string $html HTML string to process
	 * @return string Processed HTML
	 */
	public function onHtmlReady( $html ) {
		if ( $this->plainText ) {
			$html = preg_replace( '/\s*(<h([1-6])\b)/i',
				"\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
				$html
			);
		}
		return $html;
	}

	/**
	 * Removes content we've chosen to remove then removes class and style
	 * attributes from the remaining span elements.
	 *
	 * @return array Array of removed DOMElements
	 */
	public function filterContent() {
		$removed = parent::filterContent();

		$doc = $this->getDoc();
		$spans = $doc->getElementsByTagName( 'span' );

		/** @var DOMElement $span */
		foreach ( $spans as $span ) {
			$span->removeAttribute( 'class' );
			$span->removeAttribute( 'style' );
		}

		return $removed;
	}
}