File: TextTruncator.php

package info (click to toggle)
mediawiki 1%3A1.35.13-1%2Bdeb11u2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 274,932 kB
  • sloc: php: 677,563; javascript: 572,709; sql: 11,565; python: 4,447; xml: 3,145; sh: 892; perl: 788; ruby: 496; pascal: 365; makefile: 128
file content (108 lines) | stat: -rw-r--r-- 2,544 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
<?php

namespace TextExtracts;

use MWTidy;

/**
 * This class needs to understand HTML as well as plain text. It tries to not break HTML tags, but
 * might break pairs of tags, leaving unclosed tags behind. We can tidy the output to fix
 * this.
 *
 * @license GPL-2.0-or-later
 */
class TextTruncator {
	/**
	 * @var bool Whether to tidy the output
	 */
	private $useTidy;

	/**
	 * @param bool $useTidy
	 */
	public function __construct( bool $useTidy ) {
		$this->useTidy = $useTidy;
	}

	/**
	 * Returns no more than the given number of sentences
	 *
	 * @param string $text Source text to extract from
	 * @param int $requestedSentenceCount Maximum number of sentences to extract
	 * @return string
	 */
	public function getFirstSentences( $text, $requestedSentenceCount ) {
		if ( $requestedSentenceCount <= 0 ) {
			return '';
		}

		// Based on code from OpenSearchXml by Brion Vibber
		$endchars = [
			// regular ASCII
			'\P{Lu}\.(?=[ \n]|$)',
			'[!?](?=[ \n]|$)',
			// full-width ideographic full-stop
			'。',
			// double-width roman forms
			'.',
			'!',
			'?',
			// half-width ideographic full stop
			'。',
		];

		$regexp = '/(?:' . implode( '|', $endchars ) . ')+/u';
		$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );

		if ( !$res ) {
			// Just return the first line
			$lines = explode( "\n", $text, 2 );
			return trim( $lines[0] );
		}

		$index = min( $requestedSentenceCount, $res ) - 1;
		list( $tail, $length ) = $matches[0][$index];
		// PCRE returns raw offsets, so using substr() instead of mb_substr()
		$text = substr( $text, 0, $length ) . $tail;

		return $this->tidy( $text );
	}

	/**
	 * Returns no more than a requested number of characters, preserving words
	 *
	 * @param string $text Source text to extract from
	 * @param int $requestedLength Maximum number of characters to return
	 * @return string
	 */
	public function getFirstChars( $text, $requestedLength ) {
		if ( $requestedLength <= 0 ) {
			return '';
		}

		$length = mb_strlen( $text );
		if ( $length <= $requestedLength ) {
			return $text;
		}

		// This ungreedy pattern always matches, just might return an empty string
		$pattern = '/^[\w\/]*>?/su';
		preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
		$text = mb_substr( $text, 0, $requestedLength ) . $m[0];

		return $this->tidy( $text );
	}

	/**
	 * @param string $text
	 * @return string
	 */
	private function tidy( $text ) {
		if ( $this->useTidy ) {
			$text = MWTidy::tidy( $text );
		}

		return trim( $text );
	}

}