1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
<?php
declare(strict_types=1);
/*
* This file is part of the league/commonmark package.
*
* (c) Colin O'Dell <colinodell@gmail.com>
*
* Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
* - (c) John MacFarlane
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace League\CommonMark\Parser;
use League\CommonMark\Environment\EnvironmentInterface;
use League\CommonMark\Node\Block\AbstractBlock;
use League\CommonMark\Node\Inline\AdjacentTextMerger;
use League\CommonMark\Node\Inline\Text;
use League\CommonMark\Parser\Inline\InlineParserInterface;
use League\CommonMark\Reference\ReferenceMapInterface;
/**
* @internal
*/
final class InlineParserEngine implements InlineParserEngineInterface
{
/** @psalm-readonly */
private EnvironmentInterface $environment;
/** @psalm-readonly */
private ReferenceMapInterface $referenceMap;
/**
* @var array<int, InlineParserInterface|string|bool>
* @psalm-var list<array{0: InlineParserInterface, 1: non-empty-string, 2: bool}>
* @phpstan-var array<int, array{0: InlineParserInterface, 1: non-empty-string, 2: bool}>
*/
private array $parsers = [];
public function __construct(EnvironmentInterface $environment, ReferenceMapInterface $referenceMap)
{
$this->environment = $environment;
$this->referenceMap = $referenceMap;
foreach ($environment->getInlineParsers() as $parser) {
\assert($parser instanceof InlineParserInterface);
$regex = $parser->getMatchDefinition()->getRegex();
$this->parsers[] = [$parser, $regex, \strlen($regex) !== \mb_strlen($regex, 'UTF-8')];
}
}
public function parse(string $contents, AbstractBlock $block): void
{
$contents = \trim($contents);
$cursor = new Cursor($contents);
$inlineParserContext = new InlineParserContext($cursor, $block, $this->referenceMap, $this->environment->getConfiguration()->get('max_delimiters_per_line'));
// Have all parsers look at the line to determine what they might want to parse and what positions they exist at
foreach ($this->matchParsers($contents) as $matchPosition => $parsers) {
$currentPosition = $cursor->getPosition();
// We've already gone past this point
if ($currentPosition > $matchPosition) {
continue;
}
// We've skipped over some uninteresting text that should be added as a plain text node
if ($currentPosition < $matchPosition) {
$cursor->advanceBy($matchPosition - $currentPosition);
$this->addPlainText($cursor->getPreviousText(), $block);
}
// We're now at a potential start - see which of the current parsers can handle it
$parsed = false;
foreach ($parsers as [$parser, $matches]) {
\assert($parser instanceof InlineParserInterface);
if ($parser->parse($inlineParserContext->withMatches($matches))) {
// A parser has successfully handled the text at the given position; don't consider any others at this position
$parsed = true;
break;
}
}
if ($parsed) {
continue;
}
// Despite potentially being interested, nothing actually parsed text here, so add the current character and continue onwards
$this->addPlainText((string) $cursor->getCurrentCharacter(), $block);
$cursor->advance();
}
// Add any remaining text that wasn't parsed
if (! $cursor->isAtEnd()) {
$this->addPlainText($cursor->getRemainder(), $block);
}
// Process any delimiters that were found
$delimiterStack = $inlineParserContext->getDelimiterStack();
$delimiterStack->processDelimiters(null, $this->environment->getDelimiterProcessors());
$delimiterStack->removeAll();
// Combine adjacent text notes into one
AdjacentTextMerger::mergeChildNodes($block);
}
private function addPlainText(string $text, AbstractBlock $container): void
{
$lastInline = $container->lastChild();
if ($lastInline instanceof Text && ! $lastInline->data->has('delim')) {
$lastInline->append($text);
} else {
$container->appendChild(new Text($text));
}
}
/**
* Given the current line, ask all the parsers which parts of the text they would be interested in parsing.
*
* The resulting array provides a list of character positions, which parsers are interested in trying to parse
* the text at those points, and (for convenience/optimization) what the matching text happened to be.
*
* @return array<array<int, InlineParserInterface|string>>
*
* @psalm-return array<int, list<array{0: InlineParserInterface, 1: non-empty-array<string>}>>
*
* @phpstan-return array<int, array<int, array{0: InlineParserInterface, 1: non-empty-array<string>}>>
*/
private function matchParsers(string $contents): array
{
$contents = \trim($contents);
$isMultibyte = ! \mb_check_encoding($contents, 'ASCII');
$ret = [];
foreach ($this->parsers as [$parser, $regex, $isRegexMultibyte]) {
if ($isMultibyte || $isRegexMultibyte) {
$regex .= 'u';
}
// See if the parser's InlineParserMatch regex matched against any part of the string
if (! \preg_match_all($regex, $contents, $matches, \PREG_OFFSET_CAPTURE | \PREG_SET_ORDER)) {
continue;
}
// For each part that matched...
foreach ($matches as $match) {
if ($isMultibyte) {
// PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
$offset = \mb_strlen(\substr($contents, 0, $match[0][1]), 'UTF-8');
} else {
$offset = \intval($match[0][1]);
}
// Remove the offsets, keeping only the matched text
$m = \array_column($match, 0);
if ($m === []) {
continue;
}
// Add this match to the list of character positions to stop at
$ret[$offset][] = [$parser, $m];
}
}
// Sort matches by position so we visit them in order
\ksort($ret);
return $ret;
}
}
|